def test_tokenize(self): model_spec = ms.AverageWordVecModelSpec() text = model_spec._tokenize('It\'s really good.') self.assertEqual(text, ['it\'s', 'really', 'good']) model_spec = ms.AverageWordVecModelSpec(lowercase=False) text = model_spec._tokenize('That is so cool!!!') self.assertEqual(text, ['That', 'is', 'so', 'cool'])
def setUp(self): super(AverageWordVecModelSpecTest, self).setUp() self.model_spec = ms.AverageWordVecModelSpec(seq_len=5) self.vocab = collections.OrderedDict( (('<PAD>', 0), ('<START>', 1), ('<UNKNOWN>', 2), ('good', 3), ('bad', 4))) self.model_spec.vocab = self.vocab
def create(train_data, model_spec=ms.AverageWordVecModelSpec(), shuffle=False, batch_size=32, epochs=None, validation_data=None): """Loads data and train the model for test classification. Args: train_data: Training data. model_spec: Specification for the model. shuffle: Whether the data should be shuffled. batch_size: Batch size for training. epochs: Number of epochs for training. validation_data: Validation data. If None, skips validation process. Returns: TextClassifier """ if compat.get_tf_behavior() not in model_spec.compat_tf_versions: raise ValueError('Incompatible versions. Expect {}, but got {}.'.format( model_spec.compat_tf_versions, compat.get_tf_behavior())) text_classifier = TextClassifier( model_spec, train_data.index_to_label, train_data.num_classes, shuffle=shuffle) tf.compat.v1.logging.info('Retraining the models...') text_classifier.train(train_data, validation_data, epochs, batch_size) return text_classifier
def test_from_csv(self): csv_file = self._get_csv_file() model_spec = ms.AverageWordVecModelSpec() data = text_dataloader.TextClassifierDataLoader.from_csv( csv_file, text_column='text', label_column='label', model_spec=model_spec) self._test_data(data, model_spec)
def test_average_wordvec_model_create_v1_incompatible(self): with self.assertRaisesRegex(ValueError, 'Incompatible versions'): model_spec = ms.AverageWordVecModelSpec(seq_len=2) all_data = text_dataloader.TextClassifierDataLoader.from_folder( self.text_dir, model_spec=model_spec) _ = text_classifier.create( all_data, model_spec=model_spec, )
def test_average_wordvec_model(self): model_spec = ms.AverageWordVecModelSpec(seq_len=2) all_data = text_dataloader.TextClassifierDataLoader.from_folder( self.text_dir, model_spec=model_spec) # Splits data, 90% data for training, 10% for testing self.train_data, self.test_data = all_data.split(0.9) model = text_classifier.create(self.train_data, mef.ModelExportFormat.TFLITE, model_spec=model_spec, epochs=2, batch_size=4, shuffle=True) self._test_accuracy(model) self._test_export_to_tflite(model) self._test_predict_top_k(model)
def test_get_cache_filenames(self): tfrecord_file, meta_data_file, prefix = dataloader.get_cache_filenames( cache_dir='/tmp', model_spec=self.model_spec, data_name='train') self.assertTrue(tfrecord_file.startswith(prefix)) self.assertTrue(meta_data_file.startswith(prefix)) _, _, new_dir_prefix = dataloader.get_cache_filenames( cache_dir='/tmp1', model_spec=self.model_spec, data_name='train') self.assertNotEqual(new_dir_prefix, prefix) _, _, new_model_spec_prefix = dataloader.get_cache_filenames( cache_dir='/tmp', model_spec=ms.AverageWordVecModelSpec(seq_len=8), data_name='train') self.assertNotEqual(new_model_spec_prefix, prefix) _, _, new_data_name_prefix = dataloader.get_cache_filenames( cache_dir='/tmp', model_spec=self.model_spec, data_name='test') self.assertNotEqual(new_data_name_prefix, prefix)
def test_average_wordvec_model(self): model_spec = ms.AverageWordVecModelSpec(seq_len=2) all_data = text_dataloader.TextClassifierDataLoader.from_folder( self.text_dir, model_spec=model_spec) # Splits data, 90% data for training, 10% for testing self.train_data, self.test_data = all_data.split(0.5) model = text_classifier.create(self.train_data, model_spec=model_spec, epochs=1, batch_size=1, shuffle=True) self._test_accuracy(model, threshold=0.0) self._test_predict_top_k(model) self._test_export_to_tflite(model, threshold=0.0) self._test_export_to_saved_model(model) self._test_export_labels(model) self._test_export_vocab(model) self._test_model_without_training(model_spec)
def test_from_folder(self): folder_path = self._get_folder_path() model_spec = ms.AverageWordVecModelSpec() data = text_dataloader.TextClassifierDataLoader.from_folder( folder_path, model_spec=model_spec) self._test_data(data, model_spec)
def from_csv(cls, filename, text_column, label_column, fieldnames=None, model_spec=ms.AverageWordVecModelSpec(), is_training=True, delimiter=',', quotechar='"', shuffle=False, cache_dir=None): """Loads text with labels from the csv file and preproecess text according to `model_spec`. Args: filename: Name of the file. text_column: String, Column name for input text. label_column: String, Column name for labels. fieldnames: A sequence, used in csv.DictReader. If fieldnames is omitted, the values in the first row of file f will be used as the fieldnames. model_spec: Specification for the model. is_training: Whether the loaded data is for training or not. delimiter: Character used to separate fields. quotechar: Character used to quote fields containing special characters. shuffle: boolean, if shuffle, random shuffle data. cache_dir: The cache directory to save preprocessed data. If None, generates a temporary directory to cache preprocessed data. Returns: TextDataset containing text, labels and other related info. """ csv_name = os.path.basename(filename) is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info( cache_dir, csv_name, model_spec, is_training) # If cached, directly loads data from cache directory. if is_cached: return cls._load_data(tfrecord_file, meta_data_file, model_spec) lines = cls._read_csv(filename, fieldnames, delimiter, quotechar) if shuffle: random.shuffle(lines) # Gets labels. label_set = set() for line in lines: label_set.add(line[label_column]) label_names = sorted(label_set) # Generates text examples from csv file. examples = [] for i, line in enumerate(lines): text, label = line[text_column], line[label_column] guid = '%s-%d' % (csv_name, i) examples.append(classifier_data_lib.InputExample(guid, text, None, label)) # Saves preprocessed data and other assets into files. cls._save_data(examples, model_spec, label_names, tfrecord_file, meta_data_file, vocab_file, is_training) # Loads data from cache directory. return cls._load_data(tfrecord_file, meta_data_file, model_spec)
def from_folder(cls, filename, model_spec=ms.AverageWordVecModelSpec(), is_training=True, class_labels=None, shuffle=True, cache_dir=None): """Loads text with labels and preproecess text according to `model_spec`. Assume the text data of the same label are in the same subdirectory. each file is one text. Args: filename: Name of the file. model_spec: Specification for the model. is_training: Whether the loaded data is for training or not. class_labels: Class labels that should be considered. Name of the subdirectory not in `class_labels` will be ignored. If None, all the subdirectories will be considered. shuffle: boolean, if shuffle, random shuffle data. cache_dir: The cache directory to save preprocessed data. If None, generates a temporary directory to cache preprocessed data. Returns: TextDataset containing text, labels and other related info. """ data_root = os.path.abspath(filename) folder_name = os.path.basename(data_root) is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info( cache_dir, folder_name, model_spec, is_training) # If cached, directly loads data from cache directory. if is_cached: return cls._load_data(tfrecord_file, meta_data_file, model_spec) # Gets paths of all text. if class_labels: all_text_paths = [] for class_label in class_labels: all_text_paths.extend( list( tf.io.gfile.glob(os.path.join(data_root, class_label) + r'/*'))) else: all_text_paths = list(tf.io.gfile.glob(data_root + r'/*/*')) all_text_size = len(all_text_paths) if all_text_size == 0: raise ValueError('Text size is zero') if shuffle: random.shuffle(all_text_paths) # Gets label and its index. if class_labels: label_names = sorted(class_labels) else: label_names = sorted( name for name in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, name))) # Generates text examples from folder. examples = [] for i, path in enumerate(all_text_paths): with tf.io.gfile.GFile(path, 'r') as f: text = f.read() guid = '%s-%d' % (folder_name, i) label = os.path.basename(os.path.dirname(path)) examples.append(classifier_data_lib.InputExample(guid, text, None, label)) # Saves preprocessed data and other assets into files. cls._save_data(examples, model_spec, label_names, tfrecord_file, meta_data_file, vocab_file, is_training) # Loads data from cache directory. return cls._load_data(tfrecord_file, meta_data_file, model_spec)
def setUp(self): super(DataLoaderTest, self).setUp() self.model_spec = ms.AverageWordVecModelSpec(seq_len=4)