예제 #1
0
    def test_tokenize(self):
        model_spec = ms.AverageWordVecModelSpec()
        text = model_spec._tokenize('It\'s really good.')
        self.assertEqual(text, ['it\'s', 'really', 'good'])

        model_spec = ms.AverageWordVecModelSpec(lowercase=False)
        text = model_spec._tokenize('That is so cool!!!')
        self.assertEqual(text, ['That', 'is', 'so', 'cool'])
 def setUp(self):
     super(AverageWordVecModelSpecTest, self).setUp()
     self.model_spec = ms.AverageWordVecModelSpec(seq_len=5)
     self.vocab = collections.OrderedDict(
         (('<PAD>', 0), ('<START>', 1), ('<UNKNOWN>', 2), ('good', 3),
          ('bad', 4)))
     self.model_spec.vocab = self.vocab
예제 #3
0
def create(train_data,
           model_spec=ms.AverageWordVecModelSpec(),
           shuffle=False,
           batch_size=32,
           epochs=None,
           validation_data=None):
  """Loads data and train the model for test classification.

  Args:
    train_data: Training data.
    model_spec: Specification for the model.
    shuffle: Whether the data should be shuffled.
    batch_size: Batch size for training.
    epochs: Number of epochs for training.
    validation_data: Validation data. If None, skips validation process.

  Returns:
    TextClassifier
  """
  if compat.get_tf_behavior() not in model_spec.compat_tf_versions:
    raise ValueError('Incompatible versions. Expect {}, but got {}.'.format(
        model_spec.compat_tf_versions, compat.get_tf_behavior()))

  text_classifier = TextClassifier(
      model_spec,
      train_data.index_to_label,
      train_data.num_classes,
      shuffle=shuffle)

  tf.compat.v1.logging.info('Retraining the models...')
  text_classifier.train(train_data, validation_data, epochs, batch_size)

  return text_classifier
 def test_from_csv(self):
     csv_file = self._get_csv_file()
     model_spec = ms.AverageWordVecModelSpec()
     data = text_dataloader.TextClassifierDataLoader.from_csv(
         csv_file,
         text_column='text',
         label_column='label',
         model_spec=model_spec)
     self._test_data(data, model_spec)
예제 #5
0
 def test_average_wordvec_model_create_v1_incompatible(self):
     with self.assertRaisesRegex(ValueError, 'Incompatible versions'):
         model_spec = ms.AverageWordVecModelSpec(seq_len=2)
         all_data = text_dataloader.TextClassifierDataLoader.from_folder(
             self.text_dir, model_spec=model_spec)
         _ = text_classifier.create(
             all_data,
             model_spec=model_spec,
         )
예제 #6
0
    def test_average_wordvec_model(self):
        model_spec = ms.AverageWordVecModelSpec(seq_len=2)
        all_data = text_dataloader.TextClassifierDataLoader.from_folder(
            self.text_dir, model_spec=model_spec)
        # Splits data, 90% data for training, 10% for testing
        self.train_data, self.test_data = all_data.split(0.9)

        model = text_classifier.create(self.train_data,
                                       mef.ModelExportFormat.TFLITE,
                                       model_spec=model_spec,
                                       epochs=2,
                                       batch_size=4,
                                       shuffle=True)
        self._test_accuracy(model)
        self._test_export_to_tflite(model)
        self._test_predict_top_k(model)
예제 #7
0
    def test_get_cache_filenames(self):
        tfrecord_file, meta_data_file, prefix = dataloader.get_cache_filenames(
            cache_dir='/tmp', model_spec=self.model_spec, data_name='train')
        self.assertTrue(tfrecord_file.startswith(prefix))
        self.assertTrue(meta_data_file.startswith(prefix))

        _, _, new_dir_prefix = dataloader.get_cache_filenames(
            cache_dir='/tmp1', model_spec=self.model_spec, data_name='train')
        self.assertNotEqual(new_dir_prefix, prefix)

        _, _, new_model_spec_prefix = dataloader.get_cache_filenames(
            cache_dir='/tmp',
            model_spec=ms.AverageWordVecModelSpec(seq_len=8),
            data_name='train')
        self.assertNotEqual(new_model_spec_prefix, prefix)

        _, _, new_data_name_prefix = dataloader.get_cache_filenames(
            cache_dir='/tmp', model_spec=self.model_spec, data_name='test')
        self.assertNotEqual(new_data_name_prefix, prefix)
    def test_average_wordvec_model(self):
        model_spec = ms.AverageWordVecModelSpec(seq_len=2)
        all_data = text_dataloader.TextClassifierDataLoader.from_folder(
            self.text_dir, model_spec=model_spec)
        # Splits data, 90% data for training, 10% for testing
        self.train_data, self.test_data = all_data.split(0.5)

        model = text_classifier.create(self.train_data,
                                       model_spec=model_spec,
                                       epochs=1,
                                       batch_size=1,
                                       shuffle=True)
        self._test_accuracy(model, threshold=0.0)
        self._test_predict_top_k(model)
        self._test_export_to_tflite(model, threshold=0.0)
        self._test_export_to_saved_model(model)
        self._test_export_labels(model)
        self._test_export_vocab(model)
        self._test_model_without_training(model_spec)
 def test_from_folder(self):
     folder_path = self._get_folder_path()
     model_spec = ms.AverageWordVecModelSpec()
     data = text_dataloader.TextClassifierDataLoader.from_folder(
         folder_path, model_spec=model_spec)
     self._test_data(data, model_spec)
예제 #10
0
  def from_csv(cls,
               filename,
               text_column,
               label_column,
               fieldnames=None,
               model_spec=ms.AverageWordVecModelSpec(),
               is_training=True,
               delimiter=',',
               quotechar='"',
               shuffle=False,
               cache_dir=None):
    """Loads text with labels from the csv file and preproecess text according to `model_spec`.

    Args:
      filename: Name of the file.
      text_column: String, Column name for input text.
      label_column: String, Column name for labels.
      fieldnames: A sequence, used in csv.DictReader. If fieldnames is omitted,
        the values in the first row of file f will be used as the fieldnames.
      model_spec: Specification for the model.
      is_training: Whether the loaded data is for training or not.
      delimiter: Character used to separate fields.
      quotechar: Character used to quote fields containing special characters.
      shuffle: boolean, if shuffle, random shuffle data.
      cache_dir: The cache directory to save preprocessed data. If None,
        generates a temporary directory to cache preprocessed data.

    Returns:
      TextDataset containing text, labels and other related info.
    """
    csv_name = os.path.basename(filename)

    is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info(
        cache_dir, csv_name, model_spec, is_training)
    # If cached, directly loads data from cache directory.
    if is_cached:
      return cls._load_data(tfrecord_file, meta_data_file, model_spec)

    lines = cls._read_csv(filename, fieldnames, delimiter, quotechar)
    if shuffle:
      random.shuffle(lines)

    # Gets labels.
    label_set = set()
    for line in lines:
      label_set.add(line[label_column])
    label_names = sorted(label_set)

    # Generates text examples from csv file.
    examples = []
    for i, line in enumerate(lines):
      text, label = line[text_column], line[label_column]
      guid = '%s-%d' % (csv_name, i)
      examples.append(classifier_data_lib.InputExample(guid, text, None, label))

    # Saves preprocessed data and other assets into files.
    cls._save_data(examples, model_spec, label_names, tfrecord_file,
                   meta_data_file, vocab_file, is_training)

    # Loads data from cache directory.
    return cls._load_data(tfrecord_file, meta_data_file, model_spec)
예제 #11
0
  def from_folder(cls,
                  filename,
                  model_spec=ms.AverageWordVecModelSpec(),
                  is_training=True,
                  class_labels=None,
                  shuffle=True,
                  cache_dir=None):
    """Loads text with labels and preproecess text according to `model_spec`.

    Assume the text data of the same label are in the same subdirectory. each
    file is one text.

    Args:
      filename: Name of the file.
      model_spec: Specification for the model.
      is_training: Whether the loaded data is for training or not.
      class_labels: Class labels that should be considered. Name of the
        subdirectory not in `class_labels` will be ignored. If None, all the
        subdirectories will be considered.
      shuffle: boolean, if shuffle, random shuffle data.
      cache_dir: The cache directory to save preprocessed data. If None,
        generates a temporary directory to cache preprocessed data.

    Returns:
      TextDataset containing text, labels and other related info.
    """
    data_root = os.path.abspath(filename)
    folder_name = os.path.basename(data_root)

    is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info(
        cache_dir, folder_name, model_spec, is_training)
    # If cached, directly loads data from cache directory.
    if is_cached:
      return cls._load_data(tfrecord_file, meta_data_file, model_spec)

    # Gets paths of all text.
    if class_labels:
      all_text_paths = []
      for class_label in class_labels:
        all_text_paths.extend(
            list(
                tf.io.gfile.glob(os.path.join(data_root, class_label) + r'/*')))
    else:
      all_text_paths = list(tf.io.gfile.glob(data_root + r'/*/*'))

    all_text_size = len(all_text_paths)
    if all_text_size == 0:
      raise ValueError('Text size is zero')

    if shuffle:
      random.shuffle(all_text_paths)

    # Gets label and its index.
    if class_labels:
      label_names = sorted(class_labels)
    else:
      label_names = sorted(
          name for name in os.listdir(data_root)
          if os.path.isdir(os.path.join(data_root, name)))

    # Generates text examples from folder.
    examples = []
    for i, path in enumerate(all_text_paths):
      with tf.io.gfile.GFile(path, 'r') as f:
        text = f.read()
      guid = '%s-%d' % (folder_name, i)
      label = os.path.basename(os.path.dirname(path))
      examples.append(classifier_data_lib.InputExample(guid, text, None, label))

    # Saves preprocessed data and other assets into files.
    cls._save_data(examples, model_spec, label_names, tfrecord_file,
                   meta_data_file, vocab_file, is_training)

    # Loads data from cache directory.
    return cls._load_data(tfrecord_file, meta_data_file, model_spec)
예제 #12
0
 def setUp(self):
     super(DataLoaderTest, self).setUp()
     self.model_spec = ms.AverageWordVecModelSpec(seq_len=4)