Пример #1
0
def _gen_examples():
  examples = []
  examples.append(
      classifier_data_lib.InputExample(
          guid=0, text_a='Really good.', label='pos'))
  examples.append(
      classifier_data_lib.InputExample(guid=1, text_a='So bad.', label='neg'))
  return examples
    def _get_single_story_features(self, story_headline, articles):
        """Converts a list of articles to a tensorflow Example."""
        def get_text_snippet(article):
            if article.text_b:
                return " [SEP] ".join([article.text_a, article.text_b])
            else:
                return article.text_a

        story_features = collections.OrderedDict()
        story_headline_feature = classifier_data_lib.convert_single_example(
            ex_index=self.ex_index,
            example=classifier_data_lib.InputExample(guid=self.guid,
                                                     text_a=story_headline,
                                                     label=self.label),
            label_list=[self.label],
            max_seq_length=self.len_title,
            tokenizer=self.tokenizer)
        if self.include_text_snippet_in_example:
            story_headline_feature.label_id = story_headline
        self._add_feature_with_suffix(feature=story_headline_feature,
                                      suffix="a",
                                      story_features=story_features)
        for (article_index, article) in enumerate(articles):
            if article_index == self.max_num_articles:
                break
            article_feature = classifier_data_lib.convert_single_example(
                ex_index=self.ex_index,
                example=article,
                label_list=[self.label],
                max_seq_length=self.len_passage,
                tokenizer=self.tokenizer)
            if self.include_text_snippet_in_example:
                article_feature.label_id = get_text_snippet(article)
            suffix = chr(ord("b") + article_index)
            self._add_feature_with_suffix(feature=article_feature,
                                          suffix=suffix,
                                          story_features=story_features)

        # Adds empty features as placeholder.
        for article_index in range(len(articles), self.max_num_articles):
            suffix = chr(ord("b") + article_index)
            empty_article = classifier_data_lib.InputExample(guid=self.guid,
                                                             text_a="",
                                                             label=self.label)
            empty_feature = classifier_data_lib.convert_single_example(
                ex_index=self.ex_index,
                example=empty_article,
                label_list=[self.label],
                max_seq_length=self.len_passage,
                tokenizer=self.tokenizer)
            if self.include_text_snippet_in_example:
                empty_feature.label_id = ""
            self._add_feature_with_suffix(feature=empty_feature,
                                          suffix=suffix,
                                          story_features=story_features)
        return story_features
Пример #3
0
 def _get_article_content_from_json(self, file_path):
   """Returns (url, InputExample) keeping content extracted from file_path."""
   with tf.io.gfile.GFile(file_path, "r") as article_json_file:
     article = json.load(article_json_file)
     if self.include_article_title_in_passage:
       return article["url"], classifier_data_lib.InputExample(
           guid=self.guid,
           text_a=article["title"],
           text_b=article["maintext"],
           label=self.label)
     else:
       return article["url"], classifier_data_lib.InputExample(
           guid=self.guid, text_a=article["maintext"], label=self.label)
    def __to_feature(self, text, label):
        example = classifier_data_lib.InputExample(guid=None,
                                                   text_a=text.numpy(),
                                                   text_b=None,
                                                   label=label.numpy())

        feature = classifier_data_lib.convert_single_example(
            0, example, self.label_list, self.max_seq_length, self.tokenizer)

        return feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id
Пример #5
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     guid = "%s-%s" % (set_type, i)
     text_a = self.process_text_fn(line[0])
     examples.append(
         classifier_data_lib.InputExample(
             guid=guid, text_a=text_a, example_id=i))
   return examples
Пример #6
0
def to_feature(text,
               label,
               label_list=label_list,
               max_seq_length=max_seq_length,
               tokenizer=tokenizer):
    example = classifier_data_lib.InputExample(guid=None,
                                               text_a=text.numpy(),
                                               text_b=None,
                                               label=label.numpy())

    feature = classifier_data_lib.convert_single_example(
        0, example, label_list, max_seq_length, tokenizer)

    return (feature.input_ids, feature.input_mask, feature.segment_ids,
            feature.label_id)
Пример #7
0
    def from_folder(cls,
                    filename,
                    model_spec='average_word_vec',
                    is_training=True,
                    class_labels=None,
                    shuffle=True,
                    cache_dir=None):
        """Loads text with labels and preproecess text according to `model_spec`.

    Assume the text data of the same label are in the same subdirectory. each
    file is one text.

    Args:
      filename: Name of the file.
      model_spec: Specification for the model.
      is_training: Whether the loaded data is for training or not.
      class_labels: Class labels that should be considered. Name of the
        subdirectory not in `class_labels` will be ignored. If None, all the
        subdirectories will be considered.
      shuffle: boolean, if shuffle, random shuffle data.
      cache_dir: The cache directory to save preprocessed data. If None,
        generates a temporary directory to cache preprocessed data.

    Returns:
      TextDataset containing text, labels and other related info.
    """
        model_spec = ms.get(model_spec)
        data_root = os.path.abspath(filename)
        folder_name = os.path.basename(data_root)

        is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info(
            cache_dir, folder_name, model_spec, is_training)
        # If cached, directly loads data from cache directory.
        if is_cached:
            return cls._load_data(tfrecord_file, meta_data_file, model_spec)

        # Gets paths of all text.
        if class_labels:
            all_text_paths = []
            for class_label in class_labels:
                all_text_paths.extend(
                    list(
                        tf.io.gfile.glob(
                            os.path.join(data_root, class_label) + r'/*')))
        else:
            all_text_paths = list(tf.io.gfile.glob(data_root + r'/*/*'))

        all_text_size = len(all_text_paths)
        if all_text_size == 0:
            raise ValueError('Text size is zero')

        if shuffle:
            random.shuffle(all_text_paths)

        # Gets label and its index.
        if class_labels:
            label_names = sorted(class_labels)
        else:
            label_names = sorted(
                name for name in os.listdir(data_root)
                if os.path.isdir(os.path.join(data_root, name)))

        # Generates text examples from folder.
        examples = []
        for i, path in enumerate(all_text_paths):
            with tf.io.gfile.GFile(path, 'r') as f:
                text = f.read()
            guid = '%s-%d' % (folder_name, i)
            label = os.path.basename(os.path.dirname(path))
            examples.append(
                classifier_data_lib.InputExample(guid, text, None, label))

        # Saves preprocessed data and other assets into files.
        cls._save_data(examples, model_spec, label_names, tfrecord_file,
                       meta_data_file, vocab_file, is_training)

        # Loads data from cache directory.
        return cls._load_data(tfrecord_file, meta_data_file, model_spec)
Пример #8
0
    def from_csv(cls,
                 filename,
                 text_column,
                 label_column,
                 fieldnames=None,
                 model_spec='average_word_vec',
                 is_training=True,
                 delimiter=',',
                 quotechar='"',
                 shuffle=False,
                 cache_dir=None):
        """Loads text with labels from the csv file and preproecess text according to `model_spec`.

    Args:
      filename: Name of the file.
      text_column: String, Column name for input text.
      label_column: String, Column name for labels.
      fieldnames: A sequence, used in csv.DictReader. If fieldnames is omitted,
        the values in the first row of file f will be used as the fieldnames.
      model_spec: Specification for the model.
      is_training: Whether the loaded data is for training or not.
      delimiter: Character used to separate fields.
      quotechar: Character used to quote fields containing special characters.
      shuffle: boolean, if shuffle, random shuffle data.
      cache_dir: The cache directory to save preprocessed data. If None,
        generates a temporary directory to cache preprocessed data.

    Returns:
      TextDataset containing text, labels and other related info.
    """
        model_spec = ms.get(model_spec)
        csv_name = os.path.basename(filename)

        is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info(
            cache_dir, csv_name, model_spec, is_training)
        # If cached, directly loads data from cache directory.
        if is_cached:
            return cls._load_data(tfrecord_file, meta_data_file, model_spec)

        lines = cls._read_csv(filename, fieldnames, delimiter, quotechar)
        if shuffle:
            random.shuffle(lines)

        # Gets labels.
        label_set = set()
        for line in lines:
            label_set.add(line[label_column])
        label_names = sorted(label_set)

        # Generates text examples from csv file.
        examples = []
        for i, line in enumerate(lines):
            text, label = line[text_column], line[label_column]
            guid = '%s-%d' % (csv_name, i)
            examples.append(
                classifier_data_lib.InputExample(guid, text, None, label))

        # Saves preprocessed data and other assets into files.
        cls._save_data(examples, model_spec, label_names, tfrecord_file,
                       meta_data_file, vocab_file, is_training)

        # Loads data from cache directory.
        return cls._load_data(tfrecord_file, meta_data_file, model_spec)