def _gen_examples(): examples = [] examples.append( classifier_data_lib.InputExample( guid=0, text_a='Really good.', label='pos')) examples.append( classifier_data_lib.InputExample(guid=1, text_a='So bad.', label='neg')) return examples
def _get_single_story_features(self, story_headline, articles): """Converts a list of articles to a tensorflow Example.""" def get_text_snippet(article): if article.text_b: return " [SEP] ".join([article.text_a, article.text_b]) else: return article.text_a story_features = collections.OrderedDict() story_headline_feature = classifier_data_lib.convert_single_example( ex_index=self.ex_index, example=classifier_data_lib.InputExample(guid=self.guid, text_a=story_headline, label=self.label), label_list=[self.label], max_seq_length=self.len_title, tokenizer=self.tokenizer) if self.include_text_snippet_in_example: story_headline_feature.label_id = story_headline self._add_feature_with_suffix(feature=story_headline_feature, suffix="a", story_features=story_features) for (article_index, article) in enumerate(articles): if article_index == self.max_num_articles: break article_feature = classifier_data_lib.convert_single_example( ex_index=self.ex_index, example=article, label_list=[self.label], max_seq_length=self.len_passage, tokenizer=self.tokenizer) if self.include_text_snippet_in_example: article_feature.label_id = get_text_snippet(article) suffix = chr(ord("b") + article_index) self._add_feature_with_suffix(feature=article_feature, suffix=suffix, story_features=story_features) # Adds empty features as placeholder. for article_index in range(len(articles), self.max_num_articles): suffix = chr(ord("b") + article_index) empty_article = classifier_data_lib.InputExample(guid=self.guid, text_a="", label=self.label) empty_feature = classifier_data_lib.convert_single_example( ex_index=self.ex_index, example=empty_article, label_list=[self.label], max_seq_length=self.len_passage, tokenizer=self.tokenizer) if self.include_text_snippet_in_example: empty_feature.label_id = "" self._add_feature_with_suffix(feature=empty_feature, suffix=suffix, story_features=story_features) return story_features
def _get_article_content_from_json(self, file_path): """Returns (url, InputExample) keeping content extracted from file_path.""" with tf.io.gfile.GFile(file_path, "r") as article_json_file: article = json.load(article_json_file) if self.include_article_title_in_passage: return article["url"], classifier_data_lib.InputExample( guid=self.guid, text_a=article["title"], text_b=article["maintext"], label=self.label) else: return article["url"], classifier_data_lib.InputExample( guid=self.guid, text_a=article["maintext"], label=self.label)
def __to_feature(self, text, label): example = classifier_data_lib.InputExample(guid=None, text_a=text.numpy(), text_b=None, label=label.numpy()) feature = classifier_data_lib.convert_single_example( 0, example, self.label_list, self.max_seq_length, self.tokenizer) return feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = self.process_text_fn(line[0]) examples.append( classifier_data_lib.InputExample( guid=guid, text_a=text_a, example_id=i)) return examples
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer): example = classifier_data_lib.InputExample(guid=None, text_a=text.numpy(), text_b=None, label=label.numpy()) feature = classifier_data_lib.convert_single_example( 0, example, label_list, max_seq_length, tokenizer) return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
def from_folder(cls, filename, model_spec='average_word_vec', is_training=True, class_labels=None, shuffle=True, cache_dir=None): """Loads text with labels and preproecess text according to `model_spec`. Assume the text data of the same label are in the same subdirectory. each file is one text. Args: filename: Name of the file. model_spec: Specification for the model. is_training: Whether the loaded data is for training or not. class_labels: Class labels that should be considered. Name of the subdirectory not in `class_labels` will be ignored. If None, all the subdirectories will be considered. shuffle: boolean, if shuffle, random shuffle data. cache_dir: The cache directory to save preprocessed data. If None, generates a temporary directory to cache preprocessed data. Returns: TextDataset containing text, labels and other related info. """ model_spec = ms.get(model_spec) data_root = os.path.abspath(filename) folder_name = os.path.basename(data_root) is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info( cache_dir, folder_name, model_spec, is_training) # If cached, directly loads data from cache directory. if is_cached: return cls._load_data(tfrecord_file, meta_data_file, model_spec) # Gets paths of all text. if class_labels: all_text_paths = [] for class_label in class_labels: all_text_paths.extend( list( tf.io.gfile.glob( os.path.join(data_root, class_label) + r'/*'))) else: all_text_paths = list(tf.io.gfile.glob(data_root + r'/*/*')) all_text_size = len(all_text_paths) if all_text_size == 0: raise ValueError('Text size is zero') if shuffle: random.shuffle(all_text_paths) # Gets label and its index. if class_labels: label_names = sorted(class_labels) else: label_names = sorted( name for name in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, name))) # Generates text examples from folder. examples = [] for i, path in enumerate(all_text_paths): with tf.io.gfile.GFile(path, 'r') as f: text = f.read() guid = '%s-%d' % (folder_name, i) label = os.path.basename(os.path.dirname(path)) examples.append( classifier_data_lib.InputExample(guid, text, None, label)) # Saves preprocessed data and other assets into files. cls._save_data(examples, model_spec, label_names, tfrecord_file, meta_data_file, vocab_file, is_training) # Loads data from cache directory. return cls._load_data(tfrecord_file, meta_data_file, model_spec)
def from_csv(cls, filename, text_column, label_column, fieldnames=None, model_spec='average_word_vec', is_training=True, delimiter=',', quotechar='"', shuffle=False, cache_dir=None): """Loads text with labels from the csv file and preproecess text according to `model_spec`. Args: filename: Name of the file. text_column: String, Column name for input text. label_column: String, Column name for labels. fieldnames: A sequence, used in csv.DictReader. If fieldnames is omitted, the values in the first row of file f will be used as the fieldnames. model_spec: Specification for the model. is_training: Whether the loaded data is for training or not. delimiter: Character used to separate fields. quotechar: Character used to quote fields containing special characters. shuffle: boolean, if shuffle, random shuffle data. cache_dir: The cache directory to save preprocessed data. If None, generates a temporary directory to cache preprocessed data. Returns: TextDataset containing text, labels and other related info. """ model_spec = ms.get(model_spec) csv_name = os.path.basename(filename) is_cached, tfrecord_file, meta_data_file, vocab_file = cls._get_cache_info( cache_dir, csv_name, model_spec, is_training) # If cached, directly loads data from cache directory. if is_cached: return cls._load_data(tfrecord_file, meta_data_file, model_spec) lines = cls._read_csv(filename, fieldnames, delimiter, quotechar) if shuffle: random.shuffle(lines) # Gets labels. label_set = set() for line in lines: label_set.add(line[label_column]) label_names = sorted(label_set) # Generates text examples from csv file. examples = [] for i, line in enumerate(lines): text, label = line[text_column], line[label_column] guid = '%s-%d' % (csv_name, i) examples.append( classifier_data_lib.InputExample(guid, text, None, label)) # Saves preprocessed data and other assets into files. cls._save_data(examples, model_spec, label_names, tfrecord_file, meta_data_file, vocab_file, is_training) # Loads data from cache directory. return cls._load_data(tfrecord_file, meta_data_file, model_spec)