def get_dev_examples(self, data_dir): """Gets a collection of `InputExample`s for the dev set.""" lines = self._read_tsv(data_dir) examples = [] for (i, line) in enumerate(lines): guid = line[0] text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) examples.append(InputExample(guid=guid, text_a=text_a, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_test_examples(self, data_dir): """Gets a collection of `InputExample`s for prediction.""" lines = self._read_tsv(data_dir) examples = [] for (i, line) in enumerate(lines): text_a = tokenization.convert_to_unicode(line[-1]) label = self.change_label_to_id(line[:-1]) examples.append(InputExample(guid=i, text_a=text_a, label=label)) return examples
vocab_file=settings.bert_model_vocab_path, do_lower_case=True) if not os.path.exists(settings.train_tfrecord_path): train_examples = processor.get_train_examples(settings.train_data_path) file_based_convert_examples_to_features(train_examples, label_list, model_params.max_seq_length, tokenizer, settings.train_tfrecord_path) if not os.path.exists(settings.dev_tfrecord_path): dev_examples = processor.get_dev_examples(settings.dev_data_path) file_based_convert_examples_to_features(dev_examples, label_list, model_params.max_seq_length, tokenizer, settings.dev_tfrecord_path) if not os.path.exists(settings.test_tfrecord_path): test_examples = processor.get_test_examples(settings.test_data_path) file_based_convert_examples_to_features(test_examples, label_list, model_params.max_seq_length, tokenizer, settings.test_tfrecord_path) if __name__ == '__main__': create_tfrecorf_file() test_string = "##武汉加油##" print(tokenization.convert_to_unicode(test_string)) tokenizer = tokenization.FullTokenizer( vocab_file=settings.bert_model_vocab_path, do_lower_case=True) print(tokenizer.tokenize(test_string))