def get_sentence_examples(self, questions): for index, data in enumerate(questions): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(data[0])) text_b = tokenization.convert_to_unicode(str(data[1])) label = str(0) yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def _load_glue(self, lines, split, text_a_loc, text_b_loc, label_loc, skip_first_line=False, eid_offset=0, swap=False): examples = [] for (i, line) in enumerate(lines): try: if i == 0 and skip_first_line: continue eid = i - (1 if skip_first_line else 0) + eid_offset text_a = tokenization.convert_to_unicode(line[text_a_loc]) if text_b_loc is None: text_b = None else: text_b = tokenization.convert_to_unicode(line[text_b_loc]) if "test" in split or "diagnostic" in split: label = self._get_dummy_label() else: label = tokenization.convert_to_unicode(line[label_loc]) if swap: text_a, text_b = text_b, text_a examples.append(InputExample(eid=eid, task_name=self.name, text_a=text_a, text_b=text_b, label=label)) except Exception as ex: utils.log("Error constructing example from line", i, "for task", self.name + ":", ex) utils.log("Input causing the error:", line) return examples
def get_test_examples(self, data_dir): file_path = os.path.join(data_dir, 'test.csv') test_df = pd.read_csv(file_path, encoding='utf-8') test_data = [] for index, test in enumerate(test_df.values): guid = 'test-%d' % index text_a = tokenization.convert_to_unicode(str(test[0])) text_b = tokenization.convert_to_unicode(str(test[1])) label = str(test[2]) test_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return test_data
def _convert_example_to_record(self, example, max_seq_length, tokenizer): """Converts a single `Example` into a single `Record`.""" text_a = tokenization.convert_to_unicode(example.text_a) tokens_a = tokenizer.tokenize(text_a) tokens_b = None if "text_b" in example._fields: text_b = tokenization.convert_to_unicode(example.text_b) tokens_b = tokenizer.tokenize(text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] text_type_ids = [] tokens.append("[CLS]") text_type_ids.append(0) for token in tokens_a: tokens.append(token) text_type_ids.append(0) tokens.append("[SEP]") text_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) text_type_ids.append(1) tokens.append("[SEP]") text_type_ids.append(1) token_ids = tokenizer.convert_tokens_to_ids(tokens) position_ids = list(range(2, len(token_ids) + 2)) label_id = example.label Record = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid']) qid = None if "qid" in example._fields: qid = example.qid record = Record(token_ids=token_ids, text_type_ids=text_type_ids, position_ids=position_ids, label_id=label_id, qid=qid) return record
def _create_examples(self, lines, pred_type=False): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): #if i == 0: # continue guid = "0" text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) if pred_type == True: label = 0.0 examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_training_instances( input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng, ): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, 'r') as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, )) rng.shuffle(instances) return instances
def create_pairexamples_from_tsv_file(file_name): fp = open(file_name, "r", encoding="utf-8") reader = csv.reader(fp, delimiter="\t") lines = [] for line in reader: lines.append(line) examples = [] lines = tqdm(lines) for (i, line) in enumerate(lines): guid = "0" text_a = line[0] text_b = line[1] text_a = tokenization.convert_to_unicode(text_a) text_b = tokenization.convert_to_unicode(text_b) label = 0.0 examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) fp.close() return examples
def _create_encode_examples(self, lines): examples = [] for (i, line) in enumerate(lines): #if i == 0: # continue guid = "0" text_a = tokenization.convert_to_unicode(line[0]) label = 0.0 examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def create_examples_from_tsv_file(file_name): fp = open(file_name, "r", encoding="utf-8") reader = csv.reader(fp, delimiter="\t") lines = [] for line in reader: lines.append(line) examples = [] for (i, line) in enumerate(lines): guid = "0" text_a = tokenization.convert_to_unicode(line[0]) text_b = None label = "1" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def tokenize_and_align(tokenizer, words): """Splits up words into subword-level tokens.""" basic_tokenizer = tokenizer.basic_tokenizer tokenized_words = [] for word in words: word = tokenization.convert_to_unicode(word) word = basic_tokenizer._clean_text(word) if word == "[CLS]" or word == "[SEP]": word_toks = [word] else: word_toks = basic_tokenizer._run_split_on_punc(word) tokenized_word = [] for word_tok in word_toks: tokenized_word += tokenizer.wordpiece_tokenizer.tokenize(word_tok) tokenized_words.append(tokenized_word) assert len(tokenized_words) == len(words) flatten = list(chain.from_iterable(tokenized_words)) return flatten
def _create_examples(self, items, split): examples = [] for eid, item in enumerate(items): text_a = [ tokenization.convert_to_unicode(item[feature]) for feature in self.config.features ] text_a = [ preprocess_abstract(text) if feature == "abstract" else text for text, feature in zip(text_a, self.config.features) ] label = item['journal'].lower() examples.append( InputExample(eid=eid, task_name=self.name, text_a=text_a, text_b=None, label=label)) return examples
def create_examples_from_json_file(file_name): fp = open(file_name, "r", encoding="utf-8") # reader = csv.reader(fp, delimiter="\t") lines = [] for line in fp: lines.append(line) examples = [] lines = tqdm(lines) for (i, line) in enumerate(lines): guid = "0" json_data = json.loads(line.strip()) title = re.sub("[\r\n]", " ", json_data["title"]) content = re.sub("[\r\n]", " ", json_data["content"]) text_a = tokenization.convert_to_unicode(title + " " + content) text_b = None label = "1" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def tokenize_and_align(tokenizer, words, cased=False): """Splits up words into subword-level tokens.""" words = ["[CLS]"] + list(words) + ["[SEP]"] basic_tokenizer = tokenizer.basic_tokenizer tokenized_words = [] for word in words: word = tokenization.convert_to_unicode(word) word = basic_tokenizer._clean_text(word) if word == "[CLS]" or word == "[SEP]": word_toks = [word] else: if not cased: word = word.lower() word = basic_tokenizer._run_strip_accents(word) word_toks = basic_tokenizer._run_split_on_punc(word) tokenized_word = [] for word_tok in word_toks: tokenized_word += tokenizer.wordpiece_tokenizer.tokenize(word_tok) tokenized_words.append(tokenized_word) assert len(tokenized_words) == len(words) return tokenized_words
def read_examples(input_texts): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 for text in input_texts : line = tokenization.convert_to_unicode(text) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "0" text_a = tokenization.convert_to_unicode(line[0]) #if len(line) > 2: # text_b = tokenization.convert_to_unicode(line[1]) #else: # text_b = None text_b = None if set_type == "test": label = "1" examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _text_to_ids(self, text, tokenizer=None, max_len=None, trunc_type="right", keep_sep=True): """convert text to vocab ids""" max_len = max_len or self.max_src_len - 1 tokenizer = tokenizer or self.tokenizer text = tokenization.convert_to_unicode(text) if self.tokenized_input: tokens = text.split(" ") else: tokens = tokenizer.tokenize(text) token_ids = tokenizer.convert_tokens_to_ids(tokens) + [self.sep_id] token_ids = self._trunc_token_ids(token_ids, max_len, trunc_type, keep_sep) pos_ids = range(3, len(token_ids) + 3) ####################### pos start from 2 return token_ids, pos_ids
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] if set_type == 'train': for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) query = tokenization.convert_to_unicode(line[0]) cand1 = tokenization.convert_to_unicode(line[1]) cand2 = tokenization.convert_to_unicode(line[2]) cand3 = tokenization.convert_to_unicode(line[3]) label = int(line[-1]) examples.append( InputExample(guid=guid, query=query, cand1=cand1, cand2=cand2, cand3=cand3, label=label)) elif set_type == 'dev': for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) query = tokenization.convert_to_unicode(line[0]) cand1 = tokenization.convert_to_unicode(line[1]) cand2 = tokenization.convert_to_unicode(line[2]) cand3 = tokenization.convert_to_unicode(line[3]) label = 0 examples.append( InputExample(guid=guid, query=query, cand1=cand1, cand2=cand2, cand3=cand3, label=label)) return examples