def trans_to_index(self, text_as, text_bs): """ 将输入转化为索引表示 :param text_as: 输入 :param text_bs: :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path, do_lower_case=True) input_ids = [] input_masks = [] segment_ids = [] for text_a, text_b in zip(text_as, text_bs): text_a = tokenization.convert_to_unicode(text_a) text_b = tokenization.convert_to_unicode(text_b) tokens_a = tokenizer.tokenize(text_a) tokens_b = tokenizer.tokenize(text_b) # 判断两条序列组合在一起长度是否超过最大长度 self._truncate_seq_pair(tokens_a, tokens_b, self._sequence_length - 3) tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_ids.append(input_id) input_masks.append([1] * len(input_id)) segment_ids.append([0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)) return input_ids, input_masks, segment_ids
def create_single_example(index, line, set_type): """Creates examples for the training and dev sets.""" guid = "%s-%s" % (set_type, index) text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) example = InputExample(guid=guid, text_a=text_a, text_b=None, label=label) return example
def input_from_line(line, max_seq_length, tag_to_id): """ Take sentence data and return an input for the training or the evaluation function. """ string = [w[0].strip() for w in line] # chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] # for w in string] char_line = ' '.join(string) # 使用空格把汉字拼起来 text = tokenization.convert_to_unicode(char_line) tags = ['O' for _ in string] labels = ' '.join(tags) # 使用空格把标签拼起来 labels = tokenization.convert_to_unicode(labels) ids, mask, segment_ids, label_ids = convert_single_example( char_line=text, tag_to_id=tag_to_id, max_seq_length=max_seq_length, tokenizer=tokenizer, label_line=labels) import numpy as np segment_ids = np.reshape(segment_ids, (1, max_seq_length)) ids = np.reshape(ids, (1, max_seq_length)) mask = np.reshape(mask, (1, max_seq_length)) label_ids = np.reshape(label_ids, (1, max_seq_length)) return [string, segment_ids, ids, mask, label_ids]
def sentence_to_idx(self, text_a, text_b): """ 将分词后的句子转换成idx表示 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) text_a = tokenization.convert_to_unicode(text_a) text_b = tokenization.convert_to_unicode(text_b) tokens_a = tokenizer.tokenize(text_a) tokens_b = tokenizer.tokenize(text_b) # 判断两条序列组合在一起长度是否超过最大长度 self._truncate_seq_pair(tokens_a, tokens_b, self.sequence_length - 3) tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) segment_id = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1) input_id, input_mask, segment_id = self.padding( input_id, input_mask, segment_id) return [input_id], [input_mask], [segment_id]
def _create_example(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) examples.append(InputExample(guid=guid, text=text, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[1]) text_b = None label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" trian_dir = os.path.join(data_dir, "train_data.csv") train_data = pd.read_csv(trian_dir, header=None, names=['location', 'result', 'fw', 'label']) print(train_data.shape) set_type = "train" examples = [] for (i, line) in train_data.iterrows(): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode( str(i))) text_a = self.process_text(str(line["result"])) text_b = self.process_text(line["fw"]) if set_type == "test": label = "contradiction" else: label = self.process_text(str(int(float(line["label"])))) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, tokenization.convert_to_unicode( line[0])) text_a = tokenization.convert_to_unicode(line[1]) text_b = tokenization.convert_to_unicode(line[2]) if set_type == "test": label = "not_entailment" else: label = tokenization.convert_to_unicode(line[-1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): """Create `TrainingInstance`s from raw text.""" all_documents = [[]] # Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = reader.readline() if not FLAGS.spm_model_file: line = tokenization.convert_to_unicode(line) if not line: break if FLAGS.spm_model_file: line = tokenization.preprocess_text( line, lower=FLAGS.do_lower_case) else: line = line.strip() # Empty lines are used as document delimiters if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens) # Remove empty documents all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) vocab_words = list(tokenizer.vocab.keys()) instances = [] for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document(all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) rng.shuffle(instances) return instances
def _create_example(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): #skip header if i == 0 and set_type == 'test': continue guid = line[0] text_a = tokenization.convert_to_unicode(line[1]) if set_type == "test": label = self.get_labels()[-1] else: try: label = tokenization.convert_to_unicode(line[2]) except IndexError: logging.exception(line) exit(1) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def prepare_dataset(sentences, max_seq_length, tag_to_id, lower=False, train=True): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ def f(x): return x.lower() if lower else x data = [] for s in sentences: string = [w[0].strip() for w in s] char_line = ' '.join(string) # 使用空格把汉字拼起来 text = tokenization.convert_to_unicode(char_line) if train: tags = [w[-1] for w in s] else: tags = ['O' for _ in string] labels = ' '.join(tags) # 使用空格把标签拼起来 labels = tokenization.convert_to_unicode(labels) ids, mask, segment_ids, label_ids = convert_single_example( char_line=text, tag_to_id=tag_to_id, max_seq_length=max_seq_length, tokenizer=tokenizer, label_line=labels) data.append([string, segment_ids, ids, mask, label_ids]) return data
def _create_examples(self, lines, set_type): examples = [] for line in lines: qid = line['id'] question = tokenization.convert_to_unicode(line['question']['stem']) answers = np.array([ tokenization.convert_to_unicode(choice['text']) for choice in sorted( line['question']['choices'], key=lambda c: c['label']) ]) #TODO process_text # the test set has no answer key so use 'A' as a dummy label label = self.LABELS.index(line.get('answerKey', 'A')) examples.append( InputExample( qid=qid, question=question, answers=answers, label=label)) return examples
def sentence_to_idx(self, text): """ 将分词后的句子转换成idx表示 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) text = tokenization.convert_to_unicode(text) tokens = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) segment_id = [0] * len(input_id) input_id, input_mask, segment_id = self.padding( input_id, input_mask, segment_id) return [input_id], [input_mask], [segment_id]
def trans_to_index(self, inputs): """ 将输入转化为索引表示 :param inputs: 输入 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path, do_lower_case=True) input_ids = [] input_masks = [] segment_ids = [] for text in inputs: text = tokenization.convert_to_unicode(text) tokens = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_ids.append(input_id) input_masks.append([1] * len(input_id)) segment_ids.append([0] * len(input_id)) return input_ids, input_masks, segment_ids
def _to_example(sentences): import re """ sentences to InputExample :param sentences: list of strings :return: list of InputExample """ guid = 0 for ss in sentences: line = tokenization.convert_to_unicode(ss) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) yield InputExample(guid=guid, text_a=text_a, text_b=text_b) guid += 1
def process_text(self, text): if self.use_spm: return tokenization.preprocess_text(text, lower=self.do_lower_case) else: return tokenization.convert_to_unicode(text)