def input_from_line(line, max_seq_length, tag_to_id): """ Take sentence data and return an input for the training or the evaluation function. """ string = [w[0].strip() for w in line] # chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] # for w in string] char_line = ' '.join(string) # 使用空格把汉字拼起来 text = tokenization.convert_to_unicode(char_line) tags = ['O' for _ in string] labels = ' '.join(tags) # 使用空格把标签拼起来 labels = tokenization.convert_to_unicode(labels) ids, mask, segment_ids, label_ids = convert_single_example(char_line=text, tag_to_id=tag_to_id, max_seq_length=max_seq_length, tokenizer=tokenizer, label_line=labels) import numpy as np segment_ids = np.reshape(segment_ids,(1, max_seq_length)) ids = np.reshape(ids, (1, max_seq_length)) mask = np.reshape(mask, (1, max_seq_length)) label_ids = np.reshape(label_ids, (1, max_seq_length)) return [string, segment_ids, ids, mask, label_ids]
def prepare_dataset(sentences, max_seq_length, tag_to_id, lower=False, train=True): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ def f(x): return x.lower() if lower else x data = [] for s in sentences: string = [w[0].strip() for w in s] #chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] # for w in string] char_line = ' '.join(string) #使用空格把汉字拼起来 text = tokenization.convert_to_unicode(char_line) if train: tags = [w[-1] for w in s] else: tags = ['O' for _ in string] labels = ' '.join(tags) #使用空格把标签拼起来 labels = tokenization.convert_to_unicode(labels) ids, mask, segment_ids, label_ids = convert_single_example(char_line=text, tag_to_id=tag_to_id, max_seq_length=max_seq_length, tokenizer=tokenizer, label_line=labels) data.append([string, segment_ids, ids, mask, label_ids]) return data
def get_test_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(curr_path, data_dir)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) text_a = tokenization.convert_to_unicode(line[2]) text_b = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode("0") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _to_example(sentences): import re """ sentences to InputExample :param sentences: list of strings :return: list of InputExample """ unique_id = 0 for ss in sentences: line = tokenization.convert_to_unicode(ss) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) unique_id += 1