def convert(self, data): """Convert a 3D list to a DataSet object. :param data: A 3D tensor. Example:: [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], ... ] :return: A DataSet object. """ data_set = DataSet() for example in data: p, h, l = example # list, list, str instance = Instance() instance.add_field("premise", p) instance.add_field("hypothesis", h) instance.add_field("truth", l) data_set.append(instance) data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len") data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len") data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len") data_set.set_target("truth") return data_set
def convert(self, data): data_set = DataSet() for item in data: sent_words, sent_pos_tag = item[0], item[1] data_set.append(Instance(words=sent_words, tags=sent_pos_tag)) data_set.apply(lambda ins: len(ins), new_field_name="seq_len") data_set.set_target("tags") data_set.set_input("sent_words") data_set.set_input("seq_len") return data_set
def test_apply(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx") self.assertTrue("rx" in ds.field_arrays) self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1]) ds.apply(lambda ins: len(ins["y"]), new_field_name="y") self.assertEqual(ds.field_arrays["y"].content[0], 2) res = ds.apply(lambda ins: len(ins["x"])) self.assertTrue(isinstance(res, list) and len(res) > 0) self.assertTrue(res[0], 4)
def predict(self, content): if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") # 1. 利用POS得到分词和pos tagging结果 pos_out = self.pos_tagger.predict(content) # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()] # 2. 组建dataset dataset = DataSet() dataset.add_field('wp', pos_out) dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words') dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos') dataset.rename_field("words", "raw_words") # 3. 使用pipeline self.pipeline(dataset) dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred') dataset.apply(lambda x: [ arc + '/' + label for arc, label in zip(x['arc_pred'], x['label_pred_seq']) ][1:], new_field_name='output') # output like: [['2/top', '0/root', '4/nn', '2/dep']] return dataset.field_arrays['output'].content
def convert(self, data): data_set = DataSet() for item in data: sent_words = item[0] if self.pos is True and self.ner is True: instance = Instance(words=sent_words, pos_tags=item[1], ner=item[2]) elif self.pos is True: instance = Instance(words=sent_words, pos_tags=item[1]) elif self.ner is True: instance = Instance(words=sent_words, ner=item[1]) else: instance = Instance(words=sent_words) data_set.append(instance) data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len") return data_set
def predict(self, content): """ :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ if not hasattr(self, "pipeline"): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field("words", sentence_list) # 3. 使用pipeline self.pipeline(dataset) def decode_tags(ins): pred_tags = ins["tag"] chars = ins["words"] words = [] start_idx = 0 for idx, tag in enumerate(pred_tags): if tag[0] == "S": words.append(chars[start_idx:idx + 1] + "/" + tag[2:]) start_idx = idx + 1 elif tag[0] == "E": words.append("".join(chars[start_idx:idx + 1]) + "/" + tag[2:]) start_idx = idx + 1 return words dataset.apply(decode_tags, new_field_name="tag_output") output = dataset.field_arrays["tag_output"].content if isinstance(content, str): return output[0] elif isinstance(content, list): return output
def test_BucketSampler(self): sampler = BucketSampler(num_buckets=3, batch_size=16, seq_lens_field_name="seq_len") data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10}) data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len") indices = sampler(data_set) self.assertEqual(len(indices), 10)
def test_apply(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx") self.assertTrue("rx" in ds.field_arrays) self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])