def convert(self, data): """Convert a 3D list to a DataSet object. :param data: A 3D tensor. Example:: [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], ... ] :return: A DataSet object. """ data_set = DataSet() for example in data: p, h, l = example # list, list, str instance = Instance() instance.add_field("premise", p) instance.add_field("hypothesis", h) instance.add_field("truth", l) data_set.append(instance) data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len") data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len") data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len") data_set.set_target("truth") return data_set
def generate_fake_dataset(num_samples=1000): """ 产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]} :param num_samples: sample的数量 :return: """ max_len = 50 min_len = 10 num_features = 4 data_dict = {} for i in range(num_features): data = [] lengths = np.random.randint(min_len, max_len, size=(num_samples)) for length in lengths: data.append(np.random.randint(100, size=length)) data_dict[str(i)] = data dataset = DataSet(data_dict) for i in range(num_features): if np.random.randint(2) == 0: dataset.set_input(str(i)) else: dataset.set_target(str(i)) return dataset
def test_list_of_numpy_to_tensor(self): ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] + [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def test_numpy_padding(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4))
def convert(self, data): data_set = DataSet() for item in data: sent_words, sent_pos_tag = item[0], item[1] data_set.append(Instance(words=sent_words, tags=sent_pos_tag)) data_set.apply(lambda ins: len(ins), new_field_name="seq_len") data_set.set_target("tags") data_set.set_input("sent_words") data_set.set_input("seq_len") return data_set
def test_dataset_batching(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) self.assertEqual(len(x["x"]), 4) self.assertEqual(len(y["y"]), 4) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(y["y"][-1]), [5, 6])
def test_numpy_to_tensor(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_list_of_list_to_tensor(self): ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_ModelProcessor(self): from fastNLP.models.cnn_text_classification import CNNText model = CNNText(100, 100, 5) ins_list = [] for _ in range(64): seq_len = np.random.randint(5, 30) ins_list.append( Instance(word_seq=[ np.random.randint(0, 100) for _ in range(seq_len) ], seq_lens=seq_len)) data_set = DataSet(ins_list) data_set.set_input("word_seq", "seq_lens") proc = ModelProcessor(model) data_set = proc(data_set) self.assertTrue("pred" in data_set)
def test_input_target(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds.set_input("x") ds.set_target("y") self.assertTrue(ds.field_arrays["x"].is_input) self.assertTrue(ds.field_arrays["y"].is_target) with self.assertRaises(KeyError): ds.set_input("xxx") with self.assertRaises(KeyError): ds.set_input("yyy")
class CustomizedNER(object): def __init__(self, modelFile, vocabFile, addTarget2Vocab=False): # CHAR_INPUT="chars", 并且会转化为word_index self._vocabFile = vocabFile self._addTarget2Vocab = addTarget2Vocab self._CONST_CHAR = Const.CHAR_INPUT self._CONST_WORDS = Const.INPUT self._CONST_TARGET = Const.TARGET self._input_fields = [self._CONST_WORDS, Const.INPUT_LEN] self._word_counter, self._word_vocab, self._target_counter, \ self._target_vocab, self._target = self._get_vocabs() self._vocab4word = Vocabulary() self._update_word() if self._addTarget2Vocab: self._vocab4target = Vocabulary(unknown=None, padding=None) self._input_fields.append(self._CONST_TARGET) self._update_target() self._model = Predictor(ModelLoader().load_pytorch_model(modelFile)) def _target_token(self, word_token, cont, number="", word=""): ret = dict() sign = True lastIdx = len(word_token) - 1 for num, token in zip(enumerate(word_token), cont): if num[1] in self._target: if sign: number += str(num[1]) word += token if num[0] < lastIdx and not word_token[num[0] + 1]: sign = False else: ret.setdefault(number, set()) ret[number].add(word) number = "" word = token sign = True if number: ret.setdefault(number, set()) ret[number].add(word) return ret def _extract_ner(self, tokenNum, token, weighted=False): if not weighted: cls = self._target.get(int(max(tokenNum, key=tokenNum.count)), "") if cls.endswith("LOC"): return {"LOC": [x for x in token]} elif cls.endswith("PER"): return {"PER": [x for x in token]} elif cls.endswith("ORG"): return {"ORG": [x for x in token]} def _get_ner(self, tokenNumber, tokenWord): nerDict = self._target_token(tokenNumber, tokenWord) ret = dict() for num, token in nerDict.items(): if len(num) == 1: continue for k, v in self._extract_ner(num, token).items(): ret.setdefault(k, list()) ret[k].extend(v) return ret def _read_vocab(self): with open(self._vocabFile, "r", encoding="utf-8") as vocabIn: return eval(vocabIn.read()) def _reverse_dict(self, dic): ret = dict() for key, value in dic.items(): ret.setdefault(value, key) return ret def _tartget_label(self, dic): ret = self._reverse_dict(dic) del ret[0] return ret def _get_vocabs(self): vocabs = self._read_vocab() word_count = vocabs.get("wordsWc", dict()) wordsVocab = vocabs.get("wordsVocab", dict()) target_count = vocabs.get("targetWc", dict()) targetVocab = vocabs.get("targetVocab", dict()) reverseTargetVocab = self._tartget_label(targetVocab) return Counter(word_count), wordsVocab, Counter( target_count), targetVocab, reverseTargetVocab def _update_word(self): self._vocab4word.update(self._word_vocab) self._vocab4word.word_count = self._word_counter def _update_target(self): self._vocab4target.update(self._target_vocab) self._vocab4target.word_count = self._target_counter @property def model(self): if not self._model: raise return self._model def formatRowString(self, msg): msg = msg.strip() tokenized_char = [x for x in msg] self._dataset = DataSet() if self._addTarget2Vocab: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char, target=list(dict(self._target_vocab).keys())) else: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char) self._dataset.append(ins) @property def dataset(self): # if input as dict format: # data = DataSet({"raw_chars":[msg], "words":[[x for x in msg]], "seq_len":[len(word_list)]}) # 从该dataset中的chars列建立词表 self._vocab4word.from_dataset(self._dataset, field_name=self._CONST_CHAR) # 使用vocabulary将chars列转换为index self._vocab4word.index_dataset(self._dataset, field_name=self._CONST_CHAR, new_field_name=self._CONST_WORDS) if self._addTarget2Vocab: self._vocab4target.from_dataset(self._dataset, field_name=self._CONST_TARGET) self._vocab4target.index_dataset(self._dataset, field_name=self._CONST_TARGET) self._dataset.add_seq_len(self._CONST_CHAR) self._dataset.set_input(*self._input_fields) return self._dataset def _content(self): for line in self._dataset["raw_chars"].content: yield "".join(line) def result(self, dataset): # 打印数据集中的预测结果 ret = self.model.predict(dataset)["pred"] for line, cont in zip(ret, self._content()): yield self._get_ner(line[0].tolist(), cont)