def test_init_assert(self): with self.assertRaises(AssertionError): _ = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 100}) with self.assertRaises(AssertionError): _ = DataSet([[1, 2, 3, 4]] * 10) with self.assertRaises(ValueError): _ = DataSet(0.00001)
def preprocess(): train_set = DataSet() for i in range(len(raw_train.data)): train_set.append( Instance(sentence=raw_train.data[i], target=int(raw_train.target[i]))) train_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') train_set.apply(lambda x: x['sentence'].split(), new_field_name='words') train_set.apply(lambda x: len(x['words']), new_field_name='seq_len') test_set = DataSet() for i in range(len(raw_test.data)): test_set.append( Instance(sentence=raw_test.data[i], target=int(raw_test.target[i]))) test_set.apply(lambda x: x['sentence'].translate( str.maketrans("", "", string.punctuation)).lower(), new_field_name='sentence') test_set.apply(lambda x: x['sentence'].split(), new_field_name='words') test_set.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=10) train_set.apply(lambda x: [vocab.add(word) for word in x['words']]) test_set.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() vocab.index_dataset(train_set, field_name='words', new_field_name='words') vocab.index_dataset(test_set, field_name='words', new_field_name='words') return train_set, test_set, vocab
def get_fastnlp_dataset(): text_train, text_test = get_text_classification_datasets() train_data = DataSet() test_data = DataSet() for i in range(len(text_train.data)): train_data.append( Instance(text=split_sent(text_train.data[i]), target=int(text_train.target[i]))) for i in range(len(text_test.data)): test_data.append( Instance(text=split_sent(text_test.data[i]), target=int(text_test.target[i]))) # 构建词表 vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>') train_data.apply(lambda x: [vocab.add(word) for word in x['text']]) vocab.build_vocab() # 根据词表映射句子 train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']], new_field_name='word_seq') # 设定特征域和标签域 train_data.set_input("word_seq") test_data.set_input("word_seq") train_data.set_target("target") test_data.set_target("target") return train_data, test_data, vocab
def get_data(): dataset_train, dataset_test = get_text_classification_datasets() # print(dataset_train.data) dic_train = { "input" : dataset_train.data, "target" : dataset_train.target } dic_test = { "input" : dataset_test.data, "target" : dataset_test.target } dataset = DataSet(dic_train) test_data = DataSet(dic_test) dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input') test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input') test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words') # ************************** dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('target', Const.TARGET) test_data.rename_field('words', Const.INPUT) test_data.rename_field('seq_len', Const.INPUT_LEN) test_data.rename_field('target', Const.TARGET) # dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_input(Const.INPUT) dataset.set_target(Const.TARGET) # test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_input(Const.INPUT) test_data.set_target(Const.TARGET) # ************************** # only use train for vocab or train+dev train_data, dev_data = dataset.split(0.1) # print(len(train_data), len(dev_data), len(test_data)) # print(train_data[0]) vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT) vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT) vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT) # print(test_data[0]) print(len(vocab)) return vocab, train_data, dev_data, test_data
def test_get_item_error(self): with self.assertRaises(RuntimeError): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) _ = ds[40:] with self.assertRaises(KeyError): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) _ = ds["kom"]
def readdata(): global target_len min_count = 10 #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ] dataset_train = fetch_20newsgroups(subset='train', data_home='../../..') dataset_test = fetch_20newsgroups(subset='test', data_home='../../..') data = dataset_train.data target = dataset_train.target target_len = len(dataset_train.target_names) train_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') data = dataset_test.data target = dataset_test.target test_data = DataSet() padding = 0 for i in range(len(data)): data_t = re.sub("\d+|\s+|/", " ", data[i] ) temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != ''] test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp)) if len(temp) > padding: padding = len(temp) test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw') train_data.apply(lambda x: len(x['words']), new_field_name='len') test_data.apply(lambda x: len(x['words']), new_field_name='len') vocab = Vocabulary(min_freq=10) train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) vocab.build_vocab() train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq') train_data.rename_field('seq', Const.INPUT) train_data.rename_field('len', Const.INPUT_LEN) train_data.rename_field('label', Const.TARGET) test_data.rename_field('seq', Const.INPUT) test_data.rename_field('len', Const.INPUT_LEN) test_data.rename_field('label', Const.TARGET) test_data.set_input(Const.INPUT, Const.INPUT_LEN) test_data.set_target(Const.TARGET) train_data.set_input(Const.INPUT, Const.INPUT_LEN) train_data.set_target(Const.TARGET) test_data, dev_data = test_data.split(0.5) return train_data,dev_data,test_data,vocab
def test_copy_padder(self): from fastNLP.core.field import AutoPadder ds = DataSet() ds.add_field('idx', [1, 2, 3]) ds['idx'].set_padder(None) # workaround of problem 1 ds.apply_field(lambda x: x, 'idx', 'idx') self.assertEqual(ds['idx'].padder, None) # should be None, but AutoPadder ds = DataSet() ds.add_field('idx', [1, 2, 3]) ds.apply_field(lambda x: x, 'idx', 'idx') self.assertTrue( isinstance(ds.get_field('idx').padder, AutoPadder)) # should be None, but AutoPadder
def read_file(filename, processing_word=get_processing_word(lowercase=False)): dataset = DataSet() niter = 0 with codecs.open(filename, "r", "utf-16") as f: words, tags = [], [] for line in f: line = line.strip() if len(line) == 0 or line.startswith("-DOCSTART-"): if len(words) != 0: assert len(words) > 2 if niter == 1: print(words, tags) niter += 1 dataset.append( Instance(ori_words=words[:-1], ori_tags=tags[:-1])) words, tags = [], [] else: word, tag = line.split() word = processing_word(word) words.append(word) tags.append(tag.lower()) dataset.apply_field(lambda x: [x[0]], field_name='ori_words', new_field_name='task') dataset.apply_field(lambda x: len(x), field_name='ori_tags', new_field_name='seq_len') dataset.apply_field(lambda x: expand(x), field_name='ori_words', new_field_name="bi1") return dataset
def is_phrase_match_BERT(phrase1, phrase2): """ Determine if two phrases match :param phrase1: phrase1 :param phrase2: phrase2 """ from fastNLP import DataSetIter, DataSet from fastNLP.core.utils import _move_dict_value_to_device from my_bert_match import addWords, addWordPiece, processItem, processNum, addSeqlen # 0 for not match,1 for match testset = DataSet({"raw_words": [f"{phrase1}::{phrase2}"]}) testset.apply(addWords, new_field_name="p_words") testset.apply(addWordPiece, new_field_name="t_words") testset.apply(processItem, new_field_name="word_pieces") testset.apply(processNum, new_field_name="word_nums") testset.apply(addSeqlen, new_field_name="seq_len") testset.field_arrays["word_pieces"].is_input = True testset.field_arrays["seq_len"].is_input = True testset.field_arrays["word_nums"].is_input = True # print(testset) with torch.no_grad(): bert_model.eval() test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None) outputs = [] for batch_x, batch_y in test_batch: _move_dict_value_to_device(batch_x, batch_y, device=device) outputs.append(bert_model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred']) outputs = torch.cat(outputs) outputs = torch.nn.functional.softmax(outputs, dim=1) return ["Not Match", "Related", "Match"][outputs.argmax().item()]
def test_delete_field(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) dd.add_field("y", [[1, 2, 3, 4]] * 10) dd.delete_field("x") self.assertFalse("x" in dd.field_arrays) self.assertTrue("y" in dd.field_arrays)
def test_append(self): dd = DataSet() for _ in range(3): dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6])) self.assertEqual(len(dd), 3) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3) self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
def load(path): data = DataSet() _data = [] with open(path, "r", encoding="utf-8") as fil: fil.readline() for line in fil: try: tradi, verna = line.strip().split("\t") except ValueError: continue tradi = chinese_tokenizer(tradi) verna = chinese_tokenizer(verna) vocab.add_word_lst(tradi) vocab.add_word_lst(verna) _data.append(Instance(traditional=tradi, vernacular=verna)) random.shuffle(_data) for x in _data: data.append(x) data.set_input("vernacular") data.set_target("traditional") return data
def process_data(data_path, data_name, test=False, bert=False, input_name='text', target_name='target'): print('Processing', data_name) schemas = {} with open(os.path.join(data_path, "all_50_schemas"), 'rb') as f: for i, line in enumerate(f): spo = json.loads(line) schemas[spo['subject_type'] + spo['predicate'] + spo['object_type']] = i # input text = [] # target target = [] with open(os.path.join(data_path, data_name), 'rb') as f: for line in f: dic = json.loads(line) if bert: text.append(dic['text']) else: text.append(list(dic['text'])) if not test: target.append(process_class(schemas, dic['spo_list'])) if not test: data_dict = { input_name: text, target_name: target } else: data_dict = {input_name: text} dataset = DataSet(data=data_dict) print('Len', len(dataset)) print('Sample', dataset[0]) #exit() return dataset
def make_dataset(data): dataset = DataSet() tot = 0 for x in data: seq = "[CLS] " + x["raw_text"] seq = tokenizer.encode(seq) """ seq=["[CLS]"]+word_tokenize(x["raw_text"]) seq=tokenizer.convert_tokens_to_ids(seq) """ if len(seq) > 512: seq = seq[:512] tot += 1 # print(x["raw_text"]) # print() label = int(x["label"]) ins = Instance(origin=x["raw_text"], seq=seq, label=label, seq_len=len(seq)) dataset.append(ins) dataset.set_input("seq", "seq_len") dataset.set_target("label") print(dataset[5]) print("number:", len(dataset), tot) print() return dataset
def test_roberta_embed_eq_roberta_piece_encoder(self): # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致 weight_path = 'test/data_for_tests/embedding/small_roberta' ds = DataSet({ 'words': ["this is a texta a sentence".split(), 'this is'.split()] }) encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path) encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, pool_method='first', include_cls_sep=True, pooled_cls=False) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def load(self, folder): fns ={ 'dev':'{}_dev.csv'.format(self.lg1_lg2), 'test':'{}_test500.csv'.format(self.lg1_lg2), 'train': '{}_train500_10.csv'.format(self.lg1_lg2) } target_lg = self.lg1_lg2.split('_')[0] data_bundle = DataBundle() for name, fn in fns.items(): path = os.path.join(folder, fn) ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: parts = line.split('\t') if self.lower: ins = Instance(word=parts[1].lower(), definition=parts[-1].lower()) else: ins = Instance(word=parts[1], definition=parts[-1]) ds.append(ins) data_bundle.set_dataset(ds, name=name) target_words = {} with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f: for line in f: line = line.strip() if line: if self.lower: line = line.lower() target_words[line] = 1 target_words = list(target_words.keys()) setattr(data_bundle, 'target_words', target_words) return data_bundle
def create_dataset(): # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles'] # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale'] categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..') newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..') dataset = DataSet() for i in range(len(newsgroups_train.data)): if len(newsgroups_train.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i]))) for i in range(len(newsgroups_test.data)): if len(newsgroups_test.data[i]) <= 2000: dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i]))) dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') dataset.apply(lambda x: x['sentence'].split(), new_field_name='words') dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') vocab.index_dataset(dataset, field_name='words', new_field_name='words') dataset.set_input('words', 'seq_len') dataset.set_target('target') train_dev_data, test_data = dataset.split(0.1) train_data, dev_data = train_dev_data.split(0.1) return vocab, train_data, dev_data, test_data
def load(self, path: str, bigram: bool = False) -> DataSet: """ :param path: str :param bigram: 是否使用bigram feature :return: """ dataset = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: # 去掉空行 continue parts = line.split() word_lens = map(len, parts) chars = list(''.join(parts)) tags = self._word_len_to_target(word_lens) assert len(chars) == len(tags['target']) dataset.append( Instance(raw_chars=chars, **tags, seq_len=len(chars))) if len(dataset) == 0: raise RuntimeError(f"{path} has no valid data.") if bigram: dataset.apply_field(self._gen_bigram, field_name='raw_chars', new_field_name='bigrams') return dataset
def get_joke_data(data_path): data_set = DataSet() sample_num = 0 sample_len = [] if os.path.exists(data_path): with open(data_path, 'r', encoding='utf-8') as fin: for lid, line in enumerate(fin): joke = json.loads(line) if joke['support'] > 0: if len(joke['content']) == 0: continue else: instance = Instance(raw_joke=joke['content']) data_set.append(instance) sample_num += 1 sample_len.append(len(joke['content'])) else: print("the data path doesn't exit.") print("Got {} samples from file.".format(sample_num)) for i in range(5): import random id = random.randint(0, sample_num) print("sample {}: {}".format(id, data_set[id]['raw_joke'])) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.hist(sample_len, bins=50, range=(0, 1000)) plt.savefig("./examples.jpg") count = 0 for i in sample_len: if i < 255: count += 1 print(count, '/', len(sample_len)) return data_set
def data_analysis(data_path): data_set = DataSet() sample_num = 0 sample_len = [] scores = [] if os.path.exists(data_path): with open(data_path, 'r', encoding='utf-8') as fin: for lid, line in enumerate(fin): joke = json.loads(line) if len(joke['content']) > 0: scores.append(joke['support']) sample_num += 1 sample_len.append(len(joke['content'])) else: print("the data path doesn't exit.") print("Got {} samples from file.".format(sample_num)) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.hist(scores, bins=50, range=(0, 1500)) plt.savefig("./sample_scores.jpg") count = 0 for i in scores: if i >= 3: count += 1 print(count, '/', len(sample_len)) return
def test_bert_embed_eq_bert_piece_encoder(self): ds = DataSet({ 'words': ["this is a texta model vocab".split(), 'this is'.split()] }) encoder = BertWordPieceEncoder( model_dir_or_name='test/data_for_tests/embedding/small_bert') encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) vocab = Vocabulary() vocab.from_dataset(ds, field_name='words') vocab.index_dataset(ds, field_name='words', new_field_name='words') ds.set_input('words') words = torch.LongTensor(ds['words'].get([0, 1])) embed = BertEmbedding( vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', pool_method='first', include_cls_sep=True, pooled_cls=False, min_freq=1) embed.eval() words_res = embed(words) # 检查word piece什么的是正常work的 self.assertEqual((word_pieces_res[0, :5] - words_res[0, :5]).sum(), 0) self.assertEqual((word_pieces_res[0, 6:] - words_res[0, 5:]).sum(), 0) self.assertEqual((word_pieces_res[1, :3] - words_res[1, :3]).sum(), 0)
def make_dataset(data): dataset = DataSet() mx = 0 le = None for x, y in zip(data.data, data.target): xx = deal(x) ins = Instance(sentence=xx, label=int(y)) if mx < len(xx.split()): mx = max(mx, len(xx.split())) le = xx dataset.append(ins) print(mx) dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words') dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') dataset.rename_field('words', Const.INPUT) dataset.rename_field('seq_len', Const.INPUT_LEN) dataset.rename_field('label', Const.TARGET) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) return dataset
def generate_fake_dataset(num_samples=1000): """ 产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]} :param num_samples: sample的数量 :return: """ max_len = 50 min_len = 10 num_features = 4 data_dict = {} for i in range(num_features): data = [] lengths = np.random.randint(min_len, max_len, size=(num_samples)) for length in lengths: data.append(np.random.randint(1, 100, size=length)) data_dict[str(i)] = data dataset = DataSet(data_dict) for i in range(num_features): if np.random.randint(2) == 0: dataset.set_input(str(i)) else: dataset.set_target(str(i)) return dataset
def test_apply_more(self): T = DataSet({"a": [1, 2, 3], "b": [2, 4, 5]}) func_1 = lambda x: {"c": x["a"] * 2, "d": x["a"]**2} func_2 = lambda x: {"c": x * 3, "d": x**3} def func_err_1(x): if x["a"] == 1: return {"e": x["a"] * 2, "f": x["a"]**2} else: return {"e": x["a"] * 2} def func_err_2(x): if x == 1: return {"e": x * 2, "f": x**2} else: return {"e": x * 2} T.apply_more(func_1) self.assertEqual(list(T["c"]), [2, 4, 6]) self.assertEqual(list(T["d"]), [1, 4, 9]) res = T.apply_field_more(func_2, "a", modify_fields=False) self.assertEqual(list(T["c"]), [2, 4, 6]) self.assertEqual(list(T["d"]), [1, 4, 9]) self.assertEqual(list(res["c"]), [3, 6, 9]) self.assertEqual(list(res["d"]), [1, 8, 27]) with self.assertRaises(ApplyResultException) as e: T.apply_more(func_err_1) print(e) with self.assertRaises(ApplyResultException) as e: T.apply_field_more(func_err_2, "a") print(e)
def read_instances_from_file(file, max_len=400, keep_case=False): ''' Collect instances and construct vocab ''' dataset = DataSet() trimmed_sent = 0 with open(file) as f: lines = f.readlines() for l in lines: l = l.strip().split('\t') if len(l) < 2: continue label = int(l[0]) sent = l[1] if not keep_case: sent = sent.lower() word_lst = sent.split() if len(word_lst) > max_len: word_lst = word_lst[:max_len] trimmed_sent += 1 if word_lst: dataset.append(Instance(words=word_lst, label=label)) logger.info('Get {} instances from file {}'.format(len(dataset), file)) if trimmed_sent: logger.info('{} sentences are trimmed. Max sentence length: {}.' .format(trimmed_sent, max_len)) return dataset
def test_drop(self): ds = DataSet({ "x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20 }) ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True) self.assertEqual(len(ds), 20)
def test_add_field_v2(self): ds = DataSet({"x": [3, 4]}) ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True) # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y') print(ds)
def test_save_load(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds.save("./my_ds.pkl") self.assertTrue(os.path.exists("./my_ds.pkl")) ds_1 = DataSet.load("./my_ds.pkl") os.remove("my_ds.pkl")
def test_eq_transformers(self): weight_path = '' ds = DataSet({ 'words': ["this is a texta model vocab".split(), 'this is'.split()] }) encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path) encoder.eval() encoder.index_datasets(ds, field_name='words') word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1])) word_pieces_res = encoder(word_pieces) import transformers input1 = ' '.join(ds[0]['words']) input2 = ' '.join(ds[1]['words']) tokenizer = transformers.RobertaTokenizer.from_pretrained(weight_path) idx_list1 = tokenizer.encode(input1) idx_list2 = tokenizer.encode(input2) self.assertEqual(idx_list1, ds[0]['word_pieces']) self.assertEqual(idx_list2, ds[1]['word_pieces']) pad_value = tokenizer.encode('<pad>')[0] tensor = torch.nn.utils.rnn.pad_sequence( [torch.LongTensor(idx_list1), torch.LongTensor(idx_list2)], batch_first=True, padding_value=pad_value) roberta = transformers.RobertaModel.from_pretrained( weight_path, output_hidden_states=True) roberta.eval() output, pooled_output, hidden_states = roberta( tensor, attention_mask=tensor.ne(pad_value)) self.assertEqual((output - word_pieces_res).sum(), 0)
def get_data_bmeso(dataset): path=bmeso_data_path+dataset+'.char.bmes' data={'raw_chars':[],'target':[],'seq_len':[],'corpus':[],'chars':[]} with open(path,encoding='UTF-8') as file: raw_sentence=[] tags=[] for line in file: if line=='\n' and len(raw_sentence)>0: data['raw_chars'].append(''.join(raw_sentence)) data['target'].append(tags) data['seq_len'].append(len(tags)) data['corpus'].append('NER-Onto') data['chars'].append(raw_sentence) raw_sentence=[] tags=[] else: word,tag=line.strip().split() word=process_word(word) raw_sentence.append(word) if tag.endswith('-PER'): tag=tag[0]+'-NR' elif tag.endswith('-LOC'): tag=tag[0]+'-NS' elif tag.endswith('-GPE'): tag=tag[0]+'-NS' elif tag.endswith('-ORG'): tag=tag[0]+'-NT' tags.append(tag) data=DataSet(data) return data