def _get_vocab(data_list): vocab = Vocabulary(unknown=unk_str, padding=pad_str) for l in data_list: vocab.add_word_lst(l) vocab.build_vocab() print('vocab', len(vocab)) return vocab
def test_same_vector5(self): # 检查通过使用min_freq后的word是否内容一致 word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"] no_create_word_lst = ['of', "of", "she", "she", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=False, min_freq=2) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True) min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) min_freq_words = torch.LongTensor( [[min_freq_vocab.to_index(word.lower()) for word in all_words]]) min_freq_words = min_freq_embed(min_freq_words) for idx in range(len(all_words)): word_i, word_j = words[0, idx], min_freq_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq( min_freq_embed.embed_size)
def test_save_and_load(self): fp = 'vocab_save_test.txt' try: # check word2idx没变,no_create_entry正常 words = list('abcdefaddfdkjfe') no_create_entry = list('12342331') unk = '[UNK]' vocab = Vocabulary(unknown=unk, max_size=500) vocab.add_word_lst(words) vocab.add_word_lst(no_create_entry, no_create_entry=True) vocab.save(fp) new_vocab = Vocabulary.load(fp) for word, index in vocab: self.assertEqual(new_vocab.to_index(word), index) for word in no_create_entry: self.assertTrue(new_vocab._is_word_no_create_entry(word)) for word in words: self.assertFalse(new_vocab._is_word_no_create_entry(word)) for idx in range(len(vocab)): self.assertEqual(vocab.to_word(idx), new_vocab.to_word(idx)) self.assertEqual(vocab.unknown, new_vocab.unknown) except: import os if os.path.exists(fp): os.remove(fp)
def test_same_vector4(self): # 验证在有min_freq下的lower word_lst = ["The", "the", "the", "The", "a", "A"] no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) lowered_word_lst = [word.lower() for word in word_lst] lowered_no_create_word_lst = [ word.lower() for word in no_create_word_lst ] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor( [[lowered_vocab.to_index(word.lower()) for word in all_words]]) lowered_words = lowered_embed(lowered_words) for idx in range(len(all_words)): word_i, word_j = words[0, idx], lowered_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10) encoder_output = torch.randn(2, 3, 10) tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]]) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) for flag in [True, False]: for attention in [True, False]: with self.subTest(bind_decoder_input_output_embed=flag, attention=attention): decoder = LSTMSeq2SeqDecoder( embed=embed, num_layers=2, hidden_size=10, dropout=0.3, bind_decoder_input_output_embed=flag, attention=attention) state = decoder.init_state(encoder_output, encoder_mask) output = decoder(tgt_words_idx, state) self.assertEqual(tuple(output.size()), (2, 4, len(vocab)))
def test_case(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, embedding_dim=10) encoder_output = torch.randn(2, 3, 10) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) for flag in [True, False]: with self.subTest(bind_decoder_input_output_embed=flag): decoder = TransformerSeq2SeqDecoder( embed=embed, pos_embed=None, d_model=10, num_layers=2, n_head=5, dim_ff=20, dropout=0.1, bind_decoder_input_output_embed=True) state = decoder.init_state(encoder_output, encoder_mask) output = decoder(tokens=torch.randint(0, len(vocab), size=(2, 4)), state=state) self.assertEqual(output.size(), (2, 4, len(vocab)))
def word_to_id(glove_data, glove_matrix, vocab_dict_path, file_path): if os.path.exists(glove_data) == False or os.path.exists( glove_matrix) == False: data, feature_words, user_num, item_num, = feature_word(file_path) vocab = Vocabulary(max_size=len(feature_words) + 1, unknown='unk', padding='PAD') vocab.add_word_lst(feature_words) vocab.build_vocab() matrix = EmbedLoader.load_with_vocab(vocab_dict_path, vocab) matrix = torch.tensor(matrix) for d in range(len(data)): review = [] for word in data[d]['reviewText']: review.append(vocab.to_index(word)) data[d]['reviewText'] = review with open(glove_data, 'wb') as f: pickle.dump(data, f) with open(glove_matrix, 'wb') as f: pickle.dump(matrix, f) with open(glove_data, 'rb') as f: glove_data = pickle.load(f) with open(glove_matrix, 'rb') as f: matrix = pickle.load(f) return glove_data, matrix, len(glove_data[0]['reviewText'])
def test_search(self): """语义搜索.TypeError: expected dimension <= 2 array or matrix """ print('{} test_search {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] # 文本向量化 vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in texts] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) print(features_vec.shape) # build the search index! cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts) search_texts = ['朱日和站', '温都尔站', '国电站'] for text in search_texts: texts_to_id = [[vocab.to_index(word) for word in list(text)]] words = torch.LongTensor(texts_to_id) # 将文本转为index features_vec = embed(words) search_features_vec = features_vec.detach().numpy() search_result = cp.search(search_features_vec, k=2, k_clusters=2, return_distance=True) print('text:{}'.format(text)) print('search_result:{}'.format(search_result)) """
def test_same_vector3(self): # 验证lower word_lst = ["The", "the"] no_create_word_lst = ['of', 'Of', 'With', 'with'] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor( [[vocab.to_index(word) for word in word_lst + no_create_word_lst]]) words = embed(words) lowered_word_lst = [word.lower() for word in word_lst] lowered_no_create_word_lst = [ word.lower() for word in no_create_word_lst ] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor([[ lowered_vocab.to_index(word) for word in lowered_word_lst + lowered_no_create_word_lst ]]) lowered_words = lowered_embed(lowered_words) all_words = word_lst + no_create_word_lst for idx, (word_i, word_j) in enumerate(zip(words[0], lowered_words[0])): with self.subTest(idx=idx, word=all_words[idx]): assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
def get_vocab(dataset): vocabulary = Vocabulary(unknown=unk_str, padding=pad_str) for data, _ in dataset: vocabulary.add_word_lst(data) print('vocab', len(vocabulary)) print('pad', vocabulary.to_index(pad_str)) return vocabulary
def test_Index2WordProcessor(self): vocab = Vocabulary() vocab.add_word_lst(["a", "b", "c", "d", "e"]) proc = Index2WordProcessor(vocab, "tag_id", "tag") data_set = DataSet( [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])]) data_set = proc(data_set) self.assertTrue("tag" in data_set)
def get_vocabulary(data, min_freq): # train data -> vocabulary # alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" alphabet = "0123456789," char_list = [c for c in alphabet] vocabulary = Vocabulary(padding='<pad>', unknown='<unk>') vocabulary.add_word_lst(char_list) vocabulary.build_vocab() print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx) return vocabulary
def prepare_env(): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5) encoder_output = torch.randn(2, 3, 10) src_seq_len = torch.LongTensor([3, 2]) encoder_mask = seq_len_to_mask(src_seq_len) return embed, encoder_output, encoder_mask
def get_vocabulary(data, min_freq): # train data -> vocabulary vocabulary = Vocabulary(min_freq=min_freq, padding='<pad>', unknown='<unk>') for filename in data: for value in data[filename]: for word_list in data[filename][value]['data']: vocabulary.add_word_lst(word_list) vocabulary.build_vocab() print('vocab size', len(vocabulary), 'pad', vocabulary.padding_idx, 'unk', vocabulary.unknown_idx) return vocabulary
def prepare_env(): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5) src_words_idx = torch.LongTensor([[3, 1, 2], [1, 2, 0]]) tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]]) src_seq_len = torch.LongTensor([3, 2]) tgt_seq_len = torch.LongTensor([4, 2]) return embed, src_words_idx, tgt_words_idx, src_seq_len, tgt_seq_len
def load_dataset( data_dir='/remote-home/ygxu/workspace/Product_all', data_path='mr.task.train', # bert_dir='/home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12', bert_dir='/remote-home/ygxu/workspace/BERT/BERT_English_uncased_L-24_H-1024_A_16', ): path = os.path.join(data_dir, data_path) ds = DataSet.read_csv(path, headers=('label', 'raw_sentence'), sep='\t') ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) def transfer_bert_to_fastnlp(ins): result = "[CLS] " bert_text = ins['bert_tokenize_list'] for text in bert_text: result += text + " " return result.strip() with open(os.path.join(bert_dir, 'vocab.txt')) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line[:-1]) vocab_bert = Vocabulary(unknown=None, padding=None) vocab_bert.add_word_lst(vocabs) vocab_bert.build_vocab() vocab_bert.unknown = '[UNK]' vocab_bert.padding = '[PAD]' from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained( os.path.join(bert_dir, 'vocab.txt')) ds.apply(lambda x: tokenizer.tokenize(x['raw_sentence']), new_field_name='bert_tokenize_list') ds.apply(transfer_bert_to_fastnlp, new_field_name='bert_tokenize') ds.apply(lambda x: [vocab_bert.to_index(word) for word in x['bert_tokenize_list']], new_field_name='index_words', is_input=True) ds.rename_field('index_words', 'tokens') ds.apply(lambda x: [1.] * len(x['tokens']), new_field_name='masks', is_input=True) return ds
def read_vocab(file_name): # 读入vocab文件 with open(file_name) as f: lines = f.readlines() vocabs = [] for line in lines: vocabs.append(line.strip()) # 实例化Vocabulary vocab = Vocabulary(unknown='<unk>', padding='<pad>') # 将vocabs列表加入Vocabulary vocab.add_word_lst(vocabs) # 构建词表 vocab.build_vocab() return vocab
def test_rebuild(self): # 测试build之后新加入词,原来的词顺序不变 vocab = Vocabulary() text = [str(idx) for idx in range(10)] vocab.update(text) for i in text: self.assertEqual(int(i) + 2, vocab.to_index(i)) indexes = [] for word, index in vocab: indexes.append((word, index)) vocab.add_word_lst([str(idx) for idx in range(10, 13)]) for idx, pair in enumerate(indexes): self.assertEqual(pair[1], vocab.to_index(pair[0])) for i in range(13): self.assertEqual(int(i) + 2, vocab.to_index(str(i)))
def test_fit(self): """文本编码. """ print('{} test_fit {}'.format('-' * 15, '-' * 15)) texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基'] vocab = Vocabulary() for text in texts: vocab.add_word_lst(list(text)) print(len(vocab)) embed = StaticEmbedding( vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt') texts_to_id = [[vocab.to_index(word) for word in list(text)] for text in ['朱日和', '东台变']] print(texts_to_id) # [[16, 17, 18], [6, 1, 1]] words = torch.LongTensor(texts_to_id) # 将文本转为index print(embed(words).size()) # torch.Size([2, 3, 100])
def get_vocab(trainset, testset): # 构建vocab以及word2idx #tok tok_vocab = Vocabulary() tok_vocab.from_dataset(trainset, field_name="tok", no_create_entry_dataset=testset) tok_vocab.index_dataset(trainset, testset, field_name="tok", new_field_name="chars") tok_vocab.index_dataset(trainset, testset, field_name="asp", new_field_name="aspect") # deprel dep_vocab = Vocabulary() dep_vocab.from_dataset(trainset, field_name="deprel") dep_vocab.index_dataset(trainset, testset, field_name="deprel", new_field_name="depidx") # pol(target) pol_vocab = Vocabulary(padding=None, unknown=None) pol_vocab.from_dataset(trainset, field_name="pol") pol_vocab.index_dataset(trainset, testset, field_name="pol", new_field_name="target") # pos pos_vocab = Vocabulary() pos_vocab.from_dataset(trainset, field_name="pos") pos_vocab.index_dataset(trainset, testset, field_name="pos", new_field_name="posidx") # post max_len = max(max(trainset["seq_len"]), max(testset["seq_len"])) post_vocab = Vocabulary() post_vocab.add_word_lst(list(range(-max_len, max_len))) post_vocab.index_dataset(trainset, testset, field_name="post", new_field_name="postidx") return tok_vocab, pos_vocab, post_vocab, trainset, testset
def handle_data(n_class): train_data = get_text_classification_datasets(n_class) dataset = DataSet() vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>') for i in range(len(train_data.data)): ans = remove_punc(train_data.data[i]) dataset.append((Instance(content=ans, target=int(train_data.target[i])))) dataset.apply(lambda x: x['content'].lower().split(), new_field_name='words', is_input=True) for txt in dataset: vocab.add_word_lst(txt['words']) vocab.build_vocab() # index句子, Vocabulary.to_index(word) dataset.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='index') dataset.set_input("index") dataset.set_target("target") tra, dev = dataset.split(0.2) return tra, dev, len(vocab)
def prepare_env(): vocab = Vocabulary().add_word_lst("This is a test .".split()) vocab.add_word_lst("Another test !".split()) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5) src_words_idx = [[3, 1, 2], [1, 2]] # tgt_words_idx = [[1, 2, 3, 4], [2, 3]] src_seq_len = [3, 2] # tgt_seq_len = [4, 2] ds = DataSet({ 'src_tokens': src_words_idx, 'src_seq_len': src_seq_len, 'tgt_tokens': src_words_idx, 'tgt_seq_len': src_seq_len }) ds.set_input('src_tokens', 'tgt_tokens', 'src_seq_len') ds.set_target('tgt_seq_len', 'tgt_tokens') return embed, ds
def get_data(filepath): data = np.load(filepath, allow_pickle=True) data, _, ix2word = data['data'], data['word2ix'].item( ), data['ix2word'].item() wordlist = [] for d in data: for ix in d: wordlist.append(ix2word[ix]) vocab = Vocabulary(min_freq=10, padding="</s>") vocab.add_word_lst(wordlist) vocab.build_vocab() # vocab = Vocabulary(min_freq=10, padding="</s>").add_word_lst(wordlist).build_vocab() vocab_size = len(vocab.word2idx) for d in data: for i in range(len(d)): # d[i] = vocab[vocab.to_word(d[i])] if d[i] >= vocab_size: d[i] = vocab["<unk>"] print(vocab_size) return data, vocab
def test1(self): # 测试能否正常打印 from fastNLP import Vocabulary from fastNLP.core.utils import ConfusionMatrix import numpy as np vocab = Vocabulary(unknown=None, padding=None) vocab.add_word_lst(list('abcdef')) confusion_matrix = ConfusionMatrix(vocab) for _ in range(3): length = np.random.randint(1, 5) pred = np.random.randint(0, 3, size=(length,)) target = np.random.randint(0, 3, size=(length,)) confusion_matrix.add_pred_target(pred, target) print(confusion_matrix) # 测试print_ratio confusion_matrix = ConfusionMatrix(vocab, print_ratio=True) for _ in range(3): length = np.random.randint(1, 5) pred = np.random.randint(0, 3, size=(length,)) target = np.random.randint(0, 3, size=(length,)) confusion_matrix.add_pred_target(pred, target) print(confusion_matrix)
def test_no_entry(self): # 先建立vocabulary,然后变化no_create_entry, 测试能否正确识别 text = [ "FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in", "works", "well", "in", "most", "cases", "scales", "well" ] vocab = Vocabulary() vocab.add_word_lst(text) self.assertFalse(vocab._is_word_no_create_entry('FastNLP')) vocab.add_word('FastNLP', no_create_entry=True) self.assertFalse(vocab._is_word_no_create_entry('FastNLP')) vocab.add_word('fastnlp', no_create_entry=True) self.assertTrue(vocab._is_word_no_create_entry('fastnlp')) vocab.add_word('fastnlp', no_create_entry=False) self.assertFalse(vocab._is_word_no_create_entry('fastnlp')) vocab.add_word_lst(['1'] * 10, no_create_entry=True) self.assertTrue(vocab._is_word_no_create_entry('1')) vocab.add_word('1') self.assertFalse(vocab._is_word_no_create_entry('1'))
def equip_chinese_ner_with_skip(datasets, vocabs, embeddings, w_list, word_embedding_path=None, word_min_freq=1, only_train_min_freq=0): from utils_ import Trie, get_skip_path from functools import partial w_trie = Trie() for w in w_list: w_trie.insert(w) # for k,v in datasets.items(): # v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips') def skips2skips_l2r(chars, w_trie): ''' :param lexicons: list[[int,int,str]] :return: skips_l2r ''' # print(lexicons) # print('******') lexicons = get_skip_path(chars, w_trie=w_trie) # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0 result = [[] for _ in range(len(chars))] for lex in lexicons: s = lex[0] e = lex[1] w = lex[2] result[e].append([s, w]) return result def skips2skips_r2l(chars, w_trie): ''' :param lexicons: list[[int,int,str]] :return: skips_l2r ''' # print(lexicons) # print('******') lexicons = get_skip_path(chars, w_trie=w_trie) # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0 result = [[] for _ in range(len(chars))] for lex in lexicons: s = lex[0] e = lex[1] w = lex[2] result[s].append([e, w]) return result for k, v in datasets.items(): v.apply_field(partial(skips2skips_l2r, w_trie=w_trie), 'chars', 'skips_l2r') for k, v in datasets.items(): v.apply_field(partial(skips2skips_r2l, w_trie=w_trie), 'chars', 'skips_r2l') # print(v['skips_l2r'][0]) word_vocab = Vocabulary() word_vocab.add_word_lst(w_list) vocabs['word'] = word_vocab for k, v in datasets.items(): v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_l2r', 'skips_l2r_source') v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_l2r', 'skips_l2r_word') for k, v in datasets.items(): v.apply_field(lambda x: [list(map(lambda x: x[0], p)) for p in x], 'skips_r2l', 'skips_r2l_source') v.apply_field(lambda x: [list(map(lambda x: x[1], p)) for p in x], 'skips_r2l', 'skips_r2l_word') for k, v in datasets.items(): v.apply_field(lambda x: list(map(len, x)), 'skips_l2r_word', 'lexicon_count') v.apply_field(lambda x: list(map(lambda y: list(map(lambda z: word_vocab.to_index(z), y)), x)), 'skips_l2r_word', new_field_name='skips_l2r_word') v.apply_field(lambda x: list(map(len, x)), 'skips_r2l_word', 'lexicon_count_back') v.apply_field(lambda x: list(map(lambda y: list(map(lambda z: word_vocab.to_index(z), y)), x)), 'skips_r2l_word', new_field_name='skips_r2l_word') if word_embedding_path is not None: word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0) embeddings['word'] = word_embedding vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='chars', new_field_name='chars') vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='bigrams', new_field_name='bigrams') vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], field_name='target', new_field_name='target') return datasets, vocabs, embeddings
def process(self, paths, config, load_vocab_file=True): """ :param paths: dict path for each dataset :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ vocab_size = config.vocab_size def _merge_abstracts(abstracts): merged = [] for abstract in abstracts: merged.extend(abstract[:self.max_concat_len] + [SEP]) if len(abstracts) == 0: assert merged == [] return merged[:-1] def _pad_graph_inputs(graph_inputs): pad_text_wd = [] max_len = config.max_graph_enc_steps for graph_input in graph_inputs: if len(graph_input) < max_len: pad_num = max_len - len(graph_input) graph_input.extend([PAD_TOKEN] * pad_num) else: graph_input = graph_input[:max_len] pad_text_wd.append(graph_input) if len(pad_text_wd) == 0: pad_text_wd.append([PAD_TOKEN] * max_len) return pad_text_wd def _get_nbr_input_len(input_wd): enc_len = [ min(len(text), config.max_graph_enc_steps) for text in input_wd ] if len(enc_len) == 0: enc_len = [0] return enc_len def _pad_article(text_wd): token_num = len(text_wd) max_len = config.max_enc_steps if config.neighbor_process == "sep": max_len += self.max_concat_len * self.max_concat_num if token_num < max_len: padding = [PAD_TOKEN] * (max_len - token_num) article = text_wd + padding else: article = text_wd[:max_len] return article def _split_list(input_list): return [text.split() for text in input_list] def sent_tokenize(abstract): abs_list = abstract.split(".") return [(abst + ".") for abst in abs_list[:-1]] def _article_token_mask(text_wd): max_enc_len = config.max_enc_steps if config.neighbor_process == "sep": max_enc_len += self.max_concat_len * self.max_concat_num token_num = len(text_wd) if token_num < max_enc_len: mask = [1] * token_num + [0] * (max_enc_len - token_num) else: mask = [1] * max_enc_len return mask def generate_article_input(text, abstracts): if config.neighbor_process == "sep": text_wd = text.split()[:config.max_enc_steps] text_wd.append(SEP) abstracts_wd = _merge_abstracts(abstracts) return text_wd + abstracts_wd else: return text.split() def generate_graph_inputs(graph_struct): graph_inputs_ = [ graph_strut_dict[pid][config.graph_input_type] for pid in graph_struct ] return _split_list(graph_inputs_[1:]) def generate_graph_structs(paper_id): sub_graph_dict = {} sub_graph_set = [] n_hop = config.n_hop max_neighbor_num = config.max_neighbor_num k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num) for sub_g in k_nbrs: sub_graph_set += sub_g for node in sub_graph_set: sub_graph_dict[node] = [] for sub_g in k_nbrs: for centre_node in sub_g: nbrs = graph_strut_dict[centre_node]['references'] c_nbrs = list(set(nbrs).intersection(sub_graph_set)) sub_graph_dict[centre_node].extend(c_nbrs) for c_nbr in c_nbrs: sub_graph_dict[c_nbr].append(centre_node) # in python 3.6, the first in subgraph dict is source paper return sub_graph_dict def _k_hop_neighbor(paper_id, n_hop, max_neighbor): sub_graph = [[] for _ in range(n_hop + 1)] level = 0 visited = set() q = deque() q.append([paper_id, level]) curr_node_num = 0 while len(q) != 0: paper_first = q.popleft() paper_id_first, level_first = paper_first if level_first > n_hop: return sub_graph sub_graph[level_first].append(paper_id_first) curr_node_num += 1 if curr_node_num > max_neighbor: return sub_graph visited.add(paper_id_first) for pid in graph_strut_dict[paper_id_first]["references"]: if pid not in visited and pid in graph_strut_dict: q.append([pid, level_first + 1]) visited.add(pid) return sub_graph def generate_dgl_graph(paper_id, graph_struct, nodes_num): g = dgl.DGLGraph() assert len(graph_struct) == nodes_num g.add_nodes(len(graph_struct)) pid2idx = {} for index, key_node in enumerate(graph_struct): pid2idx[key_node] = index assert pid2idx[paper_id] == 0 for index, key_node in enumerate(graph_struct): neighbor = [pid2idx[node] for node in graph_struct[key_node]] # add self loop neighbor.append(index) key_nodes = [index] * len(neighbor) g.add_edges(key_nodes, neighbor) return g train_ds = None dataInfo = self.load(paths) # pop nodes in train graph in inductive setting if config.mode == "test" and self.setting == "inductive": dataInfo.datasets.pop("train") graph_strut_dict = {} for key, ds in dataInfo.datasets.items(): for ins in ds: graph_strut_dict[ins["paper_id"]] = ins logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes") for key, ds in dataInfo.datasets.items(): # process summary ds.apply(lambda x: x['abstract'].split(), new_field_name='summary_wd') ds.apply(lambda x: sent_tokenize(x['abstract']), new_field_name='abstract_sentences') # generate graph ds.apply(lambda x: generate_graph_structs(x["paper_id"]), new_field_name="graph_struct") ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]), new_field_name='graph_inputs_wd') ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1, new_field_name="nodes_num") # pad input ds.apply(lambda x: generate_article_input(x['introduction'], x[ "graph_inputs_wd"]), new_field_name='input_wd') ds.apply(lambda x: _article_token_mask(x["input_wd"]), new_field_name="enc_len_mask") ds.apply(lambda x: sum(x["enc_len_mask"]), new_field_name="enc_len") ds.apply(lambda x: _pad_article(x["input_wd"]), new_field_name="pad_input_wd") ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]), new_field_name="nbr_inputs_len") ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]), new_field_name="pad_graph_inputs_wd") if key == "train": train_ds = ds vocab_dict = {} if not load_vocab_file: logger.info("[INFO] Build new vocab from training dataset!") if train_ds is None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=config.vocab_size - 2, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.from_dataset(train_ds, field_name=["input_wd", "summary_wd"]) vocabs.add_word(START_DECODING) vocabs.add_word(STOP_DECODING) vocab_dict["vocab"] = vocabs # save vocab with open(os.path.join(config.train_path, "vocab"), "w", encoding="utf8") as f: for w, idx in vocabs: f.write(str(w) + "\t" + str(idx) + "\n") logger.info( "build new vocab ends.. please reRun the code with load_vocab = True" ) exit(0) else: logger.info("[INFO] Load existing vocab from %s!" % config.vocab_path) word_list = [] cnt = 3 # pad and unk if config.neighbor_process == "sep": cnt += 1 with open(config.vocab_path, 'r', encoding='utf8') as vocab_f: for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.add_word_lst(word_list) vocabs.add(START_DECODING) vocabs.add(STOP_DECODING) if config.neighbor_process == "sep": vocabs.add(SEP) vocabs.build_vocab() vocab_dict["vocab"] = vocabs logger.info(f"vocab size = {len(vocabs)}") assert len(vocabs) == config.vocab_size dataInfo.set_vocab(vocabs, "vocab") for key, dataset in dataInfo.datasets.items(): # do not process the training set in test mode if config.mode == "test" and key == "train": continue data_dict = { "enc_input": [], "nbr_inputs": [], "graph": [], "dec_input": [], "target": [], "dec_len": [], "article_oovs": [], "enc_input_extend_vocab": [], } logger.info( f"start construct the input of the model for {key} set, please wait..." ) for instance in dataset: graph_inputs = instance["pad_graph_inputs_wd"] abstract_sentences = instance["summary_wd"] enc_input = instance["pad_input_wd"] enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \ getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config) graph = generate_dgl_graph(instance["paper_id"], instance["graph_struct"], instance["nodes_num"]) data_dict["graph"].append(graph) data_dict["enc_input"].append(enc_input) data_dict["nbr_inputs"].append(nbr_inputs) data_dict["dec_input"].append(dec_input) data_dict["target"].append(target) data_dict["dec_len"].append(dec_len) data_dict["article_oovs"].append(article_oovs) data_dict["enc_input_extend_vocab"].append( enc_input_extend_vocab) dataset.add_field("enc_input", data_dict["enc_input"]) dataset.add_field("nbr_inputs", data_dict["nbr_inputs"]) dataset.add_field("dec_input", data_dict["dec_input"]) dataset.add_field("target", data_dict["target"]) dataset.add_field("dec_len", data_dict["dec_len"]) dataset.add_field("article_oovs", data_dict["article_oovs"]) dataset.add_field("enc_input_extend_vocab", data_dict["enc_input_extend_vocab"]) dataset.add_field("graph", data_dict["graph"]) dataset.set_ignore_type( 'graph') # without this line, there may be some errors dataset.set_input("graph") dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len", "enc_input", "enc_len_mask", "dec_input", "dec_len", "article_oovs", "nodes_num", "enc_input_extend_vocab") dataset.set_target("target", "article_oovs", "abstract_sentences") dataset.delete_field('graph_inputs_wd') dataset.delete_field('pad_graph_inputs_wd') dataset.delete_field('input_wd') dataset.delete_field('pad_input_wd') logger.info("------load dataset over---------") return dataInfo, vocabs
def change_tag(ins): words = ['[unused14]'] + ins['words'][1:] return words for target in target_list: all_data[target]['POS-ctb9'].apply(change_tag, new_field_name='words') print(all_data['train']['POS-ctb9'][0]['words'][:1]) for task in all_data['train'].keys(): if task.startswith('Parsing'): continue dataset = all_data['train'][task] for word_lst in dataset['words']: chars_vocab.add_word_lst(word_lst) pos_idx = chars_vocab.to_index('[unused14]') print(pos_idx) label_vocab['POS'] = Vocabulary().from_dataset( all_data['train']['POS-ctb9'], field_name='target') label_vocab['CWS'] = Vocabulary().from_dataset( all_data['train']['CWS-pku'], field_name='target') label_vocab['NER'] = Vocabulary().from_dataset( all_data['train']['NER-msra'], field_name='target') label_vocab['Parsing'] = torch.load('vocab/parsing_vocab') label_vocab['pos'] = Vocabulary().from_dataset( all_data['train']['Parsing-ctb9'], field_name='pos') for target in target_list:
dev_file = None test_file = None for f in v: # example of f: 'health.dev' data_type = f.split('.')[1] if data_type == 'train': train_file = os.path.join(data_path, f) elif data_type == 'dev': dev_file = os.path.join(data_path, f) elif data_type == 'test': test_file = os.path.join(data_path, f) else: raise ValueError('unknown dataset type') train_set = read_instances_from_file(train_file) train_set.add_field('task_id', [task_id] * len(train_set)) train_set.apply(lambda x: vocab.add_word_lst(x['words'])) dev_set = read_instances_from_file(dev_file) dev_set.add_field('task_id', [task_id] * len(dev_set)) dev_set.apply(lambda x: vocab.add_word_lst(x['words'])) test_set = read_instances_from_file(test_file) test_set.add_field('task_id', [task_id] * len(test_set)) # test_set.apply(lambda x: vocab.add_word_lst(x['words'])) task = Task(task_id, k, train_set, dev_set, test_set) task_lst.append(task) logger.info('Building vocabulary...') vocab.build_vocab() logger.info('Finished. Size of vocab: {}.'.format(len(vocab)))
from fastNLP.io import WeiboSenti100kPipe from fastNLP.embeddings import BertEmbedding from fastNLP.io.pipe.qa import CMRC2018Loader from fastNLP.io import CNXNLILoader from fastNLP.io import WeiboNERLoader from fastNLP.embeddings import StaticEmbedding from fastNLP import Vocabulary if __name__ == "__main__": # 下载情感分析-分类数据 data_bundle = WeiboSenti100kPipe().process_from_file() data_bundle.rename_field('chars', 'words') # 下载bert embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True) # 问答数据 data_bundle = CMRC2018Loader().load() # 文本匹配 data_bundle = CNXNLILoader().load() # NER data_bundle = WeiboNERLoader().load() # embedding vocab = Vocabulary() vocab.add_word_lst("你 好 .".split()) embed = StaticEmbedding(vocab, model_dir_or_name='cn-sgns-literature-word')