def test_gpt2_embedding(self): weight_path = 'test/data_for_tests/embedding/small_gpt2' vocab = Vocabulary().add_word_lst("this is a texta sentence".split()) embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1) requires_grad = embed.requires_grad embed.requires_grad = not requires_grad embed.train() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, only_use_pretrain_bpe=False, language_model=True) embed.eval() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) embed.get_lm_loss() vocab.add_word("NotInGpt2") embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, only_use_pretrain_bpe=False, auto_truncate=True, min_freq=1) words = torch.LongTensor([[2, 3, 4, 0] * 20]) result = embed(words) self.assertEqual(result.size(), (1, 80, 16))
def get_dataset(data_path): print('Getting dataset...') poetry = [] with open(data_path, 'r', encoding='utf-8') as f: poem = '' for line in f: if len(line) <= 1: ins = Instance(text=poem) if len(poem) > 10: poetry.append(ins) poem = '' else: poem += line.strip('\n') # print(poetry[0]) data = DataSet(data=poetry) print("Original data:", data[0]) vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>') vocabulary.add_word('<eos>') vocabulary.add_word('<START>') data.apply(lambda x: [vocabulary.add(char) for char in x['text']]) vocabulary.build_vocab() print('pad:', vocabulary.to_index('<pad>')) print('Vocab size:', len(vocabulary)) data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] + [vocabulary.to_index('<eos>')], new_field_name='text') data.apply( lambda x: x['text'][0:min(config.sequence_length, len(x['text']))], new_field_name='text') data.apply(lambda x: [vocabulary.to_index('<pad>')] * (config.sequence_length - len(x['text'])) + x['text'], new_field_name='text') data.apply(lambda x: x['text'][0:-1], new_field_name='input') data.apply(lambda x: x['text'][1:], new_field_name='target') data.set_input('input') data.set_target('target') # length = config.sequence_length # for i, d in enumerate(data): # if length != len(d['text']): # print("wrong!") # exit() train_data, dev_data = data.split(0.2) print('Train data size:', len(train_data)) print('Dev data size:', len(dev_data)) print("Train data:", train_data[20]) # print("Dev data:", dev_data[0]) return train_data, dev_data, vocabulary
def test_load_with_vocab(self): vocab = Vocabulary() glove = "test/data_for_tests/glove.6B.50d_test.txt" word2vec = "test/data_for_tests/word2vec_test.txt" vocab.add_word('the') vocab.add_word('none') g_m = EmbedLoader.load_with_vocab(glove, vocab) self.assertEqual(g_m.shape, (4, 50)) w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True) self.assertEqual(w_m.shape, (4, 50)) self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 4)
def test_case12(self): # 测试能否通过vocab生成转移矩阵 from fastNLP.modules.decoder.crf import allowed_transitions id2label = {0: 'B', 1: 'I', 2: 'O'} vocab = Vocabulary(unknown=None, padding=None) for idx, tag in id2label.items(): vocab.add_word(tag) expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2), (2, 4), (3, 0), (3, 2)} self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True))) id2label = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} vocab = Vocabulary(unknown=None, padding=None) for idx, tag in id2label.items(): vocab.add_word(tag) expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} self.assertSetEqual(expected_res, set( allowed_transitions(vocab, include_start_end=True))) id2label = {0: 'B', 1: 'I', 2: 'O', 3: '<pad>', 4: "<unk>"} vocab = Vocabulary() for idx, tag in id2label.items(): vocab.add_word(tag) allowed_transitions(vocab, include_start_end=True) labels = ['O'] for label in ['X', 'Y']: for tag in 'BI': labels.append('{}-{}'.format(tag, label)) id2label = {idx: label for idx, label in enumerate(labels)} expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1), (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3), (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)} vocab = Vocabulary(unknown=None, padding=None) for idx, tag in id2label.items(): vocab.add_word(tag) self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True))) labels = [] for label in ['X', 'Y']: for tag in 'BMES': labels.append('{}-{}'.format(tag, label)) id2label = {idx: label for idx, label in enumerate(labels)} vocab = Vocabulary(unknown=None, padding=None) for idx, tag in id2label.items(): vocab.add_word(tag) expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} self.assertSetEqual(expected_res, set( allowed_transitions(vocab, include_start_end=True)))
def load_snli(path, files): loader = SNLILoader() ds_list = [loader.load(os.path.join(path, f)) for f in files] word_v = Vocabulary(min_freq=2) tag_v = Vocabulary(unknown=None, padding=None) for ds in ds_list: ds.apply(lambda x: [w.lower() for w in x['words1']], new_field_name='words1') ds.apply(lambda x: [w.lower() for w in x['words2']], new_field_name='words2') update_v(word_v, ds_list[0], 'words1') update_v(word_v, ds_list[0], 'words2') ds_list[0].apply(lambda x: tag_v.add_word(x['target']), new_field_name=None) def process_data(ds): to_index(word_v, ds, 'words1', C.INPUTS(0)) to_index(word_v, ds, 'words2', C.INPUTS(1)) ds.apply(lambda x: tag_v.to_index(x['target']), new_field_name=C.TARGET) ds.apply(lambda x: x[C.INPUTS(0)][:MAX_LEN], new_field_name=C.INPUTS(0)) ds.apply(lambda x: x[C.INPUTS(1)][:MAX_LEN], new_field_name=C.INPUTS(1)) ds.apply(lambda x: len(x[C.INPUTS(0)]), new_field_name=C.INPUT_LENS(0)) ds.apply(lambda x: len(x[C.INPUTS(1)]), new_field_name=C.INPUT_LENS(1)) ds.set_input(C.INPUTS(0), C.INPUTS(1), C.INPUT_LENS(0), C.INPUT_LENS(1)) ds.set_target(C.TARGET) for i in range(len(ds_list)): process_data(ds_list[i]) return ds_list, word_v, tag_v
def load_sst(path, files): loaders = [ SSTLoader(subtree=sub, fine_grained=True) for sub in [True, False, False] ] ds_list = [ loader.load(os.path.join(path, fn)) for fn, loader in zip(files, loaders) ] word_v = Vocabulary(min_freq=2) tag_v = Vocabulary(unknown=None, padding=None) for ds in ds_list: ds.apply(lambda x: [w.lower() for w in x['words']], new_field_name='words') ds_list[0].drop(lambda x: len(x['words']) < 3) update_v(word_v, ds_list[0], 'words') ds_list[0].apply(lambda x: tag_v.add_word(x['target']), new_field_name=None) def process_data(ds): to_index(word_v, ds, 'words', C.INPUT) ds.apply(lambda x: tag_v.to_index(x['target']), new_field_name=C.TARGET) ds.apply(lambda x: x[C.INPUT][:MAX_LEN], new_field_name=C.INPUT) ds.apply(lambda x: len(x['words']), new_field_name=C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET) for i in range(len(ds_list)): process_data(ds_list[i]) return ds_list, word_v, tag_v
def test_no_entry(self): # 先建立vocabulary,然后变化no_create_entry, 测试能否正确识别 text = [ "FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in", "works", "well", "in", "most", "cases", "scales", "well" ] vocab = Vocabulary() vocab.add_word_lst(text) self.assertFalse(vocab._is_word_no_create_entry('FastNLP')) vocab.add_word('FastNLP', no_create_entry=True) self.assertFalse(vocab._is_word_no_create_entry('FastNLP')) vocab.add_word('fastnlp', no_create_entry=True) self.assertTrue(vocab._is_word_no_create_entry('fastnlp')) vocab.add_word('fastnlp', no_create_entry=False) self.assertFalse(vocab._is_word_no_create_entry('fastnlp')) vocab.add_word_lst(['1'] * 10, no_create_entry=True) self.assertTrue(vocab._is_word_no_create_entry('1')) vocab.add_word('1') self.assertFalse(vocab._is_word_no_create_entry('1'))
def process(self, paths, config, load_vocab_file=True): """ :param paths: dict path for each dataset :param load_vocab_file: bool build vocab (False) or load vocab (True) :return: DataBundle datasets: dict keys correspond to the paths dict vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) embeddings: optional """ vocab_size = config.vocab_size def _merge_abstracts(abstracts): merged = [] for abstract in abstracts: merged.extend(abstract[:self.max_concat_len] + [SEP]) if len(abstracts) == 0: assert merged == [] return merged[:-1] def _pad_graph_inputs(graph_inputs): pad_text_wd = [] max_len = config.max_graph_enc_steps for graph_input in graph_inputs: if len(graph_input) < max_len: pad_num = max_len - len(graph_input) graph_input.extend([PAD_TOKEN] * pad_num) else: graph_input = graph_input[:max_len] pad_text_wd.append(graph_input) if len(pad_text_wd) == 0: pad_text_wd.append([PAD_TOKEN] * max_len) return pad_text_wd def _get_nbr_input_len(input_wd): enc_len = [ min(len(text), config.max_graph_enc_steps) for text in input_wd ] if len(enc_len) == 0: enc_len = [0] return enc_len def _pad_article(text_wd): token_num = len(text_wd) max_len = config.max_enc_steps if config.neighbor_process == "sep": max_len += self.max_concat_len * self.max_concat_num if token_num < max_len: padding = [PAD_TOKEN] * (max_len - token_num) article = text_wd + padding else: article = text_wd[:max_len] return article def _split_list(input_list): return [text.split() for text in input_list] def sent_tokenize(abstract): abs_list = abstract.split(".") return [(abst + ".") for abst in abs_list[:-1]] def _article_token_mask(text_wd): max_enc_len = config.max_enc_steps if config.neighbor_process == "sep": max_enc_len += self.max_concat_len * self.max_concat_num token_num = len(text_wd) if token_num < max_enc_len: mask = [1] * token_num + [0] * (max_enc_len - token_num) else: mask = [1] * max_enc_len return mask def generate_article_input(text, abstracts): if config.neighbor_process == "sep": text_wd = text.split()[:config.max_enc_steps] text_wd.append(SEP) abstracts_wd = _merge_abstracts(abstracts) return text_wd + abstracts_wd else: return text.split() def generate_graph_inputs(graph_struct): graph_inputs_ = [ graph_strut_dict[pid][config.graph_input_type] for pid in graph_struct ] return _split_list(graph_inputs_[1:]) def generate_graph_structs(paper_id): sub_graph_dict = {} sub_graph_set = [] n_hop = config.n_hop max_neighbor_num = config.max_neighbor_num k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num) for sub_g in k_nbrs: sub_graph_set += sub_g for node in sub_graph_set: sub_graph_dict[node] = [] for sub_g in k_nbrs: for centre_node in sub_g: nbrs = graph_strut_dict[centre_node]['references'] c_nbrs = list(set(nbrs).intersection(sub_graph_set)) sub_graph_dict[centre_node].extend(c_nbrs) for c_nbr in c_nbrs: sub_graph_dict[c_nbr].append(centre_node) # in python 3.6, the first in subgraph dict is source paper return sub_graph_dict def _k_hop_neighbor(paper_id, n_hop, max_neighbor): sub_graph = [[] for _ in range(n_hop + 1)] level = 0 visited = set() q = deque() q.append([paper_id, level]) curr_node_num = 0 while len(q) != 0: paper_first = q.popleft() paper_id_first, level_first = paper_first if level_first > n_hop: return sub_graph sub_graph[level_first].append(paper_id_first) curr_node_num += 1 if curr_node_num > max_neighbor: return sub_graph visited.add(paper_id_first) for pid in graph_strut_dict[paper_id_first]["references"]: if pid not in visited and pid in graph_strut_dict: q.append([pid, level_first + 1]) visited.add(pid) return sub_graph def generate_dgl_graph(paper_id, graph_struct, nodes_num): g = dgl.DGLGraph() assert len(graph_struct) == nodes_num g.add_nodes(len(graph_struct)) pid2idx = {} for index, key_node in enumerate(graph_struct): pid2idx[key_node] = index assert pid2idx[paper_id] == 0 for index, key_node in enumerate(graph_struct): neighbor = [pid2idx[node] for node in graph_struct[key_node]] # add self loop neighbor.append(index) key_nodes = [index] * len(neighbor) g.add_edges(key_nodes, neighbor) return g train_ds = None dataInfo = self.load(paths) # pop nodes in train graph in inductive setting if config.mode == "test" and self.setting == "inductive": dataInfo.datasets.pop("train") graph_strut_dict = {} for key, ds in dataInfo.datasets.items(): for ins in ds: graph_strut_dict[ins["paper_id"]] = ins logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes") for key, ds in dataInfo.datasets.items(): # process summary ds.apply(lambda x: x['abstract'].split(), new_field_name='summary_wd') ds.apply(lambda x: sent_tokenize(x['abstract']), new_field_name='abstract_sentences') # generate graph ds.apply(lambda x: generate_graph_structs(x["paper_id"]), new_field_name="graph_struct") ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]), new_field_name='graph_inputs_wd') ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1, new_field_name="nodes_num") # pad input ds.apply(lambda x: generate_article_input(x['introduction'], x[ "graph_inputs_wd"]), new_field_name='input_wd') ds.apply(lambda x: _article_token_mask(x["input_wd"]), new_field_name="enc_len_mask") ds.apply(lambda x: sum(x["enc_len_mask"]), new_field_name="enc_len") ds.apply(lambda x: _pad_article(x["input_wd"]), new_field_name="pad_input_wd") ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]), new_field_name="nbr_inputs_len") ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]), new_field_name="pad_graph_inputs_wd") if key == "train": train_ds = ds vocab_dict = {} if not load_vocab_file: logger.info("[INFO] Build new vocab from training dataset!") if train_ds is None: raise ValueError("Lack train file to build vocabulary!") vocabs = Vocabulary(max_size=config.vocab_size - 2, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.from_dataset(train_ds, field_name=["input_wd", "summary_wd"]) vocabs.add_word(START_DECODING) vocabs.add_word(STOP_DECODING) vocab_dict["vocab"] = vocabs # save vocab with open(os.path.join(config.train_path, "vocab"), "w", encoding="utf8") as f: for w, idx in vocabs: f.write(str(w) + "\t" + str(idx) + "\n") logger.info( "build new vocab ends.. please reRun the code with load_vocab = True" ) exit(0) else: logger.info("[INFO] Load existing vocab from %s!" % config.vocab_path) word_list = [] cnt = 3 # pad and unk if config.neighbor_process == "sep": cnt += 1 with open(config.vocab_path, 'r', encoding='utf8') as vocab_f: for line in vocab_f: pieces = line.split("\t") word_list.append(pieces[0]) cnt += 1 if cnt > vocab_size: break vocabs = Vocabulary(max_size=vocab_size, padding=PAD_TOKEN, unknown=UNKNOWN_TOKEN) vocabs.add_word_lst(word_list) vocabs.add(START_DECODING) vocabs.add(STOP_DECODING) if config.neighbor_process == "sep": vocabs.add(SEP) vocabs.build_vocab() vocab_dict["vocab"] = vocabs logger.info(f"vocab size = {len(vocabs)}") assert len(vocabs) == config.vocab_size dataInfo.set_vocab(vocabs, "vocab") for key, dataset in dataInfo.datasets.items(): # do not process the training set in test mode if config.mode == "test" and key == "train": continue data_dict = { "enc_input": [], "nbr_inputs": [], "graph": [], "dec_input": [], "target": [], "dec_len": [], "article_oovs": [], "enc_input_extend_vocab": [], } logger.info( f"start construct the input of the model for {key} set, please wait..." ) for instance in dataset: graph_inputs = instance["pad_graph_inputs_wd"] abstract_sentences = instance["summary_wd"] enc_input = instance["pad_input_wd"] enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \ getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config) graph = generate_dgl_graph(instance["paper_id"], instance["graph_struct"], instance["nodes_num"]) data_dict["graph"].append(graph) data_dict["enc_input"].append(enc_input) data_dict["nbr_inputs"].append(nbr_inputs) data_dict["dec_input"].append(dec_input) data_dict["target"].append(target) data_dict["dec_len"].append(dec_len) data_dict["article_oovs"].append(article_oovs) data_dict["enc_input_extend_vocab"].append( enc_input_extend_vocab) dataset.add_field("enc_input", data_dict["enc_input"]) dataset.add_field("nbr_inputs", data_dict["nbr_inputs"]) dataset.add_field("dec_input", data_dict["dec_input"]) dataset.add_field("target", data_dict["target"]) dataset.add_field("dec_len", data_dict["dec_len"]) dataset.add_field("article_oovs", data_dict["article_oovs"]) dataset.add_field("enc_input_extend_vocab", data_dict["enc_input_extend_vocab"]) dataset.add_field("graph", data_dict["graph"]) dataset.set_ignore_type( 'graph') # without this line, there may be some errors dataset.set_input("graph") dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len", "enc_input", "enc_len_mask", "dec_input", "dec_len", "article_oovs", "nodes_num", "enc_input_extend_vocab") dataset.set_target("target", "article_oovs", "abstract_sentences") dataset.delete_field('graph_inputs_wd') dataset.delete_field('pad_graph_inputs_wd') dataset.delete_field('input_wd') dataset.delete_field('pad_input_wd') logger.info("------load dataset over---------") return dataInfo, vocabs
def test_add_word(self): vocab = Vocabulary() for word in text: vocab.add_word(word) self.assertEqual(vocab.word_count, counter)
def test_only_use_pretrain_word(self): def check_word_unk(words, vocab, embed): for word in words: self.assertListEqual( embed(torch.LongTensor([vocab.to_index(word) ])).tolist()[0], embed(torch.LongTensor([1])).tolist()[0]) def check_vector_equal(words, vocab, embed, embed_dict, lower=False): for word in words: index = vocab.to_index(word) v1 = embed(torch.LongTensor([index])).tolist()[0] if lower: word = word.lower() v2 = embed_dict[word] for v1i, v2i in zip(v1, v2): self.assertAlmostEqual(v1i, v2i, places=4) embed_dict = read_static_embed( 'test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt') # 测试是否只使用pretrain的word vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) vocab.add_word('of', no_create_entry=True) embed = StaticEmbedding( vocab, model_dir_or_name= 'test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt', only_use_pretrain_word=True) # notinfile应该被置为unk check_vector_equal(['the', 'a', 'of'], vocab, embed, embed_dict) check_word_unk(['notinfile'], vocab, embed) # 测试在大小写情况下的使用 vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile']) vocab.add_word('Of', no_create_entry=True) embed = StaticEmbedding( vocab, model_dir_or_name= 'test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt', only_use_pretrain_word=True) check_word_unk(['The', 'Of', 'notinfile'], vocab, embed) # 这些词应该找不到 check_vector_equal(['a'], vocab, embed, embed_dict) embed = StaticEmbedding( vocab, model_dir_or_name= 'test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt', only_use_pretrain_word=True, lower=True) check_vector_equal(['The', 'Of', 'a'], vocab, embed, embed_dict, lower=True) check_word_unk(['notinfile'], vocab, embed) # 测试min_freq vocab = Vocabulary().add_word_lst( ['The', 'a', 'notinfile1', 'A', 'notinfile2', 'notinfile2']) vocab.add_word('Of', no_create_entry=True) embed = StaticEmbedding( vocab, model_dir_or_name= 'test/data_for_tests/embedding/small_static_embedding/' 'glove.6B.50d_test.txt', only_use_pretrain_word=True, lower=True, min_freq=2, only_train_min_freq=True) check_vector_equal(['Of', 'a'], vocab, embed, embed_dict, lower=True) check_word_unk(['notinfile1', 'The', 'notinfile2'], vocab, embed)
class TextData(): vocab_size = 0 dataset_size = 0 train_size = 0 test_size = 0 class_num = 4 min_count = 10 max_seq_len = 500 seq_limit = 2000 data_src = "20news" data_set = DataSet() train_set = DataSet() test_set = DataSet() dev_set = DataSet() vocab = None def __init__(self,data_src="20news",min_count=10,seq_limit=None): self.data_src = data_src self.min_count = min_count if seq_limit is not None: self.seq_limit = seq_limit def find_max_len(self,words): self.max_seq_len = max(len(words),self.max_seq_len) def seq_regularize(self,words): wlen = len(words) if wlen<self.max_seq_len: return [0]*(self.max_seq_len-wlen) + words else: return words[:self.max_seq_len] def fetch_20news(self,size=4): print("Loading 20newsgroups data and tokenize.") if size==20: train,test = get_all_20news() else: train,test = get_text_classification_datasets() train_input,test_input = tokenize(train.data,test.data) train_target = train.target test_target = test.target self.class_num = len(train.target_names) assert (self.class_num == len(test.target_names)) # Building Fastnlp dataset. print("Building Fastnlp dataset.") self.train_set = DataSet({"text":train_input,"class":train_target}) self.test_set = DataSet({"text":test_input,"class":test_target}) # Building Fastnlp vocabulary... print("Building Fastnlp vocabulary.") self.vocab = Vocabulary(min_freq=self.min_count) self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = len(self.vocab) # Building multi-hot-vector for train_set and test_set. print("Building id-presentation for train_set and test_set.") self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words') self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len') self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len') self.train_set.apply_field(self.find_max_len,field_name='words') print(self.max_seq_len) self.max_seq_len = min(self.max_seq_len,self.seq_limit) self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words') self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words') # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input") # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # Building target-vector for train_set and test_set. print("Building target-vector for train_set and test_set.") self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True) self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True) # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") def fetch_csv(self,path=None): print("Not implemented now...") pass def fetch_data(self,path=None): if self.data_src == "20news": # Loading 20newsgroups data and tokenize. self.fetch_20news() elif self.data_src == "20news_all": self.fetch_20news(size=20) else: print("No data src...") self.train_size = self.train_set.get_length() self.test_size = self.test_set.get_length() return self.train_size,self.test_size
class TextData(): data_src = "all_data" class_num = 2 min_count = 10 max_seq_len = 500 seq_limit = 2000 train_set = DataSet() val_set = DataSet() test_set = DataSet() train_size = 0 val_size = 0 test_size = 0 test_projectid = None vocab = None vocab_size = 0 def __init__(self, data_src="all_data", min_count=10, seq_limit=None): self.data_src = data_src self.min_count = min_count if seq_limit is not None: self.seq_limit = seq_limit def find_max_len(self, words): self.max_seq_len = max(len(words), self.max_seq_len) def seq_regularize(self, words): wlen = len(words) if wlen < self.max_seq_len: return [0] * (self.max_seq_len - wlen) + words else: return words[:self.max_seq_len] def fetch_csv(self, path, text_var="essay", target="is_exciting", subset_num=None, us_rate=None, os_rate=None): """ us_rate: under sampling rate os_rate: over sampling rate """ print("Loading data from {} ...".format(path)) df = pd.read_csv(path) # text_vars=["title", "short_description", "need_statement", "essay"] text_vars = text_var # only select the essay column target_var = "y" df[target_var] = 0.0 df[target_var][df[target] == "t"] = 1.0 df[target_var][df[target] != "t"] = 0.0 train_df = df[df['split'] == 'train'] val_df = df[df['split'] == 'val'] test_df = df[df['split'] == 'test'] train_num = len(train_df) val_num = len(val_df) test_num = len(test_df) print("nums:({},{},{})".format(train_num, val_num, test_num)) if os_rate is not None: print("Over Sample mode") ros = RandomOverSampler(random_state=0) elif us_rate is not None: print("Under Sample mode") train_df_t = train_df[df[target] == "t"] train_df_f = train_df[df[target] == "f"] t_num = len(train_df_t) f_num = len(train_df_f) print("Raw train t:f = {}:{}".format(t_num, f_num)) nf_num = int(t_num / us_rate) f_num = min(nf_num, f_num) balanced_train_t = train_df_t.sample(n=t_num) balanced_train_f = train_df_f.sample(n=f_num) train_df = pd.concat([balanced_train_t, balanced_train_f]).sample(frac=1) print("Balanced train: t:f = {}:{}".format(len(balanced_train_t), len(balanced_train_f))) # print("Train 1.0:",len(train_df[train_df[target_var] == 1.0])) val_df_t = val_df[df[target] == "t"] val_df_f = val_df[df[target] == "f"] t_num = len(val_df_t) f_num = len(val_df_f) print("Raw val t:f = {}:{}".format(t_num, f_num)) nf_num = int(t_num / us_rate) f_num = min(nf_num, f_num) balanced_val_t = val_df_t.sample(n=t_num) balanced_val_f = val_df_f.sample(n=f_num) val_df = pd.concat([balanced_val_t, balanced_val_f]).sample(frac=1) print("Balanced val: t:f = {}:{}".format(len(balanced_val_t), len(balanced_val_f))) else: print("No sample mode") if subset_num is not None and subset_num > 0: print("Get sub set of size {}.".format(subset_num)) train_df = train_df.sample(n=subset_num) val_df = val_df.sample(n=subset_num) train_num = len(train_df) val_num = len(val_df) test_num = len(test_df) print("subset nums:({},{},{})".format(train_num, val_num, test_num)) train_target = train_df[target_var].values count = 0 print(count) val_target = val_df[target_var].values test_target = test_df[target_var].values print("tokenize train set") train_input = tokenize(train_df[text_vars].values) print("tokenize val set") val_input = tokenize(val_df[text_vars].values) print("tokenize test set") test_input = tokenize(test_df[text_vars].values) assert (self.class_num == 2) self.test_projectid = test_df['projectid'] # Building Fastnlp dataset. print("Building Fastnlp dataset.") if os_rate is not None: print("Over Sampling...") train_input, train_target = ros.fit_sample( np.array(train_input)[:, np.newaxis], np.array(train_target)[:, np.newaxis]) train_input = train_input.squeeze().tolist() train_target = train_target.tolist() val_input, val_target = ros.fit_sample( np.array(val_input)[:, np.newaxis], np.array(val_target)[:, np.newaxis]) val_input = val_input.squeeze().tolist() val_target = val_target.tolist() self.train_set = DataSet({"text": train_input, "class": train_target}) self.val_set = DataSet({"text": val_input, "class": val_target}) self.test_set = DataSet({"text": test_input, "class": test_target}) # Building Fastnlp vocabulary... print("Building Fastnlp vocabulary.") self.vocab = Vocabulary(min_freq=self.min_count) self.train_set.apply( lambda x: [self.vocab.add_word(word) for word in x['text']]) self.vocab.build_vocab() self.vocab.build_reverse_vocab() self.vocab_size = len(self.vocab) # Building multi-hot-vector for train_set and test_set. print("Building id-presentation for train_set and test_set.") self.vocab.index_dataset(self.train_set, self.val_set, self.test_set, field_name='text', new_field_name='words') self.train_set.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') self.val_set.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') self.test_set.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') self.train_set.apply_field(self.find_max_len, field_name='words') print(self.max_seq_len) self.max_seq_len = min(self.max_seq_len, self.seq_limit) self.train_set.apply_field(self.seq_regularize, field_name='words', new_field_name='words') self.val_set.apply_field(self.seq_regularize, field_name='words', new_field_name='words') self.test_set.apply_field(self.seq_regularize, field_name='words', new_field_name='words') # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input") # self.val_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input') # Building target-vector for train_set and test_set. print("Building target-vector for train_set and test_set.") self.train_set.apply(lambda x: int(x['class']), new_field_name="target", is_target=True) self.val_set.apply(lambda x: int(x['class']), new_field_name="target", is_target=True) self.test_set.apply(lambda x: int(x['class']), new_field_name="target", is_target=True) # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target") def fetch_data(self, path, text_var="essay", target_var="is_exciting", subset_num=None, us_rate=None, os_rate=None): if self.data_src == "all_data": # Loading 20newsgroups data and tokenize. self.fetch_csv(path, text_var, target_var, subset_num, us_rate, os_rate) else: print("No legal data src type:{} ...".format(self.data_src)) assert (0 == 1) self.train_size = self.train_set.get_length() self.val_size = self.val_set.get_length() self.test_size = self.test_set.get_length() return self.train_size, self.val_size, self.test_size