def test(): corpus = tp.loadPickle(corpus_save_file) test_corpus = corpus[-2000:-1] test_X = tp.loadPickle('./test_X') test_Y = tp.loadPickle('./test_Y') pred = predict(test_X, test_Y) pred = [np.argmax(i) for i in pred] label = [np.argmax(i) for i in test_Y] error_analysis(pred, label, test_corpus)
def output_neg_pos(): pairs = tp.loadPickle(corpus_save_file) f1 = open('./pos_case', 'w') f2 = open('./neg_case', 'w') for pair in pairs: if pair.label == '1': f1.write(pair.__str__() + '\n') else: f2.write(pair.__str__() + '\n') f1.close() f2.close()
def gen_templates(): tokenizer = tp.loadPickle('../ATEC/tokenizer') f = Template_Finder(tokenizer.tokenize, window=3) f.train(F()) f.find(F()) templates = f.templates templates = {i: j for i, j in templates.items() if not i.is_trivial()} trie = XTrie() for i, j in templates.items(): trie[tuple(i.words)] = j tp.savePickle(trie, '../ATEC/templates')
def output_pari(file): corpus = tp.loadPickle(corpus_save_file) with open(file, 'w') as f: for pair in corpus: f.write(pair.first_sen + '\n') f.write(pair.second_sen + '\n')
from collections import Counter from utils.structures import TrieTree, TrieTreeNode import utils.textProcess as tp from utils.nlp_zero import Tokenizer # jieba.add_word("花呗", freq=100000) # jieba.add_word("借呗", freq=100000) # jieba.add_word("外卖", freq=100000) # jieba.add_word("闲鱼", freq=100000) # jieba.add_word("更改", freq=100000) csv_file = './data/atec_nlp_sim_train.csv' csv_file2 = './data/atec_nlp_sim_train_add.csv' word_dic_file = './data/processed_data/word_dic.dic' char_dic_file = './data/processed_data/char_dic.dic' corpus_save_file = './data/processed_data/corpus' tokenizer= tp.loadPickle('./tokenizer') class Pair: def __init__(self): self.id = -1 self.first_sen = None self.second_sen = None self.label = 0 def __init__(self, id, sen1, sen2, label): self.id = id self.first_sen = sen1.replace('***', '&') self.second_sen = sen2.replace('***', '&') self.label = label def cut_word(self):
fout.write(u' '.join(res).encode('utf-8') + '\n') def gen_templates(): tokenizer = tp.loadPickle('../ATEC/tokenizer') f = Template_Finder(tokenizer.tokenize, window=3) f.train(F()) f.find(F()) templates = f.templates templates = {i: j for i, j in templates.items() if not i.is_trivial()} trie = XTrie() for i, j in templates.items(): trie[tuple(i.words)] = j tp.savePickle(trie, '../ATEC/templates') if __name__ == '__main__': # gen_templates() tokenizer = tp.loadPickle('../ATEC/tokenizer') templates = tp.loadPickle('../ATEC/templates') start = time.time() p = Parser(templates, tokenizer.tokenize) tree1 = p.parse(u'我的花呗是以前的手机号码,怎么更改成现在的支付宝的号码手机号').plot() tree2 = p.parse(u'怎么更改花呗手机号码').plot() end = time.time() print tree1 print tree2 print(end - start) # tp.savePickle(templates, './data/templates')
return acc, precision, recall, f1 def extra_train_data(train_X, train_Y, count=100000): length = len(train_X) for i in range(count): f = random.randint(0, length) s = random.randint(0, length) if f != s: train_X.append([train_X[f][0], train_X[s][0]]) train_Y.append([1.0, 0.0]) return train_X, train_Y # corpus = tp.loadPickle(corpus_save_file) index_dic = tp.loadPickle(char_dic_file) # train_X, train_Y = build_train_set() # train_Y = [[i] for i in train_Y] # train_Y = tp.oneHotLabels(train_Y, 1) # test_X = train_X[-2000:-1] # test_Y = train_Y[-2000:-1] # train_X = train_X[0:-2000] # train_Y = train_Y[0:-2000] # tp.savePickle(train_X,'./train_X') # tp.savePickle(train_Y,'./train_Y') # tp.savePickle(test_X,'./test_X') # tp.savePickle(test_Y,'./test_Y') def train():