def testMAP(): result_dic = pickleload("./modelsave/pyramidModel0_predict.pkl", "./modelsave/result_dic.pkl") true_label_dic = pickleload("./modelsave/pyramidModel0_true.pkl", "./modelsave/true_label_dic.pkl") keys = result_dic.keys() MAPS = 0 precisions = 0 recalls = 0 for key in keys: out = torch.cat(result_dic[key], dim=0) print(out) print(true_label_dic[key]) predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy() print("预测标签:", predict_index) print("-------------------------------------") precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index) MAPS += MAP precisions += precision recalls += recall print(len(keys)) MAPS /= len(keys) precisions /= len(keys) recalls /= len(keys) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))
def random(): import random idf_dic = pickleload("../data2/idf.pkl", "idf.pkl") random.seed = 1 datas = pickleload("../data2/train_data3.pkl", "../data/train_data.pkl") datas = datas[len(datas)*4//5 : len(datas)] result_lis = [] all_count = 0 true_count = 0 for data in tqdm(datas): target_content = data["target_tokens"] citations = data["citations_tokens"] citation_target_dict = dict() index = 0 # print(len(citations)) new_citations = [] for citation in citations: sel_target = citation["target_tokens"] citation_target_dict[index] = sel_target index += 1 new_citations.append(sel_target) # print(len(citation_content_dict)) random_index = randint(0, len(citation_target_dict)-1) ref = getTopVsmScore(idf_dic, target_content, new_citations) all_count+= 1 if random_index == ref: true_count += 1 print(true_count/all_count)
def test_languageModel(args): args.dropout = 0.0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data) * 4 // 5:len(data)] # dev_data = data[2000: 4000] batch = Batch(args) word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl") input_vec = len(word2index) dev_batches = batch.lm_dev_batch(dev_data, args.context_limit) log_msg = "输入词空间大小:%d" % (input_vec) logger.info(log_msg) print(log_msg) transform = Transformer(args, input_vec) transform.load_state_dict( torch.load("./modelsave/" + "TransformModel0.pkl")) if torch.cuda.is_available(): transform = transform.cuda() # 打印参数: log_msg = "模型名称:%s \n" % (args.loadmodelName) logger.info(log_msg) print(log_msg) result_dic = {} true_label_dic = {} all_count = 0 right_count = 0 loss_func = torch.nn.NLLLoss() loss = 0 for dev_step, dev_batch in enumerate(dev_batches): context_idxs = dev_batch['context_idxs'] seg_indexs = dev_batch['seg_indexs'] cit_targets = dev_batch['cit_targets'] targets = dev_batch['targets'] target_indexs = dev_batch['target_indexs'] ref_labels = dev_batch['ref_labels'] id = dev_batch['id'] print(id) context_mask = torch.Tensor( np.array( [list(map(function, xx)) for xx in context_idxs.data.numpy()], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() seg_indexs = Variable(seg_indexs).cuda() targets = Variable(targets).cuda() out1, out2 = transform.forward(context_idxs, seg_indexs, context_mask, target_indexs) # print(out) for i in range(out1.size(0)): loss += loss_func(out1[i], targets[i]) loss = loss.item() / out1.size(0) all_count += 1 del out1, out2 print(loss / all_count)
def __init__(self, args): self.OOV = 0 self.OOD = 1 #unknow self.word2index = pickleload(args.word2index_pkl, "word2index") self.stoplis = {} for word in stopwords.words("english"): self.stoplis[word] = 1
def matplotDataDisplay(): import matplotlib.pyplot as plt datas = pickleload("./data2/random_train_data.pkl", "./data2/random_train_data.pkl") key_dic = {} for i in range(len(datas)): citations = datas[i]['citations_tokens'] count = 0 for j in range(len(citations)): if citations[j]['label'] == 1: count += 1 if count not in key_dic: key_dic[count] = 1 else: key_dic[count] += 1 new_key_dic = sorted(key_dic.items(), key=lambda item: item[0], reverse=False) print(new_key_dic) name_list = [] num_list = [] for key, value in new_key_dic: name_list.append(str(key)) num_list.append(value) plt.bar(range(len(name_list)), num_list, color='grey', tick_label=name_list) plt.xlabel('The number of the alternative citations') plt.ylabel('Number of the instances') plt.show()
def getWord2vecData(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" "citations":[ { "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" } ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl") # datas = datas[len(datas)-1000:len(datas)] print(len(datas)) for i in tqdm(range(len(datas))): data = datas[i] target = data_process(data["target_tokens"]) up_content = data_process(data['up_source_tokens']) down_content = data_process(data['down_source_tokens']) writeFile('./word2vec/train_word2vec.txt', up_content + " " + target + " " + down_content + "\n")
def getZooEmbedding(): source_embedding = pickleload("./word2vec/glove_300.pkl", "glove_300.pkl") for i in range(len(source_embedding)): print(i) str_embedding = [str(j) for j in source_embedding[i]] writefile("./match_zoo_data/embedding_dic.txt", str(i) + " " + " ".join(str_embedding) + "\n")
def findBleuSimilar(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source":"", "down_source":"", "target":"" "citations":[ citation0, citation1, ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("./data2/train_data.pkl", "./data2/train_data.pkl") # datas = datas[len(datas)-1000:len(datas)] print(len(datas)) result_lis = [] count = 0 for data in datas: target = data["target_tokens"].split(" ") #计算citation citations = data["citations_tokens"] scores = [] for index in range(len(citations)): ciation = citations[index] cit_target = ciation["target_tokens"].split(" ") score = test_bleu(" ".join(cit_target), " ".join(target), 1) scores.append(score) new_score = sorted(scores, reverse=True) # if new_score[0] < 0.5 and new_score[0] != 1: # continue best_index = scores.index(new_score[0]) predict = citations[best_index]['target_tokens'] result_dic = OrderedDict() result_dic["cand_answer"] = predict result_dic["ref_answer"] = data["target_tokens"] result_lis.append(result_dic) count += 1 print("score:", new_score[0]) print("原始的:", data["target_tokens"]) print("预测的:", predict) print("-------------------------------------------------------") print(count) jsonsave('./rougetest/data/target_data.json', result_lis, "result_lis") test_score("./rougetest/data/target_data.json", n_size=1) test_score("./rougetest/data/target_data.json", n_size=2) test_score("./rougetest/data/target_data.json", n_size=3) test_score("./rougetest/data/target_data.json", n_size=4)
def getSingleTrainData(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" "citations":[ { "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" } ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl") # datas = datas[len(datas)-1000:len(datas)] print(len(datas)) new_datas = copy(datas) train_datas = [] for i in tqdm(range(len(datas))): data = datas[i] target = data_process(data["target_tokens"]) #计算citation citations = data["citations_tokens"] scores = [] for index in range(len(citations)): ciation = citations[index] cit_target = data_process(ciation["target_tokens"]) score = test_bleu(cit_target, target, 1) scores.append(score) new_datas[i]['citations_tokens'][index]["bleu1_score"] = score dic = {} dic['up_source'] = data_process(data["up_source_tokens"]) dic['down_source'] = data_process(data["down_source_tokens"]) dic['target'] = data_process(data["target_tokens"]) dic['cit_up_source'] = data_process(ciation['up_source_tokens']) dic['cit_down_source'] = data_process( ciation['down_source_tokens']) dic['cit_target'] = data_process(ciation['target_tokens']) dic['bleu1_score'] = score if score == 1: continue train_datas.append(copy(dic)) print("训练样本的数据量为:", len(train_datas)) picklesave(train_datas, "./train_data/single_train_data.pkl", "single_train_data.pkl")
def getRandomData(): import numpy as np datas = pickleload("../data2/train_data2.pkl", "./data2/train_data2.pkl") new_datas = [] ids = range(len(datas)) permutation = np.random.permutation(ids) for i, id in enumerate(permutation): new_datas.append(datas[id]) picklesave(new_datas, "../data2/random_train_data.pkl", "./data2/random_train_data.pkl")
def displayData(topn=1): result_dic = pickleload("./modelsave/pyramidModel0_predict.pkl", "./modelsave/result_dic.pkl") true_label_dic = pickleload("./modelsave/pyramidModel0_true.pkl", "./modelsave/true_label_dic.pkl") keys = result_dic.keys() blues = 0 rouges = 0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data) * 4 // 5:len(data)] id = 0 for key in keys: up_source_tokens = dev_data[id]["up_source_tokens"] target = dev_data[id]["target_tokens"] citations = dev_data[id]["citations_tokens"] out = torch.cat(result_dic[key], dim=0) print("up_source_tokens:", up_source_tokens) print("目标target:", target) predict_index = torch.topk(out, topn, dim=0)[1].squeeze(1).data.numpy() bleu = 0 rouge = 0 for index in predict_index: alternative_citation = citations[index]["target_tokens"] if len(target.strip().split(" ")) < 5 or len( alternative_citation.strip().split(" ")) < 5: continue bleu += test_bleu(alternative_citation, target, 1) rouge += test_rouge(alternative_citation, target) print("候选citation:", alternative_citation) print("--------------------------") bleu = bleu / topn rouge = rouge / topn blues += bleu rouges += rouge # print("-----------------------------------------------------") id += 1 blues /= len(keys) rouges /= len(keys) print("bleu", topn, ":", blues) print("rouge:", rouges)
def testRetrievalModelResult(topn = 5): with open("./result/predict.test.arcii_ranking.txt") as fp: lines = fp.readlines() result_lis = {} last_name = "" for line in lines: results = line.replace("\n","").split(" ") if last_name != results[0]: last_name = results[0] result_lis[last_name] = [] result_lis[last_name].append(int(results[2].split("_")[-1])) predict_indexs = [value for key, value in result_lis.items()] MAPS = 0 precisions = 0 recalls = 0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data) * 4 // 5:len(data)] bleus = 0 rouges = 0 for id in range(len(dev_data)): target = dev_data[id]["target_tokens"] citations = dev_data[id]['citations_tokens'] true_label = [] predict_index = predict_indexs[id][0:topn] for i in range(len(citations)): citation = citations[i] if citation['label'] == 1: true_label.append(i) bleu = 0 rouge = 0 for predict in predict_index: # print(predict) alternative_citation = citations[predict]["target_tokens"] if len(target.strip().split(" ")) < 5 or len(alternative_citation.strip().split(" ")) < 5: continue bleu += test_bleu(alternative_citation, target, 1) rouge += test_rouge(alternative_citation, target) print(bleu) print("------------------------") bleus += bleu/len(predict_index) rouges += rouge/len(predict_index) precision, recall, MAP = cal_MAP(true_label, predict_index) precisions += precision recalls+= recall MAPS += MAP MAPS /= len(predict_indexs) precisions /= len(predict_indexs) recalls /= len(predict_indexs) bleus /= len(dev_data) rouges /= len(dev_data) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls)) print("bleu", topn, ":", bleus) print("rouge:", rouges)
def getWord2index(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" "citations":[ { "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" } ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl") # datas = datas[len(datas)-1000:len(datas)] print(len(datas)) tokenDic = {} for i in tqdm(range(len(datas))): data = datas[i] target = data_process(data["target_tokens"]).split(" ") up_source = data_process(data["up_source_tokens"]).split(" ") down_source = data_process(data["down_source_tokens"]).split(" ") word_lis = target + up_source + down_source for token in word_lis: if token not in tokenDic: tokenDic[token] = 1 else: tokenDic[token] += 1 index = 2 word2index = {} for key, value in tokenDic.items(): if value > 1: word2index[key] = index index += 1 word2index['<padding>'] = 0 word2index['<unknow>'] = 1 word2index['<CLS>'] = index word2index['<DSP>'] = index + 1 word2index['<MASK>'] = index + 2 print(len(word2index), " / ", len(tokenDic), "个token") picklesave(word2index, './word_vec/word2index.pkl', "word2index.pkl")
def statistical_data(): datas = pickleload("../data2/train_data2.pkl", "./data2/train_data.pkl") label_num_dic = {} for data in datas: citations = data['citations_tokens'] count = 0 for citation in citations: if citation["label"] == 1: count += 1 if count not in label_num_dic: label_num_dic[count] = 1 else: label_num_dic[count] += 1 for key, value in label_num_dic.items(): print(key, " : ", value)
def getDataDisplay(): datas = pickleload("./data2/random_train_data.pkl", "./data2/random_train_data.pkl") string = "A different strategy is presented in Fung and Chen ( 2004 ) , where English FrameNet entries are mapped to concepts listed in HowNet , an on-line ontology for Chinese , without consulting a parallel corpus . Then , Chinese sentences with predicates instantiating these concepts are found in a monolingual corpus and their arguments are labeled with FrameNet roles . Other work attempts to alleviate the data requirements for semantic role labeling either by relying on unsupervised learning or by extending existing resources through the use of unlabeled data ." for i in range(len(datas)): if datas[i]['up_source_tokens'] == string: citations = datas[i]['citations_tokens'] print("目标:", datas[i]["target_tokens"]) for j in range(len(citations)): print(j + 1, ":") print("up:", citations[j]['up_source_tokens']) print("down:", citations[j]['down_source_tokens']) print(citations[j]['target_tokens']) print(citations[j]['label']) print( "-------------------------------------------------------")
def getsomeSample(): datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl") for i in tqdm(range(len(datas))): data = datas[i] target = data_process(data["target_tokens"]) up_source_tokens = data_process(data["up_source_tokens"]) down_source_tokens = data_process(data["down_source_tokens"]) # 计算citation citations = data["citations_tokens"] print("up_context:", up_source_tokens) print("down_context:", down_source_tokens) print("target_citation:", target) scores = [] for index in range(len(citations)): ciation = citations[index] cit_target = data_process(ciation["target_tokens"]) print(index, " citations:", cit_target) print("--------------------------------------------")
def manualselect(): import random datas = pickleload("./data2/random_train_data.pkl", "./data2/random_train_data.pkl") select_ids = [] right_count = 0 wrong_count = 0 for _ in range(50): id = random.randint(0, len(datas)) while id in select_ids: id = random.randint(0, len(datas) - 1) select_ids.append(id) data = datas[id] up_source = data["up_source_tokens"] down_source = data["down_source_tokens"] target_citation = data['target_tokens'] citations = data["citations_tokens"] print("up_source:", up_source) print("down_source:", down_source) for index in range(len(citations)): if citations[index]['label'] == 1: citation = citations[index]["target_tokens"] break select_lis = [target_citation, citation] order = random.randint(0, 1) if order == 0: print(select_lis[0], "\n", select_lis[1]) inputs = input("输入你要选择的目标:") if inputs == "0": right_count += 1 elif inputs == "1": wrong_count += 1 else: print(select_lis[1], "\n", select_lis[0]) inputs = input("输入你要选择的目标:") if inputs == "0": wrong_count += 1 elif inputs == "1": right_count += 1 print("right:", right_count) print("wrong:", wrong_count)
def manual_label(): datas = pickleload("../data2/train_data2.pkl", "./data2/train_data.pkl") # golden_train_datas = pickleload("../data/golden_train_data.pkl", "./data/golden_train_data.pkl") print(len(datas)) train_datas = [] flag_pairs = {} for i in range(len(datas)): data = datas[i] target = data_process(data["target_tokens"]) # 计算citation citations = data["citations_tokens"] flag = 0 for index in range(len(citations)): citation = citations[index] cand_cit = data_process(citation["target_tokens"]) if cand_cit + target not in flag_pairs.keys(): print("进程:", i, "/", len(datas), " ", index, "/", len(citations)) print("target:", target) print("candidate:", cand_cit) label = input("标签:") if str(label) == "1": citations[index]['label'] = 1 flag = 1 else: citations[index]['label'] = 0 flag_pairs[cand_cit + target] = citations[index]['label'] flag_pairs[target + cand_cit] = citations[index]['label'] else: if flag_pairs[cand_cit + target] == 1: citations[index]['label'] = 1 flag = 1 else: citations[index]['label'] = 0 picklesave(flag_pairs, "../data/flag_pairs.pkl", "./data/flag_pairs.pkl") if flag == 1: new_data = datas[i] new_data["citations_tokens"] = citations train_datas.append(new_data) picklesave(train_datas, "../data/golden_train_data.pkl", "./data/golden_train_data.pkl")
def getCsvFile(): datas = pickleload("./train_data/train_data.pkl", "./train_data/train_data.pkl") content_source = [] target = [] cit_content_source = [] cit_target = [] score = [] for data in tqdm(datas): content_source.append(data['up_source'] + ' ' + data['down_source']) target.append(data['target']) cit_content_source.append(data['cit_up_source'] + ' ' + data['cit_down_source']) cit_target.append(data['cit_target']) score.append(data['bleu1_score']) train_data = pd.concat([pd.DataFrame(data = content_source, columns=['content_source']),\ pd.DataFrame(data = target, columns=['target']), \ pd.DataFrame(data=cit_content_source,columns=['cit_content_source']), pd.DataFrame(data = cit_target, columns=['cit_target']),\ pd.DataFrame(data = score, columns=['score'])], axis = 1) train_data.to_csv("./train_data/train_data.csv")
def getIdf(): datas = pickleload("./data2/train_data.pkl", "./data/train_data.pkl") all_count = len(datas) print(len(datas)) tokenidf_dic = {} for data in tqdm(datas): up_source_tokens = process(data["up_source_tokens"]).split(" ") down_source_tokens = process(data["down_source_tokens"]).split(" ") target_tokens = process(data["target_tokens"]).split(" ") dic = {} for token in up_source_tokens + down_source_tokens + target_tokens: if token not in dic: dic[token] = 1 for key in dic.keys(): if key not in tokenidf_dic: tokenidf_dic[key] = 1 else: tokenidf_dic[key] += 1 new_dic = {} for key, value in tokenidf_dic.items(): new_dic[key] = math.log10(all_count / value) picklesave(new_dic, './data2/idf.pkl', "idf")
def getTfidf(): ''' up_source.append(data['up_source']) down_source.append(data['down_source']) target.append(data['target']) cit_up_source.append(data['cit_up_source']) cit_down_source.append(data['cit_down_source']) cit_target.append(data['cit_target']) score.append(data['bleu1_score']) :return: 数量:189389 ''' train_data = pickleload('../data/train_data.pkl', 'train_data.pkl') sentences = [] for data in tqdm(train_data): up_source = data['up_source_tokens'] down_source = data['down_source_tokens'] target = data['target_tokens'] if up_source != "": sentences.append(up_source) if down_source != "": sentences.append(down_source) if target != "": sentences.append(target) #创建词向量空间 vec = TfidfVectorizer( ngram_range=(1, 2), min_df=3, max_df=0.9, #strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1) print('开始训练tfidf') vec.fit_transform(sentences) pickle.dump(vec, open('./svmModelsave/tfidf_12gram.pkl', 'wb')) #25453 dic = vec.vocabulary_ print(len(dic))
def findVSMSimilar(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source":"", "down_source":"", "target":"" "citations":[ citation0, citation1, ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("./data2/random_train_data.pkl", "./data/train_data.pkl") datas = datas[len(datas) * 4 // 5:len(datas)] idf_dic = pickleload("./data2/idf.pkl", "idf.pkl") # datas = datas[0:10] print(len(idf_dic)) print(len(datas)) result_lis = [] count = 0 for data in datas: up_source_tokens = process(data["up_source_tokens"]).split(" ") down_source_tokens = process(data["down_source_tokens"]).split(" ") target = process(data["target_tokens"]) dic = {} for token in up_source_tokens: if token not in dic: dic[token] = 1 else: dic[token] += 1 for token in down_source_tokens: if token not in dic: dic[token] = 1 else: dic[token] += 1 keys = dic.keys() sqrt_source = 0.0 for key in keys: if key in idf_dic: dic[key] = dic[key] / (len(up_source_tokens) + len(down_source_tokens)) * idf_dic[key] else: dic[key] = 0 sqrt_source += dic[key] * dic[key] sqrt_source = math.sqrt(sqrt_source) #计算citation citations = data["citations_tokens"] scores = [] # if len(citations) < 20: # continue count += 1 for index in range(len(citations)): ciation = citations[index] cit_up_source_tokens = ciation["up_source_tokens"].split(" ") cit_down_source_tokens = ciation["down_source_tokens"].split(" ") cit_target = process(ciation["target_tokens"]).split(" ") cit_dic = {} for token in cit_target: if token not in cit_dic: cit_dic[token] = 1 else: cit_dic[token] += 1 keys = cit_dic.keys() sqrt_cit = 0.0 for key in keys: if key in idf_dic: cit_dic[key] = cit_dic[key] / ( len(cit_target)) * idf_dic[key] else: cit_dic[key] = 0 sqrt_cit += cit_dic[key] sqrt_cit = math.sqrt(sqrt_cit) #计算相似度 sum = 0.0 for key in dic.keys(): if key in cit_dic: sum += dic[key] * cit_dic[key] score = sum / (sqrt_source * sqrt_cit) scores.append(score) new_score = sorted(scores, reverse=True) best_index = scores.index(new_score[0]) predict = citations[best_index]['target_tokens'] result_dic = OrderedDict() result_dic["cand_answer"] = predict result_dic["ref_answer"] = target result_lis.append(result_dic) # print("上文:",data["up_source_tokens"]) # print("下文:",data["down_source_tokens"]) # print("原始的:", target) # print("预测的:", predict) # print("-------------------------------------------------------") print(count, " / ", len(datas), " / ", count / len(datas)) jsonsave('./rougetest/data/similar_data.json', result_lis, "result_lis") test_score("./rougetest/data/similar_data.json", n_size=1) test_score("./rougetest/data/similar_data.json", n_size=2) test_score("./rougetest/data/similar_data.json", n_size=3) test_score("./rougetest/data/similar_data.json", n_size=4)
def Bm25_similar(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source"_tokens:"", "target_tokens":"" "citations":[ { "up_source_tokens": "down_source_tokens": "target_tokens": } ... ] } ...... ] :return: ''' datas = pickleload("../data2/random_train_data.pkl", "../data2/random_train_data.pkl") datas = datas[len(datas)*4//5:len(datas)] MAPS = 0 precisions= 0 recalls = 0 for data in tqdm(datas): target_up_content = process(data["up_source_tokens"]).split(" ") target_down_content = process(data["down_source_tokens"]).split(" ") target_content = process(data["target_tokens"]) content_tokens = target_up_content #+ target_down_content citations = data["citations_tokens"] citation_content_dict = dict() citation_target_dict = dict() index = 0 # print(len(citations)) ref_lis = [] for citation in citations: sel_up_content = process(citation["up_source_tokens"]).split(" ") sel_down_content =process( citation["down_source_tokens"]).split(" ") sel_target = process(citation["target_tokens"]) citation_content_tokens = sel_up_content + sel_target.split(" ") +sel_down_content citation_content_dict[str(index)] = citation_content_tokens citation_target_dict[str(index)] = sel_target if citation['label'] == 1: ref_lis.append(index) index += 1 pre_lis = getBm25TopSimilar(content_tokens, citation_content_dict, num=5) precision, recall ,MAP = cal_MAP(ref_lis, pre_lis) MAPS += MAP precisions += precision recalls += recall MAPS /= len(datas) precisions /= len(datas) recalls /= len(datas) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))
def train(args): train_data = pickleload('../Retrieval/train_data/single_train_data.pkl', "traindata") batch = Batch(args) # source_embedding = pickleload(args.source_emb_mat_pkl, "source_emb_mat_pkl") word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl") input_vec = len(word2index) train_batches = batch.train_batch(train_data, args.context_limit, args.num_epoches, args.batch_size) log_msg = "输入词空间大小:%d" % (input_vec) logger.info(log_msg) print(log_msg) model = Transformer(args, input_vec) if torch.cuda.is_available(): model = model.cuda() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) param.data.uniform_(-0.08, 0.08) parameters_trainable = list( filter(lambda p: p.requires_grad, model.parameters())) if args.optim == "Adadelta": optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) elif args.optim == "Adam": optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) elif args.optim == "SGD": optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) if args.loadmodel == True: model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName)) # 打印参数: log_msg = "优化函数:%s \n 学习率:%s \n 隐藏层:%s\n 保存模型名称:%s \n" % ( args.optim, args.learning_rate, args.d_model, args.modelName) # print("dropout:", args.dropout) logger.info(log_msg) print(log_msg) set_epoch = 0 pbar = tqdm(total=len(train_data) * args.num_epoches // args.batch_size + 1) loss_func = torch.nn.NLLLoss() print_loss_total = 0 for train_step, (train_batch, epoch) in enumerate(train_batches): pbar.update(1) context_idxs = train_batch['context_idxs'] seg_ids = train_batch['seg_indexs'] target_indexs = train_batch['target_indexs'] targets = train_batch['targets'] labels = train_batch['labels'] # print("up_context_idxs",up_context_idxs) # print("down_context_idxs",down_context_idxs) # print("target_idxs",target_idxs) # print("-----------------------------------------------------") context_mask = torch.Tensor( np.array( [list(map(function, xx)) for xx in context_idxs.data.numpy()], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() seg_ids = Variable(seg_ids).cuda() targets = Variable(targets).cuda() labels = Variable(labels).cuda() out1, out2 = model.forward(context_idxs, seg_ids, context_mask, target_indexs) # Get loss optimizer.zero_grad() #out1:batch * num_target * word_vec #out2:batch * 2 loss1 = 0 for i in range(out1.size(0)): loss1 += loss_func(out1[i], targets[i]) loss2 = loss_func(out2, labels) loss = loss1 / out1.size(0) + loss2 # Backward propagation loss.backward() optimizer.step() loss_value = loss.data.item() print_loss_total += loss_value if train_step % 200 == 0: log_msg = 'Epoch: %d, Train_step %d loss: %.4f' % ( epoch, train_step, print_loss_total / 100) logger.debug(log_msg) print(log_msg) print_loss_total = 0 if epoch == set_epoch: set_epoch += 1 #实时保存每个epoch的模型 torch.save(model.state_dict(), "./modelsave/" + args.modelName) torch.save(model.state_dict(), "./modelsave/" + args.modelName) pbar.close()
def all_doubletrainKey(args): data = pickleload( '../Retrieval/train_data/small_pairs_random_train_data.pkl', "small_pairs_random_train_data") dev_data = pickleload("../data2/random_train_data.pkl", "dev_data") train_data = data[0] + data[1] + data[2] + data[3] dev_data = dev_data[len(dev_data) * 4 // 5:len(dev_data)] batch = Batch(args) # source_embedding = pickleload(args.source_emb_mat_pkl, "source_emb_mat_pkl") word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl") input_vec = len(word2index) train_batches = batch.double_train_batch(train_data, args.context_limit, args.num_epoches, args.batch_size) log_msg = "输入词空间大小:%d" % (input_vec) logger.info(log_msg) print(log_msg) transform = Transformer(args, input_vec) if torch.cuda.is_available(): transform = transform.cuda() transform.load_state_dict( torch.load("./modelsave/" + "TransformModel0.pkl")) model = AllClassifyGetKeyWords(args, transform) model = model.cuda() if args.loadmodel == True: model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName)) # for param in model.parameters(): # param.data.uniform_(-0.08, 0.08) # param.data.uniform_(-0.08, 0.08) parameters_trainable = list( filter(lambda p: p.requires_grad, model.parameters())) if args.optim == "Adadelta": optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) elif args.optim == "Adam": optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) elif args.optim == "SGD": optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate, weight_decay=args.init_weight_decay) if args.loadmodel == True: model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName)) # 打印参数: log_msg = "优化函数:%s \n 学习率:%s \n 隐藏层:%s\n 保存模型名称:%s \n" % ( args.optim, args.learning_rate, args.d_model, args.modelName) # print("dropout:", args.dropout) logger.info(log_msg) print(log_msg) set_epoch = 1 pbar = tqdm(total=len(train_data) * args.num_epoches // args.batch_size + 1) def loss_func(high_out, low_out, seleout11, seleout12, seleout21, seleout22): ones = torch.ones(high_out.size(0), 1).cuda() ones1 = 7 * torch.ones(high_out.size(0), 1).cuda() loss = torch.mean(ones - high_out + low_out) + torch.mean((ones1 - seleout11)*(ones1 - seleout11)) + torch.mean((ones1 - seleout12)*(ones1 - seleout12)) + \ torch.mean((ones1 - seleout21)*(ones1 - seleout21)) + torch.mean((ones1 - seleout22)*(ones1 - seleout22)) return F.relu(loss), torch.mean(ones - high_out + low_out) print_loss_total = 0 old_accu = 0 print_loss_total2 = 0 for train_step, (train_batch, epoch) in enumerate(train_batches): pbar.update(1) high_context_idxs = train_batch['high_cit_context_idxs'] high_seg_ids = train_batch['high_seg_indexs'] low_context_idxs = train_batch['low_cit_context_idxs'] low_seg_ids = train_batch['low_seg_indexs'] high_source_context_idxs = train_batch['high_source_context_idxs'] high_source_seg_indexs = train_batch['high_source_seg_indexs'] low_source_context_idxs = train_batch['low_source_context_idxs'] low_source_seg_indexs = train_batch['low_source_seg_indexs'] high_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in high_context_idxs.data.numpy() ], dtype=np.float)).cuda() low_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in low_context_idxs.data.numpy() ], dtype=np.float)).cuda() high_source_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in high_source_context_idxs.data.numpy() ], dtype=np.float)).cuda() low_source_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in low_source_context_idxs.data.numpy() ], dtype=np.float)).cuda() high_context_idxs = Variable(high_context_idxs).cuda() high_seg_ids = Variable(high_seg_ids).cuda() low_context_idxs = Variable(low_context_idxs).cuda() low_seg_ids = Variable(low_seg_ids).cuda() high_source_context_idxs = Variable(high_source_context_idxs).cuda() high_source_seg_indexs = Variable(high_source_seg_indexs).cuda() low_source_context_idxs = Variable(low_source_context_idxs).cuda() low_source_seg_indexs = Variable(low_source_seg_indexs).cuda() out1, seleout11, seleout12 = model.forward(high_context_idxs, high_seg_ids, high_context_mask, high_source_context_idxs, high_source_seg_indexs, high_source_context_mask) out2, seleout21, seleout22 = model.forward(low_context_idxs, low_seg_ids, low_context_mask, low_source_context_idxs, low_source_seg_indexs, low_source_context_mask) # Get loss optimizer.zero_grad() #out1:batch * num_target * word_vec #out2:batch * 2 loss, loss2 = loss_func(out1, out2, seleout11, seleout12, seleout21, seleout22) # Backward propagation loss.backward() optimizer.step() loss_value = loss.data.item() print_loss_total += loss_value print_loss_total2 += loss2.data.item() del out1, out2 if train_step % 100 == 0: log_msg = 'Epoch: %d, Train_step %d loss1: %.4f, loss2:%.4f' % ( epoch, train_step, print_loss_total / 100, print_loss_total2 / 100) logger.debug(log_msg) print(log_msg) print_loss_total = 0 print_loss_total2 = 0 if epoch == set_epoch: set_epoch += 1 dev_batches = batch.dev_batch(dev_data, args.context_limit) result_dic = {} true_label_dic = {} for dev_step, dev_batch in enumerate(dev_batches): context_idxs = dev_batch['context_idxs'] source_context_idxs = dev_batch['source_context_idxs'] seg_indexs = dev_batch['seg_indexs'] source_seg_indexs = dev_batch['source_seg_indexs'] ref_labels = dev_batch['ref_labels'] id = dev_batch['id'] context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in context_idxs.data.numpy() ], dtype=np.float)).cuda() source_context_mask = torch.Tensor( np.array([ list(map(function, xx)) for xx in source_context_idxs.data.numpy() ], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() seg_indexs = Variable(seg_indexs).cuda() source_context_idxs = Variable(source_context_idxs).cuda() source_seg_indexs = Variable(source_seg_indexs).cuda() out, seleout1, seleout2 = model.forward( context_idxs, seg_indexs, context_mask, source_context_idxs, source_seg_indexs, source_context_mask) # Get loss if id not in result_dic: result_dic[id] = [] result_dic[id].append(out.cpu().data) true_label_dic[id] = ref_labels else: result_dic[id].append(out.cpu().data) del out picklesave(result_dic, "./modelsave/all_dev_result_dic22.pkl", "./modelsave/result_dic.pkl") picklesave(true_label_dic, "./modelsave/all_dev_true_label_dic22.pkl", "./modelsave/true_label_dic.pkl") keys = result_dic.keys() MAPS = 0 precisions = 0 recalls = 0 for key in keys: out = torch.cat(result_dic[key], dim=0) predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy() # print("预测标签:",predict_index) precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index) MAPS += MAP precisions += precision recalls += recall MAPS /= len(dev_data) precisions /= len(dev_data) recalls /= len(dev_data) all_loss = MAPS if all_loss > old_accu: old_accu = all_loss torch.save(model.state_dict(), "./modelsave/max" + args.modelName) best_epoch = epoch # else: # args.learning_rate = args.learning_rate / 2.0 # if args.learning_rate <= 1e-6: # args.learning_rate = 1e-6 # if args.optim == "Adadelta": # optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate, # weight_decay=args.init_weight_decay) # elif args.optim == "Adam": # optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate, # weight_decay=args.init_weight_decay) # elif args.optim == "SGD": # optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate, # weight_decay=args.init_weight_decay) log_msg = '\n验证集的MAP为: %.4f P为: %.4f R为: %.4f\n 取得最小loss的epoch为:%d' % ( all_loss, precisions, recalls, best_epoch) logger.info(log_msg) print(log_msg) # 实时保存每个epoch的模型 torch.save(model.state_dict(), "./modelsave/" + args.modelName) torch.save(model.state_dict(), "./modelsave/" + args.modelName) pbar.close()
def findSimilar(): ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source":"", "down_source":"", "target":"" "citations":[ citation0, citation1, ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("./data2/random_train_data.pkl", "./data2/random_train_data.pkl") datas = datas[len(datas) * 4 // 5:len(datas)] idf_dic = pickleload("./data2/idf.pkl", "idf.pkl") # datas = datas[0:10] print(len(idf_dic)) print(len(datas)) count = 0 MAPS = 0 precisions = 0 recalls = 0 for data in tqdm(datas): up_source_tokens = process(data["up_source_tokens"]) down_source_tokens = process(data["down_source_tokens"]) target = process(data["target_tokens"]) #计算citation citations = data["citations_tokens"] scores = [] count += 1 ref_lis = [] for index in range(len(citations)): if citations[index]['label'] == 1: ref_lis.append(index) ciation = citations[index] cit_up_source_tokens = process(ciation["up_source_tokens"]) cit_down_source_tokens = process(ciation["down_source_tokens"]) cit_target = process(ciation["target_tokens"]) score = getSVMScore( idf_dic, up_source_tokens, cit_up_source_tokens + " " + cit_target + " " + cit_down_source_tokens) scores.append(score) # print("scores:",scores) new_score = sorted(scores, reverse=True) pre_lis = [] for i in range(3): pre_lis.append(scores.index(new_score[i])) # print("原文:",up_source_tokens + " "+ down_source_tokens) # print("候选:",citations[pre_lis[0]]["up_source_tokens"]) # print("候选:",citations[pre_lis[0]]["target_tokens"]) # print("候选:",citations[pre_lis[0]]["down_source_tokens"]) # print("ref_lis",ref_lis) # print("pre_lis",pre_lis) precision, recall, MAP = cal_MAP(ref_lis, pre_lis) # print("precision:", precision) # print("recall:", recall) # print("MAP:", MAP) # print("-----------------------------------------------") MAPS += MAP precisions += precision recalls += recall MAPS /= len(datas) precisions /= len(datas) recalls /= len(datas) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls))
def test(args): args.dropout = 0 data = pickleload("../data2/random_train_data.pkl", "traindata") dev_data = data[len(data)*4//5:len(data)] # dev_data = data[2000: 4000] batch = Batch(args) word2index = pickleload(args.word2index_pkl, 'word2index') input_vec = len(word2index) source_embedding = pickleload("./word2vec/glove_300.pkl", "glove_300.pkl") source_embedding = np.array(source_embedding, dtype=np.float32) dev_batches = batch.dev_batch(dev_data, args.context_limit, args.citation_limit) log_msg = "输入词空间大小:%d" %(input_vec) logger.info(log_msg) print(log_msg) if args.model == "MatchPyramid": model = MatchPyramid(args, input_vec, source_embedding) elif args.model == "LstmMatch": model = LstmMatch(args, input_vec, source_embedding) elif args.model == "Decomposable": model = Decomposable(args, input_vec, source_embedding) elif args.model == "Inference": model = Inference(args, input_vec, source_embedding) elif args.model == "ESIM": model = ESIM(args, input_vec, source_embedding) elif args.model == "ArcII": model = ArcII(args, input_vec, source_embedding) if args.loadmodel ==True: model.load_state_dict(torch.load("./modelsave/"+ args.loadmodelName)) if torch.cuda.is_available(): model = model.cuda() # 打印参数: log_msg = "模型名称:%s \n"%( args.loadmodelName) logger.info(log_msg) print(log_msg) pbar2 = tqdm(total=len(dev_data)) MAPS = 0 precisions = 0 recalls = 0 blues = 0 rouges = 0 for dev_step, dev_batch in enumerate(dev_batches): pbar2.update(1) context_idxs = dev_batch['context_idxs'] cit_context_idxs = dev_batch['cit_context_idxs'] ref_labels = dev_batch['ref_labels'] target = dev_batch["targets"] citations = dev_batch['citations'] context_mask = torch.Tensor( np.array([list(map(function, xx)) for xx in context_idxs.data.numpy()], dtype=np.float)).cuda() cit_context_mask = torch.Tensor( np.array([list(map(function, xx)) for xx in cit_context_idxs.data.numpy()], dtype=np.float)).cuda() context_idxs = Variable(context_idxs).cuda() cit_context_idxs = Variable(cit_context_idxs).cuda() out = model.forward(context_idxs, cit_context_idxs, context_mask, cit_context_mask) # Get loss # print("真实值:",out) # print("真实标签:",ref_labels) topn = 3 predict_index = torch.topk(out,topn, dim=0)[1].squeeze(1).data.cpu().numpy() bleu = 0 rouge = 0 for index in predict_index: alternative_citation = citations[index]["target_tokens"] bleu += test_bleu(alternative_citation, target, 1) rouge += test_rouge(alternative_citation, target) # print("候选citation:", alternative_citation) bleu = bleu / topn rouge = rouge / topn blues += bleu rouges += rouge # print("预测标签:",predict_index) precision, recall, MAP = cal_MAP(ref_labels, predict_index) MAPS += MAP precisions += precision recalls += recall MAPS /= len(dev_data) precisions /= len(dev_data) recalls /= len(dev_data) blues /= len(dev_data) rouges /= len(dev_data) print("MAP:%.4f P:%.4f R:%.4f" % (MAPS, precisions, recalls)) print("bleu", topn, ":", blues) print("rouge:", rouges) pbar2.close()
def getSmallPairsTrainData(): import random ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" "citations":[ { "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" } ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("../data2/train_data2.pkl", "./data2/train_data2.pkl") idf_dic = pickleload("../data2/idf.pkl", "idf.pkl") # datas = datas[len(datas)-1000:len(datas)] print(len(datas)) train_datas = [] train_datas2 = [] train_spill = [] q_id = 0 for i in tqdm(range(len(datas))): data = datas[i] target = data_process(data["target_tokens"]) # 计算citation citations = data["citations_tokens"] scores = [] if len(target) < 50: continue for index in range(len(citations)): ciation = citations[index] cit_target = data_process(ciation["target_tokens"]) if target == cit_target or len(cit_target) < 50: scores.append(0) else: score = getSVMScore(idf_dic, process_kuohao(target), process_kuohao(cit_target)) scores.append(score) sorted_scores = sorted(scores, reverse=True) best_indexs = [] for j in range(len(sorted_scores)): if sorted_scores[j] > 0.1 and j <= 5: best_index = scores.index(sorted_scores[j]) best_indexs.append(best_index) if len(best_indexs) == len(citations): continue for best_index in best_indexs: train_data = {} train_data['up_source'] = data_process(data["up_source_tokens"]) train_data['down_source'] = data_process( data["down_source_tokens"]) train_data['target'] = data_process(data["target_tokens"]) high_dic = {} high_dic['cit_up_source'] = data_process( citations[best_index]['up_source_tokens']) high_dic['cit_down_source'] = data_process( citations[best_index]['down_source_tokens']) high_dic['cit_target'] = data_process( citations[best_index]['target_tokens']) high_dic['bleu1_score'] = scores[best_index] # for k in range(len(best_indexs)): # print("target:", train_data['target']) # print("cit_target:", data_process(citations[best_indexs[k]]['target_tokens'])) # print("score:", sorted_scores[k]) # print("\n") # print(len(best_indexs), " / ", len(citations)) # print("---------------------------------------------") low_index = random.randint(0, len(scores) - 1) while low_index in best_indexs: low_index = random.randint(0, len(scores) - 1) if scores[best_index] == scores[low_index] or scores[ best_index] == 1.0: continue low_dic = {} low_dic['cit_up_source'] = data_process( citations[low_index]['up_source_tokens']) low_dic['cit_down_source'] = data_process( citations[low_index]['down_source_tokens']) low_dic['cit_target'] = data_process( citations[low_index]['target_tokens']) low_dic['bleu1_score'] = scores[low_index] if low_dic['cit_target'] == train_data['target']: continue train_data['high_dic'] = high_dic train_data['low_dic'] = low_dic train_spill.append(train_data) if i in [ len(datas) // 5, len(datas) * 2 // 5, len(datas) * 3 // 5, len(datas) * 4 // 5, len(datas) - 1 ]: train_datas.append(train_spill) print(len(train_spill)) train_spill = [] print(len(train_datas)) print(len(train_datas2)) #26933 print("训练样本的数据量为:", len(train_datas)) picklesave(train_datas, "./train_data/small_pairs_train_data.pkl", "small_pairs_train_data.pkl")
def getMatchZooData(): import random ''' [ { "citStr":"" 引用的作者和年份, "context":"", 整个引用片段 "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" "citations":[ { "up_source_tokens":"", "down_source_tokens":"", "target_tokens":"" } ... ] } ...... ] 查找相似citation :return: ''' datas = pickleload("../data2/random_train_data.pkl", "./data2/random_train_data.pkl") word2index = pickleload("./word2vec/glove_word2index_300.pkl", "./word2vec/glove_word2index_300.pkl") print(len(datas)) q_id = 0 for i in tqdm(range(len(datas))): data = datas[i] source_tokens = data_process( data["up_source_tokens"]) + " " + data_process( data["up_source_tokens"]) # 计算citation citations = data["citations_tokens"] writefile( './match_zoo_data/corpus_preprocessed.txt', "Q_" + str(q_id) + "\t250\t" + getSen_index(source_tokens, word2index) + "\n") d_id = 0 for citation in citations: score = citation['label'] citation_tokens = data_process( citation["up_source_tokens"]) + " " + data_process( citation["target_tokens"]) + " " + data_process( citation["up_source_tokens"]) writefile( './match_zoo_data/corpus_preprocessed.txt', "Q_" + str(q_id) + "D_" + str(d_id) + "\t250\t" + getSen_index(citation_tokens, word2index) + "\n") if q_id < len(datas) * 4 // 5: writefile( './match_zoo_data/relation_train.txt', str(score) + "\t" + "Q_" + str(q_id) + "\t" + "Q_" + str(q_id) + "D_" + str(d_id) + "\n") else: writefile( './match_zoo_data/relation_test.txt', str(score) + "\t" + "Q_" + str(q_id) + "\t" + "Q_" + str(q_id) + "D_" + str(d_id) + "\n") writefile( './match_zoo_data/relation_valid.txt', str(score) + "\t" + "Q_" + str(q_id) + "\t" + "Q_" + str(q_id) + "D_" + str(d_id) + "\n") d_id += 1 q_id += 1
def countScores(): datas = pickleload("./train_data/train_data.pkl", "./train_data/train_data.pkl") for data in datas: score = data['bleu1_score'] print(score)