Пример #1
0
def testMAP():
    result_dic = pickleload("./modelsave/pyramidModel0_predict.pkl",
                            "./modelsave/result_dic.pkl")
    true_label_dic = pickleload("./modelsave/pyramidModel0_true.pkl",
                                "./modelsave/true_label_dic.pkl")
    keys = result_dic.keys()
    MAPS = 0
    precisions = 0
    recalls = 0
    for key in keys:
        out = torch.cat(result_dic[key], dim=0)
        print(out)
        print(true_label_dic[key])
        predict_index = torch.topk(out, 2, dim=0)[1].squeeze(1).data.numpy()
        print("预测标签:", predict_index)
        print("-------------------------------------")
        precision, recall, MAP = cal_MAP(true_label_dic[key], predict_index)
        MAPS += MAP
        precisions += precision
        recalls += recall
    print(len(keys))
    MAPS /= len(keys)
    precisions /= len(keys)
    recalls /= len(keys)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
Пример #2
0
def random():
	import random
	idf_dic = pickleload("../data2/idf.pkl", "idf.pkl")
	random.seed = 1
	datas = pickleload("../data2/train_data3.pkl", "../data/train_data.pkl")
	datas = datas[len(datas)*4//5 : len(datas)]
	result_lis = []
	all_count = 0
	true_count = 0
	for data in tqdm(datas):
		target_content = data["target_tokens"]
		citations = data["citations_tokens"]
		citation_target_dict = dict()
		index = 0
		# print(len(citations))
		new_citations = []
		for citation in citations:
			sel_target = citation["target_tokens"]
			citation_target_dict[index] = sel_target
			index += 1
			new_citations.append(sel_target)
		# print(len(citation_content_dict))
		random_index = randint(0, len(citation_target_dict)-1)
		ref = getTopVsmScore(idf_dic, target_content, new_citations)
		all_count+= 1
		if random_index == ref:
			true_count += 1

	print(true_count/all_count)
Пример #3
0
def test_languageModel(args):
    args.dropout = 0.0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data) * 4 // 5:len(data)]
    # dev_data = data[2000: 4000]

    batch = Batch(args)
    word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl")
    input_vec = len(word2index)

    dev_batches = batch.lm_dev_batch(dev_data, args.context_limit)

    log_msg = "输入词空间大小:%d" % (input_vec)
    logger.info(log_msg)
    print(log_msg)

    transform = Transformer(args, input_vec)
    transform.load_state_dict(
        torch.load("./modelsave/" + "TransformModel0.pkl"))
    if torch.cuda.is_available():
        transform = transform.cuda()

    # 打印参数:
    log_msg = "模型名称:%s \n" % (args.loadmodelName)
    logger.info(log_msg)
    print(log_msg)

    result_dic = {}
    true_label_dic = {}
    all_count = 0
    right_count = 0
    loss_func = torch.nn.NLLLoss()
    loss = 0
    for dev_step, dev_batch in enumerate(dev_batches):
        context_idxs = dev_batch['context_idxs']
        seg_indexs = dev_batch['seg_indexs']
        cit_targets = dev_batch['cit_targets']
        targets = dev_batch['targets']
        target_indexs = dev_batch['target_indexs']
        ref_labels = dev_batch['ref_labels']
        id = dev_batch['id']
        print(id)
        context_mask = torch.Tensor(
            np.array(
                [list(map(function, xx)) for xx in context_idxs.data.numpy()],
                dtype=np.float)).cuda()

        context_idxs = Variable(context_idxs).cuda()
        seg_indexs = Variable(seg_indexs).cuda()
        targets = Variable(targets).cuda()
        out1, out2 = transform.forward(context_idxs, seg_indexs, context_mask,
                                       target_indexs)
        # print(out)
        for i in range(out1.size(0)):
            loss += loss_func(out1[i], targets[i])
        loss = loss.item() / out1.size(0)
        all_count += 1
        del out1, out2
    print(loss / all_count)
Пример #4
0
 def __init__(self, args):
     self.OOV = 0
     self.OOD = 1  #unknow
     self.word2index = pickleload(args.word2index_pkl, "word2index")
     self.stoplis = {}
     for word in stopwords.words("english"):
         self.stoplis[word] = 1
Пример #5
0
def matplotDataDisplay():
    import matplotlib.pyplot as plt
    datas = pickleload("./data2/random_train_data.pkl",
                       "./data2/random_train_data.pkl")
    key_dic = {}
    for i in range(len(datas)):
        citations = datas[i]['citations_tokens']
        count = 0
        for j in range(len(citations)):
            if citations[j]['label'] == 1:
                count += 1
        if count not in key_dic:
            key_dic[count] = 1
        else:
            key_dic[count] += 1

    new_key_dic = sorted(key_dic.items(),
                         key=lambda item: item[0],
                         reverse=False)
    print(new_key_dic)
    name_list = []
    num_list = []
    for key, value in new_key_dic:
        name_list.append(str(key))
        num_list.append(value)
    plt.bar(range(len(name_list)),
            num_list,
            color='grey',
            tick_label=name_list)
    plt.xlabel('The number of the alternative citations')
    plt.ylabel('Number of the instances')
    plt.show()
def getWord2vecData():
    '''
        [
                {
                 "citStr":"" 引用的作者和年份,
                 "context":"", 整个引用片段
                 "up_source_tokens":"",
                 "down_source_tokens":"",
                 "target_tokens":""
                 "citations":[
                                {
                                "up_source_tokens":"",
                                "down_source_tokens":"",
                                "target_tokens":""
                                }
                               ...
                              ]
                }
                ......

            ]
        查找相似citation
        :return:
        '''
    datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl")
    # datas = datas[len(datas)-1000:len(datas)]
    print(len(datas))
    for i in tqdm(range(len(datas))):
        data = datas[i]
        target = data_process(data["target_tokens"])
        up_content = data_process(data['up_source_tokens'])
        down_content = data_process(data['down_source_tokens'])
        writeFile('./word2vec/train_word2vec.txt',
                  up_content + " " + target + " " + down_content + "\n")
def getZooEmbedding():
    source_embedding = pickleload("./word2vec/glove_300.pkl", "glove_300.pkl")
    for i in range(len(source_embedding)):
        print(i)
        str_embedding = [str(j) for j in source_embedding[i]]
        writefile("./match_zoo_data/embedding_dic.txt",
                  str(i) + " " + " ".join(str_embedding) + "\n")
Пример #8
0
def findBleuSimilar():
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source":"",
             "down_source":"",
             "target":""
             "citations":[
                          citation0,
                          citation1,
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("./data2/train_data.pkl", "./data2/train_data.pkl")
    # datas = datas[len(datas)-1000:len(datas)]
    print(len(datas))
    result_lis = []
    count = 0
    for data in datas:
        target = data["target_tokens"].split(" ")

        #计算citation
        citations = data["citations_tokens"]
        scores = []
        for index in range(len(citations)):
            ciation = citations[index]
            cit_target = ciation["target_tokens"].split(" ")
            score = test_bleu(" ".join(cit_target), " ".join(target), 1)
            scores.append(score)
        new_score = sorted(scores, reverse=True)

        # if new_score[0] < 0.5 and new_score[0] != 1:
        #     continue
        best_index = scores.index(new_score[0])
        predict = citations[best_index]['target_tokens']
        result_dic = OrderedDict()
        result_dic["cand_answer"] = predict
        result_dic["ref_answer"] = data["target_tokens"]
        result_lis.append(result_dic)
        count += 1
        print("score:", new_score[0])
        print("原始的:", data["target_tokens"])
        print("预测的:", predict)
        print("-------------------------------------------------------")
    print(count)
    jsonsave('./rougetest/data/target_data.json', result_lis, "result_lis")
    test_score("./rougetest/data/target_data.json", n_size=1)
    test_score("./rougetest/data/target_data.json", n_size=2)
    test_score("./rougetest/data/target_data.json", n_size=3)
    test_score("./rougetest/data/target_data.json", n_size=4)
def getSingleTrainData():
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source_tokens":"",
             "down_source_tokens":"",
             "target_tokens":""
             "citations":[
                            {
                            "up_source_tokens":"",
                            "down_source_tokens":"",
                            "target_tokens":""
                            }
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl")
    # datas = datas[len(datas)-1000:len(datas)]
    print(len(datas))
    new_datas = copy(datas)
    train_datas = []
    for i in tqdm(range(len(datas))):
        data = datas[i]
        target = data_process(data["target_tokens"])

        #计算citation
        citations = data["citations_tokens"]
        scores = []
        for index in range(len(citations)):
            ciation = citations[index]
            cit_target = data_process(ciation["target_tokens"])
            score = test_bleu(cit_target, target, 1)
            scores.append(score)
            new_datas[i]['citations_tokens'][index]["bleu1_score"] = score

            dic = {}
            dic['up_source'] = data_process(data["up_source_tokens"])
            dic['down_source'] = data_process(data["down_source_tokens"])
            dic['target'] = data_process(data["target_tokens"])
            dic['cit_up_source'] = data_process(ciation['up_source_tokens'])
            dic['cit_down_source'] = data_process(
                ciation['down_source_tokens'])
            dic['cit_target'] = data_process(ciation['target_tokens'])
            dic['bleu1_score'] = score
            if score == 1:
                continue
            train_datas.append(copy(dic))
    print("训练样本的数据量为:", len(train_datas))
    picklesave(train_datas, "./train_data/single_train_data.pkl",
               "single_train_data.pkl")
Пример #10
0
def getRandomData():
    import numpy as np
    datas = pickleload("../data2/train_data2.pkl", "./data2/train_data2.pkl")
    new_datas = []
    ids = range(len(datas))
    permutation = np.random.permutation(ids)
    for i, id in enumerate(permutation):
        new_datas.append(datas[id])
    picklesave(new_datas, "../data2/random_train_data.pkl",
               "./data2/random_train_data.pkl")
Пример #11
0
def displayData(topn=1):
    result_dic = pickleload("./modelsave/pyramidModel0_predict.pkl",
                            "./modelsave/result_dic.pkl")
    true_label_dic = pickleload("./modelsave/pyramidModel0_true.pkl",
                                "./modelsave/true_label_dic.pkl")
    keys = result_dic.keys()
    blues = 0
    rouges = 0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data) * 4 // 5:len(data)]

    id = 0
    for key in keys:
        up_source_tokens = dev_data[id]["up_source_tokens"]
        target = dev_data[id]["target_tokens"]
        citations = dev_data[id]["citations_tokens"]
        out = torch.cat(result_dic[key], dim=0)
        print("up_source_tokens:", up_source_tokens)
        print("目标target:", target)
        predict_index = torch.topk(out, topn, dim=0)[1].squeeze(1).data.numpy()
        bleu = 0
        rouge = 0
        for index in predict_index:
            alternative_citation = citations[index]["target_tokens"]
            if len(target.strip().split(" ")) < 5 or len(
                    alternative_citation.strip().split(" ")) < 5:
                continue
            bleu += test_bleu(alternative_citation, target, 1)
            rouge += test_rouge(alternative_citation, target)
            print("候选citation:", alternative_citation)
        print("--------------------------")
        bleu = bleu / topn
        rouge = rouge / topn
        blues += bleu
        rouges += rouge
        # print("-----------------------------------------------------")
        id += 1
    blues /= len(keys)
    rouges /= len(keys)
    print("bleu", topn, ":", blues)
    print("rouge:", rouges)
Пример #12
0
def testRetrievalModelResult(topn = 5):
    with open("./result/predict.test.arcii_ranking.txt") as fp:
        lines = fp.readlines()
    result_lis = {}
    last_name = ""
    for line in lines:
        results = line.replace("\n","").split("	")
        if last_name != results[0]:
            last_name = results[0]
            result_lis[last_name] = []
        result_lis[last_name].append(int(results[2].split("_")[-1]))
    predict_indexs = [value for key, value in result_lis.items()]
    MAPS = 0
    precisions = 0
    recalls = 0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data) * 4 // 5:len(data)]
    bleus = 0
    rouges = 0
    for id in range(len(dev_data)):
        target = dev_data[id]["target_tokens"]
        citations = dev_data[id]['citations_tokens']
        true_label = []
        predict_index = predict_indexs[id][0:topn]
        for i in range(len(citations)):
            citation = citations[i]
            if citation['label'] == 1:
                true_label.append(i)
        bleu = 0
        rouge = 0
        for predict in predict_index:
            # print(predict)
            alternative_citation = citations[predict]["target_tokens"]
            if len(target.strip().split(" ")) < 5 or len(alternative_citation.strip().split(" ")) < 5:
                continue
            bleu += test_bleu(alternative_citation, target, 1)
            rouge += test_rouge(alternative_citation, target)
            print(bleu)
        print("------------------------")
        bleus += bleu/len(predict_index)
        rouges += rouge/len(predict_index)
        precision, recall, MAP = cal_MAP(true_label, predict_index)
        precisions += precision
        recalls+= recall
        MAPS += MAP
    MAPS /= len(predict_indexs)
    precisions /= len(predict_indexs)
    recalls /= len(predict_indexs)
    bleus /= len(dev_data)
    rouges /= len(dev_data)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
    print("bleu", topn, ":", bleus)
    print("rouge:", rouges)
Пример #13
0
def getWord2index():
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source_tokens":"",
             "down_source_tokens":"",
             "target_tokens":""
             "citations":[
                            {
                            "up_source_tokens":"",
                            "down_source_tokens":"",
                            "target_tokens":""
                            }
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl")
    # datas = datas[len(datas)-1000:len(datas)]
    print(len(datas))
    tokenDic = {}
    for i in tqdm(range(len(datas))):
        data = datas[i]
        target = data_process(data["target_tokens"]).split(" ")
        up_source = data_process(data["up_source_tokens"]).split(" ")
        down_source = data_process(data["down_source_tokens"]).split(" ")
        word_lis = target + up_source + down_source
        for token in word_lis:
            if token not in tokenDic:
                tokenDic[token] = 1
            else:
                tokenDic[token] += 1

    index = 2
    word2index = {}
    for key, value in tokenDic.items():
        if value > 1:
            word2index[key] = index
            index += 1
    word2index['<padding>'] = 0
    word2index['<unknow>'] = 1
    word2index['<CLS>'] = index
    word2index['<DSP>'] = index + 1
    word2index['<MASK>'] = index + 2
    print(len(word2index), "  /  ", len(tokenDic), "个token")
    picklesave(word2index, './word_vec/word2index.pkl', "word2index.pkl")
Пример #14
0
def statistical_data():
    datas = pickleload("../data2/train_data2.pkl", "./data2/train_data.pkl")
    label_num_dic = {}
    for data in datas:
        citations = data['citations_tokens']
        count = 0
        for citation in citations:
            if citation["label"] == 1:
                count += 1
        if count not in label_num_dic:
            label_num_dic[count] = 1
        else:
            label_num_dic[count] += 1
    for key, value in label_num_dic.items():
        print(key, " : ", value)
Пример #15
0
def getDataDisplay():
    datas = pickleload("./data2/random_train_data.pkl",
                       "./data2/random_train_data.pkl")
    string = "A different strategy is presented in Fung and Chen ( 2004 ) , where English FrameNet entries are mapped to concepts listed in HowNet , an on-line ontology for Chinese , without consulting a parallel corpus . Then , Chinese sentences with predicates instantiating these concepts are found in a monolingual corpus and their arguments are labeled with FrameNet roles . Other work attempts to alleviate the data requirements for semantic role labeling either by relying on unsupervised learning or by extending existing resources through the use of unlabeled data ."
    for i in range(len(datas)):
        if datas[i]['up_source_tokens'] == string:
            citations = datas[i]['citations_tokens']
            print("目标:", datas[i]["target_tokens"])
            for j in range(len(citations)):
                print(j + 1, ":")
                print("up:", citations[j]['up_source_tokens'])
                print("down:", citations[j]['down_source_tokens'])
                print(citations[j]['target_tokens'])
                print(citations[j]['label'])
                print(
                    "-------------------------------------------------------")
Пример #16
0
def getsomeSample():
    datas = pickleload("../data2/train_data.pkl", "./data2/train_data.pkl")
    for i in tqdm(range(len(datas))):
        data = datas[i]
        target = data_process(data["target_tokens"])
        up_source_tokens = data_process(data["up_source_tokens"])
        down_source_tokens = data_process(data["down_source_tokens"])
        # 计算citation
        citations = data["citations_tokens"]
        print("up_context:", up_source_tokens)
        print("down_context:", down_source_tokens)
        print("target_citation:", target)
        scores = []
        for index in range(len(citations)):
            ciation = citations[index]
            cit_target = data_process(ciation["target_tokens"])
            print(index, " citations:", cit_target)
        print("--------------------------------------------")
Пример #17
0
def manualselect():
    import random
    datas = pickleload("./data2/random_train_data.pkl",
                       "./data2/random_train_data.pkl")
    select_ids = []
    right_count = 0
    wrong_count = 0
    for _ in range(50):
        id = random.randint(0, len(datas))
        while id in select_ids:
            id = random.randint(0, len(datas) - 1)
        select_ids.append(id)

        data = datas[id]
        up_source = data["up_source_tokens"]
        down_source = data["down_source_tokens"]
        target_citation = data['target_tokens']
        citations = data["citations_tokens"]
        print("up_source:", up_source)
        print("down_source:", down_source)

        for index in range(len(citations)):
            if citations[index]['label'] == 1:
                citation = citations[index]["target_tokens"]
                break
        select_lis = [target_citation, citation]

        order = random.randint(0, 1)
        if order == 0:
            print(select_lis[0], "\n", select_lis[1])
            inputs = input("输入你要选择的目标:")
            if inputs == "0":
                right_count += 1
            elif inputs == "1":
                wrong_count += 1
        else:
            print(select_lis[1], "\n", select_lis[0])
            inputs = input("输入你要选择的目标:")
            if inputs == "0":
                wrong_count += 1
            elif inputs == "1":
                right_count += 1
    print("right:", right_count)
    print("wrong:", wrong_count)
Пример #18
0
def manual_label():
    datas = pickleload("../data2/train_data2.pkl", "./data2/train_data.pkl")
    # golden_train_datas = pickleload("../data/golden_train_data.pkl", "./data/golden_train_data.pkl")
    print(len(datas))
    train_datas = []
    flag_pairs = {}
    for i in range(len(datas)):
        data = datas[i]
        target = data_process(data["target_tokens"])
        # 计算citation
        citations = data["citations_tokens"]
        flag = 0
        for index in range(len(citations)):
            citation = citations[index]
            cand_cit = data_process(citation["target_tokens"])
            if cand_cit + target not in flag_pairs.keys():
                print("进程:", i, "/", len(datas), "  ", index, "/",
                      len(citations))
                print("target:", target)
                print("candidate:", cand_cit)
                label = input("标签:")
                if str(label) == "1":
                    citations[index]['label'] = 1
                    flag = 1
                else:
                    citations[index]['label'] = 0
                flag_pairs[cand_cit + target] = citations[index]['label']
                flag_pairs[target + cand_cit] = citations[index]['label']
            else:
                if flag_pairs[cand_cit + target] == 1:
                    citations[index]['label'] = 1
                    flag = 1
                else:
                    citations[index]['label'] = 0
        picklesave(flag_pairs, "../data/flag_pairs.pkl",
                   "./data/flag_pairs.pkl")
        if flag == 1:
            new_data = datas[i]
            new_data["citations_tokens"] = citations
            train_datas.append(new_data)
            picklesave(train_datas, "../data/golden_train_data.pkl",
                       "./data/golden_train_data.pkl")
Пример #19
0
def getCsvFile():
    datas = pickleload("./train_data/train_data.pkl",
                       "./train_data/train_data.pkl")
    content_source = []
    target = []
    cit_content_source = []
    cit_target = []
    score = []
    for data in tqdm(datas):
        content_source.append(data['up_source'] + ' ' + data['down_source'])
        target.append(data['target'])
        cit_content_source.append(data['cit_up_source'] + ' ' +
                                  data['cit_down_source'])
        cit_target.append(data['cit_target'])
        score.append(data['bleu1_score'])
    train_data = pd.concat([pd.DataFrame(data = content_source, columns=['content_source']),\
                            pd.DataFrame(data = target, columns=['target']), \
                            pd.DataFrame(data=cit_content_source,columns=['cit_content_source']), pd.DataFrame(data = cit_target, columns=['cit_target']),\
                            pd.DataFrame(data = score, columns=['score'])], axis = 1)
    train_data.to_csv("./train_data/train_data.csv")
Пример #20
0
def getIdf():
    datas = pickleload("./data2/train_data.pkl", "./data/train_data.pkl")
    all_count = len(datas)
    print(len(datas))
    tokenidf_dic = {}
    for data in tqdm(datas):
        up_source_tokens = process(data["up_source_tokens"]).split(" ")
        down_source_tokens = process(data["down_source_tokens"]).split(" ")
        target_tokens = process(data["target_tokens"]).split(" ")
        dic = {}
        for token in up_source_tokens + down_source_tokens + target_tokens:
            if token not in dic:
                dic[token] = 1

        for key in dic.keys():
            if key not in tokenidf_dic:
                tokenidf_dic[key] = 1
            else:
                tokenidf_dic[key] += 1
    new_dic = {}
    for key, value in tokenidf_dic.items():
        new_dic[key] = math.log10(all_count / value)
    picklesave(new_dic, './data2/idf.pkl', "idf")
Пример #21
0
def getTfidf():
    '''
    up_source.append(data['up_source'])
        down_source.append(data['down_source'])
        target.append(data['target'])
        cit_up_source.append(data['cit_up_source'])
        cit_down_source.append(data['cit_down_source'])
        cit_target.append(data['cit_target'])
        score.append(data['bleu1_score'])
    :return:
    数量:189389
    '''
    train_data = pickleload('../data/train_data.pkl', 'train_data.pkl')
    sentences = []
    for data in tqdm(train_data):
        up_source = data['up_source_tokens']
        down_source = data['down_source_tokens']
        target = data['target_tokens']
        if up_source != "":
            sentences.append(up_source)
        if down_source != "":
            sentences.append(down_source)
        if target != "":
            sentences.append(target)
    #创建词向量空间
    vec = TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9,  #strip_accents='unicode',
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1)
    print('开始训练tfidf')
    vec.fit_transform(sentences)
    pickle.dump(vec, open('./svmModelsave/tfidf_12gram.pkl', 'wb'))  #25453
    dic = vec.vocabulary_
    print(len(dic))
Пример #22
0
def findVSMSimilar():
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source":"",
             "down_source":"",
             "target":""
             "citations":[
                          citation0,
                          citation1,
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("./data2/random_train_data.pkl",
                       "./data/train_data.pkl")
    datas = datas[len(datas) * 4 // 5:len(datas)]
    idf_dic = pickleload("./data2/idf.pkl", "idf.pkl")
    # datas = datas[0:10]
    print(len(idf_dic))
    print(len(datas))
    result_lis = []
    count = 0
    for data in datas:
        up_source_tokens = process(data["up_source_tokens"]).split(" ")
        down_source_tokens = process(data["down_source_tokens"]).split(" ")
        target = process(data["target_tokens"])
        dic = {}
        for token in up_source_tokens:
            if token not in dic:
                dic[token] = 1
            else:
                dic[token] += 1
        for token in down_source_tokens:
            if token not in dic:
                dic[token] = 1
            else:
                dic[token] += 1
        keys = dic.keys()
        sqrt_source = 0.0
        for key in keys:
            if key in idf_dic:
                dic[key] = dic[key] / (len(up_source_tokens) +
                                       len(down_source_tokens)) * idf_dic[key]
            else:
                dic[key] = 0
            sqrt_source += dic[key] * dic[key]
        sqrt_source = math.sqrt(sqrt_source)

        #计算citation
        citations = data["citations_tokens"]
        scores = []
        # if len(citations) < 20:
        #     continue
        count += 1
        for index in range(len(citations)):
            ciation = citations[index]
            cit_up_source_tokens = ciation["up_source_tokens"].split(" ")
            cit_down_source_tokens = ciation["down_source_tokens"].split(" ")
            cit_target = process(ciation["target_tokens"]).split(" ")
            cit_dic = {}
            for token in cit_target:
                if token not in cit_dic:
                    cit_dic[token] = 1
                else:
                    cit_dic[token] += 1
            keys = cit_dic.keys()
            sqrt_cit = 0.0
            for key in keys:
                if key in idf_dic:
                    cit_dic[key] = cit_dic[key] / (
                        len(cit_target)) * idf_dic[key]
                else:
                    cit_dic[key] = 0
                sqrt_cit += cit_dic[key]
            sqrt_cit = math.sqrt(sqrt_cit)
            #计算相似度
            sum = 0.0
            for key in dic.keys():
                if key in cit_dic:
                    sum += dic[key] * cit_dic[key]

            score = sum / (sqrt_source * sqrt_cit)
            scores.append(score)
        new_score = sorted(scores, reverse=True)

        best_index = scores.index(new_score[0])
        predict = citations[best_index]['target_tokens']
        result_dic = OrderedDict()
        result_dic["cand_answer"] = predict
        result_dic["ref_answer"] = target
        result_lis.append(result_dic)
        # print("上文:",data["up_source_tokens"])
        # print("下文:",data["down_source_tokens"])
        # print("原始的:", target)
        # print("预测的:", predict)
        # print("-------------------------------------------------------")
    print(count, "   /    ", len(datas), "   /    ", count / len(datas))
    jsonsave('./rougetest/data/similar_data.json', result_lis, "result_lis")
    test_score("./rougetest/data/similar_data.json", n_size=1)
    test_score("./rougetest/data/similar_data.json", n_size=2)
    test_score("./rougetest/data/similar_data.json", n_size=3)
    test_score("./rougetest/data/similar_data.json", n_size=4)
Пример #23
0
def Bm25_similar():
	'''
	[
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source_tokens":"",
             "down_source"_tokens:"",
             "target_tokens":""
             "citations":[
                          {
                          	"up_source_tokens":
                          	"down_source_tokens":
                          	"target_tokens":
                          }
                           ...
                          ]
            }
            ......

        ]
	:return:
	'''
	datas = pickleload("../data2/random_train_data.pkl", "../data2/random_train_data.pkl")
	datas = datas[len(datas)*4//5:len(datas)]
	MAPS = 0
	precisions= 0
	recalls = 0
	for data in tqdm(datas):
		target_up_content = process(data["up_source_tokens"]).split(" ")
		target_down_content = process(data["down_source_tokens"]).split(" ")
		target_content = process(data["target_tokens"])
		content_tokens = target_up_content #+ target_down_content
		citations = data["citations_tokens"]
		citation_content_dict = dict()
		citation_target_dict = dict()
		index = 0
		# print(len(citations))
		ref_lis = []
		for citation in citations:
			sel_up_content = process(citation["up_source_tokens"]).split(" ")
			sel_down_content =process( citation["down_source_tokens"]).split(" ")
			sel_target = process(citation["target_tokens"])
			citation_content_tokens = sel_up_content + sel_target.split(" ") +sel_down_content
			citation_content_dict[str(index)] = citation_content_tokens
			citation_target_dict[str(index)] = sel_target
			if citation['label'] == 1:
				ref_lis.append(index)
			index += 1

		pre_lis = getBm25TopSimilar(content_tokens, citation_content_dict, num=5)

		precision, recall ,MAP = cal_MAP(ref_lis, pre_lis)
		MAPS += MAP
		precisions += precision
		recalls += recall

	MAPS /= len(datas)
	precisions /= len(datas)
	recalls /= len(datas)
	print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
Пример #24
0
def train(args):
    train_data = pickleload('../Retrieval/train_data/single_train_data.pkl',
                            "traindata")

    batch = Batch(args)
    # source_embedding = pickleload(args.source_emb_mat_pkl, "source_emb_mat_pkl")
    word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl")
    input_vec = len(word2index)

    train_batches = batch.train_batch(train_data, args.context_limit,
                                      args.num_epoches, args.batch_size)

    log_msg = "输入词空间大小:%d" % (input_vec)
    logger.info(log_msg)
    print(log_msg)

    model = Transformer(args, input_vec)

    if torch.cuda.is_available():
        model = model.cuda()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)
        param.data.uniform_(-0.08, 0.08)

    parameters_trainable = list(
        filter(lambda p: p.requires_grad, model.parameters()))

    if args.optim == "Adadelta":
        optimizer = torch.optim.Adadelta(parameters_trainable,
                                         lr=args.learning_rate,
                                         weight_decay=args.init_weight_decay)
    elif args.optim == "Adam":
        optimizer = torch.optim.Adam(parameters_trainable,
                                     lr=args.learning_rate,
                                     weight_decay=args.init_weight_decay)
    elif args.optim == "SGD":
        optimizer = torch.optim.SGD(parameters_trainable,
                                    lr=args.learning_rate,
                                    weight_decay=args.init_weight_decay)

    if args.loadmodel == True:
        model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName))

    # 打印参数:
    log_msg = "优化函数:%s \n 学习率:%s \n 隐藏层:%s\n 保存模型名称:%s \n" % (
        args.optim, args.learning_rate, args.d_model, args.modelName)
    # print("dropout:", args.dropout)
    logger.info(log_msg)
    print(log_msg)

    set_epoch = 0
    pbar = tqdm(total=len(train_data) * args.num_epoches // args.batch_size +
                1)

    loss_func = torch.nn.NLLLoss()
    print_loss_total = 0
    for train_step, (train_batch, epoch) in enumerate(train_batches):
        pbar.update(1)
        context_idxs = train_batch['context_idxs']
        seg_ids = train_batch['seg_indexs']
        target_indexs = train_batch['target_indexs']
        targets = train_batch['targets']
        labels = train_batch['labels']

        # print("up_context_idxs",up_context_idxs)
        # print("down_context_idxs",down_context_idxs)
        # print("target_idxs",target_idxs)
        # print("-----------------------------------------------------")

        context_mask = torch.Tensor(
            np.array(
                [list(map(function, xx)) for xx in context_idxs.data.numpy()],
                dtype=np.float)).cuda()

        context_idxs = Variable(context_idxs).cuda()
        seg_ids = Variable(seg_ids).cuda()
        targets = Variable(targets).cuda()
        labels = Variable(labels).cuda()

        out1, out2 = model.forward(context_idxs, seg_ids, context_mask,
                                   target_indexs)
        # Get loss
        optimizer.zero_grad()
        #out1:batch * num_target * word_vec
        #out2:batch * 2
        loss1 = 0
        for i in range(out1.size(0)):
            loss1 += loss_func(out1[i], targets[i])
        loss2 = loss_func(out2, labels)
        loss = loss1 / out1.size(0) + loss2
        # Backward propagation
        loss.backward()
        optimizer.step()
        loss_value = loss.data.item()
        print_loss_total += loss_value

        if train_step % 200 == 0:
            log_msg = 'Epoch: %d, Train_step %d  loss: %.4f' % (
                epoch, train_step, print_loss_total / 100)
            logger.debug(log_msg)
            print(log_msg)
            print_loss_total = 0
        if epoch == set_epoch:
            set_epoch += 1
            #实时保存每个epoch的模型
            torch.save(model.state_dict(), "./modelsave/" + args.modelName)
    torch.save(model.state_dict(), "./modelsave/" + args.modelName)
    pbar.close()
Пример #25
0
def all_doubletrainKey(args):
    data = pickleload(
        '../Retrieval/train_data/small_pairs_random_train_data.pkl',
        "small_pairs_random_train_data")
    dev_data = pickleload("../data2/random_train_data.pkl", "dev_data")
    train_data = data[0] + data[1] + data[2] + data[3]
    dev_data = dev_data[len(dev_data) * 4 // 5:len(dev_data)]

    batch = Batch(args)
    # source_embedding = pickleload(args.source_emb_mat_pkl, "source_emb_mat_pkl")
    word2index = pickleload("./word_vec/word2index.pkl", "word2index.pkl")
    input_vec = len(word2index)

    train_batches = batch.double_train_batch(train_data, args.context_limit,
                                             args.num_epoches, args.batch_size)

    log_msg = "输入词空间大小:%d" % (input_vec)
    logger.info(log_msg)
    print(log_msg)

    transform = Transformer(args, input_vec)

    if torch.cuda.is_available():
        transform = transform.cuda()

    transform.load_state_dict(
        torch.load("./modelsave/" + "TransformModel0.pkl"))

    model = AllClassifyGetKeyWords(args, transform)

    model = model.cuda()
    if args.loadmodel == True:
        model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName))
    # for param in model.parameters():
    #     param.data.uniform_(-0.08, 0.08)
    #     param.data.uniform_(-0.08, 0.08)

    parameters_trainable = list(
        filter(lambda p: p.requires_grad, model.parameters()))

    if args.optim == "Adadelta":
        optimizer = torch.optim.Adadelta(parameters_trainable,
                                         lr=args.learning_rate,
                                         weight_decay=args.init_weight_decay)
    elif args.optim == "Adam":
        optimizer = torch.optim.Adam(parameters_trainable,
                                     lr=args.learning_rate,
                                     weight_decay=args.init_weight_decay)
    elif args.optim == "SGD":
        optimizer = torch.optim.SGD(parameters_trainable,
                                    lr=args.learning_rate,
                                    weight_decay=args.init_weight_decay)

    if args.loadmodel == True:
        model.load_state_dict(torch.load("./modelsave/" + args.loadmodelName))
    # 打印参数:
    log_msg = "优化函数:%s \n 学习率:%s \n 隐藏层:%s\n 保存模型名称:%s \n" % (
        args.optim, args.learning_rate, args.d_model, args.modelName)
    # print("dropout:", args.dropout)
    logger.info(log_msg)
    print(log_msg)

    set_epoch = 1
    pbar = tqdm(total=len(train_data) * args.num_epoches // args.batch_size +
                1)

    def loss_func(high_out, low_out, seleout11, seleout12, seleout21,
                  seleout22):
        ones = torch.ones(high_out.size(0), 1).cuda()
        ones1 = 7 * torch.ones(high_out.size(0), 1).cuda()
        loss = torch.mean(ones - high_out + low_out) + torch.mean((ones1 - seleout11)*(ones1 - seleout11)) + torch.mean((ones1 - seleout12)*(ones1 - seleout12)) + \
               torch.mean((ones1 - seleout21)*(ones1 - seleout21)) + torch.mean((ones1 - seleout22)*(ones1 - seleout22))
        return F.relu(loss), torch.mean(ones - high_out + low_out)

    print_loss_total = 0
    old_accu = 0
    print_loss_total2 = 0
    for train_step, (train_batch, epoch) in enumerate(train_batches):
        pbar.update(1)
        high_context_idxs = train_batch['high_cit_context_idxs']
        high_seg_ids = train_batch['high_seg_indexs']
        low_context_idxs = train_batch['low_cit_context_idxs']
        low_seg_ids = train_batch['low_seg_indexs']
        high_source_context_idxs = train_batch['high_source_context_idxs']
        high_source_seg_indexs = train_batch['high_source_seg_indexs']
        low_source_context_idxs = train_batch['low_source_context_idxs']
        low_source_seg_indexs = train_batch['low_source_seg_indexs']

        high_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in high_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()
        low_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in low_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()
        high_source_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in high_source_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()
        low_source_context_mask = torch.Tensor(
            np.array([
                list(map(function, xx))
                for xx in low_source_context_idxs.data.numpy()
            ],
                     dtype=np.float)).cuda()

        high_context_idxs = Variable(high_context_idxs).cuda()
        high_seg_ids = Variable(high_seg_ids).cuda()
        low_context_idxs = Variable(low_context_idxs).cuda()
        low_seg_ids = Variable(low_seg_ids).cuda()
        high_source_context_idxs = Variable(high_source_context_idxs).cuda()
        high_source_seg_indexs = Variable(high_source_seg_indexs).cuda()
        low_source_context_idxs = Variable(low_source_context_idxs).cuda()
        low_source_seg_indexs = Variable(low_source_seg_indexs).cuda()

        out1, seleout11, seleout12 = model.forward(high_context_idxs,
                                                   high_seg_ids,
                                                   high_context_mask,
                                                   high_source_context_idxs,
                                                   high_source_seg_indexs,
                                                   high_source_context_mask)
        out2, seleout21, seleout22 = model.forward(low_context_idxs,
                                                   low_seg_ids,
                                                   low_context_mask,
                                                   low_source_context_idxs,
                                                   low_source_seg_indexs,
                                                   low_source_context_mask)
        # Get loss
        optimizer.zero_grad()
        #out1:batch * num_target * word_vec
        #out2:batch * 2
        loss, loss2 = loss_func(out1, out2, seleout11, seleout12, seleout21,
                                seleout22)
        # Backward propagation
        loss.backward()
        optimizer.step()
        loss_value = loss.data.item()
        print_loss_total += loss_value
        print_loss_total2 += loss2.data.item()
        del out1, out2
        if train_step % 100 == 0:
            log_msg = 'Epoch: %d, Train_step %d  loss1: %.4f, loss2:%.4f' % (
                epoch, train_step, print_loss_total / 100,
                print_loss_total2 / 100)
            logger.debug(log_msg)
            print(log_msg)
            print_loss_total = 0
            print_loss_total2 = 0
        if epoch == set_epoch:
            set_epoch += 1
            dev_batches = batch.dev_batch(dev_data, args.context_limit)
            result_dic = {}
            true_label_dic = {}
            for dev_step, dev_batch in enumerate(dev_batches):
                context_idxs = dev_batch['context_idxs']
                source_context_idxs = dev_batch['source_context_idxs']
                seg_indexs = dev_batch['seg_indexs']
                source_seg_indexs = dev_batch['source_seg_indexs']
                ref_labels = dev_batch['ref_labels']
                id = dev_batch['id']

                context_mask = torch.Tensor(
                    np.array([
                        list(map(function, xx))
                        for xx in context_idxs.data.numpy()
                    ],
                             dtype=np.float)).cuda()
                source_context_mask = torch.Tensor(
                    np.array([
                        list(map(function, xx))
                        for xx in source_context_idxs.data.numpy()
                    ],
                             dtype=np.float)).cuda()

                context_idxs = Variable(context_idxs).cuda()
                seg_indexs = Variable(seg_indexs).cuda()
                source_context_idxs = Variable(source_context_idxs).cuda()
                source_seg_indexs = Variable(source_seg_indexs).cuda()
                out, seleout1, seleout2 = model.forward(
                    context_idxs, seg_indexs, context_mask,
                    source_context_idxs, source_seg_indexs,
                    source_context_mask)
                # Get loss
                if id not in result_dic:
                    result_dic[id] = []
                    result_dic[id].append(out.cpu().data)
                    true_label_dic[id] = ref_labels
                else:
                    result_dic[id].append(out.cpu().data)
                del out
            picklesave(result_dic, "./modelsave/all_dev_result_dic22.pkl",
                       "./modelsave/result_dic.pkl")
            picklesave(true_label_dic,
                       "./modelsave/all_dev_true_label_dic22.pkl",
                       "./modelsave/true_label_dic.pkl")
            keys = result_dic.keys()
            MAPS = 0
            precisions = 0
            recalls = 0
            for key in keys:
                out = torch.cat(result_dic[key], dim=0)
                predict_index = torch.topk(out, 2,
                                           dim=0)[1].squeeze(1).data.numpy()
                # print("预测标签:",predict_index)
                precision, recall, MAP = cal_MAP(true_label_dic[key],
                                                 predict_index)
                MAPS += MAP
                precisions += precision
                recalls += recall

            MAPS /= len(dev_data)
            precisions /= len(dev_data)
            recalls /= len(dev_data)
            all_loss = MAPS
            if all_loss > old_accu:
                old_accu = all_loss
                torch.save(model.state_dict(),
                           "./modelsave/max" + args.modelName)
                best_epoch = epoch
            # else:
            #     args.learning_rate = args.learning_rate / 2.0
            #     if args.learning_rate <= 1e-6:
            #         args.learning_rate = 1e-6
            #     if args.optim == "Adadelta":
            #         optimizer = torch.optim.Adadelta(parameters_trainable, lr=args.learning_rate,
            #                                          weight_decay=args.init_weight_decay)
            #     elif args.optim == "Adam":
            #         optimizer = torch.optim.Adam(parameters_trainable, lr=args.learning_rate,
            #                                      weight_decay=args.init_weight_decay)
            #     elif args.optim == "SGD":
            #         optimizer = torch.optim.SGD(parameters_trainable, lr=args.learning_rate,
            #                                     weight_decay=args.init_weight_decay)
            log_msg = '\n验证集的MAP为: %.4f  P为: %.4f  R为: %.4f\n 取得最小loss的epoch为:%d' % (
                all_loss, precisions, recalls, best_epoch)
            logger.info(log_msg)
            print(log_msg)
            # 实时保存每个epoch的模型
            torch.save(model.state_dict(), "./modelsave/" + args.modelName)
    torch.save(model.state_dict(), "./modelsave/" + args.modelName)
    pbar.close()
Пример #26
0
def findSimilar():
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source":"",
             "down_source":"",
             "target":""
             "citations":[
                          citation0,
                          citation1,
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("./data2/random_train_data.pkl",
                       "./data2/random_train_data.pkl")
    datas = datas[len(datas) * 4 // 5:len(datas)]
    idf_dic = pickleload("./data2/idf.pkl", "idf.pkl")
    # datas = datas[0:10]
    print(len(idf_dic))
    print(len(datas))
    count = 0

    MAPS = 0
    precisions = 0
    recalls = 0
    for data in tqdm(datas):
        up_source_tokens = process(data["up_source_tokens"])
        down_source_tokens = process(data["down_source_tokens"])
        target = process(data["target_tokens"])

        #计算citation
        citations = data["citations_tokens"]
        scores = []
        count += 1
        ref_lis = []
        for index in range(len(citations)):
            if citations[index]['label'] == 1:
                ref_lis.append(index)
            ciation = citations[index]
            cit_up_source_tokens = process(ciation["up_source_tokens"])
            cit_down_source_tokens = process(ciation["down_source_tokens"])
            cit_target = process(ciation["target_tokens"])
            score = getSVMScore(
                idf_dic, up_source_tokens, cit_up_source_tokens + " " +
                cit_target + " " + cit_down_source_tokens)
            scores.append(score)
        # print("scores:",scores)
        new_score = sorted(scores, reverse=True)
        pre_lis = []
        for i in range(3):
            pre_lis.append(scores.index(new_score[i]))
        # print("原文:",up_source_tokens + " "+ down_source_tokens)
        # print("候选:",citations[pre_lis[0]]["up_source_tokens"])
        # print("候选:",citations[pre_lis[0]]["target_tokens"])
        # print("候选:",citations[pre_lis[0]]["down_source_tokens"])
        # print("ref_lis",ref_lis)
        # print("pre_lis",pre_lis)
        precision, recall, MAP = cal_MAP(ref_lis, pre_lis)
        # print("precision:", precision)
        # print("recall:", recall)
        # print("MAP:", MAP)
        # print("-----------------------------------------------")
        MAPS += MAP
        precisions += precision
        recalls += recall

    MAPS /= len(datas)
    precisions /= len(datas)
    recalls /= len(datas)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
Пример #27
0
def test(args):
    args.dropout = 0
    data = pickleload("../data2/random_train_data.pkl", "traindata")
    dev_data = data[len(data)*4//5:len(data)]
    # dev_data = data[2000: 4000]

    batch = Batch(args)
    word2index = pickleload(args.word2index_pkl, 'word2index')
    input_vec = len(word2index)
    source_embedding = pickleload("./word2vec/glove_300.pkl", "glove_300.pkl")
    source_embedding = np.array(source_embedding, dtype=np.float32)

    dev_batches = batch.dev_batch(dev_data, args.context_limit, args.citation_limit)

    log_msg = "输入词空间大小:%d" %(input_vec)
    logger.info(log_msg)
    print(log_msg)

    if args.model == "MatchPyramid":
        model = MatchPyramid(args, input_vec, source_embedding)
    elif args.model == "LstmMatch":
        model = LstmMatch(args, input_vec, source_embedding)
    elif args.model == "Decomposable":
        model = Decomposable(args, input_vec, source_embedding)
    elif args.model == "Inference":
        model = Inference(args, input_vec, source_embedding)
    elif args.model == "ESIM":
        model = ESIM(args, input_vec, source_embedding)
    elif args.model == "ArcII":
        model = ArcII(args, input_vec, source_embedding)

    if args.loadmodel ==True:
        model.load_state_dict(torch.load("./modelsave/"+ args.loadmodelName))

    if torch.cuda.is_available():
        model = model.cuda()

    # 打印参数:
    log_msg = "模型名称:%s \n"%( args.loadmodelName)
    logger.info(log_msg)
    print(log_msg)

    pbar2 = tqdm(total=len(dev_data))
    MAPS = 0
    precisions = 0
    recalls = 0
    blues = 0
    rouges = 0
    for dev_step, dev_batch in enumerate(dev_batches):
        pbar2.update(1)
        context_idxs = dev_batch['context_idxs']
        cit_context_idxs = dev_batch['cit_context_idxs']
        ref_labels = dev_batch['ref_labels']
        target = dev_batch["targets"]
        citations = dev_batch['citations']
        context_mask = torch.Tensor(
            np.array([list(map(function, xx)) for xx in context_idxs.data.numpy()],
                     dtype=np.float)).cuda()
        cit_context_mask = torch.Tensor(
            np.array([list(map(function, xx)) for xx in cit_context_idxs.data.numpy()],
                     dtype=np.float)).cuda()

        context_idxs = Variable(context_idxs).cuda()
        cit_context_idxs = Variable(cit_context_idxs).cuda()

        out = model.forward(context_idxs, cit_context_idxs, context_mask, cit_context_mask)
        # Get loss
        # print("真实值:",out)
        # print("真实标签:",ref_labels)
        topn = 3
        predict_index = torch.topk(out,topn, dim=0)[1].squeeze(1).data.cpu().numpy()

        bleu = 0
        rouge = 0

        for index in predict_index:
            alternative_citation = citations[index]["target_tokens"]
            bleu += test_bleu(alternative_citation, target, 1)
            rouge += test_rouge(alternative_citation, target)
            # print("候选citation:", alternative_citation)
        bleu = bleu / topn
        rouge = rouge / topn
        blues += bleu
        rouges += rouge

        # print("预测标签:",predict_index)
        precision, recall, MAP = cal_MAP(ref_labels, predict_index)
        MAPS += MAP
        precisions += precision
        recalls += recall

    MAPS /= len(dev_data)
    precisions /= len(dev_data)
    recalls /= len(dev_data)
    blues /= len(dev_data)
    rouges /= len(dev_data)
    print("MAP:%.4f  P:%.4f  R:%.4f" % (MAPS, precisions, recalls))
    print("bleu", topn, ":", blues)
    print("rouge:", rouges)
    pbar2.close()
Пример #28
0
def getSmallPairsTrainData():
    import random
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source_tokens":"",
             "down_source_tokens":"",
             "target_tokens":""
             "citations":[
                            {
                            "up_source_tokens":"",
                            "down_source_tokens":"",
                            "target_tokens":""
                            }
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("../data2/train_data2.pkl", "./data2/train_data2.pkl")
    idf_dic = pickleload("../data2/idf.pkl", "idf.pkl")
    # datas = datas[len(datas)-1000:len(datas)]
    print(len(datas))
    train_datas = []
    train_datas2 = []
    train_spill = []
    q_id = 0
    for i in tqdm(range(len(datas))):
        data = datas[i]
        target = data_process(data["target_tokens"])
        # 计算citation
        citations = data["citations_tokens"]
        scores = []
        if len(target) < 50:
            continue
        for index in range(len(citations)):
            ciation = citations[index]
            cit_target = data_process(ciation["target_tokens"])
            if target == cit_target or len(cit_target) < 50:
                scores.append(0)
            else:
                score = getSVMScore(idf_dic, process_kuohao(target),
                                    process_kuohao(cit_target))
                scores.append(score)

        sorted_scores = sorted(scores, reverse=True)
        best_indexs = []
        for j in range(len(sorted_scores)):
            if sorted_scores[j] > 0.1 and j <= 5:
                best_index = scores.index(sorted_scores[j])
                best_indexs.append(best_index)
        if len(best_indexs) == len(citations):
            continue
        for best_index in best_indexs:
            train_data = {}
            train_data['up_source'] = data_process(data["up_source_tokens"])
            train_data['down_source'] = data_process(
                data["down_source_tokens"])
            train_data['target'] = data_process(data["target_tokens"])

            high_dic = {}
            high_dic['cit_up_source'] = data_process(
                citations[best_index]['up_source_tokens'])
            high_dic['cit_down_source'] = data_process(
                citations[best_index]['down_source_tokens'])
            high_dic['cit_target'] = data_process(
                citations[best_index]['target_tokens'])
            high_dic['bleu1_score'] = scores[best_index]

            # for k in range(len(best_indexs)):
            #     print("target:", train_data['target'])
            #     print("cit_target:", data_process(citations[best_indexs[k]]['target_tokens']))
            #     print("score:", sorted_scores[k])
            #     print("\n")
            # print(len(best_indexs), "  /   ", len(citations))
            # print("---------------------------------------------")
            low_index = random.randint(0, len(scores) - 1)
            while low_index in best_indexs:
                low_index = random.randint(0, len(scores) - 1)
            if scores[best_index] == scores[low_index] or scores[
                    best_index] == 1.0:
                continue
            low_dic = {}
            low_dic['cit_up_source'] = data_process(
                citations[low_index]['up_source_tokens'])
            low_dic['cit_down_source'] = data_process(
                citations[low_index]['down_source_tokens'])
            low_dic['cit_target'] = data_process(
                citations[low_index]['target_tokens'])
            low_dic['bleu1_score'] = scores[low_index]
            if low_dic['cit_target'] == train_data['target']:
                continue
            train_data['high_dic'] = high_dic
            train_data['low_dic'] = low_dic
            train_spill.append(train_data)

        if i in [
                len(datas) // 5,
                len(datas) * 2 // 5,
                len(datas) * 3 // 5,
                len(datas) * 4 // 5,
                len(datas) - 1
        ]:
            train_datas.append(train_spill)
            print(len(train_spill))
            train_spill = []

    print(len(train_datas))
    print(len(train_datas2))  #26933
    print("训练样本的数据量为:", len(train_datas))
    picklesave(train_datas, "./train_data/small_pairs_train_data.pkl",
               "small_pairs_train_data.pkl")
Пример #29
0
def getMatchZooData():
    import random
    '''
    [
            {
             "citStr":"" 引用的作者和年份,
             "context":"", 整个引用片段
             "up_source_tokens":"",
             "down_source_tokens":"",
             "target_tokens":""
             "citations":[
                            {
                            "up_source_tokens":"",
                            "down_source_tokens":"",
                            "target_tokens":""
                            }
                           ...
                          ]
            }
            ......

        ]
    查找相似citation
    :return:
    '''
    datas = pickleload("../data2/random_train_data.pkl",
                       "./data2/random_train_data.pkl")
    word2index = pickleload("./word2vec/glove_word2index_300.pkl",
                            "./word2vec/glove_word2index_300.pkl")
    print(len(datas))
    q_id = 0
    for i in tqdm(range(len(datas))):
        data = datas[i]
        source_tokens = data_process(
            data["up_source_tokens"]) + " " + data_process(
                data["up_source_tokens"])
        # 计算citation
        citations = data["citations_tokens"]
        writefile(
            './match_zoo_data/corpus_preprocessed.txt', "Q_" + str(q_id) +
            "\t250\t" + getSen_index(source_tokens, word2index) + "\n")

        d_id = 0
        for citation in citations:
            score = citation['label']
            citation_tokens = data_process(
                citation["up_source_tokens"]) + " " + data_process(
                    citation["target_tokens"]) + " " + data_process(
                        citation["up_source_tokens"])
            writefile(
                './match_zoo_data/corpus_preprocessed.txt',
                "Q_" + str(q_id) + "D_" + str(d_id) + "\t250\t" +
                getSen_index(citation_tokens, word2index) + "\n")
            if q_id < len(datas) * 4 // 5:
                writefile(
                    './match_zoo_data/relation_train.txt',
                    str(score) + "\t" + "Q_" + str(q_id) + "\t" + "Q_" +
                    str(q_id) + "D_" + str(d_id) + "\n")
            else:
                writefile(
                    './match_zoo_data/relation_test.txt',
                    str(score) + "\t" + "Q_" + str(q_id) + "\t" + "Q_" +
                    str(q_id) + "D_" + str(d_id) + "\n")
                writefile(
                    './match_zoo_data/relation_valid.txt',
                    str(score) + "\t" + "Q_" + str(q_id) + "\t" + "Q_" +
                    str(q_id) + "D_" + str(d_id) + "\n")
            d_id += 1
        q_id += 1
Пример #30
0
def countScores():
    datas = pickleload("./train_data/train_data.pkl",
                       "./train_data/train_data.pkl")
    for data in datas:
        score = data['bleu1_score']
        print(score)