Пример #1
0
def word2vec_sim_en(f_tuple_list):
    print 'load word2vec model...'
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    model = KeyedVectors.load_word2vec_format(
        r'%s/%s' % (macro.DICT_DIR, 'GoogleNews-vectors-negative300.bin'),
        binary=True)
    # model = KeyedVectors.load_word2vec_format(r'%s/cn.skipgram.bin' % (macro.DICT_DIR), binary=True, unicode_errors='ignore')
    auto_sim_list = []
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        try:
            auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到0-10得分
            # 将余弦相似度-1~1放到1~10得分
            auto_sim = utils.convert_sim(auto_sim, mode=0)
        except:
            auto_sim = -1
            count += 1
        print "w2v:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim,
                                                   auto_sim)
        auto_sim_list.append(auto_sim)

    print 'count=%s/%s' % (len(manu_sim_list) - count, len(manu_sim_list))
    print 'spearman=%0.5f/%0.5f' % (eval.spearman(
        manu_sim_list,
        auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True))
    print 'pearson=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list,
        auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True))

    return auto_sim_list
Пример #2
0
def final_test_formal_run():
    # 读入已知词对,如果新评测词对在已知中,则直接取出
    id_list1, gordern_word1_list, gordern_word2_list, manu_sim_list, headline1 = utils.read2wordlist(
        [(macro.CORPUS_DIR, macro.NLPCC_DRY_FILE)], 'tag')
    # 模式是带有答案的
    id_list, word1_list, word2_list, manu_sim_list2, headline2 = utils.read2wordlist(
        [(macro.CORPUS_DIR, macro.NLPCC_FML_GD_FILE)], 'tag')
    model = Word2Vec.load_word2vec_format(
        r'%s/%s' % (macro.MODELS_DIR, macro.DRY_EXT_MIX_BST_W2V_MODEL),
        binary=True)
    auto_sim_list = []
    for id, w1, w2 in zip(id_list, word1_list, word2_list):
        if w1 in gordern_word1_list and w2 == gordern_word2_list[
                gordern_word1_list.index(w1)]:
            auto_sim = manu_sim_list[gordern_word1_list.index(w1)]
            print 'found it in dry run data:::(%s\t%s\t%s)' % (w1, w2,
                                                               auto_sim)
        elif w2 in gordern_word1_list and w1 == gordern_word2_list[
                gordern_word1_list.index(w2)]:
            auto_sim = manu_sim_list[gordern_word1_list.index(w1)]
            print 'found it in dry run data:::(%s\t%s\t%s)' % (w1, w2,
                                                               auto_sim)
        else:
            try:
                auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到0-10得分
                if auto_sim <= 0:
                    auto_sim = 1.0
                else:
                    auto_sim = auto_sim * 9 + 1
                # auto_sim = 0.5*(auto_sim+1)*10
                print '%-10s\t%-10s\t%-10s\t%-10s' % (id, w1, w2, auto_sim)
            except:
                auto_sim = 1
                print '%-10s\t%-10s\t%-10s\t%-10s' % (id, w1, w2,
                                                      '______Not Found______')
        auto_sim_list.append(auto_sim)

    print eval.spearman(manu_sim_list2, auto_sim_list)
    # 写入文件
    fw = open('%s/%s' % (macro.RESULTS_DIR, macro.FNL_FML_EXT_MIX_BST_RESULT),
              'w')
    fw.write(headline2)
    for id, w1, w2, auto_sim in zip(id_list, word1_list, word2_list,
                                    auto_sim_list):
        fw.write('%s\t%s\t%s\t%s\n' % (id.encode('utf-8'), w1.encode('utf-8'),
                                       w2.encode('utf-8'), auto_sim))
    print 'test_formal_run:::finished!'
    return
Пример #3
0
    def calculate_sim(self, load_model, ofname, write_flag=True):
        # 加载指定w2v model
        w2v_model = Word2Vec.load_word2vec_format(
            r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True)  # C format
        # 读入评测词对语料
        id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(
            self.f_tuple_list, mode='tag')
        # 新的题头
        new_headline = headline.strip() + '\tPrediction\n'
        # 计算相似度
        auto_sim_list = []
        for id, w1, w2, manu_sim in zip(id_list, word1_list, word2_list,
                                        manu_sim_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 向量余弦相似度[-1,1]
                print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % (
                    id, w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 0  # 未登录词,为了区分1.0,赋值为1
                print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % (
                    id, w1, w2, manu_sim, '______Not Found______')
            auto_sim = utils.convert_sim(auto_sim, mode=1)  # 将余弦相似度放到1-10得分
            auto_sim_list.append(auto_sim)

        # 相似度计算的结果是否写入文件
        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw:
                fw.write(new_headline)
                for id, w1, w2, manu_sim, auto_sim in zip(
                        id_list, word1_list, word2_list, manu_sim_list,
                        auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\t%s\n' %
                             (str(id), w1.encode('utf-8'), w2.encode('utf-8'),
                              manu_sim, auto_sim))

        # 评价结果
        r = eval.spearman(manu_sim_list, auto_sim_list)
        p = eval.pearson(manu_sim_list, auto_sim_list)
        print '!!!spearman=%s; pearson=%s' % (r, p)

        # 可视化结果
        data = {
            'ID': id_list,
            'Word1': word1_list,
            'Word2': word2_list,
            'Score': manu_sim_list,
            'Prediction': auto_sim_list
        }

        frame = DataFrame(data)
        sns.jointplot("Score",
                      "Prediction",
                      frame,
                      kind='reg',
                      stat_func=eval.spearmanr)
        plt.savefig('%s/%s.jpg' % (macro.PICS_DIR, ofname))

        return word1_list, word2_list, manu_sim_list, auto_sim_list, new_headline
Пример #4
0
def hnet_sim(f_tuple_list):
    '''
    bt_xiepeiyiVerb.dic:每一行是一个协陪义动词
    #对于每一个协陪义动词,得到“当前协陪义动词--glossary.dat中动词    相似度”
     相似度个数 = bt_xiepeiyiVerb.dic中有多少行乘以glossary.dat中动词数量,对所有相似度从大到小排序,结果存放在result.txt中
    '''
    generatePlabel = False
    SIMILARITY = True
    BETA = [0.5, 0.2, 0.17, 0.13]
    GAMA = 0.2
    DELTA = 0.2
    ALFA = 1.6
    glossaryfile = '%s/%s' % (macro.DICT_DIR, macro.WN_GLOSS_DICT)
    xiepeiyidic = '%s/%s' % (macro.DICT_DIR, macro.WN_XPY_VERB_DICT)
    sememefile = '%s/%s' % (macro.DICT_DIR, macro.WN_WHOLE_DICT)

    if generatePlabel:
        lines = generateSourcefile(glossaryfile, xiepeiyidic)
        print('There are ' + str(len(lines)) + ' lines!!')

    if SIMILARITY:

        obj = WordSimilarity()

        if obj.init(sememefile, glossaryfile) == False:
            print("[ERROR] init failed!!")

        count = 0
        auto_sim_list = []
        idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
        for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
            auto_sim = obj.calc(w1.encode('utf-8'), w2.encode('utf-8'), BETA, GAMA, DELTA, ALFA)
            if auto_sim >= 0:
                # 0-1放缩到1-10
                auto_sim = utils.convert_sim(auto_sim, mode=1)
                count += 1
            else:
                auto_sim = -1
            auto_sim_list.append(auto_sim)
            print "hownet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim)
        print 'count=%s/%s' % (count, len(manu_sim_list))
        print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True))
        print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True))
        return auto_sim_list
Пример #5
0
def run1(fname=macro.NLPCC_FML_FILE, ofname=macro.FML_CWORDNET_RESULT, flag=True):
    cmn = 'cmn'
    with open(r'%s/%s' % (macro.CORPUS_DIR, fname), 'r') as reader:
        wordlines = reader.readlines()

    manu_sim_list = []
    auto_sim_list = []

    # flag = False只计算差找到的结果; True则对于没有找到的赋值为-2
    count = 0
    default_sim = -1.0

    writer = open(r'%s/%s' % (macro.RESULTS_DIR, ofname), 'w')
    writer.write(wordlines[0].strip() + '\n')
    for wordline in wordlines[1:]:
        id, word1, word2, manu_sim = wordline.strip().split('\t')
        try:
            synsets1 = wn.synsets(word1.decode('utf-8'), lang=cmn)
            synsets2 = wn.synsets(word2.decode('utf-8'), lang=cmn)
            sim_tmp = []
            for synset1 in synsets1:
                for synset2 in synsets2:
                    score = synset1.path_similarity(synset2)
                    # score = synset1.wup_similarity(synset2)
                    # score = synset1.lch_similarity(synset2)
                    if score is not None:
                        pass
                    else:
                        score = default_sim
                    sim_tmp.append(score)
            if sim_tmp:
                auto_sim = np.max(sim_tmp)
                # print sim_tmp
                count += 1
            else:
                auto_sim = default_sim

        except:
            auto_sim = default_sim
            print 'word is not in list'
        if auto_sim >= 0 or flag:
            # auto_sim = utils.convert_sim(auto_sim, mode=1)
            manu_sim_list.append(float(manu_sim))
            auto_sim_list.append(auto_sim)
            print "process id= %s [%s,%s] %s %s" % (id, word1, word2, manu_sim, auto_sim)
        writer.write('%s\t%s\t%s\t%s\n' % (id, word1, word2,  str(auto_sim)))
    print 'found_pair=%s/%s' % (count, len(manu_sim_list))
    print 'pearson', eval.pearson(manu_sim_list, auto_sim_list)[0]
    print 'spearman', eval.spearman(manu_sim_list, auto_sim_list)[0]
    writer.close()
    """
Пример #6
0
def cwordnet_sim(f_tuple_list, cmn='cmn'):
    print 'load cwordnet_sim...'
    cwordnet_sim_list = []
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        auto_sim = cwn_sim(w1, w2, cmn)
        # 字典中查找到的词
        if auto_sim >= 0:
            count += 1
            # 分制转成1-10
            auto_sim = utils.convert_sim(auto_sim, mode=1)
        else:
            pass
            # 未查找到的词
            auto_sim = -1
        print "cwordnet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim)
        cwordnet_sim_list.append(auto_sim)

    print 'count=%s/%s' % (count, len(manu_sim_list))
    print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, cwordnet_sim_list), eval.spearman(manu_sim_list, cwordnet_sim_list, True))
    print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, cwordnet_sim_list), eval.pearson(manu_sim_list, cwordnet_sim_list, True))
    return cwordnet_sim_list
Пример #7
0
def compare():
    formal_pred_all_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_golden_new.txt', [1, 1, 1, 1, 1, 1, 1])
    formal_pred_selected_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_golden_new.txt', [0, 0, 0, 1, 0, 1, 0])
    dry_pred_all_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_test.txt', [1, 1, 1, 1, 1, 1, 1])
    dry_pred_selected_features = post.get_value_list(
        macro.CORPUS_DIR + '/features_test.txt', [0, 0, 0, 1, 0, 1, 0])
    for result in dry_results:
        idl, w1l, w2l, scores, headline = utils.read2wordlist([
            (macro.RESULTS_DIR, result)
        ])
        print str(
            result) + ' vs dry_pred_all_featuers spearman: ', eval.spearman(
                dry_pred_all_features,
                scores)[0], 'pearson: ', eval.pearson(dry_pred_all_features,
                                                      scores)[0]

        print str(result) + ' vs dry_pred_selected_featuers spearman: ', eval.spearman(dry_pred_selected_features,
                                                                                       scores)[0], 'pearson: ', \
            eval.pearson(dry_pred_selected_features, scores)[0]

    for result in formal_results:
        idl, w1l, w2l, scores, headline = utils.read2wordlist([
            (macro.RESULTS_DIR, result)
        ])
        print str(
            result) + ' vs formal_pred_all_featuers spearman: ', eval.spearman(
                formal_pred_all_features,
                scores)[0], 'pearson: ', eval.pearson(formal_pred_all_features,
                                                      scores)[0]

        print str(result) + ' vs formal_pred_selected_featuers spearman: ', eval.spearman(formal_pred_selected_features,
                                                                                          scores)[0], 'pearson: ', \
            eval.pearson(formal_pred_selected_features, scores)[0]
    def train_ext_vocab_choose_best(self, save_model, result_fname, last_val):
        # 获取评价词对
        id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist(self.f_tuple_list, mode='tag')

        # 获取语料
        sentences = []
        for seg_docs_dir in self.seg_docs_dir_list:
            if type(seg_docs_dir) == tuple:
                sens = utils.atxt2sens(seg_docs_dir[0], seg_docs_dir[1])
            else:
                sens = utils.txts2sens(seg_docs_dir)
            sentences.extend(sens)

        # 得到模型方式:load之前的模型 OR 训练词向量模型
        if last_val == -2:
            print 'load previous model....'
            model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, save_model), binary=True)
        else:
            model = Word2Vec(sentences, sg=1, size=300, window=10, negative=0, hs=1, sample=1e-4, workers=8,
                             min_count=5)

        # 评价相似度
        auto_sim_list = []
        for w1, w2, manu_sim in zip(word1_list, word2_list, manu_sim_list):
            try:
                auto_sim = model.similarity(w1, w2)  # 将余弦相似度放到1-10得分
                auto_sim = utils.convert_sim(auto_sim)
                # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 1  # 为了区分没有找到的情况,用1代替1.0
                print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______')
            auto_sim_list.append(auto_sim)

        # 保留val大的模型
        val = eval.spearman(manu_sim_list, auto_sim_list)
        if val > last_val:
            model.save_word2vec_format('%s/%s' % (macro.MODELS_DIR, save_model), binary=True)  # 保存模型
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, result_fname), 'w') as fw:
                fw.write(headline.strip() + '\tPrediction\n')
                for w1, w2, manu_sim, auto_sim in zip(word1_list, word2_list, manu_sim_list, auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\n' % (w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim))
        else:
            print ':::::::current val=', val
        return val
Пример #9
0
def combine_zh_en():
    d = enchant.Dict('en_US')
    _, en_w1_list, en_w2_list, _, _ = utils.read2wordlist([(macro.CORPUS_DIR, 'en_'+macro.NLPCC_FML_FILE)], mode='tag')
    _, _, _, manu_sim_list, _ = utils.read2wordlist([(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)],
                                                          mode='tag')

    # 这里换成想要提升的结果文件
    # id_list, w1_list, w2_list, manu_sim_list, auto_sim_list, headline = \
    #     utils.read2wordlist([(macro.RESULTS_DIR, macro.FML_ORG_BDNEWS_XIESO_RESULT)], mode='auto_tag')
    id_list, w1_list, w2_list,  auto_sim_list, headline = \
        utils.read2wordlist([(macro.RESULTS_DIR, 'lstm.result')], mode='tag')

    w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, macro.GOOGLE_EN_W2V_MODEL), binary=True)   # the English model

    fw2 = open(r'%s/%s' % (macro.RESULTS_DIR, macro.FML_ORG_GOOGLE_EN_W2V_RESULT), 'w')
    fw2.write(headline)

    new_auto_sim_list = []
    count = 0
    for id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim in \
            zip(id_list, w1_list, en_w1_list, w2_list, en_w2_list, manu_sim_list, auto_sim_list):
        # print id, '===='
        if d.check(trans_w1) and d.check(trans_w2):
            if len(trans_w1.split()) <= 1 and len(trans_w2.split()) <= 1:
                try:
                    auto_sim = w2v_model.similarity(trans_w1, trans_w2)
                    auto_sim = utils.convert_sim(auto_sim, mode=0)  # 将余弦相似度放到1-10得分
                    count += 1
                except:
                    pass
                print '%s\t%s[%s];%s[%s]\tmanu_sim=%s\tauto_sim=%s' % (id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim)
        new_auto_sim_list.append(float(auto_sim))
        line2 = '%s\t%s\t%s\t%s\t%s\n' % (id, trans_w1, trans_w2, manu_sim, auto_sim)
        fw2.write(line2.encode('utf-8'))
    fw2.close()
    # 评价结果
    print 'count=', count
    r = eval.spearman(manu_sim_list, new_auto_sim_list)
    p = eval.pearson(manu_sim_list, new_auto_sim_list)
    print '!!!spearman=%s; pearson=%s' % (r, p)
Пример #10
0
def cilin_sim(f_tuple_list):
    cs = loadCilin()
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    result1 = []
    result2 = []
    result3 = []
    count = 0
    for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list):
        sim1 = cs.similarity(w1, w2)
        sim2 = cs.sim2013(w1, w2)
        sim3 = cs.sim2016(w1, w2)
        # 字典中查找到的词
        if sim3 >= 0:
            count += 1
            # 分制转成1-10
            sim1 = utils.convert_sim(sim1, mode=1)
            sim2 = utils.convert_sim(sim2, mode=1)
            sim3 = utils.convert_sim(sim3, mode=1)
        else:
            pass
            # 未查找到的词认为相似度很低
            sim1, sim2, sim3 = -1, -1, -1
        # push
        result1.append(sim1)
        result2.append(sim2)
        result3.append(sim3)
        print "cilin:proc_id= %s [%s,%s] %s (%0.2f, %0.2f, %0.2f)" % (
            id, w1, w2, manu_sim, sim1, sim2, sim3)
    # 统计与评价
    print 'found_pair=%s/%s' % (count, len(manu_sim_list))
    print 'sim1: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result1), eval.pearson(
            manu_sim_list, result1, True), eval.spearman(
                manu_sim_list,
                result1), eval.spearman(manu_sim_list, result1, True))
    print 'sim2: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result2), eval.pearson(
            manu_sim_list, result2, True), eval.spearman(
                manu_sim_list,
                result2), eval.spearman(manu_sim_list, result2, True))
    print 'sim3: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson(
        manu_sim_list, result3), eval.pearson(
            manu_sim_list, result3, True), eval.spearman(
                manu_sim_list,
                result3), eval.spearman(manu_sim_list, result3, True))
    return (result1, result2, result3)
Пример #11
0
        values.append(sum / count)
    return values


if __name__ == '__main__':

    golden_score = post.read_score(macro.CORPUS_DIR + '/500_2.csv')
    lst = [0, 0, 0, 0, 0, 0, 0]
    best = []
    i = 0
    max = 0
    while i < 127:
        add(lst)
        data = get_value_list(macro.CORPUS_DIR + '/features_golden_new.txt',
                              lst)
        sp = eval.spearman(data, golden_score)[0]
        if sp > max:
            max = sp
            best = lst
        # if sp > 0.3:
        #     dataset = {
        #         'cal_value': data,
        #         'goldern': golden_score
        #     }
        #     frame = DataFrame(dataset)
        #     sns.jointplot('goldern', 'cal_value', frame, kind='reg', stat_func=eval.spearmanr)
        #
        #     plt.xlim([1,10])
        #     plt.ylim([1,10])
        #     plt.savefig('%s/%s.png' %(macro.PICS_DIR,str(lst)))
        # print lst, sp
from Com import macro
from Eval import eval
from Com import utils
import post
import merge

lst = [1] * 7

data = post.get_value_list(macro.CORPUS_DIR + '/features_golden_new.txt', lst)
max = 0
final_list = []
idl, w1l, w2l, score, headline = utils.read2wordlist([(macro.CORPUS_DIR,
                                                       '500_2.csv')])
f_c = macro.RESULTS_DIR + '/evatestdata3_goldern500_cilin.txt'
f_v = macro.RESULTS_DIR + '/fml_org_bdnews_xieso.result'

for mode in range(1, 13):
    score_m = merge.merge_2_list(f_v, f_c, mode)
    sp = eval.spearman(data, score_m)[0]
    pe = eval.pearson(data, score_m)[0]
    temp = score_m
    print macro.MODES[mode - 1], '\t', eval.spearman(
        score, score_m)[0], '\t', eval.pearson(score,
                                               score_m)[0], '\t', sp, '\t', pe
    # idl_p, w1l_p, w2l_p, score_p, headline_p = utils.read2wordlist([(macro.RESULTS_DIR,'best_without_lstm.txt')])

    # pred = merge.merge_2_list(macro.RESULTS_DIR+'/fml_google_en_w2v.result',f_c,mode=macro.MAX)
    # print eval.spearman(pred,score),eval.pearson(pred,score)

    # merge.merge(macro.RESULTS_DIR+'/fml_google_en_w2v.result',f_c,macro.RESULTS_DIR+'/best_without_lstm.txt',macro.MAX)
Пример #13
0
def test_lstm(
        dim_proj=600,  # word embedding dimension and LSTM number of hidden units.
        n_words=100000,  # Vocabulary size
        # sgd, adadelta and rmsprop available, sgd very hard to use, not recommended (probably need momentum and decaying learning rate).
    maxlen=100,  # Sequence longer then this get ignored
        batch_size=16,  # The batch size during training.
        valid_batch_size=64,  # The batch size used for validation/test set.
        # Parameter for extra option
    noise_std=0.,
        # This frequently need a bigger model.
        test_size=-1,  # If >0, we keep only this number of test example.
        use_dropout=True,  # if False slightly faster, but worst test error
        part=1):
    start_time = time.time()
    idl, w1l, w2l, score_v, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'fml_google_en_w2v.result')
    ])
    idl, w1l, w2l, score_goldern, headline = utils.read2wordlist([
        (macro.CORPUS_DIR, '500_2.csv')
    ])

    # Model options
    model_options = locals().copy()
    print("model options", model_options)
    load_data = prepare_input.load_data
    prepare_data = imdb.prepare_data
    print('Loading data')
    train, valid, test, dis_vecs_train, ids_train, dis_vecs_valid, ids_valid, dis_vecs_test, ids_test = load_data(
        n_words=n_words, valid_portion=0.05, maxlen=maxlen, part=part)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])
        dis_vecs_test = [dis_vecs_test[n] for n in idx]
        ids_test = [ids_test[n] for n in idx]
    ydim = numpy.max(train[1])

    model_options['ydim'] = ydim + 1
    print('Loading model')
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params_empty()

    load_params('lstm_model.npz' + str(part) + '.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred, cost, d1,
     d2) = build_model(tparams, model_options)
    it = get_minibatches_idx(len(test[0]), valid_batch_size)
    probs = pred_probs(f_pred_prob,
                       prepare_data,
                       test,
                       it,
                       dis_vecs=dis_vecs_test)
    scores, id_dis = probs_2_score(probs, ids_test)
    new_score = combine(idl, id_dis, score_v, scores)
    out_file = codecs.open(
        macro.RESULTS_DIR + '/lstm_w2v' + str(part) + '.txt', 'w', 'utf-8')
    out_file.write('ID\tWord1\tWord2\tScore\t\r\n')
    for id, word1, word2, score in zip(idl, w1l, w2l, new_score):
        line = id + '\t' + word1 + '\t' + word2 + '\t' + str(score) + '\r\n'
        out_file.write(line)
    out_file.close()

    idl, w1l, w2l, score_old, headline = utils.read2wordlist([
        (macro.RESULTS_DIR, 'best_without_lstm.txt')
    ])
    f_c = macro.RESULTS_DIR + '/evatestdata3_goldern500_cilin.txt'
    f_v = macro.RESULTS_DIR + '/lstm_w2v' + str(part) + '.txt'
    last_score = merge.merge_2_list(f_v, f_c, macro.MAX)
    temp = eval.spearman(last_score, score_goldern)[0]
    print(eval.spearman(score_old, score_goldern)[0])
    print(temp)

    dataset = {'pred': last_score, 'goldern': score_goldern}
    frame = DataFrame(dataset)
    sns.jointplot('goldern',
                  'pred',
                  frame,
                  kind='reg',
                  stat_func=eval.spearmanr)

    plt.xlim([1, 10])
    plt.ylim([1, 10])
    plt.savefig('%s/%s.png' % (macro.PICS_DIR, ('lstm' + str(part))))
    end_time = time.time()
    print(('Testing took %.1fs' % (end_time - start_time)), file=sys.stderr)
    return last_score
Пример #14
0
def test_2(part):
    pp = numpy.load('lstm_model.npz' + str(part) + '.npz')

    for kk, vv in pp.items():
        if kk == 'Wemb':
            return vv


if __name__ == '__main__':
    for i in range(1, 6):
        train_lstm(dim_proj=600,
                   n_words=100000,
                   max_epochs=100,
                   test_size=-1,
                   part=i)

    last_scores = []
    max_score = []

    for i in range(1, 6):
        last_scores.append(test_lstm(part=i))
    idl, w1l, w2l, score_goldern, headline = utils.read2wordlist([
        (macro.CORPUS_DIR, '500_2.csv')
    ])
    temp = last_scores[0]
    for s in last_scores[1:]:
        max_score = merge2max(temp, s)
        temp = max_score
    print('max_score: ', eval.spearman(max_score, score_goldern),
          eval.pearson(max_score, score_goldern))
Пример #15
0
def single_sims(f_tuple_list, ofname='single_sims'):
    pk_path = '%s/%s.pk' % (macro.RESULTS_DIR, ofname)
    if os.path.exists(pk_path):
        f = open(pk_path, 'rb')
        d = pk.load(f)
        f.close()
    else:
        idl, w1l, w2l, score, headline = utils.read2wordlist(f_tuple_list)
        cilin_sim_list1, cilin_sim_list2, cilin_sim_list3 = cilin_sim(
            f_tuple_list)
        hownet_sim_list = hnet_sim(f_tuple_list)
        cwordnet_sim_list = cwordnet_sim(f_tuple_list)
        w2v_sim_list = word2vec_sim(f_tuple_list)
        jcd_list, ovl_list, dice_list, pmi_list, ngd_list = ir_sim(
            f_tuple_list, '%s_ir_nums0.pk' % ofname)
        d = {
            'id': idl,
            'w1': w1l,
            'w2': w2l,
            'manu_sim': score,
            # 'cilin1': cilin_sim_list1,
            # 'cilin2': cilin_sim_list2,
            'cilin3': cilin_sim_list3,
            'hownet': hownet_sim_list,
            'wordnet': cwordnet_sim_list,
            'word2vec': w2v_sim_list,
            'jaccard': jcd_list,
            'overlap': ovl_list,
            'dice': dice_list,
            'pmi': pmi_list,
            # 'ngd': ngd_list
        }
        f = open(pk_path, 'wb')
        pk.dump(d, f)
        f.close()
    # names = ['id', 'w1', 'w2', 'manu_sim', 'cilin1', 'cilin2', 'cilin3',
    #          'hownet', 'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi']
    names = [
        'id', 'w1', 'w2', 'manu_sim', 'cilin3', 'hownet', 'wordnet',
        'word2vec', 'jaccard', 'overlap', 'dice', 'pmi'
    ]
    df = pd.DataFrame(data=d, columns=names)
    # print df
    # 评价结果
    from prettytable import PrettyTable
    # x = PrettyTable(["Eval", 'cilin1', 'cilin2', 'cilin3', 'hownet',
    #                  'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi'])
    x = PrettyTable([
        "Eval", 'cilin3', 'hownet', 'wordnet', 'word2vec', 'jaccard',
        'overlap', 'dice', 'pmi'
    ])
    x.align["Eval"] = "l"
    x.padding_width = 1
    x.add_row([
        'Spearman',
        # '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin1), eval.spearman(df.manu_sim, df.cilin1, True)),
        # '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin2), eval.spearman(df.manu_sim, df.cilin2, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin3),
                         eval.spearman(df.manu_sim, df.cilin3, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.hownet),
                         eval.spearman(df.manu_sim, df.hownet, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.wordnet),
                         eval.spearman(df.manu_sim, df.wordnet, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.word2vec),
                         eval.spearman(df.manu_sim, df.word2vec, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.jaccard),
                         eval.spearman(df.manu_sim, df.jaccard, True)),
        '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.overlap),
                         eval.spearman(df.manu_sim, df.overlap, True)),
        '%0.5f/%0.5f' % (eval.spearman(
            df.manu_sim, df.dice), eval.spearman(df.manu_sim, df.dice, True)),
        '%0.5f/%0.5f' % (eval.spearman(
            df.manu_sim, df.pmi), eval.spearman(df.manu_sim, df.pmi, True)),
    ])

    x.add_row([
        'Pearson',
        # '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin1), eval.pearson(df.manu_sim, df.cilin1, True)),
        # '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin2), eval.pearson(df.manu_sim, df.cilin2, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin3),
                         eval.pearson(df.manu_sim, df.cilin3, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.hownet),
                         eval.pearson(df.manu_sim, df.hownet, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.wordnet),
                         eval.pearson(df.manu_sim, df.wordnet, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.word2vec),
                         eval.pearson(df.manu_sim, df.word2vec, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.jaccard),
                         eval.pearson(df.manu_sim, df.jaccard, True)),
        '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.overlap),
                         eval.pearson(df.manu_sim, df.overlap, True)),
        '%0.5f/%0.5f' % (eval.pearson(
            df.manu_sim, df.dice), eval.pearson(df.manu_sim, df.dice, True)),
        '%0.5f/%0.5f' % (eval.pearson(
            df.manu_sim, df.pmi), eval.pearson(df.manu_sim, df.pmi, True)),
    ])
    x.add_row([
        'Count',
        # '%s/%s' % (len(df.manu_sim) - list(df.cilin1).count(-1), len(df.manu_sim)),
        # '%s/%s' % (len(df.manu_sim) - list(df.cilin2).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.cilin3).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.hownet).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.wordnet).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.word2vec).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.jaccard).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.overlap).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.dice).count(-1), len(df.manu_sim)),
        '%s/%s' %
        (len(df.manu_sim) - list(df.pmi).count(-1), len(df.manu_sim)),
    ])
    print x
    df.to_csv('%s/%s.csv' % (macro.RESULTS_DIR, ofname), encoding='gbk')

    # 线性结合
    df = df.replace(-1, 0)
    # max
    linear_mean_auto_sims = [row[4:].max() for row in df.values]
    print 'MAX: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))

    # min
    linear_mean_auto_sims = [row[4:].min() for row in df.values]
    print 'MIN: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))
    # mean
    linear_mean_auto_sims = [row[4:].mean() for row in df.values]
    print 'MEAN: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))

    # gmean
    df = df.replace(0, 1)

    linear_mean_auto_sims = [geometric_mean(row[4:]) for row in df.values]
    print 'GMEAN: pearson=%.5f;spearman=%.5f' % (
        eval.pearson(df.manu_sim, linear_mean_auto_sims),
        eval.spearman(df.manu_sim, linear_mean_auto_sims))
    return df
Пример #16
0
 outfile = codecs.open(macro.RESULTS_DIR+'/fml_cilin.txt','w','utf-8')
 outfile.write('\r\n')
 for id,w1, w2 in zip(idl,w1l, w2l):
     sim1 = cs.similarity(w1, w2)
     sim2 = cs.sim2013(w1, w2)
     sim3 = cs.sim2016(w1, w2)
     outfile.write(id+'\t'+w1+'\t'+w2+'\t'+str(sim3)+'\r\n')
     if sim1 == -1:
         flags.append(0)
     else:
         flags.append(1)
     result1.append(sim1)
     result2.append(sim2)
     result3.append(sim3)
 outfile.close()
 print eval.spearman(score, result1)[0]
 print eval.spearman(score, result2)[0]
 print eval.spearman(score, result3)[0]
 '''
 计算全部的得分
 0.347925120242
 0.352377437382
 0.421492611614
 '''
 score_f = []
 result1_f = []
 result2_f = []
 result3_f = []
 for s,r1,r2,r3,flag in zip(score,result1,result2,result3,flags):
     if flag == 1:
         score_f.append(s)
Пример #17
0
def ir_sim(f_tuple_list, ofname='NLPCC_Formal500_single_sims_ir_nums0.pk'):
    print 'ir sim ...'
    idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list)
    nums_pk_path = '%s/%s' % (macro.RESULTS_DIR, ofname)
    if os.path.exists(nums_pk_path):
        print 'load nums...'
        f = open(nums_pk_path, 'rb')
        n1l, n2l, n3l = pickle.load(f)
        f.close()
    else:
        print 'retrieval nums...'
        n1l, n2l, n3l = get_nums_list(w1l, w2l)
        f = open(nums_pk_path, 'wb')
        pickle.dump((n1l, n2l, n3l), f)
        f.close()
    with open(nums_pk_path.split('.')[0]+'_nums.csv', 'w') as fw:
        for id, w1, w2, n1, n2, n3 in zip(idl, w1l, w2l, n1l, n2l, n3l):
            new_line = '%s,%s,%s,%s,%s,%s' % (id, w1, w2, n1, n2, n3)
            fw.write(new_line.encode('gbk')+'\n')

    N = pow(10, 16)
    jcd_list, ovl_list, dice_list, pmi_list, ngd_list = [], [], [], [], []
    for num1, num2, num3, id, w1, w2, manu_sim in zip(n1l, n2l, n3l, idl, w1l, w2l, manu_sim_list):
        jcd = utils.convert_sim(web_jaccard(num1, num2, num3), mode=1)
        ovl = utils.convert_sim(web_overlap(num1, num2, num3), mode=1)
        dice = utils.convert_sim(web_dice(num1, num2, num3), mode=1)
        pmi = utils.convert_sim(web_pmi(num1, num2, num3, N), mode=1)
        ngd = utils.convert_sim(web_ngd(num1, num2, num3, N), mode=1)
        jcd_list.append(jcd)
        ovl_list.append(ovl)
        dice_list.append(dice)
        pmi_list.append(pmi)
        ngd_list.append(ngd)
        # print "ir:proc_id= %s [%s,%s] %s (%.5f, %.5f, %.5f, %.5f, %.5f) " % (id, w1, w2, manu_sim, jcd, ovl, dice, pmi, ngd)

    from prettytable import PrettyTable
    x = PrettyTable(["Eval", "jaccard", "overlap", "dice", "pmi", "ngd"])
    x.align["Eval"] = "l"
    x.padding_width = 1
    x.add_row(['Spearman',
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, jcd_list), eval.spearman(manu_sim_list, jcd_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ovl_list), eval.spearman(manu_sim_list, ovl_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, dice_list), eval.spearman(manu_sim_list, dice_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, pmi_list), eval.spearman(manu_sim_list, pmi_list, True)),
               '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ngd_list), eval.spearman(manu_sim_list, ngd_list, True))])
    x.add_row(['Pearson',
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, jcd_list), eval.pearson(manu_sim_list, jcd_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ovl_list), eval.pearson(manu_sim_list, ovl_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, dice_list), eval.pearson(manu_sim_list, dice_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, pmi_list), eval.pearson(manu_sim_list, pmi_list, True)),
               '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ngd_list), eval.pearson(manu_sim_list, ngd_list, True)),
               ])
    x.add_row(['Count',
               '%s/%s' % (len(manu_sim_list) - jcd_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - ovl_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - dice_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - pmi_list.count(-1), len(manu_sim_list)),
               '%s/%s' % (len(manu_sim_list) - ngd_list.count(-1), len(manu_sim_list)),
               ])
    print x

    return jcd_list, ovl_list, dice_list, pmi_list, ngd_list
Пример #18
0
def cilin_run1():
    '''
    有三种计算方法
    cs = CilinSimilarity()
    sim1 = cs.similarity(w1, w2)
    sim2 = cs.sim2013(w1, w2)
    sim3 = cs.sim2016(w1, w2)
    '''

    cs = loadCilin()
    # w1 = u'抄袭'
    # w2 = u'克隆'
    # code1 = cs.get_code(w1)
    # print w1, '的编码有:', code1
    # code2 = cs.get_code(w2)
    # print w2, '的编码有:', code2
    # sim = cs.similarity(w1, w2)
    # print w1, w2, '最终的相似度为', sim
    idl, w1l, w2l, score, headline = utils.read2wordlist([
        (macro.CORPUS_DIR, macro.NLPCC_FML_FILE)
    ])
    result1 = []
    result2 = []
    result3 = []
    flags = []
    outfile = codecs.open(macro.RESULTS_DIR + '/fml_cilin.txt', 'w', 'utf-8')
    outfile.write('\r\n')
    for id, w1, w2 in zip(idl, w1l, w2l):
        sim1 = cs.similarity(w1, w2)
        sim2 = cs.sim2013(w1, w2)
        sim3 = cs.sim2016(w1, w2)
        outfile.write(id + '\t' + w1 + '\t' + w2 + '\t' + str(sim3) + '\r\n')
        if sim1 == -1:
            flags.append(0)
        else:
            flags.append(1)
        result1.append(sim1)
        result2.append(sim2)
        result3.append(sim3)
    outfile.close()
    print eval.spearman(score, result1)
    print eval.spearman(score, result2)
    print eval.spearman(score, result3)
    '''
    计算全部的得分
    0.347925120242
    0.352377437382
    0.421492611614
    '''
    score_f = []
    result1_f = []
    result2_f = []
    result3_f = []
    for s, r1, r2, r3, flag in zip(score, result1, result2, result3, flags):
        if flag == 1:
            score_f.append(s)
            result1_f.append(r1)
            result2_f.append(r2)
            result3_f.append(r3)
    print '-------------------------'
    print len(score_f)
    print eval.spearman(score_f, result1_f)
    print eval.spearman(score_f, result2_f)
    print eval.spearman(score_f, result3_f)
    print eval.pearson(score_f, result3_f)
    '''
Пример #19
0
    # score_m = read_score(macro.CORPUS_DIR+'/merge_result_extract.txt')
    # print golden_score
    # print score_m
    # print spearmanr(golden_score,score_m)
    # eval.spearman(golden_score,score_m)
    last_scores = []
    max_score = []
    #
    #
    for i in range(1, 6):
        last_scores.append(merge.merge_2_list(macro.RESULTS_DIR + '/lstm_w2v' + str(i) + '.txt', f_c, mode=macro.MAX))
    idl, w1l, w2l, score_goldern, headline = utils.read2wordlist([(macro.CORPUS_DIR, '500_2.csv')])
    temp = last_scores[0]
    for s in last_scores[1:]:
        max_score = merge2max(temp, s)
        temp = max_score
    print ('max_score: ', eval.spearman(max_score, score_goldern), eval.pearson(max_score, score_goldern))
    sss = small(max_score)
    print eval.spearman(sss, score_goldern)
    # dataset = {
    #     'pred': max_score,
    #     'goldern': score_goldern
    # }
    # frame = DataFrame(dataset)
    # sns.jointplot('goldern', 'pred', frame, kind='reg', stat_func=eval.spearman)
    #
    # plt.xlim([1, 10])
    # plt.ylim([1, 10])
    # plt.savefig('%s/%s.png' % (macro.PICS_DIR, ('cilin_w2v_trans_lstm_max')))
    # pass