def word2vec_sim_en(f_tuple_list): print 'load word2vec model...' idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) model = KeyedVectors.load_word2vec_format( r'%s/%s' % (macro.DICT_DIR, 'GoogleNews-vectors-negative300.bin'), binary=True) # model = KeyedVectors.load_word2vec_format(r'%s/cn.skipgram.bin' % (macro.DICT_DIR), binary=True, unicode_errors='ignore') auto_sim_list = [] count = 0 for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): try: auto_sim = model.similarity(w1, w2) # 将余弦相似度放到0-10得分 # 将余弦相似度-1~1放到1~10得分 auto_sim = utils.convert_sim(auto_sim, mode=0) except: auto_sim = -1 count += 1 print "w2v:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim) auto_sim_list.append(auto_sim) print 'count=%s/%s' % (len(manu_sim_list) - count, len(manu_sim_list)) print 'spearman=%0.5f/%0.5f' % (eval.spearman( manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True)) print 'pearson=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True)) return auto_sim_list
def calculate_sim(self, load_model, ofname, write_flag=True): # 加载指定w2v model w2v_model = Word2Vec.load_word2vec_format( r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True) # C format # 读入评测词对语料 id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist( self.f_tuple_list, mode='tag') # 新的题头 new_headline = headline.strip() + '\tPrediction\n' # 计算相似度 auto_sim_list = [] for id, w1, w2, manu_sim in zip(id_list, word1_list, word2_list, manu_sim_list): try: auto_sim = w2v_model.similarity(w1, w2) # 向量余弦相似度[-1,1] print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % ( id, w1, w2, manu_sim, auto_sim) except: auto_sim = 0 # 未登录词,为了区分1.0,赋值为1 print '%-10s\t%-10s\t%-10s\t%-10s\t%-10s' % ( id, w1, w2, manu_sim, '______Not Found______') auto_sim = utils.convert_sim(auto_sim, mode=1) # 将余弦相似度放到1-10得分 auto_sim_list.append(auto_sim) # 相似度计算的结果是否写入文件 if write_flag: print 'write result to file...' with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw: fw.write(new_headline) for id, w1, w2, manu_sim, auto_sim in zip( id_list, word1_list, word2_list, manu_sim_list, auto_sim_list): fw.write('%s\t%s\t%s\t%s\t%s\n' % (str(id), w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim)) # 评价结果 r = eval.spearman(manu_sim_list, auto_sim_list) p = eval.pearson(manu_sim_list, auto_sim_list) print '!!!spearman=%s; pearson=%s' % (r, p) # 可视化结果 data = { 'ID': id_list, 'Word1': word1_list, 'Word2': word2_list, 'Score': manu_sim_list, 'Prediction': auto_sim_list } frame = DataFrame(data) sns.jointplot("Score", "Prediction", frame, kind='reg', stat_func=eval.spearmanr) plt.savefig('%s/%s.jpg' % (macro.PICS_DIR, ofname)) return word1_list, word2_list, manu_sim_list, auto_sim_list, new_headline
def hnet_sim(f_tuple_list): ''' bt_xiepeiyiVerb.dic:每一行是一个协陪义动词 #对于每一个协陪义动词,得到“当前协陪义动词--glossary.dat中动词 相似度” 相似度个数 = bt_xiepeiyiVerb.dic中有多少行乘以glossary.dat中动词数量,对所有相似度从大到小排序,结果存放在result.txt中 ''' generatePlabel = False SIMILARITY = True BETA = [0.5, 0.2, 0.17, 0.13] GAMA = 0.2 DELTA = 0.2 ALFA = 1.6 glossaryfile = '%s/%s' % (macro.DICT_DIR, macro.WN_GLOSS_DICT) xiepeiyidic = '%s/%s' % (macro.DICT_DIR, macro.WN_XPY_VERB_DICT) sememefile = '%s/%s' % (macro.DICT_DIR, macro.WN_WHOLE_DICT) if generatePlabel: lines = generateSourcefile(glossaryfile, xiepeiyidic) print('There are ' + str(len(lines)) + ' lines!!') if SIMILARITY: obj = WordSimilarity() if obj.init(sememefile, glossaryfile) == False: print("[ERROR] init failed!!") count = 0 auto_sim_list = [] idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): auto_sim = obj.calc(w1.encode('utf-8'), w2.encode('utf-8'), BETA, GAMA, DELTA, ALFA) if auto_sim >= 0: # 0-1放缩到1-10 auto_sim = utils.convert_sim(auto_sim, mode=1) count += 1 else: auto_sim = -1 auto_sim_list.append(auto_sim) print "hownet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim) print 'count=%s/%s' % (count, len(manu_sim_list)) print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, auto_sim_list), eval.spearman(manu_sim_list, auto_sim_list, True)) print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, auto_sim_list), eval.pearson(manu_sim_list, auto_sim_list, True)) return auto_sim_list
def run1(fname=macro.NLPCC_FML_FILE, ofname=macro.FML_CWORDNET_RESULT, flag=True): cmn = 'cmn' with open(r'%s/%s' % (macro.CORPUS_DIR, fname), 'r') as reader: wordlines = reader.readlines() manu_sim_list = [] auto_sim_list = [] # flag = False只计算差找到的结果; True则对于没有找到的赋值为-2 count = 0 default_sim = -1.0 writer = open(r'%s/%s' % (macro.RESULTS_DIR, ofname), 'w') writer.write(wordlines[0].strip() + '\n') for wordline in wordlines[1:]: id, word1, word2, manu_sim = wordline.strip().split('\t') try: synsets1 = wn.synsets(word1.decode('utf-8'), lang=cmn) synsets2 = wn.synsets(word2.decode('utf-8'), lang=cmn) sim_tmp = [] for synset1 in synsets1: for synset2 in synsets2: score = synset1.path_similarity(synset2) # score = synset1.wup_similarity(synset2) # score = synset1.lch_similarity(synset2) if score is not None: pass else: score = default_sim sim_tmp.append(score) if sim_tmp: auto_sim = np.max(sim_tmp) # print sim_tmp count += 1 else: auto_sim = default_sim except: auto_sim = default_sim print 'word is not in list' if auto_sim >= 0 or flag: # auto_sim = utils.convert_sim(auto_sim, mode=1) manu_sim_list.append(float(manu_sim)) auto_sim_list.append(auto_sim) print "process id= %s [%s,%s] %s %s" % (id, word1, word2, manu_sim, auto_sim) writer.write('%s\t%s\t%s\t%s\n' % (id, word1, word2, str(auto_sim))) print 'found_pair=%s/%s' % (count, len(manu_sim_list)) print 'pearson', eval.pearson(manu_sim_list, auto_sim_list)[0] print 'spearman', eval.spearman(manu_sim_list, auto_sim_list)[0] writer.close() """
def cwordnet_sim(f_tuple_list, cmn='cmn'): print 'load cwordnet_sim...' cwordnet_sim_list = [] idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) count = 0 for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): auto_sim = cwn_sim(w1, w2, cmn) # 字典中查找到的词 if auto_sim >= 0: count += 1 # 分制转成1-10 auto_sim = utils.convert_sim(auto_sim, mode=1) else: pass # 未查找到的词 auto_sim = -1 print "cwordnet:proc_id= %s [%s,%s] %s %.2f" % (id, w1, w2, manu_sim, auto_sim) cwordnet_sim_list.append(auto_sim) print 'count=%s/%s' % (count, len(manu_sim_list)) print 'spearman=%0.5f/%0.5f' % (eval.spearman(manu_sim_list, cwordnet_sim_list), eval.spearman(manu_sim_list, cwordnet_sim_list, True)) print 'pearson=%0.5f/%0.5f' % (eval.pearson(manu_sim_list, cwordnet_sim_list), eval.pearson(manu_sim_list, cwordnet_sim_list, True)) return cwordnet_sim_list
def compare(): formal_pred_all_features = post.get_value_list( macro.CORPUS_DIR + '/features_golden_new.txt', [1, 1, 1, 1, 1, 1, 1]) formal_pred_selected_features = post.get_value_list( macro.CORPUS_DIR + '/features_golden_new.txt', [0, 0, 0, 1, 0, 1, 0]) dry_pred_all_features = post.get_value_list( macro.CORPUS_DIR + '/features_test.txt', [1, 1, 1, 1, 1, 1, 1]) dry_pred_selected_features = post.get_value_list( macro.CORPUS_DIR + '/features_test.txt', [0, 0, 0, 1, 0, 1, 0]) for result in dry_results: idl, w1l, w2l, scores, headline = utils.read2wordlist([ (macro.RESULTS_DIR, result) ]) print str( result) + ' vs dry_pred_all_featuers spearman: ', eval.spearman( dry_pred_all_features, scores)[0], 'pearson: ', eval.pearson(dry_pred_all_features, scores)[0] print str(result) + ' vs dry_pred_selected_featuers spearman: ', eval.spearman(dry_pred_selected_features, scores)[0], 'pearson: ', \ eval.pearson(dry_pred_selected_features, scores)[0] for result in formal_results: idl, w1l, w2l, scores, headline = utils.read2wordlist([ (macro.RESULTS_DIR, result) ]) print str( result) + ' vs formal_pred_all_featuers spearman: ', eval.spearman( formal_pred_all_features, scores)[0], 'pearson: ', eval.pearson(formal_pred_all_features, scores)[0] print str(result) + ' vs formal_pred_selected_featuers spearman: ', eval.spearman(formal_pred_selected_features, scores)[0], 'pearson: ', \ eval.pearson(formal_pred_selected_features, scores)[0]
def combine_zh_en(): d = enchant.Dict('en_US') _, en_w1_list, en_w2_list, _, _ = utils.read2wordlist([(macro.CORPUS_DIR, 'en_'+macro.NLPCC_FML_FILE)], mode='tag') _, _, _, manu_sim_list, _ = utils.read2wordlist([(macro.CORPUS_DIR, macro.NLPCC_FML_FILE)], mode='tag') # 这里换成想要提升的结果文件 # id_list, w1_list, w2_list, manu_sim_list, auto_sim_list, headline = \ # utils.read2wordlist([(macro.RESULTS_DIR, macro.FML_ORG_BDNEWS_XIESO_RESULT)], mode='auto_tag') id_list, w1_list, w2_list, auto_sim_list, headline = \ utils.read2wordlist([(macro.RESULTS_DIR, 'lstm.result')], mode='tag') w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, macro.GOOGLE_EN_W2V_MODEL), binary=True) # the English model fw2 = open(r'%s/%s' % (macro.RESULTS_DIR, macro.FML_ORG_GOOGLE_EN_W2V_RESULT), 'w') fw2.write(headline) new_auto_sim_list = [] count = 0 for id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim in \ zip(id_list, w1_list, en_w1_list, w2_list, en_w2_list, manu_sim_list, auto_sim_list): # print id, '====' if d.check(trans_w1) and d.check(trans_w2): if len(trans_w1.split()) <= 1 and len(trans_w2.split()) <= 1: try: auto_sim = w2v_model.similarity(trans_w1, trans_w2) auto_sim = utils.convert_sim(auto_sim, mode=0) # 将余弦相似度放到1-10得分 count += 1 except: pass print '%s\t%s[%s];%s[%s]\tmanu_sim=%s\tauto_sim=%s' % (id, w1, trans_w1, w2, trans_w2, manu_sim, auto_sim) new_auto_sim_list.append(float(auto_sim)) line2 = '%s\t%s\t%s\t%s\t%s\n' % (id, trans_w1, trans_w2, manu_sim, auto_sim) fw2.write(line2.encode('utf-8')) fw2.close() # 评价结果 print 'count=', count r = eval.spearman(manu_sim_list, new_auto_sim_list) p = eval.pearson(manu_sim_list, new_auto_sim_list) print '!!!spearman=%s; pearson=%s' % (r, p)
def cilin_sim(f_tuple_list): cs = loadCilin() idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) result1 = [] result2 = [] result3 = [] count = 0 for id, w1, w2, manu_sim in zip(idl, w1l, w2l, manu_sim_list): sim1 = cs.similarity(w1, w2) sim2 = cs.sim2013(w1, w2) sim3 = cs.sim2016(w1, w2) # 字典中查找到的词 if sim3 >= 0: count += 1 # 分制转成1-10 sim1 = utils.convert_sim(sim1, mode=1) sim2 = utils.convert_sim(sim2, mode=1) sim3 = utils.convert_sim(sim3, mode=1) else: pass # 未查找到的词认为相似度很低 sim1, sim2, sim3 = -1, -1, -1 # push result1.append(sim1) result2.append(sim2) result3.append(sim3) print "cilin:proc_id= %s [%s,%s] %s (%0.2f, %0.2f, %0.2f)" % ( id, w1, w2, manu_sim, sim1, sim2, sim3) # 统计与评价 print 'found_pair=%s/%s' % (count, len(manu_sim_list)) print 'sim1: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, result1), eval.pearson( manu_sim_list, result1, True), eval.spearman( manu_sim_list, result1), eval.spearman(manu_sim_list, result1, True)) print 'sim2: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, result2), eval.pearson( manu_sim_list, result2, True), eval.spearman( manu_sim_list, result2), eval.spearman(manu_sim_list, result2, True)) print 'sim3: pearson=%0.5f/%0.5f; spearman=%0.5f/%0.5f' % (eval.pearson( manu_sim_list, result3), eval.pearson( manu_sim_list, result3, True), eval.spearman( manu_sim_list, result3), eval.spearman(manu_sim_list, result3, True)) return (result1, result2, result3)
from Com import macro from Eval import eval from Com import utils import post import merge lst = [1] * 7 data = post.get_value_list(macro.CORPUS_DIR + '/features_golden_new.txt', lst) max = 0 final_list = [] idl, w1l, w2l, score, headline = utils.read2wordlist([(macro.CORPUS_DIR, '500_2.csv')]) f_c = macro.RESULTS_DIR + '/evatestdata3_goldern500_cilin.txt' f_v = macro.RESULTS_DIR + '/fml_org_bdnews_xieso.result' for mode in range(1, 13): score_m = merge.merge_2_list(f_v, f_c, mode) sp = eval.spearman(data, score_m)[0] pe = eval.pearson(data, score_m)[0] temp = score_m print macro.MODES[mode - 1], '\t', eval.spearman( score, score_m)[0], '\t', eval.pearson(score, score_m)[0], '\t', sp, '\t', pe # idl_p, w1l_p, w2l_p, score_p, headline_p = utils.read2wordlist([(macro.RESULTS_DIR,'best_without_lstm.txt')]) # pred = merge.merge_2_list(macro.RESULTS_DIR+'/fml_google_en_w2v.result',f_c,mode=macro.MAX) # print eval.spearman(pred,score),eval.pearson(pred,score) # merge.merge(macro.RESULTS_DIR+'/fml_google_en_w2v.result',f_c,macro.RESULTS_DIR+'/best_without_lstm.txt',macro.MAX)
def test_2(part): pp = numpy.load('lstm_model.npz' + str(part) + '.npz') for kk, vv in pp.items(): if kk == 'Wemb': return vv if __name__ == '__main__': for i in range(1, 6): train_lstm(dim_proj=600, n_words=100000, max_epochs=100, test_size=-1, part=i) last_scores = [] max_score = [] for i in range(1, 6): last_scores.append(test_lstm(part=i)) idl, w1l, w2l, score_goldern, headline = utils.read2wordlist([ (macro.CORPUS_DIR, '500_2.csv') ]) temp = last_scores[0] for s in last_scores[1:]: max_score = merge2max(temp, s) temp = max_score print('max_score: ', eval.spearman(max_score, score_goldern), eval.pearson(max_score, score_goldern))
def single_sims(f_tuple_list, ofname='single_sims'): pk_path = '%s/%s.pk' % (macro.RESULTS_DIR, ofname) if os.path.exists(pk_path): f = open(pk_path, 'rb') d = pk.load(f) f.close() else: idl, w1l, w2l, score, headline = utils.read2wordlist(f_tuple_list) cilin_sim_list1, cilin_sim_list2, cilin_sim_list3 = cilin_sim( f_tuple_list) hownet_sim_list = hnet_sim(f_tuple_list) cwordnet_sim_list = cwordnet_sim(f_tuple_list) w2v_sim_list = word2vec_sim(f_tuple_list) jcd_list, ovl_list, dice_list, pmi_list, ngd_list = ir_sim( f_tuple_list, '%s_ir_nums0.pk' % ofname) d = { 'id': idl, 'w1': w1l, 'w2': w2l, 'manu_sim': score, # 'cilin1': cilin_sim_list1, # 'cilin2': cilin_sim_list2, 'cilin3': cilin_sim_list3, 'hownet': hownet_sim_list, 'wordnet': cwordnet_sim_list, 'word2vec': w2v_sim_list, 'jaccard': jcd_list, 'overlap': ovl_list, 'dice': dice_list, 'pmi': pmi_list, # 'ngd': ngd_list } f = open(pk_path, 'wb') pk.dump(d, f) f.close() # names = ['id', 'w1', 'w2', 'manu_sim', 'cilin1', 'cilin2', 'cilin3', # 'hownet', 'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi'] names = [ 'id', 'w1', 'w2', 'manu_sim', 'cilin3', 'hownet', 'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi' ] df = pd.DataFrame(data=d, columns=names) # print df # 评价结果 from prettytable import PrettyTable # x = PrettyTable(["Eval", 'cilin1', 'cilin2', 'cilin3', 'hownet', # 'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi']) x = PrettyTable([ "Eval", 'cilin3', 'hownet', 'wordnet', 'word2vec', 'jaccard', 'overlap', 'dice', 'pmi' ]) x.align["Eval"] = "l" x.padding_width = 1 x.add_row([ 'Spearman', # '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin1), eval.spearman(df.manu_sim, df.cilin1, True)), # '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin2), eval.spearman(df.manu_sim, df.cilin2, True)), '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.cilin3), eval.spearman(df.manu_sim, df.cilin3, True)), '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.hownet), eval.spearman(df.manu_sim, df.hownet, True)), '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.wordnet), eval.spearman(df.manu_sim, df.wordnet, True)), '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.word2vec), eval.spearman(df.manu_sim, df.word2vec, True)), '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.jaccard), eval.spearman(df.manu_sim, df.jaccard, True)), '%0.5f/%0.5f' % (eval.spearman(df.manu_sim, df.overlap), eval.spearman(df.manu_sim, df.overlap, True)), '%0.5f/%0.5f' % (eval.spearman( df.manu_sim, df.dice), eval.spearman(df.manu_sim, df.dice, True)), '%0.5f/%0.5f' % (eval.spearman( df.manu_sim, df.pmi), eval.spearman(df.manu_sim, df.pmi, True)), ]) x.add_row([ 'Pearson', # '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin1), eval.pearson(df.manu_sim, df.cilin1, True)), # '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin2), eval.pearson(df.manu_sim, df.cilin2, True)), '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.cilin3), eval.pearson(df.manu_sim, df.cilin3, True)), '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.hownet), eval.pearson(df.manu_sim, df.hownet, True)), '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.wordnet), eval.pearson(df.manu_sim, df.wordnet, True)), '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.word2vec), eval.pearson(df.manu_sim, df.word2vec, True)), '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.jaccard), eval.pearson(df.manu_sim, df.jaccard, True)), '%0.5f/%0.5f' % (eval.pearson(df.manu_sim, df.overlap), eval.pearson(df.manu_sim, df.overlap, True)), '%0.5f/%0.5f' % (eval.pearson( df.manu_sim, df.dice), eval.pearson(df.manu_sim, df.dice, True)), '%0.5f/%0.5f' % (eval.pearson( df.manu_sim, df.pmi), eval.pearson(df.manu_sim, df.pmi, True)), ]) x.add_row([ 'Count', # '%s/%s' % (len(df.manu_sim) - list(df.cilin1).count(-1), len(df.manu_sim)), # '%s/%s' % (len(df.manu_sim) - list(df.cilin2).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.cilin3).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.hownet).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.wordnet).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.word2vec).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.jaccard).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.overlap).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.dice).count(-1), len(df.manu_sim)), '%s/%s' % (len(df.manu_sim) - list(df.pmi).count(-1), len(df.manu_sim)), ]) print x df.to_csv('%s/%s.csv' % (macro.RESULTS_DIR, ofname), encoding='gbk') # 线性结合 df = df.replace(-1, 0) # max linear_mean_auto_sims = [row[4:].max() for row in df.values] print 'MAX: pearson=%.5f;spearman=%.5f' % ( eval.pearson(df.manu_sim, linear_mean_auto_sims), eval.spearman(df.manu_sim, linear_mean_auto_sims)) # min linear_mean_auto_sims = [row[4:].min() for row in df.values] print 'MIN: pearson=%.5f;spearman=%.5f' % ( eval.pearson(df.manu_sim, linear_mean_auto_sims), eval.spearman(df.manu_sim, linear_mean_auto_sims)) # mean linear_mean_auto_sims = [row[4:].mean() for row in df.values] print 'MEAN: pearson=%.5f;spearman=%.5f' % ( eval.pearson(df.manu_sim, linear_mean_auto_sims), eval.spearman(df.manu_sim, linear_mean_auto_sims)) # gmean df = df.replace(0, 1) linear_mean_auto_sims = [geometric_mean(row[4:]) for row in df.values] print 'GMEAN: pearson=%.5f;spearman=%.5f' % ( eval.pearson(df.manu_sim, linear_mean_auto_sims), eval.spearman(df.manu_sim, linear_mean_auto_sims)) return df
print eval.spearman(score, result1)[0] print eval.spearman(score, result2)[0] print eval.spearman(score, result3)[0] ''' 计算全部的得分 0.347925120242 0.352377437382 0.421492611614 ''' score_f = [] result1_f = [] result2_f = [] result3_f = [] for s,r1,r2,r3,flag in zip(score,result1,result2,result3,flags): if flag == 1: score_f.append(s) result1_f.append(r1) result2_f.append(r2) result3_f.append(r3) print '-------------------------' print len(score_f) print eval.spearman(score_f, result1_f)[0] print eval.spearman(score_f, result2_f)[0] print eval.spearman(score_f, result3_f)[0] print eval.pearson(score_f,result3_f) ''' 只计算找到的词的得分, 454/500 0.43210021977 0.433189473938 0.520306265914 '''
# score_m = read_score(macro.CORPUS_DIR+'/merge_result_extract.txt') # print golden_score # print score_m # print spearmanr(golden_score,score_m) # eval.spearman(golden_score,score_m) last_scores = [] max_score = [] # # for i in range(1, 6): last_scores.append(merge.merge_2_list(macro.RESULTS_DIR + '/lstm_w2v' + str(i) + '.txt', f_c, mode=macro.MAX)) idl, w1l, w2l, score_goldern, headline = utils.read2wordlist([(macro.CORPUS_DIR, '500_2.csv')]) temp = last_scores[0] for s in last_scores[1:]: max_score = merge2max(temp, s) temp = max_score print ('max_score: ', eval.spearman(max_score, score_goldern), eval.pearson(max_score, score_goldern)) sss = small(max_score) print eval.spearman(sss, score_goldern) # dataset = { # 'pred': max_score, # 'goldern': score_goldern # } # frame = DataFrame(dataset) # sns.jointplot('goldern', 'pred', frame, kind='reg', stat_func=eval.spearman) # # plt.xlim([1, 10]) # plt.ylim([1, 10]) # plt.savefig('%s/%s.png' % (macro.PICS_DIR, ('cilin_w2v_trans_lstm_max'))) # pass
def train_ext_vocab_choose_best(self, save_model, result_fname, last_val, cur_iter=0): # 获取评价词对 id_list, word1_list, word2_list, manu_sim_list, headline = utils.read2wordlist( self.f_tuple_list, mode='tag') with open('%s/%s' % (macro.RESULTS_DIR, 'fmlCalScore.txt')) as mss_fr: wss_list = [float(l.strip()) for l in mss_fr.readlines()] # 获取语料 sentences = [] for seg_docs_dir in self.seg_docs_dir_list: if type(seg_docs_dir) == tuple: sens = utils.atxt2sens(seg_docs_dir[0], seg_docs_dir[1]) else: sens = utils.txts2sens(seg_docs_dir) sentences.extend(sens) # 得到模型方式:load之前的模型 OR 训练词向量模型 if last_val == -2: print 'load previous model....' model = Word2Vec.load_word2vec_format( r'%s/%s' % (macro.MODELS_DIR, save_model), binary=True) else: model = Word2Vec(sentences, sg=1, size=300, window=10, negative=0, hs=1, sample=1e-4, workers=8, min_count=5) # 评价相似度 auto_sim_list = [] for w1, w2, manu_sim in zip(word1_list, word2_list, manu_sim_list): try: auto_sim = model.similarity(w1, w2) # 将余弦相似度放到1-10得分 auto_sim = utils.convert_sim(auto_sim) # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim) except: auto_sim = 1 # 为了区分没有找到的情况,用1代替1.0 print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______') auto_sim_list.append(auto_sim) # 保留val大的模型 val = eval.spearman(manu_sim_list, auto_sim_list) val2 = eval.spearman(wss_list, auto_sim_list) val3 = eval.pearson(manu_sim_list, auto_sim_list) val4 = eval.pearson(wss_list, auto_sim_list) msg = '%s %s %s %s %s\n' % (cur_iter, val[0], val2[0], val3[0], val4[0]) print msg with open('iter.log', 'a') as fa: fa.write(msg) model.save_word2vec_format('%s/%s' % (macro.MODELS_DIR, save_model), binary=True) # 保存模型 if val > last_val: print '%s ::: larger val and write result to file...' % cur_iter with open('%s/%s' % (macro.RESULTS_DIR, result_fname), 'w') as fw: fw.write(headline.strip() + '\tPrediction\n') for w1, w2, manu_sim, auto_sim in zip(word1_list, word2_list, manu_sim_list, auto_sim_list): fw.write('%s\t%s\t%s\t%s\n' % (w1.encode('utf-8'), w2.encode('utf-8'), manu_sim, auto_sim)) else: print ':::::::current val=', val return val2
def ir_sim(f_tuple_list, ofname='NLPCC_Formal500_single_sims_ir_nums0.pk'): print 'ir sim ...' idl, w1l, w2l, manu_sim_list, headline = utils.read2wordlist(f_tuple_list) nums_pk_path = '%s/%s' % (macro.RESULTS_DIR, ofname) if os.path.exists(nums_pk_path): print 'load nums...' f = open(nums_pk_path, 'rb') n1l, n2l, n3l = pickle.load(f) f.close() else: print 'retrieval nums...' n1l, n2l, n3l = get_nums_list(w1l, w2l) f = open(nums_pk_path, 'wb') pickle.dump((n1l, n2l, n3l), f) f.close() with open(nums_pk_path.split('.')[0]+'_nums.csv', 'w') as fw: for id, w1, w2, n1, n2, n3 in zip(idl, w1l, w2l, n1l, n2l, n3l): new_line = '%s,%s,%s,%s,%s,%s' % (id, w1, w2, n1, n2, n3) fw.write(new_line.encode('gbk')+'\n') N = pow(10, 16) jcd_list, ovl_list, dice_list, pmi_list, ngd_list = [], [], [], [], [] for num1, num2, num3, id, w1, w2, manu_sim in zip(n1l, n2l, n3l, idl, w1l, w2l, manu_sim_list): jcd = utils.convert_sim(web_jaccard(num1, num2, num3), mode=1) ovl = utils.convert_sim(web_overlap(num1, num2, num3), mode=1) dice = utils.convert_sim(web_dice(num1, num2, num3), mode=1) pmi = utils.convert_sim(web_pmi(num1, num2, num3, N), mode=1) ngd = utils.convert_sim(web_ngd(num1, num2, num3, N), mode=1) jcd_list.append(jcd) ovl_list.append(ovl) dice_list.append(dice) pmi_list.append(pmi) ngd_list.append(ngd) # print "ir:proc_id= %s [%s,%s] %s (%.5f, %.5f, %.5f, %.5f, %.5f) " % (id, w1, w2, manu_sim, jcd, ovl, dice, pmi, ngd) from prettytable import PrettyTable x = PrettyTable(["Eval", "jaccard", "overlap", "dice", "pmi", "ngd"]) x.align["Eval"] = "l" x.padding_width = 1 x.add_row(['Spearman', '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, jcd_list), eval.spearman(manu_sim_list, jcd_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ovl_list), eval.spearman(manu_sim_list, ovl_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, dice_list), eval.spearman(manu_sim_list, dice_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, pmi_list), eval.spearman(manu_sim_list, pmi_list, True)), '%0.3f/%0.3f' % (eval.spearman(manu_sim_list, ngd_list), eval.spearman(manu_sim_list, ngd_list, True))]) x.add_row(['Pearson', '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, jcd_list), eval.pearson(manu_sim_list, jcd_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ovl_list), eval.pearson(manu_sim_list, ovl_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, dice_list), eval.pearson(manu_sim_list, dice_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, pmi_list), eval.pearson(manu_sim_list, pmi_list, True)), '%0.3f/%0.3f' % (eval.pearson(manu_sim_list, ngd_list), eval.pearson(manu_sim_list, ngd_list, True)), ]) x.add_row(['Count', '%s/%s' % (len(manu_sim_list) - jcd_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - ovl_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - dice_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - pmi_list.count(-1), len(manu_sim_list)), '%s/%s' % (len(manu_sim_list) - ngd_list.count(-1), len(manu_sim_list)), ]) print x return jcd_list, ovl_list, dice_list, pmi_list, ngd_list
def cilin_run1(): ''' 有三种计算方法 cs = CilinSimilarity() sim1 = cs.similarity(w1, w2) sim2 = cs.sim2013(w1, w2) sim3 = cs.sim2016(w1, w2) ''' cs = loadCilin() # w1 = u'抄袭' # w2 = u'克隆' # code1 = cs.get_code(w1) # print w1, '的编码有:', code1 # code2 = cs.get_code(w2) # print w2, '的编码有:', code2 # sim = cs.similarity(w1, w2) # print w1, w2, '最终的相似度为', sim idl, w1l, w2l, score, headline = utils.read2wordlist([ (macro.CORPUS_DIR, macro.NLPCC_FML_FILE) ]) result1 = [] result2 = [] result3 = [] flags = [] outfile = codecs.open(macro.RESULTS_DIR + '/fml_cilin.txt', 'w', 'utf-8') outfile.write('\r\n') for id, w1, w2 in zip(idl, w1l, w2l): sim1 = cs.similarity(w1, w2) sim2 = cs.sim2013(w1, w2) sim3 = cs.sim2016(w1, w2) outfile.write(id + '\t' + w1 + '\t' + w2 + '\t' + str(sim3) + '\r\n') if sim1 == -1: flags.append(0) else: flags.append(1) result1.append(sim1) result2.append(sim2) result3.append(sim3) outfile.close() print eval.spearman(score, result1) print eval.spearman(score, result2) print eval.spearman(score, result3) ''' 计算全部的得分 0.347925120242 0.352377437382 0.421492611614 ''' score_f = [] result1_f = [] result2_f = [] result3_f = [] for s, r1, r2, r3, flag in zip(score, result1, result2, result3, flags): if flag == 1: score_f.append(s) result1_f.append(r1) result2_f.append(r2) result3_f.append(r3) print '-------------------------' print len(score_f) print eval.spearman(score_f, result1_f) print eval.spearman(score_f, result2_f) print eval.spearman(score_f, result3_f) print eval.pearson(score_f, result3_f) '''