def get_top_k_match(k, source, targets, source_embeddings, target_embeddings): result_dict_average = {} result_dict_average_tfidf = {} result_dict_sum = {} for t in targets: distance_average = euclidean_distances( vector_averaging(source.split(" "), source_embeddings, DIMENSION), vector_averaging(t.split(" "), target_embeddings, DIMENSION))[0][0] distance_average_tfidf = euclidean_distances( vector_averaging_with_tfidf(source.split(" "), source_embeddings, cs_word2weight, DIMENSION), vector_averaging_with_tfidf(t.split(" "), target_embeddings, java_word2weight, DIMENSION))[0][0] # distance_sum = euclidean_distances(vector_summing(source.split(" "),source_embeddings,DIMENSION),vector_summing(t.split(" "),target_embeddings,DIMENSION))[0][0] # distance_sum_tfidf = euclidean_distances(vector_summing_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0] result_dict_average[t] = distance_average result_dict_average_tfidf[t] = distance_average_tfidf # result_dict_sum[t] = distance_sum sorted_result_average = sorted(result_dict_average.items(), key=operator.itemgetter(1)) sorted_result_average_tfidf = sorted(result_dict_average_tfidf.items(), key=operator.itemgetter(1)) # sorted_result_sum = sorted(result_dict_sum.items(), key=operator.itemgetter(1)) return sorted_result_average[: k], sorted_result_average_tfidf[: k] #sorted_result_sum[:k]
def print_phrase_cosine_similarity(phrases,cs_vectors,java_vectors): for p in phrases: for p2 in phrases: cs_vec = vector_averaging(p.split(" "),cs_vectors) java_vec = vector_averaging(p2.split(" "),java_vectors) # cs_vec_tfidf = vector_averaging_with_tfidf(p.split(" "),cs_vectors,cs_word2weight) # java_vec_tfidf = vector_averaging_with_tfidf(p.split(" "),java_vectors,java_word2weight) cos_sim = cosine_similarity(cs_vec,java_vec)[0][0] # cos_sim_tfidf = cosine_similarity(cs_vec_tfidf,java_vec_tfidf)[0][0] with open("cosine_sim_phrase.csv","a") as f: line = p + "," + p2 + "," + str(cos_sim) f.write(line + "\n")
def get_top_k_match(k, source, targets, source_embeddings,target_embeddings ): result_dict_average = {} result_dict_average_tfidf = {} result_dict_sum = {} for t in targets: distance_average = euclidean_distances(vector_averaging(source.split(" "),source_embeddings,DIMENSION),vector_averaging(t.split(" "),target_embeddings,DIMENSION))[0][0] distance_average_tfidf = euclidean_distances(vector_averaging_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0] # distance_sum = euclidean_distances(vector_summing(source.split(" "),source_embeddings,DIMENSION),vector_summing(t.split(" "),target_embeddings,DIMENSION))[0][0] # distance_sum_tfidf = euclidean_distances(vector_summing_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0] result_dict_average[t] = distance_average result_dict_average_tfidf[t] = distance_average_tfidf # result_dict_sum[t] = distance_sum sorted_result_average = sorted(result_dict_average.items(), key=operator.itemgetter(1)) sorted_result_average_tfidf = sorted(result_dict_average_tfidf.items(), key=operator.itemgetter(1)) # sorted_result_sum = sorted(result_dict_sum.items(), key=operator.itemgetter(1)) return sorted_result_average[:k], sorted_result_average_tfidf[:k] #sorted_result_sum[:k]
# top_5_java = java_vectors.similar_by_vector(cs_vectors[cs_k], topn=5) # # print top_5_java # relevant_list = list() # for element in top_5_java: # if element[0] == cs_k: # relevant_list.append(1) # else: # relevant_list.append(0) # avg_p = average_precision(relevant_list) # avg_p_list_cs.append(relevant_list) # # print avg_p # precision_list_cs.append(avg_p) # except Exception as e: # print e # print avg_p_list_cs # print "MAP CS : " + str(mean_average_precision(avg_p_list_cs)) print expr_cs print expr_java predict_average = cosine_similarity(vector_averaging(expr_cs.split(" "),cs_vectors,DIMENSION),vector_averaging(expr_java.split(" "),java_vectors,DIMENSION))[0][0] predict_average_tfidf = cosine_similarity(vector_averaging_with_tfidf(expr_cs.split(" "),cs_vectors,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(expr_java.split(" "),java_vectors,java_word2weight,DIMENSION))[0][0] new_row = list() new_row.append(str(predict_average)) new_row.append(str(predict_average_tfidf)) with open("./evaluation_result/keywords_result_11_20_include_functions.csv","a") as f: f.write(",".join(new_row) + "\n")
# C# # diff_1 = process_diff_srcml(row[3],0) # diff_1 = process_source_code(row[3],0) diff_1 = process_diff_srcml2(row[3]) # Java # diff_2 = process_diff_srcml(row[5],1) # diff_2 = process_source_code(row[5],0) diff_2 = process_diff_srcml2(row[5]) print("diff 1 : " + diff_1) print("diff 2 : " + diff_2) # if i > 0 and i != 40: # diff_cs_list.append(process_source_code(row[5])) predict_average = cosine_similarity( vector_averaging(diff_1.split(" "), cs_vectors), vector_averaging(diff_2.split(" "), java_vectors))[0][0] predict_average_tfidf = cosine_similarity( vector_averaging_with_tfidf(diff_1.split(" "), cs_vectors, cs_word2weight), vector_averaging_with_tfidf(diff_2.split(" "), java_vectors, java_word2weight))[0][0] # predict = doc2vec_model.docvecs.similarity_unseen_docs(doc2vec_model,diff_1.split(" "),diff_2.split(" ")) print(predict_average) print(predict_average_tfidf) print(row[7]) new_row = list() new_row.append(row[0]) new_row.append(row[1]) new_row.append(row[7]) new_row.append(row[8])
# else: # relevant_list.append(0) # avg_p = average_precision(relevant_list) # avg_p_list_cs.append(relevant_list) # # print avg_p # precision_list_cs.append(avg_p) # except Exception as e: # print e # print avg_p_list_cs # print "MAP CS : " + str(mean_average_precision(avg_p_list_cs)) print expr_cs print expr_java predict_average = cosine_similarity( vector_averaging(expr_cs.split(" "), cs_vectors, DIMENSION), vector_averaging(expr_java.split(" "), java_vectors, DIMENSION))[0][0] predict_average_tfidf = cosine_similarity( vector_averaging_with_tfidf(expr_cs.split(" "), cs_vectors, cs_word2weight, DIMENSION), vector_averaging_with_tfidf(expr_java.split(" "), java_vectors, java_word2weight, DIMENSION))[0][0] new_row = list() new_row.append(str(predict_average)) new_row.append(str(predict_average_tfidf)) with open( "./evaluation_result/keywords_result_11_20_include_functions.csv", "a") as f:
with codecs.open("./codelabel/codelabel_" + PROJECT + ".csv","r") as f_csv: reader = csv.reader(f_csv) for i,row in enumerate(reader): print("###############################") # if i == 40: print("id : " + row[1]) diff_1 = process_source_code(row[3],0) diff_2 = process_source_code(row[5],0) print("diff 1 : " + diff_1) print("diff 2 : " + diff_2) # if i > 0 and i != 40: # diff_cs_list.append(process_source_code(row[5])) predict_average = cosine_similarity(vector_averaging(diff_1.split(" "),cs_vectors,DIMENSION),vector_averaging(diff_2.split(" "),java_vectors,DIMENSION))[0][0] predict_average_tfidf = cosine_similarity(vector_averaging_with_tfidf(diff_1.split(" "),cs_vectors,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(diff_2.split(" "),java_vectors,java_word2weight,DIMENSION))[0][0] # predict = doc2vec_model.docvecs.similarity_unseen_docs(doc2vec_model,diff_1.split(" "),diff_2.split(" ")) print(predict_average) print(predict_average_tfidf) print(row[7]) new_row = list() new_row.append(row[0]) new_row.append(row[1]) new_row.append(row[7]) new_row.append(row[8]) new_row.append(row[9]) new_row.append(str(predict_average)) new_row.append(str(predict_average_tfidf)) with open("./codelabel_result/bi2vec_" + PROJECT +"_10.csv","a") as f: f.write(",".join(new_row) + "\n")