예제 #1
0
def get_top_k_match(k, source, targets, source_embeddings, target_embeddings):

    result_dict_average = {}
    result_dict_average_tfidf = {}
    result_dict_sum = {}
    for t in targets:

        distance_average = euclidean_distances(
            vector_averaging(source.split(" "), source_embeddings, DIMENSION),
            vector_averaging(t.split(" "), target_embeddings, DIMENSION))[0][0]
        distance_average_tfidf = euclidean_distances(
            vector_averaging_with_tfidf(source.split(" "), source_embeddings,
                                        cs_word2weight, DIMENSION),
            vector_averaging_with_tfidf(t.split(" "), target_embeddings,
                                        java_word2weight, DIMENSION))[0][0]
        # distance_sum = euclidean_distances(vector_summing(source.split(" "),source_embeddings,DIMENSION),vector_summing(t.split(" "),target_embeddings,DIMENSION))[0][0]
        # distance_sum_tfidf = euclidean_distances(vector_summing_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0]

        result_dict_average[t] = distance_average
        result_dict_average_tfidf[t] = distance_average_tfidf
        # result_dict_sum[t] = distance_sum

    sorted_result_average = sorted(result_dict_average.items(),
                                   key=operator.itemgetter(1))

    sorted_result_average_tfidf = sorted(result_dict_average_tfidf.items(),
                                         key=operator.itemgetter(1))

    # sorted_result_sum = sorted(result_dict_sum.items(), key=operator.itemgetter(1))
    return sorted_result_average[:
                                 k], sorted_result_average_tfidf[:
                                                                 k]  #sorted_result_sum[:k]
예제 #2
0
def print_phrase_cosine_similarity(phrases,cs_vectors,java_vectors):
	for p in phrases:
		for p2 in phrases:
			cs_vec = vector_averaging(p.split(" "),cs_vectors)
			java_vec = vector_averaging(p2.split(" "),java_vectors)

			# cs_vec_tfidf = vector_averaging_with_tfidf(p.split(" "),cs_vectors,cs_word2weight)
			# java_vec_tfidf = vector_averaging_with_tfidf(p.split(" "),java_vectors,java_word2weight)

			cos_sim = cosine_similarity(cs_vec,java_vec)[0][0]
			# cos_sim_tfidf = cosine_similarity(cs_vec_tfidf,java_vec_tfidf)[0][0]

			with open("cosine_sim_phrase.csv","a") as f:
				line = p + "," + p2 + "," + str(cos_sim)
				f.write(line + "\n")
def get_top_k_match(k, source, targets, source_embeddings,target_embeddings ):

	result_dict_average = {}
	result_dict_average_tfidf = {}
	result_dict_sum = {}
	for t in targets:
	
	
		distance_average = euclidean_distances(vector_averaging(source.split(" "),source_embeddings,DIMENSION),vector_averaging(t.split(" "),target_embeddings,DIMENSION))[0][0]
		distance_average_tfidf = euclidean_distances(vector_averaging_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0]
		# distance_sum = euclidean_distances(vector_summing(source.split(" "),source_embeddings,DIMENSION),vector_summing(t.split(" "),target_embeddings,DIMENSION))[0][0]
		# distance_sum_tfidf = euclidean_distances(vector_summing_with_tfidf(source.split(" "),source_embeddings,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(t.split(" "),target_embeddings,java_word2weight,DIMENSION))[0][0]
		
		result_dict_average[t] = distance_average
		result_dict_average_tfidf[t] = distance_average_tfidf
		# result_dict_sum[t] = distance_sum
	

	sorted_result_average = sorted(result_dict_average.items(), key=operator.itemgetter(1))

	
	sorted_result_average_tfidf = sorted(result_dict_average_tfidf.items(), key=operator.itemgetter(1))

	
	# sorted_result_sum = sorted(result_dict_sum.items(), key=operator.itemgetter(1))
	return sorted_result_average[:k], sorted_result_average_tfidf[:k] #sorted_result_sum[:k]
예제 #4
0
	# 		top_5_java = java_vectors.similar_by_vector(cs_vectors[cs_k], topn=5)
	# 		# print top_5_java
	# 		relevant_list = list()
			
	# 		for element in top_5_java:
	# 			if element[0] == cs_k:
	# 				relevant_list.append(1)
	# 			else:
	# 				relevant_list.append(0)
	# 		avg_p =  average_precision(relevant_list)
	# 		avg_p_list_cs.append(relevant_list)
	# 		# print avg_p
	# 		precision_list_cs.append(avg_p)
	# 	except Exception as e:
	# 		print e

	# print avg_p_list_cs
	# print "MAP CS : " + str(mean_average_precision(avg_p_list_cs))


			print expr_cs
			print expr_java
			predict_average = cosine_similarity(vector_averaging(expr_cs.split(" "),cs_vectors,DIMENSION),vector_averaging(expr_java.split(" "),java_vectors,DIMENSION))[0][0]
			predict_average_tfidf = cosine_similarity(vector_averaging_with_tfidf(expr_cs.split(" "),cs_vectors,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(expr_java.split(" "),java_vectors,java_word2weight,DIMENSION))[0][0]
			
			new_row = list()

			new_row.append(str(predict_average))
			new_row.append(str(predict_average_tfidf))
			with open("./evaluation_result/keywords_result_11_20_include_functions.csv","a") as f:
				f.write(",".join(new_row) + "\n")
예제 #5
0
        # C#
        # diff_1 = process_diff_srcml(row[3],0)
        # diff_1 = process_source_code(row[3],0)
        diff_1 = process_diff_srcml2(row[3])
        # Java
        # diff_2 = process_diff_srcml(row[5],1)
        # diff_2 = process_source_code(row[5],0)
        diff_2 = process_diff_srcml2(row[5])
        print("diff 1 : " + diff_1)
        print("diff 2 : " + diff_2)
        # if i > 0 and i != 40:
        # 	diff_cs_list.append(process_source_code(row[5]))

        predict_average = cosine_similarity(
            vector_averaging(diff_1.split(" "), cs_vectors),
            vector_averaging(diff_2.split(" "), java_vectors))[0][0]
        predict_average_tfidf = cosine_similarity(
            vector_averaging_with_tfidf(diff_1.split(" "), cs_vectors,
                                        cs_word2weight),
            vector_averaging_with_tfidf(diff_2.split(" "), java_vectors,
                                        java_word2weight))[0][0]
        # predict = doc2vec_model.docvecs.similarity_unseen_docs(doc2vec_model,diff_1.split(" "),diff_2.split(" "))
        print(predict_average)
        print(predict_average_tfidf)
        print(row[7])
        new_row = list()
        new_row.append(row[0])
        new_row.append(row[1])
        new_row.append(row[7])
        new_row.append(row[8])
예제 #6
0
            # 			else:
            # 				relevant_list.append(0)
            # 		avg_p =  average_precision(relevant_list)
            # 		avg_p_list_cs.append(relevant_list)
            # 		# print avg_p
            # 		precision_list_cs.append(avg_p)
            # 	except Exception as e:
            # 		print e

            # print avg_p_list_cs
            # print "MAP CS : " + str(mean_average_precision(avg_p_list_cs))

            print expr_cs
            print expr_java
            predict_average = cosine_similarity(
                vector_averaging(expr_cs.split(" "), cs_vectors, DIMENSION),
                vector_averaging(expr_java.split(" "), java_vectors,
                                 DIMENSION))[0][0]
            predict_average_tfidf = cosine_similarity(
                vector_averaging_with_tfidf(expr_cs.split(" "), cs_vectors,
                                            cs_word2weight, DIMENSION),
                vector_averaging_with_tfidf(expr_java.split(" "), java_vectors,
                                            java_word2weight, DIMENSION))[0][0]

            new_row = list()

            new_row.append(str(predict_average))
            new_row.append(str(predict_average_tfidf))
            with open(
                    "./evaluation_result/keywords_result_11_20_include_functions.csv",
                    "a") as f:
예제 #7
0
with codecs.open("./codelabel/codelabel_" + PROJECT + ".csv","r") as f_csv:
	reader = csv.reader(f_csv)

	for i,row in enumerate(reader):
		print("###############################")
		# if i == 40:
		print("id : " + row[1])
		diff_1 = process_source_code(row[3],0)
		diff_2 = process_source_code(row[5],0)

		print("diff 1 : " + diff_1)
		print("diff 2 : " + diff_2)
		# if i > 0 and i != 40:
		# 	diff_cs_list.append(process_source_code(row[5]))
		
		predict_average = cosine_similarity(vector_averaging(diff_1.split(" "),cs_vectors,DIMENSION),vector_averaging(diff_2.split(" "),java_vectors,DIMENSION))[0][0]
		predict_average_tfidf = cosine_similarity(vector_averaging_with_tfidf(diff_1.split(" "),cs_vectors,cs_word2weight,DIMENSION),vector_averaging_with_tfidf(diff_2.split(" "),java_vectors,java_word2weight,DIMENSION))[0][0]
		# predict = doc2vec_model.docvecs.similarity_unseen_docs(doc2vec_model,diff_1.split(" "),diff_2.split(" "))
		print(predict_average)
		print(predict_average_tfidf)
		print(row[7])
		new_row = list()
		new_row.append(row[0])
		new_row.append(row[1])
		new_row.append(row[7])
		new_row.append(row[8])
		new_row.append(row[9])
		new_row.append(str(predict_average))
		new_row.append(str(predict_average_tfidf))
		with open("./codelabel_result/bi2vec_" + PROJECT +"_10.csv","a") as f:
			f.write(",".join(new_row) + "\n")