def text2feat(api, api_descriptions, w2v, idf, query_matrix, query_idf_vector): api_matrix, api_idf_vector = feedback.load_matrix(api, w2v, idf) api_descriptions_matrix, api_descriptions_idf_vector = feedback.load_matrix( api_descriptions, w2v, idf) # 获取api及doc信息并计算其相似度,相关问题在推荐中已经获得 api_sim = similarity.sim_doc_pair(query_matrix, api_matrix, query_idf_vector, api_idf_vector) if api_descriptions == 'null': api_desc_sim = 0 else: api_desc_sim = similarity.sim_doc_pair(query_matrix, api_descriptions_matrix, query_idf_vector, api_descriptions_idf_vector) # 将获得信息按api为一列放入sum_inf中 sum_inf = list() sum_inf.append(api_sim) sum_inf.append(api_desc_sim) # 将所有特征封装成字典并返回,这样得到特征之后能直接输出topn的相关特征 api_inf = dict() api_desc_inf = dict() api_inf[api] = api_sim api_desc_inf[api_descriptions] = api_desc_sim return sum_inf, api_inf, api_desc_inf
def get_sim_query(train, test, w2v, idf): sim = 0 for i in range(len(train)): train_matrix, train_idf = load_matrix(train, w2v, idf) test_matrix, test_idf = load_matrix(test, w2v, idf) sim = similarity.sim_doc_pair(train_matrix, test_matrix, train_idf, test_idf) return sim
def get_topk_questions(origin_query, query_matrix, query_idf_vector, questions, topk, parent): # this function returns a dictionary of the top-k most relevant questions of the query # the key is question id, the value is the similarity between the question and the query query_id = '-1' for question in questions: if question.title == origin_query or question.title in origin_query or origin_query in question.title: # the same question should not appear in the dataset query_id = question.id if query_id not in parent: parent[query_id] = query_id relevant_questions = list() for question in questions: if query_id in parent and question.id in parent and parent[ query_id] == parent[question.id]: #duplicate questions continue valid = False for answer in question.answers: if int(answer.score) >= 0: valid = True if not valid: continue sim = similarity.sim_doc_pair(query_matrix, question.matrix, query_idf_vector, question.idf_vector) relevant_questions.append((question.id, question.title, sim)) list_relevant_questions = sorted(relevant_questions, key=lambda question: question[2], reverse=True) #print(list_relevant_questions) # get the ids of top-k most relevant questions top_questions = dict() for i, item in enumerate(list_relevant_questions): top_questions[item[0]] = item[2] if i + 1 == topk: break return top_questions
def get_feedback_api(query, answer, query_matrix, query_idf_vector, w2v, idf): line = 0 feeds = [] for row in answer: if line > 0: question_matrix, question_idf_vector = load_matrix( query[answer.index(row)], w2v, idf) sim = similarity.sim_doc_pair(query_matrix, question_matrix, query_idf_vector, question_idf_vector) # 若query与反馈的问题相似,则将反馈问题的api信息加入 if sim > 0.65: for n in range(len(row)): feed = [query[answer.index(row)], row[n], sim] feeds.append(feed) line += 1 feeds = sorted(feeds, key=lambda item: item[2], reverse=True) while len(feeds) < 5: feeds.append([0, 0, 0]) feed_sim = [] for inf in feeds: if len(feed_sim) < 5: feed_sim.append(inf[2]) return feeds, feed_sim
def recommend_api_class(query_matrix, query_idf_vector, top_questions, questions, javadoc, javadoc_dict_classes, topk): # remember that top_questions is a dictionary of the top-k most relevant questions of the query # the key is question id, the value is the similarity between the question and the query # questions is a list including all questions (api related) in StackOverflow # javadoc is a list including all api classes api_classes_count = dict() api_classes = dict( ) # stores the similarity between the question (whose answer contains the API class) and the query for question in questions: if question.id not in top_questions: continue for answer in question.answers: if int(answer.score) < 0: continue soup = BeautifulSoup(answer.body, 'html.parser', from_encoding='utf-8') links = soup.find_all('a') for link in links: link = link['href'] if 'docs.oracle.com/javase/' in link and '/api/' in link and 'html' in link: pair = util.parse_api_link( link) # pair[0] is class name, pair[1] is method name class_name = pair[ 0] #note that this class_name already contains package name, i.e, java.util.Calendar if class_name in api_classes: api_classes[class_name] += top_questions[question.id] api_classes_count[class_name] += 1 else: api_classes[class_name] = top_questions[question.id] api_classes_count[class_name] = 1 codes = soup.find_all('code') for code in codes: code = code.get_text() pos = code.find('(') if pos != -1: code = code[:pos] #code = code.replace('()', '') if code in javadoc_dict_classes: # print code,'!class' class_name = javadoc_dict_classes[code] if class_name in api_classes: api_classes[class_name] += top_questions[question.id] api_classes_count[class_name] += 1 else: api_classes[class_name] = top_questions[question.id] api_classes_count[class_name] = 1 for key, value in api_classes.items(): api_classes[key] = min( 1.0, value / api_classes_count[key] * (1.0 + math.log(api_classes_count[key], 2) / 10)) api_sim = {} for api in javadoc: if api.package_name + '.' + api.class_name not in api_classes: continue doc_sim = 0.0 for i, method_matrix in enumerate(api.methods_matrix): doc_sim = max( doc_sim, similarity.sim_doc_pair(query_matrix, method_matrix, query_idf_vector, api.methods_idf_vector[i])) so_sim = api_classes[api.package_name + '.' + api.class_name] api_sim[api.package_name + '.' + api.class_name] = 2 * doc_sim * so_sim / (doc_sim + so_sim) api_sim = sorted(api_sim.items(), key=lambda item: item[1], reverse=True) recommended_api = list() for item in api_sim: recommended_api.append(item[0]) if topk != -1 and len(recommended_api) >= topk: break return recommended_api
def recommend_api(query_matrix, query_idf_vector, top_questions, questions, javadoc, javadoc_dict_methods, topk): # remember that top_questions is a dictionary of the top-k most relevant questions of the query # the key is question id, the value is the similarity between the question and the query # questions is a list including all questions (api related) in StackOverflow # javadoc is a list including all api classes api_methods = dict() #stores the SO_sim of api method and the query api_methods_count = dict() for question in questions: if question.id not in top_questions: continue tmp_set = set() for answer in question.answers: if int(answer.score) < 0: continue soup = BeautifulSoup(answer.body, 'html.parser') links = soup.find_all('a') for link in links: link = link['href'] if 'docs.oracle.com/javase/' in link and '/api/' in link and 'html' in link: pair = util.parse_api_link( link) # pair[0] is class name, pair[1] is method name if pair[1] != '': method_name = pair[0] + '.' + pair[1] if method_name in tmp_set: continue else: tmp_set.add(method_name) if method_name in api_methods: api_methods[method_name] += top_questions[ question.id] api_methods_count[method_name] += 1 else: api_methods[method_name] = top_questions[ question.id] api_methods_count[method_name] = 1.0 codes = soup.find_all('code') for code in codes: code = code.get_text() pos = code.find('(') if pos != -1: code = code[:pos] if code in javadoc_dict_methods: method_name = javadoc_dict_methods[code] if method_name in tmp_set: continue else: tmp_set.add(method_name) if method_name in api_methods: api_methods[method_name] += top_questions[ question.id] api_methods_count[method_name] += 1 else: api_methods[method_name] = top_questions[ question.id] api_methods_count[method_name] = 1.0 for key, value in api_methods.items(): api_methods[key] = min( 1.0, value / api_methods_count[key] * (1.0 + math.log(api_methods_count[key], 2) / 10)) api_sim = {} for api in javadoc: class_name = api.package_name + '.' + api.class_name for i, method in enumerate(api.methods): method_name = class_name + '.' + method if method_name not in api_methods: continue else: doc_sim = similarity.sim_doc_pair(query_matrix, api.methods_matrix[i], query_idf_vector, api.methods_idf_vector[i]) so_sim = api_methods[method_name] if method_name in api_sim: api_sim[method_name] = max( api_sim[method_name], 2 * doc_sim * so_sim / (doc_sim + so_sim)) else: api_sim[method_name] = 2 * doc_sim * so_sim / (doc_sim + so_sim) api_sim = sorted(api_sim.items(), key=lambda item: item[1], reverse=True) recommended_api = list() for item in api_sim: recommended_api.append(item[0]) if topk != -1 and len(recommended_api) >= topk: break return recommended_api