def fuzzy_search(self, path_queries, path_jdk, respond_top_n, path_save): queries = cm.load_pkl(path_queries) jdk = cm.load_pkl(path_jdk) total = respond_top_n for i in range(len(queries)): query = queries[i] query_words = list(query[0]) query_sorts = list(query[1]) cmd = '.*' + '.*'.join(query_words) + '.*' data = [] cmds = [] source_hash = [] respond, query_cmd = self.search_respond(cmd, source_hash, jdk) data.extend(respond) cmds.extend(query_cmd) idx = 0 while len(data) < total and len(query_words) - idx > 0: temp = [] if idx == 0: s = [query_sorts[0]] else: s = query_sorts[:idx] for j in range(len(query_words)): if j not in s: temp.append(query_words[j]) cmd = '.*' + '.*'.join(temp) + '.*' # add .* devant respond, query_cmd = self.search_respond(cmd, source_hash, jdk) data.extend(respond) cmds.extend(query_cmd) idx += 1 cm.save_pkl(path_save + 'respond' + str(i) + '.pkl', data) cm.save_pkl(path_save + 'cmd' + str(i) + '.pkl', cmds) print(str(i) + '-' + str(len(queries)) + ' ' + str(len(data)))
def query_parse(path_from, path_parsed_vocab, path_method_vocab, path_to): queries = cm.load_txt(path_from) vjdk = dict(cm.load_pkl(path_parsed_vocab)) vword = dict(cm.load_pkl(path_method_vocab)) stemmer = PorterStemmer() str_replace = [ 'in java', 'using java', 'java', 'how to', 'how do', 'what is' ] data_queries = list() p = 0 for i in range(len(queries)): print(str(i)) query = queries[i] for str_re in str_replace: query = query.replace(str_re, '') data = [] tokens = cm.get_tokens(query) p += len(tokens) tokens = nltk.pos_tag(tokens) for token in tokens: tvalue = token[0] ttype = token[1] if ttype in type_all: para = 0 impact = 0 stem = stemmer.stem(tvalue) if stem in vword: para = 1 impact = vword[stem] else: freq = [] syns = cm.get_synonyms(stem) for syn in syns: score = 0 stem = cm.get_stemmed(syn) if stem in vword: score = vword[stem] freq.append(score) idx_max_freq = -1 if len(freq) > 0: idx_max_freq = freq.index(max(freq)) if idx_max_freq > -1: tvalue = syns[idx_max_freq] para = 1 impact = vword[tvalue] if ttype in type_nn and stem in vjdk: para = 2 impact = vjdk[stem] tvalue = cm.get_stemmed(tvalue) vector = [tvalue, ttype, para, impact] data.append(vector) data_queries.append(data) cm.save_pkl(path_to, data_queries)
def stat_parsed(path_jdk, file_from, folder_to, total_num=-1): base = folder_to all_api = 0 all_jdk = 0 relate = 0 vocab_api = collections.defaultdict(int) vocab_jdk = collections.defaultdict(int) jdk = dict(cm.load_pkl(path_jdk)) methods = cm.load_pkl(file_from + 'methname.pkl') lines = cm.load_pkl(file_from + 'apiseq.pkl') assert len(lines) == len(methods) total = len(methods) if total_num == -1 else total_num for i in range(0, total): method_tokens = methods[i] apiseq = lines[i].lower() if apiseq is not '[]': for token in method_tokens: if apiseq.find(token) >= 0: relate += 1 break flag_api = 0 flag_jdk = 0 apis = apiseq.split(';') if len(apis) > 0: for api in apis: if '.' in api: flag_api = 1 if api.endswith(')'): api = api[0:api.find('(')] if api.endswith(']'): api = api[0:api.find('[')] if api.endswith('>'): api = api[0:api.find('<')] if api in jdk.keys(): flag_jdk = 1 vocab_jdk[api] += 1 else: vocab_api[api] += 1 all_api += flag_api all_jdk += flag_jdk print( str(i) + '-' + str(total) + ' ' + '-' + str(all_api) + '-' + str(relate) + ' ' + str(len(vocab_api.keys())) + '-' + str(len(vocab_jdk.keys()))) cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api) cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)
def reranking(path_parsed_queries, path_queries, path_jdk, path_fuzzy_search, path_rerank): queries = cm.load_pkl(path_parsed_queries) jdk = cm.load_pkl(path_jdk) for i in range(len(queries)): query = queries[i] words = [] for word in query: words.append(word[0]) queries[i] = words queries_txt = cm.load_txt(path_queries) lines = [] for i in range(99): # 50 respond = cm.load_pkl(path_fuzzy_search + 'respond' + str(i) + '.pkl') query_cmd = cm.load_pkl(path_fuzzy_search + 'cmd' + str(i) + '.pkl') query = queries[i] query_txt = queries_txt[i] scores = list() for j in range(len(respond)): print(str(i) + '-50, iter-1, ' + str(j) + '-' + str(len(respond))) res = respond[j]['_source'] line = res['method'] cmd = query_cmd[j] scores.append([j, matcher_name(query, line, cmd)]) scores.sort(key=operator.itemgetter(1), reverse=True) scores = scores[:100] for j in range(len(scores)): print(str(i + 1) + '-99, iter-2, ' + str(j) + '-' + str(len(scores))) idx = scores[j][0] res = respond[idx]['_source'] line = res['parsed'] scores[j].append(matcher_api(query, line, jdk)) scores.sort(key=operator.itemgetter(1, 2), reverse=True) if '\n' not in query_txt: query_txt += '\n' lines.append(query_txt) results = min(len(scores), 10) if len(scores) > 0: for j in range(results): idx = scores[j][0] lines.append(respond[idx]['_source']['source']) lines.append('\n') cm.save_txt(path_rerank, lines)
def fill_simple_data(self, path): body = list() code = cm.load_txt(path + 'rawcode.txt') apiseq = cm.load_pkl(path + 'apiseq.pkl') meth = cm.load_pkl(path + 'raw_methname.pkl') assert len(code) == len(apiseq) and len(code) == len(meth) for j in range(len(code)): body.append({ "_index": self.index_name, "_type": self.doc_type, "_source": { "method": meth[j], "parsed": apiseq[j], "source": code[j] } }) helpers.bulk(self.es, body, request_timeout=1000)
def analyze_parsed(path_parsed_vocab, path_parsed): vdata = dict(cm.load_pkl(path_parsed_vocab)) vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) vjdks = dict() for k, v in dict(vdata).items(): key = cm.get_stemmed(k[str(k).rfind('.') + 1:].lower()) if key not in vjdks.keys(): vjdks[key] = v cm.save_pkl(path_parsed, vjdks)
def analyze_method(path_parsed_vocab, path_method): vdata = dict(cm.load_pkl(path_parsed_vocab)) vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) vwords = dict() for k, v in dict(vdata).items(): key = cm.get_stemmed(k).lower() if key in vwords.keys(): vwords[key] += v else: vwords[key] = v cm.save_pkl(path_method, vwords)
def query_parse_tree(path_from, path_to): lines = cm.load_pkl(path_from) # sorting words for i in range(len(lines)): items = lines[i] mid_list1 = list() mid_list2 = list() word_list1 = list() word_list2 = list() other_list1 = list() other_list2 = list() for j in range(len(items)): item = items[j] if item[1] in type_cc + type_to + type_in: if item[2] is 1: mid_list1.append([j, items[j][3]]) else: mid_list2.append([j, items[j][3]]) elif item[1] in type_vb + type_nn: if item[2] is 1: word_list1.append([j, items[j][3]]) else: word_list2.append([j, items[j][3]]) else: if item[2] is 1: other_list1.append([j, items[j][3]]) else: other_list2.append([j, items[j][3]]) mid_list1.sort(key=operator.itemgetter(1)) mid_list2.sort(key=operator.itemgetter(1)) word_list1.sort(key=operator.itemgetter(1)) word_list2.sort(key=operator.itemgetter(1)) other_list1.sort(key=operator.itemgetter(1)) other_list2.sort(key=operator.itemgetter(1)) sort_list = mid_list1 + mid_list2 + other_list1 + other_list2 + word_list1 + word_list2 for j in range(len(sort_list)): sort_list[j] = sort_list[j][0] query_list = list() for item in items: query_list.append(item[0]) lines[i] = [query_list, sort_list] cm.save_pkl(path_to, lines)
def stat_method(file_from, folder_to, total_num=-1): base = folder_to vword = dict() vsent = dict() vtype_cc = dict() vtype_cd = dict() vtype_in = dict() vtype_to = dict() vtype_jj = dict() vtype_nn = dict() vtype_rb = dict() vtype_vb = dict() vtype_ot = dict() methods = cm.load_pkl(file_from) total = len(methods) if total_num == -1 else total_num for i in range(0, total): print(str(i) + '-' + str(total)) tokens = methods[i] update_vocab_by_tokens(vword, tokens) tokens_type = nltk.pos_tag(tokens) sent = [] for ttype in tokens_type: if ttype[1] in type_cc: update_vocab_by_token(vtype_cc, ttype[0]) sent.append('cc') elif ttype[1] in type_cd: update_vocab_by_token(vtype_cd, ttype[0]) sent.append('cd') elif ttype[1] in type_jj: update_vocab_by_token(vtype_jj, ttype[0]) sent.append('jj') elif ttype[1] in type_nn: update_vocab_by_token(vtype_nn, ttype[0]) sent.append('nn') elif ttype[1] in type_rb: update_vocab_by_token(vtype_rb, ttype[0]) sent.append('rb') elif ttype[1] in type_vb: update_vocab_by_token(vtype_vb, ttype[0]) sent.append('vb') elif ttype[1] in type_in: update_vocab_by_token(vtype_in, ttype[0]) sent.append('in') elif ttype[1] in type_to: update_vocab_by_token(vtype_to, ttype[0]) sent.append('to') else: update_vocab_by_token(vtype_ot, str(ttype[0]) + '-' + str(ttype[1])) sent.append(ttype[1]) lsent = '-'.join(sent) update_vocab_by_token(vsent, lsent) print( str(i) + '-' + str(total) + ': ' + str(len(vword.keys())) + '-' + str(len(vsent.keys())) + ' ' + str(len(vtype_cd.keys())) + '-' + str(len(vtype_in.keys())) + '-' + str(len(vtype_to.keys())) + '-' + str(len(vtype_jj.keys())) + '-' + str(len(vtype_nn.keys())) + '-' + str(len(vtype_rb.keys())) + '-' + str(len(vtype_vb.keys())) + '-' + str(len(vtype_ot.keys())) + '-') cm.save_pkl(base + 'method_vword.pkl', vword) cm.save_pkl(base + 'method_vsent.pkl', vsent) cm.save_pkl(base + 'method_vtype_cd.pkl', vtype_cd) cm.save_pkl(base + 'method_vtype_in.pkl', vtype_in) cm.save_pkl(base + 'method_vtype_to.pkl', vtype_to) cm.save_pkl(base + 'method_vtype_jj.pkl', vtype_jj) cm.save_pkl(base + 'method_vtype_nn.pkl', vtype_nn) cm.save_pkl(base + 'method_vtype_rb.pkl', vtype_rb) cm.save_pkl(base + 'method_vtype_vb.pkl', vtype_vb) cm.save_pkl(base + 'method_vtype_ot.pkl', vtype_ot)
def fill_data(self, path_formatted_repos): for i in range(10): print(str(i) + '-9') body = cm.load_pkl(path_formatted_repos + 'body' + str(i) + '.pkl') helpers.bulk(self.es, body, request_timeout=1000)
def stat_parameter_return(path_jdk, folder_method, folder_parameter, folder_return, folder_to, total_files): base = folder_to vocab_api = dict() vocab_jdk = dict() vocab_name = dict() all = 0 con = 0 jdk = dict(cm.load_pkl(path_jdk)) for i in range(0, total_files): path = folder_method + 'method' + str(i) + '.txt' if os.path.exists(path): lines_method = cm.load_txt(folder_method + 'method' + str(i) + '.txt') lines_parameter = cm.load_txt(folder_parameter + 'parameter' + str(i) + '.txt') lines_return = cm.load_txt(folder_return + 'return' + str(i) + '.txt') for j in range(len(lines_method)): print( str(i) + '-' + str(total_files) + ' -' + str(j) + ' ' + str(len(lines_method)) + ' ' + str(con) + ' - ' + str(all) + ' ' + str(len(vocab_api.keys())) + '-' + str(len(vocab_jdk.keys())) + '-' + str(len(vocab_name.keys()))) all += 1 line_method = lines_method[j] tokens = cm.get_tokens(cm.camel_split(line_method)) line_paras = lines_parameter[j].replace('\n', '') para_types = [] para_names = [] line_return = lines_return[j] line_return = line_return.replace('\n', '') para_types.append(line_return) line = line_paras + ' ' + line_return for token in tokens: if line.find(token) >= 0: con += 1 break if '[]' not in line_paras: if ';' in line_paras: line_paras = line_paras.split(';') for line_para in line_paras: paras = line_para.split(',') if len(paras) == 2: para_types.append(paras[0]) para_names.append(paras[1]) else: paras = line_paras.split(',') if len(paras) == 2: para_types.append(paras[0]) para_names.append(paras[1]) for type in para_types: if type in jdk.keys(): if type in vocab_jdk.keys(): vocab_jdk[type] += 1 else: vocab_jdk[type] = 1 else: if type in vocab_api.keys(): vocab_api[type] += 1 else: vocab_api[type] = 1 for name in para_names: if name in vocab_name.keys(): vocab_name[name] += 1 else: vocab_name[name] = 1 cm.save_pkl(base + 'para_vocab_api.pkl', vocab_api) cm.save_pkl(base + 'para_vocab_jdk.pkl', vocab_jdk) cm.save_pkl(base + 'para_vocab_name.pkl', vocab_name)
def stat_parsed(path_jdk, folder_from_method, folder_to_parsed, folder_to, total_files): base = folder_to vocab_api = dict() vocab_jdk = dict() all = 0 all_api = 0 all_jdk = 0 relate = 0 jdk = dict(cm.load_pkl(path_jdk)) for i in range(0, total_files): path = folder_from_method + 'method' + str(i) + '.txt' if os.path.exists(path): methods = cm.load_txt(folder_from_method + 'method' + str(i) + '.txt') lines = cm.load_txt(folder_to_parsed + 'parsed' + str(i) + '.txt') for j in range(len(methods)): all += 1 line = lines[j].replace('\n', '') if line is not '[]': tokens = cm.get_tokens(cm.camel_split(methods[j])) for token in tokens: if line.find(token) >= 0: relate += 1 break flag_api = 0 flag_jdk = 0 apis = line.split(',') if len(apis) > 0: for api in apis: if '.' in api: flag_api = 1 if api.endswith(')'): api = api[0:api.find('(')] if api.endswith(']'): api = api[0:api.find('[')] if api.endswith('>'): api = api[0:api.find('<')] if api in jdk.keys(): flag_jdk = 1 if api in vocab_jdk.keys(): vocab_jdk[api] += 1 else: vocab_jdk[api] = 1 else: if api in vocab_api.keys(): vocab_api[api] += 1 else: vocab_api[api] = 1 all_api += flag_api all_jdk += flag_jdk print( str(i) + '-41025 ' + str(j) + '-' + str(len(methods)) + ' ' + str(all) + '-' + str(all_api) + '-' + str(all_jdk) + '-' + str(relate) + ' ' + str(len(vocab_api.keys())) + '-' + str(len(vocab_jdk.keys()))) cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api) cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)