def delete_self_loop(type_list_sl, filepath=None): type_list = [] for item in type_list_sl: if not int(item['source']['number']) == int(item['target']['number']): type_list.append(item) file_opt.save_json_to_file(filepath + "links_type.json", type_list) return
def extract_link_type(response_p, response_i, renew, filepath=None): if renew == 1: nodes = response_p['data']['repository']['pullRequests'][ 'nodes'] + response_i['data']['repository']['issues']['nodes'] if os.path.isfile(filepath + "links_type.json"): # 如果已有link_type.json,查找后断点重启 links = file_opt.read_json_from_file(filepath + "links_type.json") else: # 从0开始提取Link links = [] continue_nodes = [] for node in nodes: # 用来找到新的起点 if links == []: continue_nodes = nodes break else: if str(node['number']) == str(links[-1]['source']['number']): continue_nodes = nodes[nodes.index(node) + 1:] break else: continue if continue_nodes != []: for node in tqdm(continue_nodes): # 开始提取link links = extract_link_in_title(nodes, node, links) # links = extract_link_in_body(nodes, node, links) # links = extract_link_in_comment(nodes, node, links) # links = extract_link_in_crossReference(nodes, node, links) links = extract_link_in_referencedEvent(nodes, node, links) if len(links) % 100 == 0: file_opt.save_json_to_file(filepath + "links_type_sl.json", links) file_opt.save_json_to_file(filepath + "links_type_sl.json", links) elif renew == 0: links = file_opt.read_json_from_file(filepath + "links_type_sl.json") return
def extract_link_mode(linkset, renew, save_file_path): if renew == 1: link_1_1, link_1_N = parse_1_and_N(linkset) link_cluster = parse_link_cluster(link_1_1, link_1_N) # link_list = parse_link_list(linkset) # link_cluster = parse_list_2_cluster(link_list) link_self_bilateral, link_bilateral = parse_bilateral(linkset) file_opt.save_json_to_file(save_file_path + "link_1_1.json", link_1_1) file_opt.save_json_to_file(save_file_path + "link_1_N.json", link_1_N) file_opt.save_json_to_file(save_file_path + "link_bi.json", link_bilateral) file_opt.save_json_to_file(save_file_path + "link_self_bi.json", link_self_bilateral) file_opt.save_json_to_file(save_file_path + "link_cluster.json", link_cluster) elif renew == 0: link_1_1 = file_opt.read_json_from_file(save_file_path + "link_1_1.json") link_1_N = file_opt.read_json_from_file(save_file_path + "link_1_N.json") link_self_bilateral = file_opt.read_json_from_file(save_file_path + "link_self_bi.json") link_bilateral = file_opt.read_json_from_file(save_file_path + "link_bi.json") link_cluster = file_opt.read_json_from_file(save_file_path + "link_cluster.json") return link_1_1, link_1_N, link_self_bilateral, link_bilateral, link_cluster
def select_repos(): initial_info_list = create_initial_info() clear_language_list = remove_no_language(initial_info_list) common_language_list = involve_common_language(clear_language_list) iss_pr_number_list = select_iss_pr_number(common_language_list) # 保存文件 file_opt.save_json_to_file( init.local_data_filepath + "/after_select_respos.json", iss_pr_number_list) # 创建仓库列表 create_repo_list(iss_pr_number_list)
def search_repos(): """ 获取满足search条件的repos list """ output_response_file = init.local_data_filepath + "/candidate_repos_info.json" r = query_request(queries.search_candidate_repos) while r['data']['search']['nodes'][-1]['stargazerCount'] > 10000: last_star = r['data']['search']['nodes'][-1]['stargazerCount'] r2 = query_request(queries.search_candidate_repos, last_star=last_star) r['data']['search']['nodes'] += r2['data']['search']['nodes'][1:] print("has finished ", len(r['data']['search']['nodes'])) file_opt.save_json_to_file(output_response_file, r)
def request_graphQL(fullname_repo): """ 通过graphQL获取owner/repo仓库的pr和issue数据 """ owner = fullname_repo[0] repo = fullname_repo[1] types = ["pullRequests","issues"] # types = ["issues","pullRequests"] for type in types: count = 0 output_response_file = init.local_data_filepath+owner+"/"+repo+"/response_"+type+".json" if os.path.isfile(output_response_file): r = file_opt.read_json_from_file(output_response_file) else: r = query_request(queries.search_100_nodes, owner, repo, type) if not r['data']['repository'][type]['pageInfo']['hasNextPage']: continue print("-----------------start fetch " + fullname_repo[0] + "/" + fullname_repo[1] + "---------------") while True: count += 1 print(owner+"/"+repo,count,datetime.now(),r['data']['repository'][type]['totalCount'],len(r['data']['repository'][type]['nodes'])) if count % 1 == 0: file_opt.save_json_to_file(output_response_file, r) else: pass earliest_pr_cursor = r['data']['repository'][type]['edges'][-1]['cursor'] # earliest_pr_cursor = "Y3Vyc29yOnYyOpHOHaMMaA==" # 用来处理无法通过graphQL获取的pr或者issue,需要填入当前pr的cursor,可能是timelineItem的原因 r2 = query_request(queries.search_100_nodes, owner, repo, type, last_typenode=earliest_pr_cursor) r2 = request_morethan_100_nodes(r2, owner, repo, type) r['data']['repository'][type]['pageInfo'] = r2['data']['repository'][type]['pageInfo'] r['data']['repository'][type]['totalCount'] = r2['data']['repository'][type]['totalCount'] r['data']['repository'][type]['edges']+= r2['data']['repository'][type]['edges'] r['data']['repository'][type]['nodes'] += r2['data']['repository'][type]['nodes'] if not r['data']['repository'][type]['pageInfo']['hasNextPage']: file_opt.save_json_to_file(output_response_file, r) print("-----------------finish fetch " + fullname_repo[0]+"/"+ fullname_repo[1] + "---------------") break file_opt.save_line_to_file(init.repos_list_finish_graphQL, fullname_repo[0] + "/" + fullname_repo[1])
def random_sample(all_link): sample = random.sample(all_link, sample_size) file_opt.save_json_to_file( "../card_sorting/sample_" + str(sample_size) + "_supply2.json", sample) return