def fetch_pr_code_info(repo, pr_id, must_in_local=False): global file_list_cache ind = (repo, pr_id) if ind in file_list_cache: return file_list_cache[ind] path = LOCAL_DATA_PATH + '/pr_data/%s/%s' % (repo, pr_id) # if os.path.exists(path + '/toobig.txt'): # return [] raw_diff_path = path + '/raw_diff.json' pull_files_path = path + '/pull_files.json' if os.path.exists(raw_diff_path) or os.path.exists(pull_files_path): if os.path.exists(raw_diff_path): file_list = localfile.get_file(raw_diff_path) elif os.path.exists(pull_files_path): pull_files = localfile.get_file(pull_files_path) file_list = [ parse_diff(file["file_full_name"], file["changed_code"]) for file in pull_files ] else: raise Exception('error on fetch local file %s' % path) else: if must_in_local: raise Exception('not found in local') file_list = fetch_file_list(repo, pr_id) codeOnlyFileList = filterNonCodeFiles(file_list, path) if len(codeOnlyFileList) > 0: file_list_cache[ind] = codeOnlyFileList return codeOnlyFileList
def get_repo_PRlist(repo, type, renew): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type # todo: could be extended to analyze forks in the future if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): print("read from local files and return") try: return localfile.get_file(save_path) except: pass print('files does not exist in local disk, start to fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): ret = api.request('repos/%s/%ss' % (repo, type), state='all', paginate=True) else: if type == 'branch': type = 'branche' ret = api.request('repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) return ret
def fetch_file_list(pull, renew=False): repo, num = pull["base"]["repo"]["full_name"], str(pull["number"]) save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/' + num + '/raw_diff.json' if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass # t = api.get('repos/%s/pulls/%s/files?page=3' % (repo, num)) t = api.request('repos/%s/pulls/%s/files?page=3' % (repo, num)) file_list = [] if len(t) > 0: raise Exception('too big', pull['html_url']) else: li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True) # li = api.request( 'repos/%s/pulls/%s/files' % (repo, num), True) time.sleep(0.8) for f in li: if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f): file_list.append(parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def get_repo_info_forPR_experiment(repo, type, renew): filtered_result = [] api = GitHubAPI() print(init.local_pr_data_dir + repo + '/pull_list.json') save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/pull_list.json' if (os.path.exists(save_path)) and (not renew): try: return localfile.get_file(save_path) except: pass
def get_PR(repo, num, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/api.json' % (repo, num) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass r = api.request('repos/%s/pulls/%s' % (repo, num)) time.sleep(3.0) localfile.write_to_file(save_path, r) return r
def get_pull_commit(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass commits = api.request(pull['commits_url'].replace( 'https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def get_pr_commit(repo, pr_id, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (repo, pr_id) commit_url = 'repos/%s/pulls/%s/commits' % (repo, pr_id) if os.path.exists(save_path) and (not renew) and ( os.stat(save_path).st_size > 2): try: return localfile.get_file(save_path) except: pass # commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), True) api = GitHubAPI() commits = api.request(commit_url.replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def fetch_commit(url, renew=False): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/%s.json' % url.replace( 'https://api.github.com/repos/', '') if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass c = api.request(url) time.sleep(0.7) file_list = [] for f in c['files']: if 'patch' in f: file_list.append( fetch_raw_diff.parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def fetch_file_list(repo, num, renew=False): api = GitHubAPI() # repo, num = pull["base"]["repo"]["full_name"], str(pull["number"]) outfile_prefix = init.local_pr_data_dir + repo + "/" + str(num) save_path = outfile_prefix + '/raw_diff.json' if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass file_list = [] li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True) time.sleep(0.8) for f in li: if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f): file_list.append( fetch_raw_diff.parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def get_another_pull(pull, renew=False): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass comments_href = pull["_links"]["comments"][ "href"] # found cites in comments, but checking events is easier. comments = api.request(comments_href, True) time.sleep(0.7) candidates = [] for comment in comments: candidates.extend(get_pr_and_issue_numbers(comment["body"])) candidates.extend(get_pr_and_issue_numbers(pull["body"])) result = list(set(candidates)) localfile.write_to_file(save_path, result) return result