def get_repo_PRlist(repo, type, renew): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type # todo: could be extended to analyze forks in the future if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): print("read from local files and return") try: return localfile.get_file(save_path) except: pass print('files does not exist in local disk, start to fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): ret = api.request('repos/%s/%ss' % (repo, type), state='all', paginate=True) else: if type == 'branch': type = 'branche' ret = api.request('repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) return ret
def fetch_file_list(pull, renew=False): repo, num = pull["base"]["repo"]["full_name"], str(pull["number"]) save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/' + num + '/raw_diff.json' if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass # t = api.get('repos/%s/pulls/%s/files?page=3' % (repo, num)) t = api.request('repos/%s/pulls/%s/files?page=3' % (repo, num)) file_list = [] if len(t) > 0: raise Exception('too big', pull['html_url']) else: li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True) # li = api.request( 'repos/%s/pulls/%s/files' % (repo, num), True) time.sleep(0.8) for f in li: if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f): file_list.append(parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def getFeatureVectorForModeling(data): renew = True # for data in dataset: path = data[0] # path = 'data/clf/second_msr_pairs.txt' label = data[1] group = data[2] default_path = init.currentDIR + '/' + path.replace('.txt', '') + '_feature_vector' X_path, y_path = default_path + '_X.json', default_path + '_y.json' if os.path.exists(X_path) and os.path.exists(y_path) and (not renew): print('feature vector already exists, read from local file') X = localfile.get_file(X_path) y = localfile.get_file(y_path) return X, y X, y = [], [] # run with all PR's info model repo2PRpair_map = {} with open(init.currentDIR + '/' + path) as f: all_pr = f.readlines() for l in tqdm(all_pr): repo, n1, n2 = l.strip().split() if repo not in repo2PRpair_map: repo2PRpair_map[repo] = [] repo2PRpair_map[repo].append((n1, n2)) out_file = open(default_path + '_X_and_Y.txt', 'w+') for repo in tqdm(repo2PRpair_map): # print('Start running on', repo) # sequence for pr_pair in tqdm(repo2PRpair_map[repo]): print(repo, pr_pair[0], pr_pair[1]) featureVec = get_featureVector_ForPRpair(repo, pr_pair[0], pr_pair[1]) X.append(featureVec) y.append(label) print(repo, pr_pair[0], pr_pair[1], featureVec, label, file=out_file) out_file.close() # save to local localfile.write_to_file(X_path, X) localfile.write_to_file(y_path, y) return (X, y)
def get_repo_info_forPR(repo, type, renew): filtered_result = [] api = GitHubAPI() print(init.local_pr_data_dir + repo + '/pull_list.json') pullListfile = pathlib.Path(init.local_pr_data_dir + repo + '/pull_list.json') if pullListfile.exists(): tocheck_pr = getOldOpenPRs(repo) print("tocheck_pr " + str(tocheck_pr)) if (tocheck_pr is None): tocheck_pr = 0 save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): try: return localfile.get_file(save_path) except: pass print('start fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): page_index = 1 while (True): ret = api.requestPR('repos/%s/%ss' % (repo, type), state='all', page=page_index) numPR = init.numPRperPage if (len(ret) > 0): for pr in ret: # if (pr['number'] >= tocheck_pr): if (pr['number'] >= tocheck_pr): filtered_result.append(pr) else: print('get all ' + str(len(filtered_result)) + ' prs') localfile.replaceWithNewPRs(save_path, filtered_result) return filtered_result if (len(filtered_result) < numPR): print('get all ' + str(len(filtered_result)) + ' prs -- after page ' + str(page_index)) localfile.replaceWithNewPRs(save_path, filtered_result) return filtered_result else: page_index += 1 numPR += init.numPRperPage else: print("get pulls failed") return filtered_result else: if type == 'branch': type = 'branche' ret = api.request('repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) else: print('pull list does not exist, get from scratch') ret = get_repo_PRlist(repo, type, renew) return ret
def filterNonCodeFiles(file_list, outfile_prefix): newFileList = [] count = 0 for f in file_list: if count > 500: localfile.write_to_file(outfile_prefix + "/toobig.txt", '500file') return [] if not language_tool.is_text(f['name']): newFileList.append(f) count += 1 return newFileList
def preprocess_documents(repo, pulls, renew): for pull in tqdm(pulls): # tqdm is used for print progress bar https://github.com/tqdm/tqdm/ pr_id = pull['number'] # if pr_id != 14378: # continue outfile_prefix = init.local_pr_data_dir + repo + "/" + str(pr_id) print(str(pr_id)) if os.path.exists(outfile_prefix + '/updateAt.txt') and (not renew): print('skip') continue # if the pr is older than 1 year, ignore # # todo: why do I care about the create date when training the model? comment out for now # # current_pr_createdAt = pull['created_at'] # # if (util.timeUtil.days_between(now, current_pr_createdAt) > init.comparePRs_timeWindow_inDays): # # print(str(pull['number']) + " older than " + str(init.pr_date_difference_inDays) + " days , stop") # # break # ----------- title and description ----------- wordext.get_tokens_from_file(pull['title'], outfile_prefix, 'title') if pull["body"]: if not os.path.exists(outfile_prefix + "/body_tokens_stemmed.tsv") or renew: import re body_str = re.sub("(<.*?>)", "", pull['body'], flags=re.DOTALL) wordext.get_tokens_from_file(body_str, outfile_prefix, 'body') # # ----------- commit msg ----------- print('check commit') from github.github_api import concat_commits from github.github_api import get_pr_commit all_commit_msg = concat_commits(get_pr_commit(repo, pr_id)) wordext.get_tokens_from_file(all_commit_msg, outfile_prefix, 'commit') # # ----------- CODE & FILE ----------- print('check code ,file ') from github.github_api import fetch_pr_code_info pr_filelist_json = fetch_pr_code_info(repo, pr_id) if (len(pr_filelist_json) == 0): localfile.write_to_file(outfile_prefix + "/updateAt.txt", str(datetime.datetime.now().strftime("%Y-%m-%d"))) continue wordext.get_code_tokens_from_file(pr_filelist_json, outfile_prefix, 'add_code') wordext.get_code_tokens_from_file(pr_filelist_json, outfile_prefix, 'del_code') # ----------- Location ----------- pr_filelist_json = fetch_pr_code_info(repo, pr_id) if len(pr_filelist_json) > 0: getCodeLocation(pr_filelist_json, outfile_prefix) # ----------- version number & crossReference PR or ISSUE----------- print('check reference') body_text = '' if pull["body"] is None else pull["body"] pull_text = str(pull["title"]) + ' ' + str(body_text) + ' ' + all_commit_msg getReference(repo, pull_text, outfile_prefix) localfile.write_to_file(outfile_prefix + "/updateAt.txt", str(datetime.datetime.now().strftime("%Y-%m-%d")))
def get_pull_commit(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def get_pull(repo, num, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/api.json' % (repo, num) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass r = api.request('repos/%s/pulls/%s' % (repo, num)) time.sleep(3.0) localfile.write_to_file(save_path, r) return r
def get_pull_commit(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass commits = api.request('GET', pull['commits_url'], True) time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def fetch_pr_info(pull, must_in_local=False): global file_list_cache ind = (pull["base"]["repo"]["full_name"], pull["number"]) if ind in file_list_cache: return file_list_cache[ind] path = '/DATA/luyao/pr_data/%s/%s' % (pull["base"]["repo"]["full_name"], pull["number"]) parse_diff_path = path + '/parse_diff.json' raw_diff_path = path + '/raw_diff.json' pull_files_path = path + '/pull_files.json' flag_path = path + '/too_large_flag.json' if os.path.exists(flag_path): raise Exception('too big', pull['html_url']) if os.path.exists(parse_diff_path): try: ret = localfile.get_file(parse_diff_path) file_list_cache[ind] = ret return ret except: pass if os.path.exists(raw_diff_path) or os.path.exists(pull_files_path): if os.path.exists(raw_diff_path): file_list = localfile.get_file(raw_diff_path) elif os.path.exists(pull_files_path): pull_files = localfile.get_file(pull_files_path) file_list = [ parse_diff(file["file_full_name"], file["changed_code"]) for file in pull_files ] else: raise Exception('error on fetch local file %s' % path) else: if must_in_local: raise Exception('not found in local') try: file_list = fetch_file_list(pull) except: localfile.write_to_file(flag_path, 'flag') raise Exception('too big', pull['html_url']) # print(path, [x["name"] for x in file_list]) localfile.write_to_file(parse_diff_path, file_list) file_list_cache[ind] = file_list return file_list
def get_pr_commit(repo, pr_id, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (repo, pr_id) commit_url = 'repos/%s/pulls/%s/commits' % (repo, pr_id) if os.path.exists(save_path) and (not renew) and (os.stat(save_path).st_size > 2): try: return localfile.get_file(save_path) except: pass # commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), True) api = GitHubAPI() commits = api.request(commit_url.replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def fetch_commit(url, renew=False): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/%s.json' % url.replace('https://api.github.com/repos/', '') if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass c = api.request(url) time.sleep(0.7) file_list = [] for f in c['files']: if 'patch' in f: file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def fetch_file_list(repo, num, renew=False): api = GitHubAPI() # repo, num = pull["base"]["repo"]["full_name"], str(pull["number"]) outfile_prefix = init.local_pr_data_dir + repo + "/" + str(num) save_path = outfile_prefix + '/raw_diff.json' if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass file_list = [] li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True) time.sleep(0.8) for f in li: if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f): file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def get_another_pull(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass comments_href = pull["_links"]["comments"]["href"] comments = api.request('GET', comments_href, True) time.sleep(0.7) candidates = [] for comment in comments: candidates.extend(get_pr_and_issue_numbers(comment["body"])) candidates.extend(get_pr_and_issue_numbers(pull["body"])) result = list(set(candidates)) localfile.write_to_file(save_path, result) return result
def get_another_pull(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass comments_href = pull["_links"]["comments"]["href"] # found cites in comments, but checking events is easier. # comments = api.request(comments_href.replace('https://api.github.com/', ''), True) comments = api.request(comments_href.replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) candidates = [] for comment in comments: candidates.extend(get_pr_and_issue_numbers(comment["body"])) candidates.extend(get_pr_and_issue_numbers(pull["body"])) result = list(set(candidates)) localfile.write_to_file(save_path, result) return result
def get_repo_info(repo, type, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): try: return localfile.get_file(save_path) except: pass print('start fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): ret = api.request('GET', 'repos/%s/%ss?state=closed' % (repo, type), True) ret.extend( api.request('GET', 'repos/%s/%ss?state=open' % (repo, type), True)) else: if type == 'branch': type = 'branche' ret = api.request('GET', 'repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) return ret
def random_pairs(): global select_set # repos = os.listdir('/DATA/luyao/pr_data') # choose = ['saltstack/salt'] # training repos # choose = ['mozilla-b2g/gaia', 'twbs/bootstrap', 'scikit-learn/scikit-learn', 'rust-lang/rust', 'servo/servo', 'pydata/pandas', 'saltstack/salt', 'nodejs/node', 'symfony/symfony-docs', 'zendframework/zf2', 'symfony/symfony', 'kubernetes/kubernetes'] # testing repos print("randomly pick a repo...") choose = [ 'cocos2d/cocos2d-x', 'dotnet/corefx', 'django/django', 'angular/angular.js', 'JuliaLang/julia', 'ceph/ceph', 'joomla/joomla-cms', 'facebook/react', 'hashicorp/terraform', 'rails/rails', 'docker/docker', 'elastic/elasticsearch', 'emberjs/ember.js', 'ansible/ansible' ] find = False while not find: # random a repo while True: try: ''' repo = repos[random.randint(0, len(repos) - 1)] repo_ = os.listdir('/DATA/luyao/pr_data/' + repo)[0] repo = repo + '/' + repo_ ''' repo = choose[random.randint(0, len(choose) - 1)] print("..." + repo) break except: continue ok_file = '/DATA/luyao/pr_data/%s/list_for_random_generate_c1.json' % repo if all_pr_flag: ok_file = ok_file.replace('_c1', '_all') if os.path.exists(ok_file): print(ok_file + " exists!") nums = localfile.get_file(ok_file) else: print(ok_file + " file does not exist ...") nums = os.listdir('/DATA/luyao/pr_data/%s' % repo) print(repo + "has " + str(len(nums)) + " PRs in total on GitHub") # filter out config file and readme file def like_localize(p): if 'confi' in p["title"].lower(): return True if 'readme' in p["title"].lower(): return True return False def too_small(p): if len(p["title"]) <= 20: return True if (p["body"] is not None) and (len(p["body"]) <= 20): return True return False new_num = [] cnt, tot_cnt = 0, len(nums) #todo: what is loop about? print("start to parse every PR...") for x in nums: cnt += 1 #progess bar... if cnt % 100 == 0: print(1.0 * cnt / tot_cnt) if x.isdigit(): p = get_pull(repo, x) # print('check', repo, x) if (all_pr_flag or (p["merged_at"] is not None)) and (not check_large(p)) and \ (not too_small(p)) and (not like_localize(p)): len_f = len(fetch_pr_info(p)) if (len_f > 0) and (len_f <= 10): new_num.append(x) print("length of new_nums " + str(len(new_num))) nums = new_num print("length of nums: " + str(len(nums))) localfile.write_to_file(ok_file, nums) l = len(nums) # print(repo, l) if l <= 100: raise Exception('too small', repo) continue if l <= 1000: if random.randint(0, 3) > 0: continue ti = 0 while not find: ti += 1 # if ti > 100: # break if l > 0: x = nums[random.randint(0, l - 1)] y = nums[random.randint(0, l - 1)] if ((repo, x, y) in msr_d) or ((repo, y, x) in msr_d): continue if (repo, x, y) in select_set: continue try: if (x != y) and (x.isdigit()) and (y.isdigit()): p1 = get_pull(repo, x) p2 = get_pull(repo, y) # print(repo, x, y) if p1["user"]["id"] != p2["user"]["id"]: select_set.add((repo, x, y)) select_set.add((repo, y, x)) find = True break except: print("PR 404") pass return [repo, x, y]
def get_feature_vector(data, label, renew=False, out=None): print('Model Data Input=', data) default_path = data.replace('.txt', '') + '_feature_vector' out = default_path if out is None else default_path + '_' + out X_path, y_path = out + '_X.json', out + '_y.json' if os.path.exists(X_path) and os.path.exists(y_path) and (not renew): print('warning: feature vector already exists!', out) X = localfile.get_file(X_path) y = localfile.get_file(y_path) return X, y X, y = [], [] # run with all PR's info model p = {} pr_len = 0 with open(data) as f: all_pr = f.readlines() pr_len = len(all_pr) count = 0 for l in all_pr: print(str(count / pr_len) + ' pr:' + l) r, n1, n2 = l.strip().split() if 'msr_pairs' not in data: print( 'check if there are too much texts in the PR description.. such as template..' ) if check_large(get_pull(r, n1)) or check_large(get_pull(r, n2)): continue if r not in p: p[r] = [] p[r].append((n1, n2, label)) count = count + 1 print('all=', len(all_pr)) out_file = open(out + '_X_and_Y.txt', 'w+') for r in p: init_model_with_repo(r) for r in p: print('Start running on', r) # init NLP model init_model_with_repo(r) print('pairs num=', len(p[r])) # sequence cnt = 0 for z in p[r]: # print(r, z[0], z[1]) x0, y0 = get_sim(r, z[0], z[1]), z[2] X.append(x0) y.append(y0) print(r, z[0], z[1], x0, y0, file=out_file) cnt += 1 if cnt % 100 == 0: print('current:', r, cnt) ''' # run parallel for label in [0, 1]: pairs = [] for z in p[r]: if z[2] == label: pairs.append((r, z[0], z[1])) with Pool(processes=10) as pool: result = pool.map(get_sim_wrap, pairs) X.extend(result) y.extend([label for i in range(len(result))]) ''' out_file.close() # save to local localfile.write_to_file(X_path, X) localfile.write_to_file(y_path, y) return (X, y)
pairs = sorted(pairs, key=lambda x: x.split()[0]) last_repo = None for pair in pairs: pair_s = pair.split() r, n1, n2 = pair_s[0], pair_s[1], pair_s[2] if r != last_repo: clf.init_model_with_repo(r) last_repo = r status, history, history_ret, history_last, history_commit = simulate( r, n1, n2) for i in range(len(history)): history[i] = (history[i], max(history_last[i][0], history_last[i][1])) if status >= 0: with open(out_file, 'a+') as outf: print(r, n1, n2, ':', history, file=outf) all_ret.append({ 'repo': r, 'num1': n1, 'num2': n2, 'history': history_commit }) localfile.write_to_file(out_file + '.all_commit', all_ret)