def run(): if not git_is_available(): msg = "contribcompl requires git to be installed and accessible on path" print(msg) sys.exit(1) arguments = docopt(__doc__, version=__version__) if arguments["-v"] == True or arguments["--verbose"] == True: toggle_verbose_output() path_to_repo = arguments["<repository>"] if not (is_git_url(path_to_repo) or is_git_dir(path_to_repo)): print(__doc__) sys.exit(1) if is_git_url(path_to_repo): path_to_repo = clone_to_tmp(path_to_repo) if arguments["commits"]: commit_shas = arguments["<commit_sha>"] elif arguments["issue"]: issue_re = arguments["<issue_regex>"] commit_shas = find_commits_for_issue(path_to_repo, issue_re) # print(commit_shas) contribcompl = compute_contrib_compl(path_to_repo, commit_shas) print(contribcompl)
def test_compute_contrib_compl_low(): path_to_repo = "/tmp/cassandra/" commit_shas = [ "021df085074b761f2b3539355ecfc4c237a54a76", "2f1d6c7254342af98c2919bd74d37b9944c41a6b", ] result = compute_contrib_compl(path_to_repo, commit_sha) assert result == ContributionComplexity.LOW
def main(sys_name): df = pd.read_csv(f"data/{sys_name}_issues.csv") if sys_name == "cassandra": closed_col = "resolved" issue_key_col = "key" elif sys_name == "gaffer": closed_col = "closed_at" issue_key_col = "number" # Filter for only resolved issues: 2300 closed issues for Gaffer and # 14158 closed issues for Cassandra df = df[~df[closed_col].isnull()] df.reset_index(drop=True, inplace=True) # df = df.iloc[:10] path_to_repo = f"/tmp/{sys_name}" contribcompls = [] commit_shas_per_contrib = [] for issue_key in df[issue_key_col].values: if sys_name == "cassandra": issue_re = f"{issue_key}( |$)" elif sys_name == "gaffer": issue_re = f"(Gh |gh-){issue_key}( |$)" commit_shas = find_commits_for_issue(path_to_repo, issue_re) commit_shas_per_contrib.append(commit_shas) print(issue_re, commit_shas, flush=True) if commit_shas: try: contribcompl = compute_contrib_compl(path_to_repo, commit_shas) except: print( f"Skipping {issue_key}", issue_re, commit_shas, type(commit_shas), flush=True, ) contribcompl = None contribcompl = contribcompl.value else: contribcompl = None contribcompls.append(contribcompl) df["commit_shas"] = commit_shas_per_contrib df["contrib_complexity"] = contribcompls # Prevent sha lists from being truncated np.set_printoptions(threshold=sys.maxsize) df.to_csv(f"data/{sys_name}_contrib_compl.csv", index=False)
def get_complete_issue_df(sys_name): procfname = f"data/processing/{sys_name[:3]}_issues.csv" if os.path.isfile(procfname): date_cols = ["created", "resolved", "updated"] df_iss = pd.read_csv(procfname, parse_dates=date_cols) # parse commits again in df_iss.commit_shas = [ eval(h.replace("\n", ",")) if type(h) == str else "" for h in df_iss.commit_shas ] df_iss.t_lead = pd.to_timedelta(df_iss.t_lead) return df_iss # For first time preprocessing fname_issues = f"data/input/{sys_name[:3]}_issues.csv" if sys_name == "cassandra": date_cols = ["created", "resolved", "updated"] elif sys_name == "gaffer": date_cols = ["created_at", "closed_at", "updated_at"] df_iss = pd.read_csv(fname_issues, parse_dates=date_cols) if sys_name == "cassandra": # Adjust priority values to same format as for Gaffer df_iss["priority"] = df_iss.priority.str.lower() df_iss.rename(columns={"description": "body"}, inplace=True) df_iss.status = df_iss.status.str.lower() # Rename resolved to closed as for Gaffer df_iss[df_iss.status == "resolved"].status = "closed" burl = "https://issues.apache.org/jira/browse/" df_iss["url"] = df_iss.key.apply(lambda k: burl + k) if sys_name == "gaffer": # Rename the columns to allow for unique code in the following df_iss.rename(columns={"created_at": "created"}, inplace=True) df_iss.rename(columns={"closed_at": "resolved"}, inplace=True) df_iss.rename(columns={"updated_at": "updated"}, inplace=True) df_iss.rename(columns={"number": "key"}, inplace=True) # df_iss.rename(columns={"labels": "issue_type"}, inplace=True) df_iss.rename(columns={"state": "status"}, inplace=True) # clean the data df_iss["labels"] = df_iss.labels.apply(labels_to_list) df_iss["issue_type"] = df_iss.labels.apply(remove_priority) df_iss["priority"] = df_iss.labels.apply(extract_priority) # Does not exist in Github Isssue tracker df_iss["resolution"] = df_iss.issue_type.apply(lambda _: "") # Compute lead time df_iss["t_lead"] = df_iss.resolved - df_iss.created df_iss["t_lead_s"] = df_iss.t_lead.dt.total_seconds() # df_iss = df_iss.loc[:200] # Find related commits path_to_repo = f"{os.getenv('HOME')}/case_systems/{sys_name}" contribcompls = [] commit_shas_per_contrib = [] for issue_key in tqdm(df_iss["key"].values): if sys_name == "cassandra": issue_re = f"{issue_key}( |$)" elif sys_name == "gaffer": issue_re = f"(Gh |gh-){issue_key}( |$)" commit_shas = find_commits_for_issue(path_to_repo, issue_re) commit_shas_per_contrib.append(commit_shas) if commit_shas: try: contribcompl = compute_contrib_compl(path_to_repo, commit_shas) except: print( f"Skipping {issue_key}", issue_re, commit_shas, type(commit_shas), flush=True, ) contribcompl = None contribcompl = contribcompl.value else: contribcompl = None contribcompls.append(contribcompl) df_iss["commit_shas"] = commit_shas_per_contrib # Compute Contribution Complexities ... This takes multiple hours df_iss["contrib_complexity"] = contribcompls cols = [ "key", "title", "body", "created", "updated", "resolved", "status", "issue_type", "labels", "priority", "resolution", "t_lead", "t_lead_s", "url", "commit_shas", "contrib_complexity", ] np.set_printoptions(threshold=sys.maxsize) df_iss[cols].to_csv(procfname, index=False) return df_iss[cols]
def test_compute_contrib_compl_high(): path_to_repo = "/tmp/cassandra/" commit_shas = ["a991b64811f4d6adb6c7b31c0df52288eb06cf19"] result = compute_contrib_compl(path_to_repo, commit_sha) assert result == ContributionComplexity.HIGH