def add_company_column_for_issues(org, repo): users = c.get_issue_authors_with_company(org, repo) issues = c.get_issues(org, repo) issues_with_company = merge_issues_with_company_column(issues, users) issues_with_company.to_csv(org + "_" + repo + "_" + c.issue_file_suffix + "with_employer", sep='\t')
def calculate_issue_processing_time(org, repo): issues = c.get_issues(org, repo) for based_on_devstats_data in True, False: issues_w_processing_time = p.calculate_issue_time_difference( org, repo, issues, "created_at", "closed_at", based_on_devstats_data) issues_w_processing_time.rename( columns={'time_difference': 'processing_time'}, inplace=True) issues_w_processing_time.to_csv( org + "_" + repo + "_" + c.issue_file_suffix + "_with_processing_time_based_on_devstats_" + str(based_on_devstats_data), sep='\t')
def print_logistic_regression_for_pr_acceptance_rate( org, repo, based_on_devstats_data=False): users = p.get_users(org, repo, based_on_devstats_data) issues = c.get_issues(org, repo) pulls = c.get_pulls(org, repo) pulls = p.merge_pulls_with_issue_priority_and_kind(pulls, issues) pulls = _merge_pulls_with_company_column(pulls, users, based_on_devstats_data) pulls = p.add_dummy_column_for_pr_merge_state(pulls) pulls = _add_controlling_variables(pulls) pulls = _prepare_independent_company_variable(pulls, based_on_devstats_data) result = sm.logit(formula=_ols_formula("pr_is_merged", based_on_devstats_data), data=pulls).fit() _print_company_representation_in_data(pulls) _print_and_save_result(result)
def calculate_issue_reponse_time(org, repo): issues_w_comments = c.get_issues_with_comments(org, repo) if issues_w_comments.empty: issues = c.get_issues(org, repo) issue_comments = c.get_issue_comments(org, repo) issue_comments = p.extract_first_comment_per_issue(issue_comments) issues_w_comments = p.merge_issues_with_issue_comments( issues, issue_comments) for based_on_devstats_data in True, False: issues_w_response_time_df = p.calculate_issue_time_difference( org, repo, issues_w_comments, "created_at", "commented_at", based_on_devstats_data) issues_w_response_time_df.rename( columns={'time_difference': 'response_time'}, inplace=True) issues_w_response_time_df.to_csv( org + "_" + repo + "_" + c.issue_file_suffix + "_with_response_time_based_on_devstats_" + str(based_on_devstats_data), sep='\t')
def calculate_avg_issue_response_time_by_company(org, repo): issues = c.get_issues(org, repo) issue_comments = c.get_issue_comments(org, repo) issues_w_comments = p.merge_issues_with_issue_comments( issues, issue_comments) companies = c.get_companies(org, repo) for employer in companies.keys(): companies[employer]["response_time"] = timedelta(0) companies[employer]["issue_count"] = 0 time_format = "%Y-%m-%d %H:%M:%S" for _, issue in issues_w_comments.iterrows(): employer = p.get_employer( issue.user_login_x, org, repo) # TODO make more generic / change merge behavior open_issue = type(issue.created_at_y) is float and math.isnan( issue.created_at_y) if employer is None or open_issue: continue companies[employer]["response_time"] = companies[employer][ "response_time"] + p.determine_processing_time( issue.created_at_y, issue.created_at_x, time_format) # TODO make more generic / change merge behavior companies[employer]["issue_count"] += 1 for employer in companies.keys(): companies[employer]["avg_response_time"] = companies[employer][ "response_time"].total_seconds() / companies[employer][ "issue_count"] if companies[employer]["issue_count"] else 0 print( str(employer) + " - avg_response_time: " + str(companies[employer]["avg_response_time"])) print( str(employer) + " - issue_count: " + str(companies[employer]["issue_count"])) return companies
def calculate_avg_issue_processing_time_by_company(org, repo): # note that PRs are included here as they are a special type of an issue time_format = "%Y-%m-%d %H:%M:%S" companies = c.get_companies(org, repo) for employer in companies.keys(): companies[employer]["processing_time"] = timedelta(0) companies[employer]["issue_count"] = 0 issues = c.get_issues(org, repo) print("Start iterating over issues...") for _, issue in issues.iterrows(): employer = p.get_employer(issue.user_login, org, repo) still_open = type(issue.closed_at) is float and math.isnan( issue.closed_at) if employer is None or still_open: continue print(issue.title) companies[employer]["processing_time"] = companies[employer][ "processing_time"] + p.determine_processing_time( issue.created_at, issue.closed_at, time_format) companies[employer]["issue_count"] += 1 for employer in companies.keys(): companies[employer]["avg_processing_time"] = companies[employer][ "processing_time"].total_seconds() / companies[employer][ "issue_count"] if companies[employer]["issue_count"] else 0 print( str(employer) + " - avg_processing_time: " + str(companies[employer]["avg_processing_time"])) print( str(employer) + " - issue_count: " + str(companies[employer]["issue_count"])) return companies
def determine_company_share_of_issues_based_on_devstats_data(org, repo): issues = c.get_issues(org, repo) users = get_formatted_devstats_user() issues_with_company = merge_issues_with_company_column(issues, users) return Counter(issues_with_company.company.values)