def compute_repo_ccp(repo_file): rep = pd.read_csv(repo_file) rep['y2016_hit_rate'] = 1.0 * rep.y2016_hits / rep.y2016_commits rep['y2017_hit_rate'] = 1.0 * rep.y2017_hits / rep.y2017_commits rep['y2018_hit_rate'] = 1.0 * rep.y2018_hits / rep.y2018_commits rep['y2019_hit_rate'] = 1.0 * rep.y2019_hits / rep.y2019_commits rep['y2016_ccp'] = rep.y2016_hit_rate.map( lambda x: ccp_estimator.estimate_positives(x)) rep['y2017_ccp'] = rep.y2017_hit_rate.map( lambda x: ccp_estimator.estimate_positives(x)) rep['y2018_ccp'] = rep.y2018_hit_rate.map( lambda x: ccp_estimator.estimate_positives(x)) rep['y2019_ccp'] = rep.y2019_hit_rate.map( lambda x: ccp_estimator.estimate_positives(x)) rep['y2019_hit_rate_rnd'] = rep.y2019_hit_rate.map(lambda x: round(x, 2)) # extract to repo utils trep = rep[rep.fork == False] trep = trep[trep.y2019_hit_rate.map( lambda x: ccp_estimator.is_in_range(x))] trep = trep.sort_values(['y2019_hit_rate'], ascending=False) y2019_hit_rate_10p = trep.iloc[int(90 * len(trep) / 100)].y2019_hit_rate rep['quality_group'] = rep.apply( lambda x: 'Others' if x.y2019_hit_rate > y2019_hit_rate_10p else 'Top 10', axis=1) rep['dev_num_group'] = pd.cut(rep.authors, [ 0, rep.authors.quantile(0.25), rep.authors.quantile(0.75), float("inf") ], labels=["few", "intermediate", "numerous"]) rep['start_year'] = rep.start_time.map(lambda x: int(x[:4])) rep['age'] = ANALYZED_YEAR - rep.start_year rep['age_group'] = pd.cut(rep.start_year, [ 0, GITHUB_START_YEAR - 1, rep[rep.start_year >= GITHUB_START_YEAR].start_year.quantile(0.25), rep[rep.start_year >= GITHUB_START_YEAR].start_year.quantile(0.75), float("inf") ], labels=['prehistory', "old", "medium", "young"]) rep['y2019_ccp_in_valid_range'] = rep.y2019_hit_rate.map( lambda x: ccp_estimator.is_in_range(x)) # Updating the file with the CCP values rep.to_csv(repo_file, index=False) return rep
def onboarding_ccp_cochange(repo_file_quality_per_year, repo_file_onboarding_per_year): """ Note that repo_file_quality_per_year uses bug hit ratio and not ccp. For change analysis it doesn't matter. :param repo_file_quality_per_year: :return: """ key = 'repo_name' repo_file_quality_per_year_df = build_repo_per_year_df( repo_file_quality_per_year, key=key) repo_file_onboarding_per_year_df = build_repo_per_year_df( repo_file_onboarding_per_year, key=key) repo_file_onboarding_per_year_df = repo_file_onboarding_per_year_df[ repo_file_onboarding_per_year_df.comming_developers > 9] per_year_df = pd.merge(repo_file_quality_per_year_df, repo_file_onboarding_per_year_df, on=[key, 'year']) per_year_df['ccp'] = per_year_df.corrective_commits_ratio.map( lambda x: ccp_estimator.estimate_positives(x)) cochange_analysis(per_year_df, first_metric='ccp', second_metric='comming_involved_developers_ratio', first_the_higher_the_better=False, second_the_higher_the_better=True, first_sig_threshold=0.1, second_sig_threshold=0.1, key=key)
def churn_ccp_cochange_by_age(repo_file_quality_per_year, repo_file_churn_per_year): """ Note that repo_file_quality_per_year uses bug hit ratio and not ccp. For change analysis it doesn't matter. :param repo_file_quality_per_year: :return: """ key = 'repo_name' control_variables = ['age_group'] repo_file_quality_per_year_df = build_repo_per_year_df( repo_file_quality_per_year, key=key, control_variables=control_variables) repo_file_churn_per_year_df = build_repo_per_year_df( repo_file_churn_per_year, key=key, control_variables=control_variables) repo_file_churn_per_year_df = repo_file_churn_per_year_df[( repo_file_churn_per_year_df.base_year_developers > 9)] per_year_df = pd.merge(repo_file_quality_per_year_df, repo_file_churn_per_year_df, on=[key, 'year'] + control_variables) per_year_df['ccp'] = per_year_df.corrective_commits_ratio.map( lambda x: ccp_estimator.estimate_positives(x)) cochange_analysis_by_value(per_year_df, first_metric='ccp', second_metric='continuing_developers_ratio', first_the_higher_the_better=False, second_the_higher_the_better=True, first_sig_threshold=0.1, second_sig_threshold=0.1, fixed_variable='age_group', fixed_values=['old', 'medium', 'young'], key=key, control_variables=control_variables)
def plot_duration_by_length(): df = get_dataset() df['CCP'] = df['corrective_rate'].map( lambda x: round(ccp_estimator.estimate_positives(x), 2)) fig = plot_deciles(df, grouping_column='line_count', metric_column='CCP', title='CCP by Line Count Deciles', xaxis_title='Number of Lines', output_file=None) fig.show() print("Perason corrective rate and line count", df.corr()['line_count']['corrective_rate'])
def onboarding_ccp_cochange_by_lang(repo_file_quality_per_year, repo_file_onboarding_per_year): """ Note that repo_file_quality_per_year uses bug hit ratio and not ccp. For change analysis it doesn't matter. :param repo_file_quality_per_year: :return: """ key = 'repo_name' fixed_variable = 'language' control_variables = [fixed_variable] repo_file_quality_per_year_df = build_repo_per_year_df( repo_file_quality_per_year, key=key, control_variables=control_variables) repo_file_churn_per_year_df = build_repo_per_year_df( repo_file_onboarding_per_year, key=key, control_variables=control_variables) repo_file_churn_per_year_df = repo_file_churn_per_year_df[ (repo_file_churn_per_year_df.comming_developers > 9) & (repo_file_churn_per_year_df.language.isin(lang_name))] per_year_df = pd.merge(repo_file_quality_per_year_df, repo_file_churn_per_year_df, on=[key, 'year'] + control_variables) per_year_df['ccp'] = per_year_df.corrective_commits_ratio.map( lambda x: ccp_estimator.estimate_positives(x)) cochange_analysis_by_value( per_year_df, first_metric='ccp', second_metric='comming_involved_developers_ratio', first_the_higher_the_better=False, second_the_higher_the_better=True, first_sig_threshold=0.1, second_sig_threshold=0.1, fixed_variable=fixed_variable, fixed_values=lang_name, key=key, control_variables=control_variables)
def generete_computed_values(df: pd.DataFrame): print() print("% Computed values") print(r"\newcommand \filesNum {" + "{:,d}".format(len(df)) + " }") print(r"\newcommand \reposNum {" + "{:,d}".format(df.repo_name.nunique()) + " }") print(r"\newcommand \meanCCP {" + str(round(get_average_ccp(), 2)) + " }") print(r"\newcommand \hotspotCCPThreshold {" + "{:,.2f}".format( ccp_estimator.estimate_positives(get_hotspots_corrective_threshold())) + " }") print(r"\newcommand \reducedRiskRatio {" + str(round(len(df[df.quality_group == 'reduced_risk']) / len(df), 2)) + " }") print(r"\newcommand \shortLengthThreshold {" + "{:,.0f}".format(get_short_length_threshold()) + " }") print(r"\newcommand \longLengthThreshold {" + "{:,.0f}".format(get_long_length_threshold()) + " }") print(r"\newcommand \robustSmellNum {" + str(len(get_robust_smells())) + " }") print()