def compute_repo_ccp(repo_file):
    rep = pd.read_csv(repo_file)

    rep['y2016_hit_rate'] = 1.0 * rep.y2016_hits / rep.y2016_commits
    rep['y2017_hit_rate'] = 1.0 * rep.y2017_hits / rep.y2017_commits
    rep['y2018_hit_rate'] = 1.0 * rep.y2018_hits / rep.y2018_commits
    rep['y2019_hit_rate'] = 1.0 * rep.y2019_hits / rep.y2019_commits

    rep['y2016_ccp'] = rep.y2016_hit_rate.map(
        lambda x: ccp_estimator.estimate_positives(x))
    rep['y2017_ccp'] = rep.y2017_hit_rate.map(
        lambda x: ccp_estimator.estimate_positives(x))
    rep['y2018_ccp'] = rep.y2018_hit_rate.map(
        lambda x: ccp_estimator.estimate_positives(x))
    rep['y2019_ccp'] = rep.y2019_hit_rate.map(
        lambda x: ccp_estimator.estimate_positives(x))
    rep['y2019_hit_rate_rnd'] = rep.y2019_hit_rate.map(lambda x: round(x, 2))

    # extract to repo utils
    trep = rep[rep.fork == False]
    trep = trep[trep.y2019_hit_rate.map(
        lambda x: ccp_estimator.is_in_range(x))]
    trep = trep.sort_values(['y2019_hit_rate'], ascending=False)
    y2019_hit_rate_10p = trep.iloc[int(90 * len(trep) / 100)].y2019_hit_rate

    rep['quality_group'] = rep.apply(
        lambda x: 'Others'
        if x.y2019_hit_rate > y2019_hit_rate_10p else 'Top 10',
        axis=1)

    rep['dev_num_group'] = pd.cut(rep.authors, [
        0,
        rep.authors.quantile(0.25),
        rep.authors.quantile(0.75),
        float("inf")
    ],
                                  labels=["few", "intermediate", "numerous"])

    rep['start_year'] = rep.start_time.map(lambda x: int(x[:4]))
    rep['age'] = ANALYZED_YEAR - rep.start_year

    rep['age_group'] = pd.cut(rep.start_year, [
        0, GITHUB_START_YEAR - 1,
        rep[rep.start_year >= GITHUB_START_YEAR].start_year.quantile(0.25),
        rep[rep.start_year >= GITHUB_START_YEAR].start_year.quantile(0.75),
        float("inf")
    ],
                              labels=['prehistory', "old", "medium", "young"])

    rep['y2019_ccp_in_valid_range'] = rep.y2019_hit_rate.map(
        lambda x: ccp_estimator.is_in_range(x))

    # Updating the file with the CCP values
    rep.to_csv(repo_file, index=False)

    return rep
Пример #2
0
def onboarding_ccp_cochange(repo_file_quality_per_year,
                            repo_file_onboarding_per_year):
    """
        Note that repo_file_quality_per_year uses bug hit ratio and not ccp.
        For change analysis it doesn't matter.
    :param repo_file_quality_per_year:
    :return:
    """
    key = 'repo_name'

    repo_file_quality_per_year_df = build_repo_per_year_df(
        repo_file_quality_per_year, key=key)
    repo_file_onboarding_per_year_df = build_repo_per_year_df(
        repo_file_onboarding_per_year, key=key)
    repo_file_onboarding_per_year_df = repo_file_onboarding_per_year_df[
        repo_file_onboarding_per_year_df.comming_developers > 9]
    per_year_df = pd.merge(repo_file_quality_per_year_df,
                           repo_file_onboarding_per_year_df,
                           on=[key, 'year'])
    per_year_df['ccp'] = per_year_df.corrective_commits_ratio.map(
        lambda x: ccp_estimator.estimate_positives(x))

    cochange_analysis(per_year_df,
                      first_metric='ccp',
                      second_metric='comming_involved_developers_ratio',
                      first_the_higher_the_better=False,
                      second_the_higher_the_better=True,
                      first_sig_threshold=0.1,
                      second_sig_threshold=0.1,
                      key=key)
def churn_ccp_cochange_by_age(repo_file_quality_per_year,
                              repo_file_churn_per_year):
    """
        Note that repo_file_quality_per_year uses bug hit ratio and not ccp.
        For change analysis it doesn't matter.
    :param repo_file_quality_per_year:
    :return:
    """
    key = 'repo_name'
    control_variables = ['age_group']
    repo_file_quality_per_year_df = build_repo_per_year_df(
        repo_file_quality_per_year,
        key=key,
        control_variables=control_variables)
    repo_file_churn_per_year_df = build_repo_per_year_df(
        repo_file_churn_per_year, key=key, control_variables=control_variables)
    repo_file_churn_per_year_df = repo_file_churn_per_year_df[(
        repo_file_churn_per_year_df.base_year_developers > 9)]
    per_year_df = pd.merge(repo_file_quality_per_year_df,
                           repo_file_churn_per_year_df,
                           on=[key, 'year'] + control_variables)
    per_year_df['ccp'] = per_year_df.corrective_commits_ratio.map(
        lambda x: ccp_estimator.estimate_positives(x))

    cochange_analysis_by_value(per_year_df,
                               first_metric='ccp',
                               second_metric='continuing_developers_ratio',
                               first_the_higher_the_better=False,
                               second_the_higher_the_better=True,
                               first_sig_threshold=0.1,
                               second_sig_threshold=0.1,
                               fixed_variable='age_group',
                               fixed_values=['old', 'medium', 'young'],
                               key=key,
                               control_variables=control_variables)
Пример #4
0
def plot_duration_by_length():
    df = get_dataset()
    df['CCP'] = df['corrective_rate'].map(
        lambda x: round(ccp_estimator.estimate_positives(x), 2))

    fig = plot_deciles(df,
                       grouping_column='line_count',
                       metric_column='CCP',
                       title='CCP by Line Count Deciles',
                       xaxis_title='Number of Lines',
                       output_file=None)
    fig.show()

    print("Perason corrective rate and line count",
          df.corr()['line_count']['corrective_rate'])
Пример #5
0
def onboarding_ccp_cochange_by_lang(repo_file_quality_per_year,
                                    repo_file_onboarding_per_year):
    """
        Note that repo_file_quality_per_year uses bug hit ratio and not ccp.
        For change analysis it doesn't matter.
    :param repo_file_quality_per_year:
    :return:
    """
    key = 'repo_name'
    fixed_variable = 'language'
    control_variables = [fixed_variable]

    repo_file_quality_per_year_df = build_repo_per_year_df(
        repo_file_quality_per_year,
        key=key,
        control_variables=control_variables)
    repo_file_churn_per_year_df = build_repo_per_year_df(
        repo_file_onboarding_per_year,
        key=key,
        control_variables=control_variables)
    repo_file_churn_per_year_df = repo_file_churn_per_year_df[
        (repo_file_churn_per_year_df.comming_developers > 9)
        & (repo_file_churn_per_year_df.language.isin(lang_name))]
    per_year_df = pd.merge(repo_file_quality_per_year_df,
                           repo_file_churn_per_year_df,
                           on=[key, 'year'] + control_variables)
    per_year_df['ccp'] = per_year_df.corrective_commits_ratio.map(
        lambda x: ccp_estimator.estimate_positives(x))

    cochange_analysis_by_value(
        per_year_df,
        first_metric='ccp',
        second_metric='comming_involved_developers_ratio',
        first_the_higher_the_better=False,
        second_the_higher_the_better=True,
        first_sig_threshold=0.1,
        second_sig_threshold=0.1,
        fixed_variable=fixed_variable,
        fixed_values=lang_name,
        key=key,
        control_variables=control_variables)
Пример #6
0
def generete_computed_values(df: pd.DataFrame):
    print()
    print("% Computed values")

    print(r"\newcommand \filesNum {" + "{:,d}".format(len(df)) + " }")
    print(r"\newcommand \reposNum {" + "{:,d}".format(df.repo_name.nunique()) +
          " }")

    print(r"\newcommand \meanCCP {" + str(round(get_average_ccp(), 2)) + " }")
    print(r"\newcommand \hotspotCCPThreshold {" + "{:,.2f}".format(
        ccp_estimator.estimate_positives(get_hotspots_corrective_threshold()))
          + " }")
    print(r"\newcommand \reducedRiskRatio {" +
          str(round(len(df[df.quality_group == 'reduced_risk']) /
                    len(df), 2)) + " }")

    print(r"\newcommand \shortLengthThreshold {" +
          "{:,.0f}".format(get_short_length_threshold()) + " }")
    print(r"\newcommand \longLengthThreshold {" +
          "{:,.0f}".format(get_long_length_threshold()) + " }")

    print(r"\newcommand \robustSmellNum {" + str(len(get_robust_smells())) +
          " }")
    print()