Пример #1
0
def speed_ccp_cochange_by_var(commits_per_user_file, fixed_variable,
                              fixed_values):
    """
        Note that repo_file_quality_per_year uses bug hit ratio and not ccp.
        For change analysis it doesn't matter.
    :param repo_file_quality_per_year:
    :return:
    """
    key = 'repo_name'
    control_variables = [fixed_variable]

    trep = get_valid_repos()
    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[
        users_per_project.year > EARLIEST_ANALYZED_YEAR]
    per_year_df = pd.merge(users_per_project, trep, on='repo_name')

    cochange_analysis_by_value(per_year_df,
                               first_metric='corrective_commits_ratio',
                               second_metric='commits_per_above11_users',
                               first_the_higher_the_better=False,
                               second_the_higher_the_better=True,
                               first_sig_threshold=0.1,
                               second_sig_threshold=10,
                               fixed_variable=fixed_variable,
                               fixed_values=fixed_values,
                               key=key,
                               control_variables=control_variables)
Пример #2
0
def speed_ccp_cochange(commits_per_user_file):
    """
        Note that repo_file_quality_per_year uses bug hit ratio and not ccp.
        For change analysis it doesn't matter.
    :param repo_file_quality_per_year:
    :return:
    """
    key = 'repo_name'

    trep = get_valid_repos()
    trep = trep[['repo_name']]
    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year > 2014]
    per_year_df = pd.merge(users_per_project, trep, on='repo_name')

    per_year_df = per_year_df[[
        'repo_name', 'year', 'corrective_commits_ratio',
        'commits_per_above11_users'
    ]]
    per_year_df = per_year_df.dropna()

    cochange_analysis(per_year_df,
                      first_metric='corrective_commits_ratio',
                      second_metric='commits_per_above11_users',
                      first_the_higher_the_better=False,
                      second_the_higher_the_better=True,
                      first_sig_threshold=0.1,
                      second_sig_threshold=10,
                      key=key)
Пример #3
0
def build_repo_per_year_df(cochange_file
                           , key
                           , control_variables=[]):
    trep = get_valid_repos()
    trep = trep[[key] + control_variables]
    cochange_df = pd.read_csv(cochange_file)
    cochange_df = cochange_df[cochange_df.year > EARLIEST_ANALYZED_YEAR]
    df = pd.merge(cochange_df, trep, on=key)

    return df
Пример #4
0
def build_porting_pairs():
    df = get_valid_repos()
    df['user'] = df.repo_name.map(lambda x: x.split('/')[0])
    df['project'] = df.repo_name.map(lambda x: x.split('/')[1])

    lang = lang_name + [i.lower() for i in lang_name]
    lang = [re.escape(i) for i in lang]
    lang_in_name = df[df.project.str.contains('|'.join(lang))]
    lang_in_name_by_user = lang_in_name.groupby(['user'], as_index=False).agg(
        {'repo_name': 'count'})
    p = lang_in_name[lang_in_name.user.isin(
        lang_in_name_by_user[lang_in_name_by_user.repo_name > 1].user.tolist(
        ))].sort_values('user')[['repo_name', 'user', 'project', 'language']]
    p.to_csv(os.path.join(DATA_PATH, 'porting_pairs.csv'), index=False)
Пример #5
0
def plot_ccp_pdf():
    df = get_valid_repos()

    plot_cdf_by_column(df,
                       column_name='y2019_ccp',
                       title='CDF of CCP',
                       output_file='c:/tmp/ccp_by_dev_num_group_cdf.png',
                       subsets_column='dev_num_group')

    plot_cdf_by_column(df,
                       column_name='y2019_ccp',
                       title='CDF of CCP',
                       output_file='c:/tmp/ccp_by_age_group_cdf.png',
                       subsets_column='age_group')
def coupling_analysis(coupling_file):
    trep = get_valid_repos()

    coupling_size = pd.read_csv(coupling_file)
    coupling_size = coupling_size[coupling_size.year == ANALYZED_YEAR]

    treps = pd.merge(trep, coupling_size, on='repo_name')
    print(treps.avg_capped_files.describe())

    coupling_25_q = treps.avg_capped_files.quantile(0.25)
    print("coupling 25 quantile", coupling_25_q)
    coupling_75_q = treps.avg_capped_files.quantile(0.75)
    print("coupling 75 quantile", coupling_75_q)
 
    treps['coupling_group'] = treps.apply(lambda x: 'Lower 25' if x.avg_capped_files < coupling_25_q
    else "top 25" if x.avg_capped_files > coupling_75_q else "Middle", axis=1)

    print('top 10 prob', 1.0 * len(treps[treps.quality_group == 'Top 10']) / len(treps))
    top_10_in_l25 = 1.0 * len(treps[(treps.quality_group == 'Top 10')
                                    & (treps.coupling_group == 'Lower 25')]) / len(
        treps[treps.coupling_group == 'Lower 25'])
    print('top 10 prob in lower 25', top_10_in_l25)
    top_10_in_t25 = 1.0 * len(treps[(treps.quality_group == 'Top 10')
                                    & (treps.coupling_group == 'top 25')]) / len(
        treps[treps.coupling_group == 'top 25'])
    print('top 10 prob in top 25', top_10_in_t25)
    print("short files lift ", top_10_in_l25 / top_10_in_t25 - 1)

    print("CCP in 4 top deciles"
          ,  round(treps[treps.avg_capped_files > 8.156].y2019_ccp.mean(),2))
    group_by_size = treps.groupby(['coupling_group'], as_index=False).agg({'y2019_ccp': 'mean'})
    print(group_by_size)

    size_df = run_file_size_analysis()
    joint = pd.merge(treps, size_df, on='repo_name')

    both_l25 = len(joint[(joint.coupling_group == 'Lower 25') & (joint.size_group == 'Lower 25')])
    top_10_in_both_l25 = 1.0 * len(joint[(joint.quality_group_x == 'Top 10')
                                         & (joint.coupling_group == 'Lower 25')
                                         & (joint.size_group == 'Lower 25')
                                         ]) / both_l25

    print('top 10 prob in lower 25 in coupling and size', top_10_in_both_l25)

    print('both lower 25', both_l25, "ratio", both_l25 / len(joint))
    print("both lower 25 CCP",
          joint[(joint.coupling_group == 'Lower 25') & (joint.size_group == 'Lower 25')].y2019_ccp_x.mean())

    return treps
def plot_dev_num():
    df = get_valid_repos()

    cutting_points = [0, 1] \
                     + [int(df.authors.quantile(i*0.1)) for i in range(1,10)] \
                     + [int(df.authors.quantile(0.99))]\
                     + [float("inf")]
    df['dev_num_sets'] = pd.cut(df.authors, cutting_points)
    df = df.sort_values('dev_num_sets')
    plot_deciles(df,
                 grouping_column='dev_num_sets',
                 metric_column='y2019_ccp',
                 title="Number of Developers vs. CCP",
                 xaxis_title="Developers (single, deciles and 99%)",
                 output_file=os.path.join(FIGURES_PATH,
                                          'ccp_by_dev_num_boxplot.png'))
def Linus_rule():
    df = get_valid_repos()
    df = check_name_redundency(df)

    selected_users = [
        'google', 'facebook', 'apache', 'angular', 'kubernetes', 'tensorflow'
    ]
    many_stars_threshhold = df.stargazers_count.quantile(0.95)
    print("many_stars_threshold 95%", many_stars_threshhold)
    df['many_stars'] = df.stargazers_count.map(
        lambda x: x > many_stars_threshhold)

    for i in selected_users:
        print(i)
        g = df[df.user == i].groupby(['many_stars'], as_index=False).agg({
            'y2019_ccp':
            'mean',
            'repo_name':
            'count'
        })
        print(g)
        print(
            "Many stars lift",
            round(
                g[g.many_stars].iloc[0].y2019_ccp /
                g[~g.many_stars].iloc[0].y2019_ccp - 1.0, 2))

    df['selected_users_project'] = df.user.map(lambda x: x in selected_users)
    g = df.groupby(['selected_users_project'], as_index=False).agg({
        'y2019_ccp':
        'mean',
        'repo_name':
        'count',
        'age':
        'mean',
        'authors':
        'mean',
        'stargazers_count':
        'mean'
    })
    print(g)
    for i in ['y2019_ccp', 'age', 'authors', 'stargazers_count']:
        print(
            str(i) + " users", g[g.selected_users_project][i].iloc[0],
            "others", g[~g.selected_users_project][i].iloc[0], "lift",
            g[g.selected_users_project][i].iloc[0] /
            g[~g.selected_users_project][i].iloc[0] - 1)
Пример #9
0
def onboarding_analysis(onboarding_file):
    df = get_valid_repos()
    churn = pd.read_csv(onboarding_file)
    churn = churn[churn.year == ANALYZED_YEAR]

    df = pd.merge(df, churn, on='repo_name')
    g = df.groupby('quality_group', as_index=False).agg(
        {'comming_involved_developers_ratio': 'mean'})
    print("Onboarding by quality group")
    print(g)
    print(
        "Lift",
        round(
            g[g.quality_group == 'Top 10'].iloc[0].
            comming_involved_developers_ratio / g[g.quality_group == 'Others'].
            iloc[0].comming_involved_developers_ratio - 1.0, 2))

    return df
Пример #10
0
def analyze_porting_pairs():
    # After manual editing and selecting only suitable pairs
    df = get_valid_repos()
    df['user'] = df.repo_name.map(lambda x: x.split('/')[0])
    df['project'] = df.repo_name.map(lambda x: x.split('/')[1])

    p = pd.read_csv(os.path.join(DATA_PATH, 'porting_pairs.csv'))
    j = pd.merge(p, df[['repo_name', 'y2019_ccp']], on='repo_name')

    pairs = pd.merge(j, j, on='user')
    pairs = pairs[(pairs.project_x != pairs.project_y)]
    pairs['y_ccp_by_x'] = 1.0 * pairs.y2019_ccp_y / pairs.y2019_ccp_x

    g = pairs.groupby(['language_x', 'language_y']).agg({
        'project_x': 'count',
        'y_ccp_by_x': {'mean', 'std'}
    })
    print(g)
    pairs.to_csv(os.path.join(DATA_PATH, 'porting_pairs_ccp.csv'), index=False)
Пример #11
0
def quality_and_speed(commits_per_user_file):

    trep = get_valid_repos()
    trep = trep[['repo_name', 'quality_group']]
    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = pd.merge(users_per_project, trep, on='repo_name')

    users_per_project_cur = users_per_project[users_per_project.year ==
                                              ANALYZED_YEAR].copy()
    # commit_per_user
    users_per_project_cur[
        'commit_per_user'] = users_per_project_cur.commits / users_per_project_cur.users
    # users_capped_commit_per_user
    users_per_project_cur[
        'users_capped_commit_per_user'] = users_per_project_cur.users_capped_commit / users_per_project_cur.users

    g = users_per_project_cur.groupby(['quality_group'], as_index=False).agg({
        'repo_name':
        'count',
        'commit_per_user':
        '******',
        'users_above_11_commits_per_above11_users':
        'mean',
        'users_capped_commit_per_user':
        '******',
        'users_above_11_500_cap_per_above11_users':
        'mean'
    })

    print("quality and speed")
    print(g)
    print("Commit per user top 10 lift",
          (g[g.quality_group == 'Top 10'].iloc[0].commit_per_user /
           g[g.quality_group == 'Others'].iloc[0].commit_per_user) - 1)

    print("Capped commit per user above 11 top 10 lift",
          (g[g.quality_group ==
             'Top 10'].iloc[0].users_above_11_500_cap_per_above11_users /
           g[g.quality_group ==
             'Others'].iloc[0].users_above_11_500_cap_per_above11_users) - 1)

    return g
def compute_lang_anova(major_extensions_file):
    ext = pd.read_csv(major_extensions_file)

    dominant = ext[ext.major_extension_ratio > DOMINANT_RATE]

    trep = get_valid_repos()

    major = pd.merge(trep, dominant, on='repo_name')
    print("projects with a ", DOMINANT_RATE, " dominant extension"
                                             , len(major[major.major_extension_ratio > DOMINANT_RATE]))

    ccp_cpp = major[major.major_extension =='.cpp'].y2019_ccp.tolist()
    ccp_cs = major[major.major_extension =='.cs'].y2019_ccp.tolist()
    ccp_java = major[major.major_extension =='.java'].y2019_ccp.tolist()
    ccp_js = major[major.major_extension =='.js'].y2019_ccp.tolist()
    ccp_php = major[major.major_extension =='.php'].y2019_ccp.tolist()
    ccp_py = major[major.major_extension =='.py'].y2019_ccp.tolist()
    ccp_sh = major[major.major_extension =='.sh'].y2019_ccp.tolist()

    print(stats.f_oneway(ccp_cpp,ccp_cs,ccp_java, ccp_js,ccp_php,ccp_py, ccp_sh))
def describe_repos(repos_file, bq_propeties_file, git_propeties_file):

    print("################## Describing repositories ##################")
    df = pd.read_csv(bq_propeties_file)
    df = df[df.commit2019 > 199]
    print("Large active repositories", '{:,}'.format(df.repo_name.nunique()))

    git_repos = pd.read_csv(git_propeties_file)
    git_repos = pd.merge(git_repos, df, on='repo_name')

    print("BQ Large non fork repositories",
          '{:,}'.format(git_repos[~git_repos.fork].repo_name.nunique()))

    repos = pd.read_csv(repos_file)
    print("Large no reduendent repositories",
          '{:,}'.format(repos[~repos.fork].repo_name.nunique()))

    trep = get_valid_repos()
    print("Valid non reduendent repositories",
          '{:,}'.format(trep.repo_name.nunique()))
def run_generate_bins():
    df = get_valid_repos()
    pair_analysis_by_bins_to_file(df,
                                  'y2019_ccp',
                                  'stargazers_count',
                                  output_file=os.path.join(
                                      DATA_PATH, 'stars_by_ccp_bins.csv'),
                                  bins=10)
    pair_analysis_by_bins_to_file(df,
                                  'y2019_ccp',
                                  'authors',
                                  output_file=os.path.join(
                                      DATA_PATH, 'authors_by_ccp_bins.csv'),
                                  bins=10)
    pair_analysis_by_bins_to_file(df,
                                  'y2019_ccp',
                                  'start_year',
                                  output_file=os.path.join(
                                      DATA_PATH, 'start_year_by_ccp_bins.csv'),
                                  bins=10)
def developer_num_analysis():
    trep = get_valid_repos()


    print("Authors & ccp correlation" , trep.corr()['authors']['y2019_ccp'])
    print("CCP for the first num of developers")
    print(trep.groupby(['authors'], as_index=False).agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'})[:20])
    print(trep.authors.describe())
    q25 = trep.authors.quantile(0.25)
    print("q25", q25)
    print(trep[(trep.authors < q25)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'}))

    q75 = trep.authors.quantile(0.75)
    print("q75", q75)
    print(trep[(trep.authors> q25) & (trep.authors < q75)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'}))

    print("above q50")
    print(trep[(trep.authors > q75)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'}))

    q99 = trep.authors.quantile(0.99)
    print("q99", q99)
    print(trep[(trep.authors > q99)].agg({'repo_name' : 'count', 'y2019_ccp' : 'mean'}))
def coupling_ccp_cochange(repo_file_quality_per_year,
                          repo_file_coupling_per_year):
    """
        Note that repo_file_quality_per_year uses bug hit ratio and not ccp.
        For change analysis it doesn't matter.
    :param repo_file_quality_per_year:
    :return:
    """
    key = 'repo_name'

    repo_file_quality_per_year_df = build_repo_per_year_df(
        repo_file_quality_per_year, key=key)
    repo_file_coupling_per_year_df = build_repo_per_year_df(
        repo_file_coupling_per_year, key=key)
    per_year_df = pd.merge(repo_file_quality_per_year_df,
                           repo_file_coupling_per_year_df,
                           on=[key, 'year'])
    repos = get_valid_repos()
    per_year_df = pd.merge(per_year_df, repos, on=[key])

    cochange_analysis(per_year_df,
                      first_metric='corrective_commits_ratio',
                      second_metric='avg_capped_files',
                      first_the_higher_the_better=False,
                      second_the_higher_the_better=False,
                      first_sig_threshold=0.1,
                      second_sig_threshold=1,
                      key=key)

    cochange_with_control(per_year_df,
                          first_metric='corrective_commits_ratio',
                          second_metric='avg_capped_files',
                          first_the_higher_the_better=False,
                          second_the_higher_the_better=False,
                          first_sig_threshold=0.1,
                          second_sig_threshold=1,
                          key=key)
def plot_longevity(repo_properties_file, longevity_file):
    """
    Longevity is on 2018 porjects, which are in a different file and therfore get a different function.
    """
    repos = pd.read_csv(repo_properties_file)
    longevity = pd.read_csv(longevity_file)

    df = pd.merge(repos, longevity, on='repo_name', how='left')
    df = df[(df.fork == False) & (df.y2018_ccp > 0) & (df.y2018_ccp < 1)]

    df['after_2019_end'] = df.days_from_2019_end.map(lambda x: 1
                                                     if x > 0 else 0)
    grouping_column = 'y2018_ccp_10bins'

    repos_2019 = get_valid_repos()
    bins = 10
    cuts = [0.0] + [
        repos_2019['y2019_ccp'].quantile((1.0 / bins) * i)
        for i in range(1, bins)
    ] + [1.0]

    df[grouping_column] = pd.cut(df['y2018_ccp'], cuts)
    """
    bin_metric_by_quantiles(df
                            , 'y2018_ccp'
                            , grouping_column
                            , bins=10
                            )
    """
    df = df.sort_values(grouping_column)
    plot_deciles(df=df,
                 grouping_column=grouping_column,
                 metric_column='after_2019_end',
                 title='Longevity by CCP',
                 xaxis_title='CCP deciles',
                 output_file=os.path.join(FIGURES_PATH, 'longevity.png'))
Пример #18
0
def speed_consistency(commits_per_user_file):

    trep = get_valid_repos()

    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.repo_name.isin(
        trep.repo_name.unique())]

    users_per_project_cur = users_per_project[users_per_project.year ==
                                              ANALYZED_YEAR].copy()
    users_per_project_cur = users_per_project_cur.rename(
        columns={
            'users':
            'cur_users',
            'commits':
            'cur_commits',
            'users_above_11':
            'cur_users_above_11',
            'users_above_11_commits_per_above11_users':
            'cur_users_above_11_commits_per_above11_users',
            'users_capped_commit':
            'cur_users_capped_commit',
            'users_above_11_500_cap_per_above11_users':
            'cur_users_above_11_500_cap_per_above11_users'
        })
    # commit_per_user
    users_per_project_cur[
        'cur_commit_per_user'] = users_per_project_cur.cur_commits / users_per_project_cur.cur_users
    # users_capped_commit_per_user
    users_per_project_cur[
        'cur_users_capped_commit_per_user'] = users_per_project_cur.cur_users_capped_commit / users_per_project_cur.cur_users

    users_per_project_prev = users_per_project[users_per_project.year == (
        ANALYZED_YEAR - 1)].copy()
    users_per_project_prev = users_per_project_prev.rename(
        columns={
            'users':
            'prev_users',
            'commits':
            'prev_commits',
            'users_above_11':
            'prev_users_above_11',
            'users_above_11_commits_per_above11_users':
            'prev_users_above_11_commits_per_above11_users',
            'users_capped_commit':
            'prev_users_capped_commit',
            'users_above_11_500_cap_per_above11_users':
            'prev_users_above_11_500_cap_per_above11_users'
        })
    # commit_per_user
    users_per_project_prev[
        'prev_commit_per_user'] = users_per_project_prev.prev_commits / users_per_project_prev.prev_users
    # users_capped_commit_per_user
    users_per_project_prev['prev_users_capped_commit_per_user'] = (
        users_per_project_prev.prev_users_capped_commit /
        users_per_project_prev.prev_users)

    upp_adjacent = pd.merge(users_per_project_cur,
                            users_per_project_prev,
                            on='repo_name')

    print("Users Pearson", upp_adjacent.corr()['cur_users']['prev_users'])
    print("Users above 11 Pearson",
          upp_adjacent.corr()['cur_users_above_11']['prev_users_above_11'])
    print("Commits Pearson",
          upp_adjacent.corr()['cur_commits']['prev_commits'])
    print(
        "Capped commits Pearson",
        upp_adjacent.corr()['cur_users_capped_commit']
        ['prev_users_capped_commit'])
    print("Commits per user Pearson",
          upp_adjacent.corr()['cur_commit_per_user']['prev_commit_per_user'])
    print(
        "Commits per user above 11 Pearson",
        upp_adjacent.corr()['cur_users_above_11_commits_per_above11_users']
        ['prev_users_above_11_commits_per_above11_users'])
    print(
        "Capped commits per user Pearson",
        upp_adjacent.corr()['cur_users_capped_commit_per_user']
        ['prev_users_capped_commit_per_user'])

    print(
        "Capped commits, above 11 Pearson",
        upp_adjacent.corr()['cur_users_above_11_500_cap_per_above11_users']
        ['prev_users_above_11_500_cap_per_above11_users'])

    return upp_adjacent
Пример #19
0
def length_per_lang_figure(major_extensions_file, image_file):

    ext = pd.read_csv(major_extensions_file)
    dominant = ext[ext.major_extension_ratio > DOMINANT_RATE]
    trep = get_valid_repos()

    main = pd.merge(trep, dominant, on='repo_name')

    agg = main.groupby(['major_extension', 'quality_group'],
                       as_index=False).agg({
                           'major_capped_avg_file': {'mean', 'std'},
                           'repo_name': 'count'
                       })

    agg.columns = [
        u'langauge', u'quality_group', u'size_std', u'size_mean', u'projects'
    ]

    print(agg)

    top_size_mean = []
    top_size_std_err = []
    other_size_mean = []
    other_size_std_err = []
    for i in language_extensions:
        top_size_mean.append(
            round(agg[(agg.langauge == i)
                      & (agg.quality_group == 'Top 10')].iloc[0].size_mean))
        top_size_std_err.append(
            round(
                agg[(agg.langauge == i)
                    & (agg.quality_group == 'Top 10')].iloc[0].size_std /
                sqrt(agg[(agg.langauge == i)
                         & (agg.quality_group == 'Top 10')].iloc[0].projects)))
        other_size_mean.append(
            round(agg[(agg.langauge == i)
                      & (agg.quality_group == 'Others')].iloc[0].size_mean))
        other_size_std_err.append(
            round(
                agg[(agg.langauge == i)
                    & (agg.quality_group == 'Others')].iloc[0].size_std /
                sqrt(agg[(agg.langauge == i)
                         & (agg.quality_group == 'Others')].iloc[0].projects)))

    trace1 = go.Bar(x=lang_name,
                    y=top_size_mean,
                    name='Top Length',
                    error_y=dict(type='data',
                                 array=top_size_std_err,
                                 visible=True))

    trace2 = go.Bar(x=lang_name,
                    y=other_size_mean,
                    name='Other Length',
                    error_y=dict(type='data',
                                 array=other_size_std_err,
                                 visible=True))

    data = [trace1, trace2]
    #layout = go.Layout(
    #    barmode='group'
    #)

    layout = go.Layout(
        barmode='group',
        title='File length per language',
        xaxis=dict(title='Language',
                   titlefont=dict(family='Courier New, monospace',
                                  size=18,
                                  color='#7f7f7f')),
        yaxis=dict(title='Avgerage file length',
                   titlefont=dict(family='Courier New, monospace',
                                  size=18,
                                  color='#7f7f7f')))
    fig = go.Figure(data=data, layout=layout)
    plot(fig, image='png', image_filename=image_file, output_type='file')
Пример #20
0
def file_size_analysis(major_extensions_file):

    trep = get_valid_repos()

    rep_size = pd.read_csv(major_extensions_file)
    print('avg file mean', rep_size.avg_size.mean() / KILOBYTE)
    print('std file mean', rep_size.std_size.mean() / KILOBYTE)
    print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE)
    print('std capped file mean', rep_size.capped_std_file.mean() / KILOBYTE)
    print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE)
    print('std capped file mean/avg capped file mean',
          rep_size.capped_std_file.mean() / rep_size.capped_avg_file.mean())

    treps = pd.merge(trep, rep_size, on='repo_name')
    print(rep_size.capped_avg_file.describe())

    size_25_q = rep_size.capped_avg_file.quantile(0.25)
    print("size 25 quantile", size_25_q, "in kb", size_25_q / KILOBYTE)
    size_75_q = rep_size.capped_avg_file.quantile(0.75)
    print("size 75 quantile", size_75_q, "in kb", size_75_q / KILOBYTE)

    treps['size_group'] = treps.apply(
        lambda x: 'Lower 25' if x.capped_avg_file < size_25_q else "top 25"
        if x.capped_avg_file > size_75_q else "Middle",
        axis=1)

    print('top 10 prob',
          1.0 * len(treps[treps.quality_group == 'Top 10']) / len(treps))
    top_10_in_l25 = 1.0 * len(treps[(treps.quality_group == 'Top 10')
                                    & (treps.size_group == 'Lower 25')]) / len(
                                        treps[treps.size_group == 'Lower 25'])
    print('top 10 prob in lower 25', top_10_in_l25)
    top_10_in_t25 = 1.0 * len(treps[(treps.quality_group == 'Top 10')
                                    & (treps.size_group == 'top 25')]) / len(
                                        treps[treps.size_group == 'top 25'])
    print('top 10 prob in top 25', top_10_in_t25)
    print("short files lift ", top_10_in_l25 / top_10_in_t25 - 1)

    group_by_size = treps.groupby(['size_group'],
                                  as_index=False).agg({'y2019_ccp': 'mean'})
    print(group_by_size)

    print("all files")
    print(
        treps.groupby('quality_group').agg({
            'capped_avg_file': 'mean',
            'avg_size': 'mean',
            'files': 'sum',
            'repo_name': 'count'
        }))

    for i in lang_name:
        print(i, " files")
        print(treps[(treps.major_extension_ratio > DOMINANT_RATE)
                    & (treps.major_extension == lang_extension[i])].groupby(
                        'quality_group').agg({
                            'capped_avg_file': 'mean',
                            'avg_size': 'mean',
                            'files': 'sum',
                            'repo_name': 'count'
                        }))

    print("Size controled by developer groups")
    pretty_print(
        pair_analysis_by_dev_num_group(treps, 'size_group', 'y2019_ccp'))

    print("Size controled by project age")
    pretty_print(pair_analysis_by_age_group(treps, 'size_group', 'y2019_ccp'))

    scatter(treps,
            first_metric='y2019_ccp',
            second_metric='capped_avg_file',
            output_file=os.path.join(FIGURES_PATH,
                                     r'ccp_vs_length_scatter.html'),
            mode='markers',
            opacity=0.9)
    pair_analysis_by_bins_to_file(treps,
                                  'y2019_ccp',
                                  'capped_avg_file',
                                  output_file=os.path.join(
                                      DATA_PATH, 'ccp_vs_length_bins.csv'),
                                  bins=10)

    return treps
Пример #21
0
def build_repo_ccp_dist():

    rep = get_non_fork_repos()
    trep = get_valid_repos()
    rep = rep.sort_values(['y2019_hit_rate'], ascending=False)

    trep = trep.sort_values(['y2019_hit_rate'], ascending=False)

    rep.groupby(['y2019_hit_rate_rnd']).agg({'repo_name': 'count'})
    g = rep.groupby(['y2019_hit_rate_rnd']).agg({'repo_name': 'count'})

    num_of_repos = len(rep)
    num_of_repos_in_range = len(trep)
    print()
    print(r"\begin{table}\centering")
    print(r"\caption{\label{tab:CCP-distrib}")
    print(r"CCP distribution in active GitHub projects}")
    print(r"\begin {tabular}{ | c | c | c | c | c |}")
    print(r"\hline")
    print(
        r"& \multicolumn {2} {c |} {Full data set} & \multicolumn {2} {c |} {CCP $\ in [0, 1]$}\\ "
    )
    print(r"& \multicolumn {2} {c |} {(", f'{num_of_repos:,}',
          "projects)} &\multicolumn")
    print(r"{2}{c |}{(", f'{num_of_repos_in_range:,}',
          r"projects)}\\ \cline {2 - 5}")

    print(
        r"Percentile &  Hit rate & CCP est. & Hit rate & CCP est.  \\ \hline")
    vals = [1.0 * i / 10 for i in range(1, 10)]
    vals.append(0.95)
    #vals.append(0.99)

    for i in vals:
        print(
            str(int(100 * i)) + " & ",
            str(round(rep.iloc[int(i * len(rep))].y2019_hit_rate, 2)),
            " & " + str(round(rep.iloc[int(i * len(rep))].y2019_ccp, 2)),
            " & " +
            str(round(trep.iloc[int(i * len(trep))].y2019_hit_rate, 2)),
            " & " + str(round(trep.iloc[int(i * len(trep))].y2019_ccp, 2)) +
            " \\\\ \hline")

    print(r"\end{tabular}")
    print(r"\end{table}")
    print()

    # For manual verification
    # print (rep.y2019_hit_rate_rnd.value_counts(normalize=True).sort_index().cumsum())
    # print(trep.y2019_hit_rate_rnd.value_counts(normalize=True).sort_index().cumsum())

    print("correlation between years")
    print("commits", trep.corr()[u'y2019_commits'][u'y2018_commits'])
    print("hits", trep.corr()[u'y2019_hits'][u'y2018_hits'])
    print("hit ratio", trep.corr()[u'y2019_hit_rate'][u'y2018_hit_rate'])
    print("ccp", trep.corr()[u'y2019_ccp'][u'y2018_ccp'])

    print()
    y2019_hit_rate_median = trep.iloc[int(len(trep) / 2)].y2019_hit_rate
    #print("y2018_hit_rate_median", y2018_hit_rate_median)
    trep['high_half_2019'] = trep.y2019_hit_rate.map(
        lambda x: x > y2019_hit_rate_median)
    trep['high_half_2018'] = trep.y2018_hit_rate.map(
        lambda x: x > y2019_hit_rate_median)
    g = trep.groupby(['high_half_2019', 'high_half_2018'],
                     as_index=False).agg({'repo_name': 'count'})
    g = g.rename(columns={'repo_name': 'cnt'})
    print(
        "stable half",
        1.0 * g[((g.high_half_2019 == False) & (g.high_half_2018 == False)) |
                ((g.high_half_2019 == True) &
                 (g.high_half_2018 == True))].cnt.sum() / len(trep))

    y2019_hit_rate_10p = trep.iloc[int(90 * len(trep) / 100)].y2019_hit_rate
    #print("y2018_hit_rate_10p", y2018_hit_rate_10p)
    trep['high_10_2018'] = trep.y2018_hit_rate.map(
        lambda x: x < y2019_hit_rate_10p)
    trep['high_10_2019'] = trep.y2019_hit_rate.map(
        lambda x: x < y2019_hit_rate_10p)
    g = trep.groupby(['high_10_2018', 'high_10_2019'],
                     as_index=False).agg({'repo_name': 'count'})
    g = g.rename(columns={'repo_name': 'cnt'})
    print(
        "stable 10",
        1.0 * g[((g.high_10_2018 == False) & (g.high_10_2019 == False)) |
                ((g.high_10_2018 == True) &
                 (g.high_10_2019 == True))].cnt.sum() / len(trep))

    print(
        "stay in top", 1.0 * g[((g.high_10_2018 == True) &
                                (g.high_10_2019 == True))].iloc[0].cnt /
        g[g.high_10_2018 == True].cnt.sum())
    print(
        "get to top", 1.0 * g[((g.high_10_2018 == False) &
                               (g.high_10_2019 == True))].iloc[0].cnt /
        g[g.high_10_2018 == False].cnt.sum())

    trep['cpp_abs_diff'] = trep.apply(
        lambda x: round(abs(x.y2019_ccp - x.y2018_ccp), 2), axis=1)
    trep.cpp_abs_diff.value_counts(normalize=True).sort_index().cumsum()
    print("abs difference mean", trep.cpp_abs_diff.mean())

    print()
    trep['cpp_diff'] = trep.apply(
        lambda x: round(x.y2019_ccp - x.y2018_ccp, 2), axis=1)
    trep.cpp_diff.value_counts(normalize=True).sort_index().cumsum()
    print("difference mean", trep.cpp_diff.mean())
    print("2018 average ccp ratio",
          trep.y2018_hits.sum() * 1.0 / trep.y2018_commits.sum())
    print("2019 average ccp ratio",
          trep.y2019_hits.sum() * 1.0 / trep.y2019_commits.sum())

    print()
    print("CCP ratios")
    y2019_ccp_90p = trep.iloc[int(90 * len(trep) / 100)].y2019_ccp
    y2019_ccp_50p = trep.iloc[int(50 * len(trep) / 100)].y2019_ccp
    y2019_ccp_10p = trep.iloc[int(10 * len(trep) / 100)].y2019_ccp
    print("2019 top 90 CCP ", round(y2019_ccp_90p, 2))
    print("2019 top 50 CCP ", round(y2019_ccp_50p, 2))
    print("2019 top 10 CCP (worse) ", round(y2019_ccp_10p, 2))
    print("2019 top 50 CCP over top 90", round(y2019_ccp_50p / y2019_ccp_90p,
                                               2))
    print("2019 top 10 CCP over top 90", round(y2019_ccp_10p / y2019_ccp_90p,
                                               2))
def file_length_per_language(major_extensions_file, commits_per_user_file,
                             image_file):

    ext = pd.read_csv(major_extensions_file)

    dominant = ext[ext.major_extension_ratio > DOMINANT_RATE]

    trep = get_valid_repos()

    major = pd.merge(trep, dominant, left_on='repo_name', right_on='repo_name')

    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year == 2019]
    trepu = pd.merge(major, users_per_project, on='repo_name')

    trepu['commit_per_user'] = trepu.apply(lambda x: x.y2019_commits / x.users
                                           if x.users > 0 else None,
                                           axis=1)
    trepu['commit_per_user_above_11'] = trepu.apply(
        lambda x: x.users_above_11_commits / x.users_above_11
        if x.users_above_11 > 0 else None,
        axis=1)

    trepu['commit_per_user_cap'] = trepu.apply(
        lambda x: x.users_capped_commit / x.users if x.users > 0 else None,
        axis=1)
    trepu['commit_per_user_above_11_cap'] = trepu.apply(
        lambda x: x.commits_above_11_500_cap / x.users_above_11
        if x.users_above_11 > 0 else None,
        axis=1)

    agg_lang = trepu[trepu.major_extension.isin(language_extensions)].groupby(
        'major_extension', as_index=False).agg({
            'repo_name': 'count',
            'y2019_ccp': {'mean', 'std'},
            'commit_per_user_above_11_cap': {'mean', 'std'}
        })

    agg_lang.columns = agg_lang.columns.droplevel()
    agg_lang.columns = [
        u'langauge', u'projects', u'ccp_mean', u'ccp_std', u'speed_mean',
        u'speed_std'
    ]

    agg_lang_quality = trepu[trepu.major_extension.isin(
        language_extensions)].groupby(['major_extension', 'quality_group'],
                                      as_index=False).agg({
                                          'repo_name': 'count',
                                          'commit_per_user_above_11_cap':
                                          {'mean', 'std'}
                                      })
    agg_lang_quality.columns = agg_lang_quality.columns.droplevel()
    """
    agg_lang_quality = agg_lang_quality.rename(columns={
        'major_extension' : u'langauge'
        , 'std': u'speed_std'
        , 'mean': u'speed_mean'
        , 'count': u'projects'
    })
    """
    agg_lang_quality.columns = [
        u'langauge', u'quality_group', u'projects', u'speed_mean', u'speed_std'
    ]

    all_speed_mean = []
    all_speed_std = []

    top_speed_mean = []
    top_speed_std = []
    other_speed_mean = []
    other_speed_std = []
    ccp_mean = []
    ccp_std = []
    for i in language_extensions:
        top_speed_mean.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_mean))
        top_speed_std.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Top 10')].iloc[0].projects)))
        other_speed_mean.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_mean))
        other_speed_std.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Others')].iloc[0].projects)))
        ccp_mean.append(
            round(100 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean))
        ccp_std.append(100 * round(
            agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std /
            math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects)))
        all_speed_mean.append(
            round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean))
        all_speed_std.append(
            round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std /
                  math.sqrt(agg_lang[(agg_lang.langauge
                                      == i)].iloc[0].projects)))

    trace0 = go.Bar(x=lang_name,
                    y=all_speed_mean,
                    name='Speed',
                    error_y=dict(type='data',
                                 array=all_speed_std,
                                 visible=True))
    trace1 = go.Bar(x=lang_name,
                    y=top_speed_mean,
                    name='Top Speed',
                    error_y=dict(type='data',
                                 array=top_speed_std,
                                 visible=True))

    trace2 = go.Bar(x=lang_name,
                    y=other_speed_mean,
                    name='Other Speed',
                    error_y=dict(type='data',
                                 array=other_speed_std,
                                 visible=True))

    trace3 = go.Bar(x=lang_name,
                    y=ccp_mean,
                    name='CCP',
                    error_y=dict(type='data', array=ccp_std, visible=True))
    data = [trace0, trace1, trace2, trace3]

    layout = go.Layout(
        barmode='group',
        title='Speed and CCP per language',
        xaxis=dict(title='Language',
                   titlefont=dict(family='Courier New, monospace',
                                  size=24,
                                  color='#7f7f7f')),
        yaxis=dict(title='Commit per developer, CCP',
                   titlefont=dict(family='Courier New, monospace',
                                  size=24,
                                  color='#7f7f7f')))

    fig = go.Figure(data=data, layout=layout)
    plot(fig, image='png', image_filename=image_file, output_type='file')

    print(r"\begin{tabular}{| l| l| l| l| l| l|}")
    print(r"   \hline ")
    Title = r" Metric & Projects & CCP & Speed & Top Speed & Others Speed  \\ \hline"
    print(Title)
    for i in agg_lang.sort_values('ccp_mean').langauge.tolist():
        Line = str(lang_by_extension(i))

        Line = Line + " & " + str(agg_lang[(agg_lang.langauge
                                            == i)].iloc[0].projects)

        Line = Line + " & " + str(
            round(1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean, 2))
        Line = Line + " $\pm$ " + str(
            round(
                1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std /
                math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects),
                3))

        Line = Line + " & " + str(
            int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean))
        Line = Line + " $\pm$ " + str(
            int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std /
                math.sqrt(agg_lang[
                    (agg_lang.langauge == i)].iloc[0].projects)))

        Line = Line + " & " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_mean))
        Line = Line + " $\pm$ " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Top 10')].iloc[0].projects)))
        Line = Line + " & " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_mean))
        Line = Line + " $\pm$ " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Others')].iloc[0].projects)))

        Line = Line + r" \\ \hline"
        print(Line)

    scatter(trepu,
            first_metric='y2019_ccp',
            second_metric='commit_per_user_above_11_cap',
            output_file=os.path.join(FIGURES_PATH,
                                     r'ccp_vs_speed_scatter.html'),
            mode='markers',
            opacity=0.9)

    pair_analysis_by_bins_to_file(trepu,
                                  'y2019_ccp',
                                  'commit_per_user_above_11_cap',
                                  output_file=os.path.join(
                                      DATA_PATH, 'ccp_vs_speed_bins.csv'),
                                  bins=10)
def ccp_cdf_per_language(major_extensions_file
                         , image_file):

    ext = pd.read_csv(major_extensions_file)
    dominant = ext[ext.major_extension_ratio > DOMINANT_RATE]
    print("Number of repositories with a dominant extension above"
          , DOMINANT_RATE
          , " is "
          , len(dominant))

    trep = get_valid_repos()

    major = pd.merge(trep, dominant, left_on='repo_name', right_on='repo_name')

    trepu = major

    cdfs = {}
    traces = []
    for i in lang_name:
        cdf = trepu[trepu.major_extension == lang_extension[i]].y2019_ccp.value_counts(normalize=True).sort_index().cumsum()
        cdf = pd.DataFrame(cdf)
        cdf = cdf.reset_index()
        cdf.columns = ['ccp', 'cdf']
        cdf = cdf[cdf.ccp < LIMIT]
        cdfs[i] = cdf
        traces.append(go.Scatter(
                            x=cdfs[i].ccp,
                            y=cdfs[i].cdf,
                            mode='lines',
                            name=i
                        ))

    layout = go.Layout(
        title='CDF of CCP for common languages',
        xaxis=dict(
            title='CCP',
            titlefont=dict(
                family='Courier New, monospace',
                size=24,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            title='CDF of projects CCP',
            titlefont=dict(
                family='Courier New, monospace',
                size=24,
                color='#7f7f7f'
            )
        )
    )
    fig = go.Figure(data=traces
                    , layout=layout)

    plot(fig
         , image='png'
         , image_filename=image_file
         , output_type='file'
         , image_width=800, image_height=400
         )

    #plot(fig)
    fig.write_image(image_file)
def run_star_analysis():
    trep = get_valid_repos()
    do_stars_analysis(trep)
    Linus_rule()
Пример #25
0
def quality_and_speed_over_years(commits_per_user_file):

    print("over the years ccp and speed change")
    trep = get_valid_repos()
    trep = trep[['repo_name']]
    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year > 2014]
    df = pd.merge(users_per_project, trep, on='repo_name')

    df = df[[
        'repo_name', 'year', 'corrective_commits_ratio',
        'commits_per_above11_users'
    ]]
    df = df.dropna()

    cur_df = df.copy()
    cur_df['prev_year'] = cur_df.year - 1
    cur_df = cur_df.rename(
        columns={
            'year': 'cur_year',
            'corrective_commits_ratio': 'cur_corrective_commits_ratio',
            'commits_per_above11_users': 'cur_commits_per_above11_users'
        })

    prev_df = df.copy()
    prev_df = prev_df.rename(
        columns={
            'year': 'prev_year',
            'corrective_commits_ratio': 'prev_corrective_commits_ratio',
            'commits_per_above11_users': 'prev_commits_per_above11_users'
        })

    two_years = pd.merge(cur_df,
                         prev_df,
                         left_on=['repo_name', 'prev_year'],
                         right_on=['repo_name', 'prev_year'])
    two_years[
        'improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio
    two_years[
        'hurt_ccp'] = two_years.cur_corrective_commits_ratio > two_years.prev_corrective_commits_ratio
    two_years[
        'improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users

    g = two_years.groupby(['improved_ccp', 'improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    cm = ConfusionMatrix(g_df=g,
                         classifier='improved_ccp',
                         concept='improved_speed',
                         count='repo_name')

    print(cm.summarize())
    print("speed & ccp improvement match", cm.accuracy())
    print("speed improvement given ccp improvement", cm.precision())
    print("ccp improvement given speed improvement",
          cm.tp() / (cm.fn() + cm.tp()))

    two_years[
        'sig_improved_ccp'] = two_years.cur_corrective_commits_ratio < two_years.prev_corrective_commits_ratio - 0.1
    two_years[
        'sig_improved_speed'] = two_years.cur_commits_per_above11_users > two_years.prev_commits_per_above11_users + 10

    g = two_years.groupby(['sig_improved_ccp', 'sig_improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    cm = ConfusionMatrix(g_df=g,
                         classifier='sig_improved_ccp',
                         concept='sig_improved_speed',
                         count='repo_name')
    print(cm.summarize())

    g = two_years.groupby(['sig_improved_ccp', 'improved_speed'],
                          as_index=False).agg({'repo_name': 'count'})
    print(g)

    print(cm.summarize())
    print()
    print("speed & ccp improvement match", cm.accuracy())
    print("speed improvement given ccp improvement", cm.precision(), "lift",
          cm.precision_lift())
    print("ccp improvement given speed improvement", cm.recall(), "lift",
          cm.recall() / cm.hit_rate() - 1)
    print()

    g = two_years.groupby(['sig_improved_speed', 'hurt_ccp'],
                          as_index=False).agg({'repo_name': 'count'})
    cm = ConfusionMatrix(g_df=g,
                         classifier='sig_improved_speed',
                         concept='hurt_ccp',
                         count='repo_name')

    print(cm.summarize())
    print()
    print("ccp hurt given significant speed improvement", cm.precision(),
          "lift", cm.precision_lift())
    print()
import os
import pandas as pd
import plotly
import plotly.graph_objects as go

from analysis_configuration import EARLIEST_ANALYZED_YEAR
from configuration import FIGURES_PATH
from repo_utils import get_valid_repos

df = get_valid_repos()


def repos_by_lang():
    g_by_lang = df.groupby(['language'], as_index=False).agg({'repo_name' : 'nunique'
                                                              , 'commits' : 'mean' })
    g_by_lang = g_by_lang[g_by_lang.repo_name > 9]
    g_by_lang = g_by_lang.sort_values(['repo_name', 'language'], ascending=[True, True])

    graphs = [
        go.Bar(x=g_by_lang['language'], y=g_by_lang['repo_name'], name='repos')
        , go.Bar(x=g_by_lang['language'], y=g_by_lang['commits'], name='commits')
    ]

    fig = go.Figure(data=graphs)
    fig.update_layout(
        title=go.layout.Title(
            text="Repositories by language",
            xref="paper",
            x=0
        ),
        xaxis=go.layout.XAxis(
def decile_analysis(major_extensions_file, coupling_file,
                    commits_per_user_file, churn_file, onboarding_file,
                    reuse_file, output_file):

    repos = get_valid_repos()

    repos = repos.rename(columns={'commits': 'repo_all_commits'})

    bin_metric_by_quantiles(repos, 'y2019_ccp', 'y2019_ccp_10bins', bins=10)
    # File length
    rep_size = pd.read_csv(major_extensions_file)
    df = pd.merge(repos, rep_size, on='repo_name', how='left')
    df['Capped_Length_KB'] = df.capped_avg_file / KILOBYTE

    # Coupling
    coupling_size = pd.read_csv(coupling_file)
    coupling_size = coupling_size[coupling_size.year == ANALYZED_YEAR]
    df = pd.merge(df, coupling_size, on='repo_name', how='left')
    df['Commit_Size_Capped'] = df['avg_capped_files']

    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year ==
                                          ANALYZED_YEAR]

    df = pd.merge(df, users_per_project, on='repo_name', how='left')

    df['commit_per_user'] = df.apply(lambda x: x.y2019_commits / x.users
                                     if x.users > 0 else None,
                                     axis=1)
    df['commit_per_user_above_11'] = df.apply(
        lambda x: x.users_above_11_commits / x.users_above_11
        if x.users_above_11 > 0 else None,
        axis=1)

    df['commit_per_user_cap'] = df.apply(
        lambda x: x.users_capped_commit / x.users if x.users > 0 else None,
        axis=1)
    df['Commit_Per_Involved_User_Cappped'] = df.apply(
        lambda x: x.commits_above_11_500_cap / x.users_above_11
        if x.users_above_11 > 0 else None,
        axis=1)

    # print(df.groupby(['y2019_ccp_10bins']).agg({'Commit_Per_Involved_User_Cappped' : 'mean', 'repo_name' : 'count'}).sort_index())

    df['repo_all_commits_log10'] = df.repo_all_commits.map(lambda x: log10(x)
                                                           if x > 0 else x)
    df['authors_log10'] = df.authors.map(lambda x: log10(x) if x > 0 else x)
    df['stargazers_count_log10'] = df.stargazers_count.map(lambda x: log10(x)
                                                           if x > 0 else x)

    churn = pd.read_csv(churn_file)
    churn = churn[churn.year == ANALYZED_YEAR - 1]

    df = pd.merge(df, churn, on='repo_name', how='left')
    df['churn'] = 1.0 - df['continuing_developers_ratio']

    onboarding = pd.read_csv(onboarding_file)
    onboarding = onboarding[onboarding.year == ANALYZED_YEAR]
    df = pd.merge(df, onboarding, on='repo_name', how='left')
    df['Onboarding'] = df.comming_involved_developers_ratio

    reuse = pd.read_csv(reuse_file)
    df = pd.merge(df, reuse, on='repo_name', how='left')

    aggregations = {i: 'mean' for i in metrics}
    aggregations['repo_name'] = 'count'
    g = df.groupby('y2019_ccp_10bins', as_index=False).agg(aggregations)

    g.to_csv(output_file)

    plot_all_metrics(df, grouping_column='y2019_ccp_10bins')

    plot_by_ccp_all_metrics(df,
                            grouping_columns=[
                                'Capped_Length_KB', 'Commit_Size_Capped',
                                'package_avg'
                            ])
    #plot_ccp_by_length_per_lang(df)

    return df