예제 #1
0
def validation_wrapper(project_name, sws):
    """
    Wrapper to run validation method. First, find intersection developers, then
    run the validation.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    sws (int):
        Sliding_window_size.

    Returns
    -------
    str:
        NAme of the experiment

    dict:
        Mapping from each category to tuple of its topk tables.
    """
    exp_name = get_exp_name(project_name, sws=sws)
    date_to_results = load_results(exp_name)

    # Add union to results
    date_to_union = generate_date_to_union(date_to_results)
    for date in date_to_results:
        date_to_results[date]["union"] = date_to_union[date]

    date_to_top_commenters = generate_date_to_top_commenters(project_name, sws)
    date_to_top_commenters = {
        date: list(top_commenters.keys())
        for date, top_commenters in date_to_top_commenters.items()
    }

    date_to_developers = {
        date: results["developers"]
        for date, results in date_to_results.items()
    }

    res_dict = {"jacks": (), "union": (), "top_committers": ()}
    for category in res_dict:
        date_to_key_developers = {
            date: list(results[category].keys())
            for date, results in date_to_results.items()
        }

        acc_table, monte_carlo_avg_acc_table = validation(
            date_to_key_developers, date_to_top_commenters, date_to_developers)

        avg_improvement = (sum(acc1 / acc2 for acc1, acc2 in zip(
            acc_table.values(), monte_carlo_avg_acc_table.values())) /
                           len(acc_table)) * 100 - 100

        res_dict[category] = (acc_table, monte_carlo_avg_acc_table,
                              avg_improvement)

    return exp_name, res_dict
def run_plot_test(flags):
    experiment_path = paths.experiment_path(flags.name)

    for problem in flags.problems:
        prefix = problem + '_' + flags.mode
        if flags.mode == 'many':
            prefix += '_' + flags.compare_with

        data = util.load_results(experiment_path, prefix=prefix)
        plot_test_results(flags, experiment_path, data)
예제 #3
0
def main():
    # Read config file
    cfg = util.read_config('config/mushroom.yaml')

    # Load mushroom data from dataset
    x_train, x_test, y_train, y_test = load_data(cfg['dataset'],
                                                 cfg['test_ratio_offset'])
    x_train, y_train = util.shuffle_data(x_train, y_train)
    x_test, y_test = util.shuffle_data(x_test, y_test)

    # Default model name as loaded from file, overwritten if training
    model_name = cfg['nn']['model_name']
    model_dir = cfg['nn']['model_dir']

    with tf.Session() as sess:
        if cfg['nn']['train']:
            # Train network on our training data
            print('[ANN] Training new network...')
            model, model_name, train_stats = train_network(
                sess, x_train, y_train, cfg)
        else:
            loaded_results = util.load_results(
                os.path.join(model_dir, model_name + "_cm"))
            # Setup our continous plot
            plt.title('Error vs Epoch')
            plt.plot(loaded_results['train_stats']['train_errors'],
                     color='r',
                     label='training')
            plt.plot(loaded_results['train_stats']['valid_errors'],
                     color='b',
                     label='validation')
            plt.xlabel('Epoch')
            plt.ylabel('Error')
            plt.legend()
            plt.grid()
            plt.show()

            print('[ANN] Testing network {0}...'.format(model_name))
            model = util.load_model(
                os.path.join(model_dir, model_name + "_model"))
            train_stats = loaded_results['train_stats']

        # Test network on our testing data
        results = test_network(sess, model, x_test, y_test, cfg)
        conf_mat, sk_fpr, sk_tpr, roc_auc = util.analyse_results(
            y_test, results)
        print('[ANN] ROC Area Under Curve: {0:.2f}'.format(roc_auc))
        plot_roc(sk_fpr, sk_tpr, roc_auc)
        results_to_save = {
            'conf_mat': conf_mat,
            'train_stats': train_stats,
            'roc_auc': float(roc_auc)
        }
        util.store_results(results_to_save,
                           os.path.join(model_dir, model_name + "_cm"))
예제 #4
0
def acc_figure(name):
    df = load_results(name)

    df = df.set_index(['dataset', 'strategy'])
    fig, axes = plt.subplots(4, 3, sharey=True, squeeze=True, figsize=(6, 5))

    for dataset, col in zip(DATASETS, axes.T):
        for target, ax in zip(TARGETS, col):
            ax.plot(np.r_[0, df.loc[(dataset, 'delay'), 'mean_delta'].values /
                          1000],
                    np.r_[df.loc[(dataset, 'none'), target].iloc[0],
                          df.loc[(dataset, 'delay'), target]],
                    linewidth=1,
                    label='Delay')

            ax.plot(np.r_[0,
                          df.loc[(dataset, 'interval'), 'mean_delta'].values /
                          1000],
                    np.r_[df.loc[(dataset, 'none'), target].iloc[0],
                          df.loc[(dataset, 'interval'), target]],
                    linewidth=1,
                    linestyle='--',
                    label='Interval')

            ax.set_ylim(0, 1)

            if dataset == 'short_fixed':
                ax.set_xlim(0, 1)
                ax.set_xticks([0, 0.25, 0.5, 0.75])
            else:
                ax.set_xlim(0, 2)
                ax.set_xticks([0, 0.5, 1, 1.5])

    axes[0, 0].set_title('Short fixed-text')
    axes[0, 1].set_title('Long fixed-text')
    axes[0, 2].set_title('Long free-text')

    axes[0, 0].set_ylabel('Identity ACC')
    axes[1, 0].set_ylabel('Age ACC')
    axes[2, 0].set_ylabel('Gender ACC')
    axes[3, 0].set_ylabel('Handedness ACC')

    for i, j in product(range(3), range(3)):
        axes[i, j].set_xticklabels([])

    axes[-1, -1].legend(loc='lower right')

    fig.text(0.5, 0.0, 'Lag (s)', ha='center')
    # fig.text(0.0, 0.5, 'ACC', va='center', rotation='vertical')
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.1, hspace=0.15)

    save_fig(name)
    return
def run_plot(flags):
    if flags.phase == 'test':
        run_plot_test(flags)
        return

    model_path = paths.model_path(flags.name)
    data = util.load_results(model_path)

    plot_func = {
        'train': plot_training_results,
        'cv': plot_cv_results
    }[flags.phase]

    plot_func(flags, model_path, data)
예제 #6
0
def acc_figure(name):
    df = load_results(name)

    df = df.set_index(['dataset', 'strategy'])
    fig, axes = plt.subplots(4, 3, sharey=True, squeeze=True, figsize=(6, 5))

    for dataset, col in zip(DATASETS, axes.T):
        for target, ax in zip(TARGETS, col):
            ax.plot(np.r_[0, df.loc[(dataset, 'delay'), 'mean_delta'].values / 1000],
                    np.r_[df.loc[(dataset, 'none'), target].iloc[0], df.loc[(dataset, 'delay'), target]], linewidth=1, label='Delay')

            ax.plot(np.r_[0, df.loc[(dataset, 'interval'), 'mean_delta'].values / 1000],
                    np.r_[df.loc[(dataset, 'none'), target].iloc[0], df.loc[(dataset, 'interval'), target]], linewidth=1, linestyle='--', label='Interval')

            ax.set_ylim(0, 1)

            if dataset == 'short_fixed':
                ax.set_xlim(0, 1)
                ax.set_xticks([0, 0.25, 0.5, 0.75])
            else:
                ax.set_xlim(0, 2)
                ax.set_xticks([0, 0.5, 1, 1.5])

    axes[0, 0].set_title('Short fixed-text')
    axes[0, 1].set_title('Long fixed-text')
    axes[0, 2].set_title('Long free-text')

    axes[0, 0].set_ylabel('Identity ACC')
    axes[1, 0].set_ylabel('Age ACC')
    axes[2, 0].set_ylabel('Gender ACC')
    axes[3, 0].set_ylabel('Handedness ACC')

    for i,j in product(range(3), range(3)):
        axes[i,j].set_xticklabels([])

    axes[-1,-1].legend(loc='lower right')

    fig.text(0.5, 0.0, 'Lag (s)', ha='center')
    # fig.text(0.0, 0.5, 'ACC', va='center', rotation='vertical')
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.1, hspace=0.15)

    save_fig(name)
    return
예제 #7
0
def validation(project_name, sliding_window_size, check_days, max_k,
               random_val):
    """
    Perform validation with given parameters.

    Parameters
    ----------
    project_name (str):
        Name of the project to read change sets.

    sliding_window_size (str):
        Number of days to include the graph.

    check_days (list):
        List of integers to check if recomendations are true or false.

    max_k (int):
        Maximum k for topk and MRR calculations. When max_k is 3, top1, top2 and top3
        will be calculated, and the ranks in MRR calculations can 1, 2 and 3.

    random_val (bool):
        If True, `max_k` replacements will be selected randomly.

    Returns
    -------
    list:
        First item of the list is the name of the experiment. Second and the following
        items will include accuracy and MRR for each check day. For example, returns
        [pig_sws365, (7, {top1:.5, top2:.7, mrr:.6}), (30, {top1:.6, top2:.9, mrr:.7})].
    """
    dataset_path = get_dataset_path(project_name)
    exp_name = get_exp_name(project_name, sws=sliding_window_size)

    dm = DataManager(dataset_path, None)  # No need for sliding window size
    G = HistoryGraph(dataset_path, sliding_window_size)

    check_day_to_ranks = {check_day: [] for check_day in check_days}
    date_to_results = load_results(exp_name)
    for date, results in date_to_results.items():
        if not results["replacements"]:  # No leaving developer
            continue

        G.forward_until(date)  # Update graph

        for leaving_dev, recommended_devs in results["replacements"].items():
            if not recommended_devs:  # No recommended developers
                continue

            if random_val:  # Randomly select "max_k" developers
                other_devs = results["developers"]
                other_devs.remove(leaving_dev)
                recommended_devs = random.sample(other_devs, max_k)
            else:  # Convert dictionary keys to list and get first "max_k" items
                recommended_devs = list(recommended_devs)[:max_k]

            leaving_dev_files = set(G.find_reachable_files(leaving_dev))

            for check_day in check_days:
                # Get the change sets in the next days
                # For example, get the change sets in the next 7 days if check day is 7
                change_sets = dm.get_specific_window(
                    date + timedelta(days=1), date + timedelta(days=check_day))
                rank = float("inf")  # Not found yet
                for i, recommended_dev in enumerate(recommended_devs):
                    recommended_dev_files = set(
                        G.find_reachable_files(recommended_dev))

                    # Find the files that leaving developer can reach but recmommended
                    # developer cannot reach
                    target_files = leaving_dev_files - recommended_dev_files

                    if check_modification(change_sets, recommended_dev,
                                          target_files):
                        rank = i + 1
                        break  # No need to check other developers

                check_day_to_ranks[check_day].append(rank)

    ret_items = [exp_name]

    for check_day in check_days:
        res = {}
        for k in range(1, max_k + 1):
            res["top{}".format(k)] = cal_accuracy(
                check_day_to_ranks[check_day], k)

        res["mrr"] = cal_mrr(check_day_to_ranks[check_day])

        ret_items.append((check_day, res))
    return ret_items
예제 #8
0
    intersection_dates = set(dict1.keys()).intersection(dict2.keys())
    num_matches = sum(1 for date in intersection_dates
                      if dict1[date] == dict2[date])
    accuracy = num_matches / len(intersection_dates)

    return accuracy


if __name__ == "__main__":
    for sws in sws_list:
        print("*** Sliding Window Size: {} ***\n".format(sws))
        for project_name in project_list:
            print(project_name)
            for alpha in alpha_list:
                print("Alpha: {}".format(alpha))
                our_results = load_results(get_exp_name(project_name, sws=sws))

                date_to_label_our = {
                    date:
                    our_results[date]["balanced_or_hero_{}".format(alpha)]
                    for date in our_results
                    if our_results[date]["balanced_or_hero_{}".format(
                        alpha)]  # num of devs is not less than 3
                }

                date_to_dev_to_commit_counts = generate_date_to_top_committers(
                    project_name, sws)
                date_to_label_pareto = balanced_or_hero_pareto_over_time(
                    date_to_dev_to_commit_counts)

                num_hero_our = 0