示例#1
0
def run_experiment(experiment_name, dataset_path):
    """
    Run experiment with default parameters and export results into a pickle file.
    First, create a graph for the inital window, then slide that window day by day.
    Find developers, mavens, connectors and jacks for each iteration.
    
    Parameters
    ----------
    experiment_name (str):
        Name of the experiment.

    dataset_path (str):
        Dataset path to read data.
    """

    G = HistoryGraph(dataset_path)

    log_path = "logs/{}.log".format(experiment_name)
    print_log(
        "Started (Total iterations: {}).\n".format(G.get_num_iterations()),
        log_path,
        mode="w",
    )

    # Start iterations
    result = {}
    i = 0
    while True:
        i += 1

        result[G.get_last_included_date()] = {
            "developers": G.get_developers(),
            "jacks": G.get_jacks(),
            "mavens": G.get_mavens(),
            "connectors": G.get_connectors(),
        }

        print_log("{} -> {} nodes\n".format(i, G.get_num_nodes()), log_path)

        if not G.forward_graph_one_day():
            break

    print_log("Ended.\n", log_path)

    with open("results/{}.pkl".format(experiment_name), "wb") as f:
        pickle.dump(result, f)

    print_log("Exported results to 'results/{}.pkl'".format(experiment_name),
              log_path)
示例#2
0
def average_number_of_developers():
    """
    Generate the average number of developers in the graph for each project.
    """

    print("\n*** Average Number of Developer ***\n")
    print("{:<15}".format("SWS"), end="")
    print(("{:<15}" * len(project_list)).format(*project_list))
    for sws in sws_list:
        print("{:<15}".format(sws), end="")
        for project_name in project_list:
            dataset_path = get_dataset_path(project_name)
            G = HistoryGraph(dataset_path, sliding_window_size=sws)
            dev_nums = []
            while True:
                devs = G.get_developers()
                dev_nums.append(len(devs))
                if not G.forward_graph_one_day():
                    break
            avg_dev_num = sum(dev_nums) / len(dev_nums)
            print("{:<15.2f}".format(avg_dev_num), end="")
        print()
    print()
示例#3
0
def leaving_developers_table():
    """
    Generate the number of leaving developers for each project.
    """

    print("\n*** Number of Leaving Developers ***\n")
    print("Absence Limit ",
          ("{:<15}" * len(project_list)).format(*project_list))
    for absence_limit in sws_list:
        print("{:<15}".format(absence_limit), end="")
        for project_name in project_list:
            dataset_path = get_dataset_path(project_name)
            G = HistoryGraph(dataset_path, sliding_window_size=absence_limit)
            date_to_leaving_developers = find_leaving_developers(G)
            leaving_developers = [
                dev for devs in date_to_leaving_developers.values()
                for dev in devs
            ]
            print("{:<15}".format(len(leaving_developers)), end="")
        print()
    print()
示例#4
0
def validation(project_name, sliding_window_size, check_days, max_k,
               random_val):
    """
    Perform validation with given parameters.

    Parameters
    ----------
    project_name (str):
        Name of the project to read change sets.

    sliding_window_size (str):
        Number of days to include the graph.

    check_days (list):
        List of integers to check if recomendations are true or false.

    max_k (int):
        Maximum k for topk and MRR calculations. When max_k is 3, top1, top2 and top3
        will be calculated, and the ranks in MRR calculations can 1, 2 and 3.

    random_val (bool):
        If True, `max_k` replacements will be selected randomly.

    Returns
    -------
    list:
        First item of the list is the name of the experiment. Second and the following
        items will include accuracy and MRR for each check day. For example, returns
        [pig_sws365, (7, {top1:.5, top2:.7, mrr:.6}), (30, {top1:.6, top2:.9, mrr:.7})].
    """
    dataset_path = get_dataset_path(project_name)
    exp_name = get_exp_name(project_name, sws=sliding_window_size)

    dm = DataManager(dataset_path, None)  # No need for sliding window size
    G = HistoryGraph(dataset_path, sliding_window_size)

    check_day_to_ranks = {check_day: [] for check_day in check_days}
    date_to_results = load_results(exp_name)
    for date, results in date_to_results.items():
        if not results["replacements"]:  # No leaving developer
            continue

        G.forward_until(date)  # Update graph

        for leaving_dev, recommended_devs in results["replacements"].items():
            if not recommended_devs:  # No recommended developers
                continue

            if random_val:  # Randomly select "max_k" developers
                other_devs = results["developers"]
                other_devs.remove(leaving_dev)
                recommended_devs = random.sample(other_devs, max_k)
            else:  # Convert dictionary keys to list and get first "max_k" items
                recommended_devs = list(recommended_devs)[:max_k]

            leaving_dev_files = set(G.find_reachable_files(leaving_dev))

            for check_day in check_days:
                # Get the change sets in the next days
                # For example, get the change sets in the next 7 days if check day is 7
                change_sets = dm.get_specific_window(
                    date + timedelta(days=1), date + timedelta(days=check_day))
                rank = float("inf")  # Not found yet
                for i, recommended_dev in enumerate(recommended_devs):
                    recommended_dev_files = set(
                        G.find_reachable_files(recommended_dev))

                    # Find the files that leaving developer can reach but recmommended
                    # developer cannot reach
                    target_files = leaving_dev_files - recommended_dev_files

                    if check_modification(change_sets, recommended_dev,
                                          target_files):
                        rank = i + 1
                        break  # No need to check other developers

                check_day_to_ranks[check_day].append(rank)

    ret_items = [exp_name]

    for check_day in check_days:
        res = {}
        for k in range(1, max_k + 1):
            res["top{}".format(k)] = cal_accuracy(
                check_day_to_ranks[check_day], k)

        res["mrr"] = cal_mrr(check_day_to_ranks[check_day])

        ret_items.append((check_day, res))
    return ret_items
示例#5
0
def run_experiment(experiment_name, dataset_path, sliding_window_size):
    """
    Run experiment with default parameters and export results into a pickle file.
    First, create a graph for the inital window, then slide that window day by day.
    Find developers, mavens, connectors, jacks and knowledge distribution labels for
    each iteration (day). Also, find replacements if any developers leave the project.

    Parameters
    ----------
    experiment_name (str):
        Name of the experiment.

    dataset_path (str):
        Dataset path to read data.

    sliding_window_size (int):
        Number of days included to the artifact graph.
    """
    G = HistoryGraph(dataset_path, sliding_window_size)

    date_to_leaving_developers = find_leaving_developers(G)

    log_path = "logs/{}.log".format(experiment_name)
    print_log(
        "Started (Total iterations: {}).\n".format(G.get_num_iterations()),
        log_path,
        mode="w",
    )

    start = datetime.now()
    # Start iterations
    date_to_results = {}
    step = 0
    while True:
        step += 1

        date = G.get_last_included_date()
        date_to_results[date] = {
            "developers": G.get_developers(),
            "top_committers": G.get_top_committers(),
            "jacks": G.get_jacks(),
            "mavens": G.get_mavens(),
            "connectors": G.get_connectors(),
            "last_jack": G.find_last_sig_jack(),
            "last_maven": G.find_last_sig_maven(),
            "last_connector": G.find_last_sig_connector(),
            "num_files": G.get_num_files_in_project(),
            "num_reachable_files": G.get_num_reachable_files(),
            "num_rare_files": G.get_num_rare_files(),
            "replacements": {
                dev: G.find_replacement(dev)
                for dev in date_to_leaving_developers.get(date, [])
            },
        }

        for alpha in alpha_list:
            date_to_results[date]["balanced_or_hero_{}".format(
                alpha)] = G.balanced_or_hero(alpha=alpha)

        print_log("{} -> {} nodes\n".format(step, G.get_num_nodes()), log_path)

        if not G.forward_graph_one_day():
            break

    end = datetime.now()
    print_log("Ended.(Time taken: {})\n".format(end - start), log_path)

    dump_results(experiment_name, date_to_results)
    print_log("Exported results to 'results/{}.pkl'".format(experiment_name),
              log_path)
示例#6
0
def scalability_experiment_rq2(project_name):
    """
    First, find leaving developers, then create a graph (with default parameters) and
    find replacements for leaving developers. At the same time keep some statistics,
    and return them.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    Returns
    -------
    tuple:
        Tuple of experiment name, node statistics, average number of nodes, edge
        statistics, average number of edges, average time taken and total number
        of recommended replacements.
    """
    experiment_name = get_exp_name(project_name)
    dataset_path = get_dataset_path(project_name)

    G = HistoryGraph(dataset_path)
    date_to_leaving_developers = find_leaving_developers(G)

    # Start iterations
    num_leaving_developers = 0
    total_num_nodes = 0
    total_num_edges = 0
    node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0}
    edge_stat = {"commit": 0, "include": 0, "link": 0}
    time_taken = 0
    for date, leaving_developers in date_to_leaving_developers.items():
        G.forward_until(date)
        for leaving_developer in leaving_developers:
            num_leaving_developers += 1

            for node_type in node_stat:
                node_stat[node_type] += len(G._filter_nodes_by_kind(node_type))

            for edge_type in edge_stat:
                edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type))

            total_num_nodes += G.get_num_nodes()
            total_num_edges += G.get_num_edges()

            t_start = perf_counter()
            G.find_replacement(leaving_developer)
            t_end = perf_counter()
            time_taken += t_end - t_start

    for node_type in node_stat:
        node_stat[node_type] = round(node_stat[node_type] /
                                     num_leaving_developers)

    for edge_type in edge_stat:
        edge_stat[edge_type] = round(edge_stat[edge_type] /
                                     num_leaving_developers)

    avg_num_nodes = round(total_num_nodes / num_leaving_developers)
    avg_num_edges = round(total_num_edges / num_leaving_developers)
    avg_time_taken = time_taken / num_leaving_developers

    return (
        experiment_name,
        node_stat,
        avg_num_nodes,
        edge_stat,
        avg_num_edges,
        avg_time_taken,
        num_leaving_developers,
    )
示例#7
0
def scalability_experiment_rq1_rq3(project_name, method_name):
    """
    First, create a graph (with default parameters) for the initial window, then slide
    the window day by day. While sliding, run the given method (`method_name`) for each
    day and keep some statistics, and return them.

    This method is for RQ1 and RQ3 because the given method can't have any
    parameters in this setup.

    Parameters
    ----------
    project_name (str):
        Name of the project.

    method name (str):
        Name of the method to run in the experiment. It have to match one of the
        methods defined in graph.HistoryGraph. Also, the given method cannot have
        any paramaters. For example, "get_connectors".

    Returns
    -------
    tuple:
        Tuple of experiment name, node statistics, average number of nodes, edge
        statistics, average number of edges, average time taken and total number
        of iterations.
    """
    experiment_name = get_exp_name(project_name)
    dataset_path = get_dataset_path(project_name)

    G = HistoryGraph(dataset_path)

    # Start iterations
    num_iters = 0
    total_num_nodes = 0
    total_num_edges = 0
    node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0}
    edge_stat = {"commit": 0, "include": 0, "link": 0}
    time_taken = 0
    while True:
        num_iters += 1
        for node_type in node_stat:
            node_stat[node_type] += len(G._filter_nodes_by_kind(node_type))

        for edge_type in edge_stat:
            edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type))

        total_num_nodes += G.get_num_nodes()
        total_num_edges += G.get_num_edges()

        t_start = perf_counter()
        eval("G.{}()".format(method_name))
        t_end = perf_counter()
        time_taken += t_end - t_start

        if not G.forward_graph_one_day():
            break

    for node_type in node_stat:
        node_stat[node_type] = round(node_stat[node_type] / num_iters)

    for edge_type in edge_stat:
        edge_stat[edge_type] = round(edge_stat[edge_type] / num_iters)

    avg_num_nodes = round(total_num_nodes / num_iters)
    avg_num_edges = round(total_num_edges / num_iters)
    avg_time_taken = time_taken / num_iters

    return (
        experiment_name,
        node_stat,
        avg_num_nodes,
        edge_stat,
        avg_num_edges,
        avg_time_taken,
        num_iters,
    )