def run_experiment(experiment_name, dataset_path): """ Run experiment with default parameters and export results into a pickle file. First, create a graph for the inital window, then slide that window day by day. Find developers, mavens, connectors and jacks for each iteration. Parameters ---------- experiment_name (str): Name of the experiment. dataset_path (str): Dataset path to read data. """ G = HistoryGraph(dataset_path) log_path = "logs/{}.log".format(experiment_name) print_log( "Started (Total iterations: {}).\n".format(G.get_num_iterations()), log_path, mode="w", ) # Start iterations result = {} i = 0 while True: i += 1 result[G.get_last_included_date()] = { "developers": G.get_developers(), "jacks": G.get_jacks(), "mavens": G.get_mavens(), "connectors": G.get_connectors(), } print_log("{} -> {} nodes\n".format(i, G.get_num_nodes()), log_path) if not G.forward_graph_one_day(): break print_log("Ended.\n", log_path) with open("results/{}.pkl".format(experiment_name), "wb") as f: pickle.dump(result, f) print_log("Exported results to 'results/{}.pkl'".format(experiment_name), log_path)
def average_number_of_developers(): """ Generate the average number of developers in the graph for each project. """ print("\n*** Average Number of Developer ***\n") print("{:<15}".format("SWS"), end="") print(("{:<15}" * len(project_list)).format(*project_list)) for sws in sws_list: print("{:<15}".format(sws), end="") for project_name in project_list: dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path, sliding_window_size=sws) dev_nums = [] while True: devs = G.get_developers() dev_nums.append(len(devs)) if not G.forward_graph_one_day(): break avg_dev_num = sum(dev_nums) / len(dev_nums) print("{:<15.2f}".format(avg_dev_num), end="") print() print()
def leaving_developers_table(): """ Generate the number of leaving developers for each project. """ print("\n*** Number of Leaving Developers ***\n") print("Absence Limit ", ("{:<15}" * len(project_list)).format(*project_list)) for absence_limit in sws_list: print("{:<15}".format(absence_limit), end="") for project_name in project_list: dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path, sliding_window_size=absence_limit) date_to_leaving_developers = find_leaving_developers(G) leaving_developers = [ dev for devs in date_to_leaving_developers.values() for dev in devs ] print("{:<15}".format(len(leaving_developers)), end="") print() print()
def validation(project_name, sliding_window_size, check_days, max_k, random_val): """ Perform validation with given parameters. Parameters ---------- project_name (str): Name of the project to read change sets. sliding_window_size (str): Number of days to include the graph. check_days (list): List of integers to check if recomendations are true or false. max_k (int): Maximum k for topk and MRR calculations. When max_k is 3, top1, top2 and top3 will be calculated, and the ranks in MRR calculations can 1, 2 and 3. random_val (bool): If True, `max_k` replacements will be selected randomly. Returns ------- list: First item of the list is the name of the experiment. Second and the following items will include accuracy and MRR for each check day. For example, returns [pig_sws365, (7, {top1:.5, top2:.7, mrr:.6}), (30, {top1:.6, top2:.9, mrr:.7})]. """ dataset_path = get_dataset_path(project_name) exp_name = get_exp_name(project_name, sws=sliding_window_size) dm = DataManager(dataset_path, None) # No need for sliding window size G = HistoryGraph(dataset_path, sliding_window_size) check_day_to_ranks = {check_day: [] for check_day in check_days} date_to_results = load_results(exp_name) for date, results in date_to_results.items(): if not results["replacements"]: # No leaving developer continue G.forward_until(date) # Update graph for leaving_dev, recommended_devs in results["replacements"].items(): if not recommended_devs: # No recommended developers continue if random_val: # Randomly select "max_k" developers other_devs = results["developers"] other_devs.remove(leaving_dev) recommended_devs = random.sample(other_devs, max_k) else: # Convert dictionary keys to list and get first "max_k" items recommended_devs = list(recommended_devs)[:max_k] leaving_dev_files = set(G.find_reachable_files(leaving_dev)) for check_day in check_days: # Get the change sets in the next days # For example, get the change sets in the next 7 days if check day is 7 change_sets = dm.get_specific_window( date + timedelta(days=1), date + timedelta(days=check_day)) rank = float("inf") # Not found yet for i, recommended_dev in enumerate(recommended_devs): recommended_dev_files = set( G.find_reachable_files(recommended_dev)) # Find the files that leaving developer can reach but recmommended # developer cannot reach target_files = leaving_dev_files - recommended_dev_files if check_modification(change_sets, recommended_dev, target_files): rank = i + 1 break # No need to check other developers check_day_to_ranks[check_day].append(rank) ret_items = [exp_name] for check_day in check_days: res = {} for k in range(1, max_k + 1): res["top{}".format(k)] = cal_accuracy( check_day_to_ranks[check_day], k) res["mrr"] = cal_mrr(check_day_to_ranks[check_day]) ret_items.append((check_day, res)) return ret_items
def run_experiment(experiment_name, dataset_path, sliding_window_size): """ Run experiment with default parameters and export results into a pickle file. First, create a graph for the inital window, then slide that window day by day. Find developers, mavens, connectors, jacks and knowledge distribution labels for each iteration (day). Also, find replacements if any developers leave the project. Parameters ---------- experiment_name (str): Name of the experiment. dataset_path (str): Dataset path to read data. sliding_window_size (int): Number of days included to the artifact graph. """ G = HistoryGraph(dataset_path, sliding_window_size) date_to_leaving_developers = find_leaving_developers(G) log_path = "logs/{}.log".format(experiment_name) print_log( "Started (Total iterations: {}).\n".format(G.get_num_iterations()), log_path, mode="w", ) start = datetime.now() # Start iterations date_to_results = {} step = 0 while True: step += 1 date = G.get_last_included_date() date_to_results[date] = { "developers": G.get_developers(), "top_committers": G.get_top_committers(), "jacks": G.get_jacks(), "mavens": G.get_mavens(), "connectors": G.get_connectors(), "last_jack": G.find_last_sig_jack(), "last_maven": G.find_last_sig_maven(), "last_connector": G.find_last_sig_connector(), "num_files": G.get_num_files_in_project(), "num_reachable_files": G.get_num_reachable_files(), "num_rare_files": G.get_num_rare_files(), "replacements": { dev: G.find_replacement(dev) for dev in date_to_leaving_developers.get(date, []) }, } for alpha in alpha_list: date_to_results[date]["balanced_or_hero_{}".format( alpha)] = G.balanced_or_hero(alpha=alpha) print_log("{} -> {} nodes\n".format(step, G.get_num_nodes()), log_path) if not G.forward_graph_one_day(): break end = datetime.now() print_log("Ended.(Time taken: {})\n".format(end - start), log_path) dump_results(experiment_name, date_to_results) print_log("Exported results to 'results/{}.pkl'".format(experiment_name), log_path)
def scalability_experiment_rq2(project_name): """ First, find leaving developers, then create a graph (with default parameters) and find replacements for leaving developers. At the same time keep some statistics, and return them. Parameters ---------- project_name (str): Name of the project. Returns ------- tuple: Tuple of experiment name, node statistics, average number of nodes, edge statistics, average number of edges, average time taken and total number of recommended replacements. """ experiment_name = get_exp_name(project_name) dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path) date_to_leaving_developers = find_leaving_developers(G) # Start iterations num_leaving_developers = 0 total_num_nodes = 0 total_num_edges = 0 node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0} edge_stat = {"commit": 0, "include": 0, "link": 0} time_taken = 0 for date, leaving_developers in date_to_leaving_developers.items(): G.forward_until(date) for leaving_developer in leaving_developers: num_leaving_developers += 1 for node_type in node_stat: node_stat[node_type] += len(G._filter_nodes_by_kind(node_type)) for edge_type in edge_stat: edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type)) total_num_nodes += G.get_num_nodes() total_num_edges += G.get_num_edges() t_start = perf_counter() G.find_replacement(leaving_developer) t_end = perf_counter() time_taken += t_end - t_start for node_type in node_stat: node_stat[node_type] = round(node_stat[node_type] / num_leaving_developers) for edge_type in edge_stat: edge_stat[edge_type] = round(edge_stat[edge_type] / num_leaving_developers) avg_num_nodes = round(total_num_nodes / num_leaving_developers) avg_num_edges = round(total_num_edges / num_leaving_developers) avg_time_taken = time_taken / num_leaving_developers return ( experiment_name, node_stat, avg_num_nodes, edge_stat, avg_num_edges, avg_time_taken, num_leaving_developers, )
def scalability_experiment_rq1_rq3(project_name, method_name): """ First, create a graph (with default parameters) for the initial window, then slide the window day by day. While sliding, run the given method (`method_name`) for each day and keep some statistics, and return them. This method is for RQ1 and RQ3 because the given method can't have any parameters in this setup. Parameters ---------- project_name (str): Name of the project. method name (str): Name of the method to run in the experiment. It have to match one of the methods defined in graph.HistoryGraph. Also, the given method cannot have any paramaters. For example, "get_connectors". Returns ------- tuple: Tuple of experiment name, node statistics, average number of nodes, edge statistics, average number of edges, average time taken and total number of iterations. """ experiment_name = get_exp_name(project_name) dataset_path = get_dataset_path(project_name) G = HistoryGraph(dataset_path) # Start iterations num_iters = 0 total_num_nodes = 0 total_num_edges = 0 node_stat = {"Developer": 0, "Issue": 0, "ChangeSet": 0, "File": 0} edge_stat = {"commit": 0, "include": 0, "link": 0} time_taken = 0 while True: num_iters += 1 for node_type in node_stat: node_stat[node_type] += len(G._filter_nodes_by_kind(node_type)) for edge_type in edge_stat: edge_stat[edge_type] += len(G._filter_edges_by_kind(edge_type)) total_num_nodes += G.get_num_nodes() total_num_edges += G.get_num_edges() t_start = perf_counter() eval("G.{}()".format(method_name)) t_end = perf_counter() time_taken += t_end - t_start if not G.forward_graph_one_day(): break for node_type in node_stat: node_stat[node_type] = round(node_stat[node_type] / num_iters) for edge_type in edge_stat: edge_stat[edge_type] = round(edge_stat[edge_type] / num_iters) avg_num_nodes = round(total_num_nodes / num_iters) avg_num_edges = round(total_num_edges / num_iters) avg_time_taken = time_taken / num_iters return ( experiment_name, node_stat, avg_num_nodes, edge_stat, avg_num_edges, avg_time_taken, num_iters, )