def main(): example_path = os.path.dirname(__file__) output_dir = f"{example_path}/output" input_dir = f"{example_path}/input" with zipfile.ZipFile(f"{input_dir}/commons_lang.zip", "r") as target_zip: # 0. Prepare target file and tool target_zip.extractall(input_dir) # 1. Mining repository_path = f"{input_dir}/commons_lang" miner = DataMiner(output_dir, repository_path) miner.mining(start_date=datetime(2020, 10, 1)) # 2. Predicting database_path = f"{output_dir}/commons_lang.db" TraceabilityPredictor(database_path).run( LinkStrategy.COCHANGE, LinkBase.FOR_COMMITS ) ground_truth_csv_path = f"{input_dir}/gt_commons_lang.csv" # 3. Evaluating evaluator = LinkEvaluator(database_path, ground_truth_csv_path) print( evaluator.precision_recall_and_f1_score_of_strategy( "links_commits_based_cochanged" ) )
def print_apriori_report() -> None: evaluate_report = LinkEvaluator(path_to_db, path_to_csv) print( evaluate_report.precision_recall_and_f1_score_of_strategy( "apriori_for_commits")) print( evaluate_report.precision_recall_and_f1_score_of_strategy( "apriori_for_weeks"))
def print_complete_report() -> None: evaluate_report = LinkEvaluator(path_to_db, path_to_csv) print( "links_commits_based_apriori: ", evaluate_report.precision_recall_and_f1_score_of_strategy( "links_commits_based_apriori"), ) print( "links_commits_based_cochanged: ", evaluate_report.precision_recall_and_f1_score_of_strategy( "links_commits_based_cochanged"), ) print( "links_commits_based_cocreated: ", evaluate_report.precision_recall_and_f1_score_of_strategy( "links_commits_based_cocreated"), ) print( "links_weeks_based_apriori: ", evaluate_report.precision_recall_and_f1_score_of_strategy( "links_weeks_based_apriori"), ) print( "links_weeks_based_cochanged: ", evaluate_report.precision_recall_and_f1_score_of_strategy( "links_weeks_based_cochanged"), ) print( "links_weeks_based_cocreated: ", evaluate_report.precision_recall_and_f1_score_of_strategy( "links_weeks_based_cocreated"), ) return None
def draw_scatter_figure_for_coordinates_for_methods_and_commits_type(): fig = plt.figure(num=1, figsize=(15, 15)) axes = fig.add_subplot(111, projection="3d") def draw_3d_scatter( by_3d_coordinates: List[Tuple[int, int, int]], color: str, marker: str, size: int, ): xs, ys, zs = list(), list(), list() for x, y, z in by_3d_coordinates: xs.append(x) ys.append(y) zs.append(z) axes.scatter(np.array(xs), np.array(ys), np.array(zs), c=color, marker=marker, s=size) coordinates = LinkEvaluator(path_to_db, path_to_csv).coordinates_for_methods_commits() draw_3d_scatter(coordinates.coordinates_for_test, "r", ".", 1) draw_3d_scatter(coordinates.coordinates_for_tested, "b", "^", 1) print(coordinates.package_name_x_table) print(coordinates.commit_hash_y_table) print(coordinates.change_type_z_table) plt.title("sale") plt.xlabel("method_id") plt.ylabel("commit_id") plt.show()
def draw_3D_for_co_changed(): evaluator = LinkEvaluator(path_to_db, path_to_csv) evaluation = evaluator.raw_links_for_predicated_and_ground_truth( "apriori_for_weeks") num_of_links = len(evaluation.ground_truth_links) gt_links_id_pair = list(evaluation.ground_truth_links.keys()) pd_links_dict = evaluation.predict_links valid_pd_links_dict = evaluation.valid_predict_links predicted_links: List[float] = list() for link in gt_links_id_pair: if link not in valid_pd_links_dict: predicted_links.append(0.0) else: predicted_links.append(valid_pd_links_dict[link]) fig = plt.figure(num=1, figsize=(15, 15)) ax1 = fig.add_subplot(111, projection="3d") __draw_scatter_from_dict(ax1, pd_links_dict, "b", ".", 1) __draw_scatter_from_dict(ax1, valid_pd_links_dict, "r", "^", 10) __draw_scatter_from_dict(ax1, evaluation.ground_truth_links, "g", "^", 20) plt.title("sale") plt.xlabel("tested_id") plt.ylabel("test_id") plt.show()
def theory_max_precision(): evaluator = LinkEvaluator(path_to_db, path_to_csv) report = evaluator.co_changed_commits() co_changes_commits = report.co_changes_commits commit_method_pairs: Dict[str, Set[Tuple[int, int]]] = dict() for method_pair, commit_ids in co_changes_commits.items(): for commit_id in commit_ids: commit_hash = report.from_commit_id_to_hash(commit_id) commit_method_pairs.setdefault(commit_hash, set()) commit_method_pairs[commit_hash].add(method_pair) path_to_gt_count = f"{path_to_tmp}/table_for_ground_truth_occurred_commits.csv" csv_data = pd.read_csv(path_to_gt_count, index_col=0) commits_count: Dict[str, int] = dict() for commits in co_changes_commits.values(): for commit_id in commits: commit_hash = report.from_commit_id_to_hash(commit_id) commit_count = csv_data.loc[commit_hash][-2] if commit_hash in commits_count: continue if commit_count > 14 or commit_count < 3: continue commits_count[commit_hash] = commit_count sorted_commits = sorted(commits_count.keys(), key=lambda hash_val: (commits_count[hash_val])) min_commits = set() cur_methods_scope = set() for commit_hash in sorted_commits: method_pairs = commit_method_pairs[commit_hash] if cur_methods_scope.issuperset(method_pairs): continue cur_methods_scope.update(method_pairs) min_commits.add(commit_hash) db_connection = sqlite3.connect(path_to_db) def find_all_links_in_commits(commit_hash: str) -> Set[Tuple[int, int]]: db_cursor = db_connection.cursor() exe_rst = db_cursor.execute( f""" WITH test_methods AS ( SELECT id FROM git_methods WHERE file_path LIKE 'src/test/java/org/apache/commons/lang3%' ), tested_functions AS ( SELECT id FROM git_methods WHERE file_path LIKE 'src/main/java/org/apache/commons/lang3%' ), changes_test_in_commits AS ( SELECT target_method_id AS test_id FROM git_changes WHERE commit_hash = :commit_hash AND target_method_id IN test_methods ), changes_tested_in_commits AS ( SELECT target_method_id AS tested_id FROM git_changes WHERE commit_hash = :commit_hash AND target_method_id IN tested_functions ) SELECT DISTINCT tested_id, test_id FROM changes_test_in_commits LEFT OUTER JOIN changes_tested_in_commits """, {"commit_hash": commit_hash}, ) return {(int(row[0]), int(row[1])) for row in exe_rst.fetchall() if row is not None and len(row) == 2} predicated_pairs = set() test_ids = set(report.test_changed_commits.keys()) for commit_hash in min_commits: candidate = find_all_links_in_commits(commit_hash) for pair in candidate: if pair[1] in test_ids: predicated_pairs.add(pair) print(len(predicated_pairs))
def draw_2d_scatter_for_commits_distributions(): def draw_2d_scatter_for_type_z( axes: Axes, by_3d_coordinates: List[Tuple[int, int, int]], title: str, y_max: Optional[int] = None, ) -> None: added: Tuple[List[int], List[int]] = (list(), list()) modified: Tuple[List[int], List[int]] = (list(), list()) renamed: Tuple[List[int], List[int]] = (list(), list()) for change_commit, change_count, change_type in by_3d_coordinates: if y_max is not None and change_count > y_max: print( f"IGNORE COMMITS({change_commit}), SIZE ({change_count}), TYPE({change_type})." ) continue if change_type == 1: added[0].append(change_commit) added[1].append(change_count) elif change_type == 2: modified[0].append(change_commit) modified[1].append(change_count) elif change_type == 3: renamed[0].append(change_commit) renamed[1].append(change_count) axes.scatter(np.array(added[0]), np.array(added[1]), c="r", marker=".", s=1) axes.scatter(np.array(modified[0]), np.array(modified[1]), c="b", marker=".", s=1) axes.scatter(np.array(renamed[0]), np.array(renamed[1]), c="g", marker=".", s=1) axes.set_xlabel("commits_id(chronological)") axes.set_ylabel("number of changed (R: ADD, B: MODIFY, G: RENAME)") axes.set_title(title) return None evaluate_report = LinkEvaluator(path_to_db, path_to_csv) files = ( evaluate_report.coordinates_for_files_changes_distribution_of_commits( ).commits_count_coordinates) classes = (evaluate_report. coordinates_for_classes_changes_distribution_of_commits( ).commits_count_coordinates) methods = (evaluate_report. coordinates_for_methods_changes_distribution_of_commits( ).commits_count_coordinates) test = ( evaluate_report.coordinates_for_test_changes_distribution_of_commits( ).commits_count_coordinates) tested = ( evaluate_report.coordinates_for_tested_changes_distribution_of_commits( ).commits_count_coordinates) fig = plt.figure(num=5, figsize=(25, 25)) draw_2d_scatter_for_type_z(fig.add_subplot(511), files, "changes for files", 5) draw_2d_scatter_for_type_z(fig.add_subplot(512), classes, "changes for classes", 5) draw_2d_scatter_for_type_z(fig.add_subplot(513), methods, "changes for methods", 25) draw_2d_scatter_for_type_z(fig.add_subplot(514), test, "changes for test", 25) draw_2d_scatter_for_type_z(fig.add_subplot(515), tested, "changes for tested", 25) plt.show()
def output_to_csv() -> None: evaluate_report = LinkEvaluator(path_to_db, path_to_csv) evaluate_report.output_predict_to_csv()