def _remove_all_result_files(ctx: click.Context, error: bool) -> None: """Remove all report files of the current paper_config.""" result_folders = _find_result_dir_paths_of_projects( ctx.obj["case_studies"]) for folder in result_folders: for res_file in folder.iterdir(): report_file = ReportFilename(res_file.name) if not report_file.is_result_file(): continue if ctx.obj["experiment"] and not ctx.obj[ "experiment"].file_belongs_to_experiment(res_file.name): continue if ctx.obj["report"] and not ctx.obj[ "report"].is_correct_report_type(res_file.name): continue commit_hash = report_file.commit_hash if any( list( case_study.has_revision(commit_hash) for case_study in ctx.obj["case_studies"])): if error and not (report_file.has_status_compileerror() or report_file.has_status_failed()): continue res_file.unlink()
def __get_result_files_dict( project_name: str, result_file_type: tp.Type[BaseReport] ) -> tp.Dict[ShortCommitHash, tp.List[Path]]: """ Returns a dict that maps the commit_hash to a list of all result files, of type result_file_type, for that commit. Args: project_name: target project result_file_type: the type of the result file """ res_dir = Path(f"{vara_cfg()['result_dir']}/{project_name}/") result_files: tp.DefaultDict[ShortCommitHash, tp.List[Path]] = defaultdict( list ) # maps commit hash -> list of res files (success or fail) if not res_dir.exists(): return result_files for res_file in res_dir.iterdir(): report_file = ReportFilename(res_file) if report_file.is_result_file( ) and result_file_type.is_correct_report_type(res_file.name): commit_hash = report_file.commit_hash result_files[commit_hash].append(res_file) return result_files
def test_get_newest_result_files_for_case_study_fail(self) -> None: """Check that when we have two files, the newes one get's selected.""" vara_cfg()['paper_config']['current_config'] = "test_revision_lookup" load_paper_config() bad_file = ReportFilename( 'CRE-CR-brotli-brotli-21ac39f7c8_' '34d4d1b5-7212-4244-9adc-b19bff599cf1_success.yaml') now = datetime.now().timestamp() file_path = Path(str( vara_cfg()['result_dir'])) / 'brotli' / bad_file.filename os.utime(file_path, (now, now)) newest_res_files = MCS.get_newest_result_files_for_case_study( get_paper_config().get_case_studies('brotli')[0], Path(vara_cfg()['result_dir'].value), CR) # remove unnecessary files filtered_newest_res_files = list( filter( lambda res_file: res_file.commit_hash == bad_file.commit_hash, map(lambda res_file: ReportFilename(res_file), newest_res_files))) self.assertFalse(filtered_newest_res_files[0].uuid.endswith('42'))
def test_get_commit(self): """Check if the correct commit hash is returned.""" self.assertEqual( ReportFilename(self.success_filename).commit_hash, ShortCommitHash("7bb9ef5f8c")) self.assertEqual( ReportFilename(self.fail_filename).commit_hash, ShortCommitHash("7bb9ef5f8c"))
def setUpClass(cls): """Setup file and CommitReport.""" cls.correct_UUID = "fdb09c5a-4cee-42d8-bbdc-4afe7a7864be" cls.raw_filename = ("CRE-CR-foo-bar-7bb9ef5f8c_" f"{cls.correct_UUID}_" "success.txt") cls.report_filename = ReportFilename(cls.raw_filename) cls.broken_report_filename = ReportFilename("ThisFileIsWrong.foobar")
def test_file_status(self): """Check if the correct file status is returned for BaseReport names.""" self.assertTrue( ReportFilename(self.success_filename).has_status_success()) self.assertFalse( ReportFilename(self.fail_filename).has_status_success()) self.assertTrue(ReportFilename(self.fail_filename).has_status_failed()) self.assertFalse( ReportFilename(self.success_filename).has_status_failed())
def test_is_result_file(self): """Check if the result file matcher works.""" self.assertTrue(ReportFilename(self.success_filename).is_result_file()) self.assertTrue(ReportFilename(self.fail_filename).is_result_file()) self.assertFalse( ReportFilename(self.success_filename.replace("_", "")).is_result_file()) self.assertFalse( ReportFilename(self.fail_filename.replace("-", "")).is_result_file())
def __get_tag_for_revision( revision: ShortCommitHash, file_list: tp.List[Path], project_cls: tp.Type[Project], result_file_type: tp.Type[BaseReport], tag_blocked: bool = True ) -> FileStatusExtension: """ Calculates the file status for a revision. Args: revision: the revision to get the status for file_list: the list of result files for the revision project_cls: the project class the revision belongs to result_file_type: the report type to be considered Returns: the status for the revision """ if tag_blocked and is_revision_blocked(revision, project_cls): return FileStatusExtension.BLOCKED newest_res_file = max(file_list, key=lambda x: x.stat().st_mtime) if result_file_type.is_correct_report_type(newest_res_file.name): return ReportFilename(str(newest_res_file)).file_status return FileStatusExtension.MISSING
def get_newest_result_files_for_case_study( case_study: CaseStudy, result_dir: Path, report_type: tp.Type[BaseReport]) -> tp.List[Path]: """ Return all result files of a specific type that belong to a given case study. For revision with multiple files, the newest file will be selected. Args: case_study: to load result_dir: to load the results from report_type: type of report that should be loaded Returns: list of result file paths """ files_to_store: tp.Dict[ShortCommitHash, Path] = {} result_dir /= case_study.project_name if not result_dir.exists(): return [] for opt_res_file in result_dir.iterdir(): report_file = ReportFilename(opt_res_file.name) if report_type.is_correct_report_type(report_file.filename): commit_hash = report_file.commit_hash if case_study.has_revision(commit_hash): current_file = files_to_store.get(commit_hash, None) if current_file is None: files_to_store[commit_hash] = opt_res_file else: if (current_file.stat().st_mtime < opt_res_file.stat().st_mtime): files_to_store[commit_hash] = opt_res_file return list(files_to_store.values())
def filter_report_paths_binary( report_files: tp.List[Path], binary: ProjectBinaryWrapper ) -> tp.List[Path]: return list( filter( lambda x: ReportFilename(x).binary_name == binary.name, report_files ) )
def test_is_result_file(self) -> None: """Check if the result file matcher works.""" self.assertTrue(self.commit_report_success.filename.is_result_file()) self.assertTrue(self.commit_report_fail.filename.is_result_file()) self.assertFalse( ReportFilename( self.commit_report_success.filename.filename.replace("_", "") ).is_result_file() ) self.assertFalse( ReportFilename( self.commit_report_success.filename.filename.replace("-", "") ).is_result_file() ) self.assertFalse( ReportFilename( self.commit_report_success.filename.filename.replace(".", "f") ).is_result_file() )
def cs_filter(file_name: str) -> bool: """ Filter files that are not in the case study. Returns: ``True`` if a case_study is set and the commit_hash of the file is not part of this case_study, otherwise, ``False``. """ if case_study is None: return False return not case_study.has_revision( ReportFilename(file_name).commit_hash)
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]: report = load_commit_report(report_path) cf_head_interactions_raw = report.number_of_head_cf_interactions() df_head_interactions_raw = report.number_of_head_df_interactions() return pd.DataFrame( { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'CFInteractions': report.number_of_cf_interactions(), 'DFInteractions': report.number_of_df_interactions(), 'HEAD CF Interactions': cf_head_interactions_raw[0] + cf_head_interactions_raw[1], 'HEAD DF Interactions': df_head_interactions_raw[0] + df_head_interactions_raw[1] }, index=[0]), report.head_commit.hash, str( report_path.stat().st_mtime_ns) report_files = get_processed_revisions_files( project_name, CommitReport, get_case_study_file_name_filter(case_study)) failed_report_files = get_failed_revisions_files( project_name, CommitReport, get_case_study_file_name_filter(case_study)) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash, lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b)) return data_frame
def build_report_files_tuple( project_name: str, case_study: tp.Optional[CaseStudy] ) -> tp.Tuple[tp.Dict[ShortCommitHash, Path], tp.Dict[ShortCommitHash, Path]]: """ Build the mappings between commit hash to its corresponding report file path, where the first mapping corresponds to commit hashes and their successful report files and the second mapping to commit hashes and their failed report files. Args: project_name: the name of the project case_study: the selected CaseStudy Returns: the mappings from commit hash to successful and failed report files as tuple """ report_files: tp.Dict[ShortCommitHash, Path] = { ReportFilename(report).commit_hash: report for report in get_processed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study) if case_study else lambda x: False, ) } failed_report_files: tp.Dict[ShortCommitHash, Path] = { ReportFilename(report).commit_hash: report for report in get_failed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study) if case_study else lambda x: False, ) } return report_files, failed_report_files
def file_belongs_to_experiment(cls, file_name: str) -> bool: """ Checks if the file belongs to this experiment. Args: file_name: name of the file to check Returns: True, if the file belongs to this experiment type """ try: other_short_hand = ReportFilename(file_name).experiment_shorthand return cls.shorthand() == other_short_hand except ValueError: return False
def _remove_old_result_files(ctx: click.Context) -> None: """Remove result files of wich a newer version exists.""" result_dir = Path(str(vara_cfg()['result_dir'])) for case_study in ctx.obj['case_studies']: old_files: tp.List[Path] = [] newer_files: tp.Dict[ShortCommitHash, Path] = {} result_dir_cs = result_dir / case_study.project_name if not result_dir_cs.exists(): continue for opt_res_file in result_dir_cs.iterdir(): report_file = ReportFilename(opt_res_file.name) if not report_file.is_result_file(): continue if ctx.obj["experiment"] and not ctx.obj[ "experiment"].file_belongs_to_experiment( opt_res_file.name): continue if ctx.obj["report"] and not ctx.obj[ "report"].is_correct_report_type(opt_res_file.name): continue commit_hash = report_file.commit_hash if case_study.has_revision(commit_hash): current_file = newer_files.get(commit_hash) if current_file is None: newer_files[commit_hash] = opt_res_file else: if (current_file.stat().st_mtime_ns < opt_res_file.stat().st_mtime_ns): newer_files[commit_hash] = opt_res_file old_files.append(current_file) else: old_files.append(opt_res_file) for file in old_files: if file.exists(): file.unlink()
def get_processed_revisions( project_name: str, result_file_type: tp.Type[BaseReport] ) -> tp.List[ShortCommitHash]: """ Calculates a list of revisions of a project that have already been processed successfully. Args: project_name: target project result_file_type: the type of the result file Returns: list of correctly process revisions """ return [ ReportFilename(x.name).commit_hash for x in get_processed_revisions_files(project_name, result_file_type) ]
def _get_requested_report_paths( project_name: str, szz_report: SZZReport ) -> tp.Dict[ShortCommitHash, Path]: bugs = szz_report.get_all_raw_bugs() requested_report_revisions: tp.Set[ShortCommitHash] = set() for bug in bugs: requested_report_revisions.add(bug.fixing_commit.to_short_commit_hash()) requested_report_revisions.update( introducer.to_short_commit_hash() for introducer in bug.introducing_commits ) report_map: tp.Dict[ShortCommitHash, Path] = {} for report_path in get_processed_revisions_files(project_name, BlameReport): report_revision = ReportFilename(report_path).commit_hash if report_revision in requested_report_revisions: report_map[report_revision] = report_path return report_map
def __get_files_with_status( project_name: str, result_file_type: tp.Type[BaseReport], file_statuses: tp.List[FileStatusExtension], file_name_filter: tp.Callable[[str], bool] = lambda x: False, only_newest: bool = True ) -> tp.List[Path]: """ Find all file paths to revision files with given file statuses. Args: project_name: target project result_file_type: the type of the result file file_statuses: a list of statuses the files should have file_name_filter: optional filter to exclude certain files; returns true if the file_name should not be checked only_newest: whether to include all result files, or only the newest; if ``False``, result files for the same revision are sorted descending by the file's mtime Returns: a list of file paths to matching revision files """ processed_revisions_paths = [] result_files = __get_result_files_dict(project_name, result_file_type) for value in result_files.values(): sorted_res_files = sorted( value, key=lambda x: Path(x).stat().st_mtime, reverse=True ) if only_newest: sorted_res_files = [sorted_res_files[0]] for result_file in sorted_res_files: if file_name_filter(result_file.name): continue if ReportFilename(result_file.name).file_status in file_statuses: processed_revisions_paths.append(result_file) return processed_revisions_paths
def get_failed_revisions( project_name: str, result_file_type: tp.Type[BaseReport] ) -> tp.List[ShortCommitHash]: """ Calculates a list of revisions of a project that have failed. Args: project_name: target project result_file_type: the type of the result file Returns: list of failed revisions """ failed_revisions = [] result_files = __get_result_files_dict(project_name, result_file_type) for commit_hash, value in result_files.items(): newest_res_file = max(value, key=lambda x: Path(x).stat().st_mtime) if ReportFilename(newest_res_file.name).has_status_failed(): failed_revisions.append(commit_hash) return failed_revisions
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: # pylint: disable=unused-argument def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]: report_file_name_match = re.search( BlameVerifierReportDatabase.report_file_name_pattern, str(report_path)) if report_file_name_match: report_file_name = report_file_name_match.group() else: raise RuntimeWarning( "report file name could not be read from report path") report: tp.Union[BlameVerifierReportOpt, BlameVerifierReportNoOptTBAA] if BlameVerifierReportOpt.is_correct_report_type(report_file_name): report_opt = load_blame_verifier_report_opt(report_path) report = report_opt opt_level = OptLevel.OPT.value elif BlameVerifierReportNoOptTBAA.is_correct_report_type( report_file_name): report_no_opt = load_blame_verifier_report_no_opt_tbaa( report_path) report = report_no_opt opt_level = OptLevel.NO_OPT.value else: raise RuntimeWarning("unknown report type") number_of_total_annotations = report.get_total_annotations() number_of_successful_annotations = \ report.get_successful_annotations() number_of_failed_annotations = report.get_failed_annotations() number_of_undetermined_annotations \ = report.get_undetermined_annotations() return pd.DataFrame( { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'opt_level': opt_level, 'total': number_of_total_annotations, 'successful': number_of_successful_annotations, 'failed': number_of_failed_annotations, 'undetermined': number_of_undetermined_annotations }, index=[0] # Add prefix of report name to head_commit to differentiate # between reports with and without optimization ), report.head_commit.hash + report_path.name.split( "-", 1)[0], str(report_path.stat().st_mtime_ns) report_files_opt = get_processed_revisions_files( project_name, BlameVerifierReportOpt, get_case_study_file_name_filter(case_study)) report_files_no_opt = get_processed_revisions_files( project_name, BlameVerifierReportNoOptTBAA, get_case_study_file_name_filter(case_study)) report_files = report_files_opt + report_files_no_opt failed_report_files_opt = get_failed_revisions_files( project_name, BlameVerifierReportOpt, get_case_study_file_name_filter(case_study)) failed_report_files_no_opt = get_failed_revisions_files( project_name, BlameVerifierReportNoOptTBAA, get_case_study_file_name_filter(case_study)) failed_report_files = \ failed_report_files_opt + failed_report_files_no_opt # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash + path.name.split("-", 1)[0], lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b)) return data_frame
def head_commit(self) -> ShortCommitHash: """The current HEAD commit under which this BlameVerifierReportOpt was created.""" return ReportFilename(Path(self.path)).commit_hash
def result_file_to_list_entry(result_file: Path) -> str: file_status = ReportFilename(result_file.name).file_status status = (file_status.get_colored_status().rjust( longest_file_status_extension + file_status.num_color_characters(), " ")) return f"[{status}] {result_file.name}"
def match_revision(file_name: str) -> bool: return ReportFilename( file_name).commit_hash != revision.to_short_commit_hash()
def _load_dataframe( cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any ) -> pd.DataFrame: commit_lookup = create_commit_lookup_helper(project_name) def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path ) -> tp.Tuple[pd.DataFrame, str, str]: report = load_blame_report(report_path) categorised_degree_occurrences = generate_lib_dependent_degrees( report ) def calc_total_amounts() -> int: total = 0 for _, lib_dict in categorised_degree_occurrences.items(): for _, tuple_list in lib_dict.items(): for degree_amount_tuple in tuple_list: total += degree_amount_tuple[1] return total total_amounts_of_all_libs = calc_total_amounts() list_of_author_degree_occurrences = generate_author_degree_tuples( report, commit_lookup ) author_degrees, author_amounts = _split_tuple_values_in_lists_tuple( list_of_author_degree_occurrences ) author_total = sum(author_amounts) list_of_max_time_deltas = generate_max_time_distribution_tuples( report, commit_lookup, MAX_TIME_BUCKET_SIZE ) (max_time_buckets, max_time_amounts ) = _split_tuple_values_in_lists_tuple(list_of_max_time_deltas) total_max_time_amounts = sum(max_time_amounts) list_of_avg_time_deltas = generate_avg_time_distribution_tuples( report, commit_lookup, AVG_TIME_BUCKET_SIZE ) (avg_time_buckets, avg_time_amounts ) = _split_tuple_values_in_lists_tuple(list_of_avg_time_deltas) total_avg_time_amounts = sum(avg_time_amounts) def build_dataframe_row( degree_type: DegreeType, degree: int, amount: int, total_amount: int, base_library: tp.Optional[str] = None, inter_library: tp.Optional[str] = None ) -> tp.Dict[str, tp.Any]: data_dict: tp.Dict[str, tp.Any] = { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'degree_type': degree_type.value, 'base_lib': base_library, 'inter_lib': inter_library, 'degree': degree, 'amount': amount, 'fraction': np.divide(amount, total_amount) } return data_dict result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = [] # Append interaction rows for base_lib_name, inter_lib_dict \ in categorised_degree_occurrences.items(): for inter_lib_name, list_of_lib_degree_amount_tuples in \ inter_lib_dict.items(): (inter_degrees, inter_amounts) = _split_tuple_values_in_lists_tuple( list_of_lib_degree_amount_tuples ) for i, _ in enumerate(inter_degrees): degree = inter_degrees[i] lib_amount = inter_amounts[i] interaction_data_dict = build_dataframe_row( degree_type=DegreeType.INTERACTION, degree=degree, amount=lib_amount, total_amount=total_amounts_of_all_libs, base_library=base_lib_name, inter_library=inter_lib_name, ) result_data_dicts.append(interaction_data_dict) def append_rows_of_degree_type( degree_type: DegreeType, degrees: tp.List[int], amounts: tp.List[int], sum_amounts: int, ) -> None: for k, _ in enumerate(degrees): data_dict = build_dataframe_row( degree_type=degree_type, degree=degrees[k], amount=amounts[k], total_amount=sum_amounts ) result_data_dicts.append(data_dict) # Append author rows append_rows_of_degree_type( degree_type=DegreeType.AUTHOR, degrees=author_degrees, amounts=author_amounts, sum_amounts=author_total ) # Append max_time rows append_rows_of_degree_type( degree_type=DegreeType.MAX_TIME, degrees=max_time_buckets, amounts=max_time_amounts, sum_amounts=total_max_time_amounts ) # Append avg_time rows append_rows_of_degree_type( degree_type=DegreeType.AVG_TIME, degrees=avg_time_buckets, amounts=avg_time_amounts, sum_amounts=total_avg_time_amounts ) return pd.DataFrame(result_data_dicts ), report.head_commit.hash, str( report_path.stat().st_mtime_ns ) report_files = get_processed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study) ) failed_report_files = get_failed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study) ) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash, lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b) ) return data_frame
def file_name_filter(file_name: str) -> bool: file_commit_hash = ReportFilename(file_name).commit_hash return not file_commit_hash == commit_hash
def _load_dataframe(cls, project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy], **kwargs: tp.Any) -> pd.DataFrame: def create_dataframe_layout() -> pd.DataFrame: df_layout = pd.DataFrame(columns=cls.COLUMNS) df_layout = df_layout.astype(cls.COLUMN_TYPES) return df_layout def create_data_frame_for_report( report_path: Path) -> tp.Tuple[pd.DataFrame, str, str]: report = load_blame_report(report_path) base_inter_c_repo_pair_mapping = \ gen_base_to_inter_commit_repo_pair_mapping(report) def build_dataframe_row(base_hash: FullCommitHash, base_library: str, inter_hash: FullCommitHash, inter_library: str, amount: int) -> tp.Dict[str, tp.Any]: data_dict: tp.Dict[str, tp.Any] = { 'revision': report.head_commit.hash, 'time_id': commit_map.short_time_id(report.head_commit), 'base_hash': base_hash.hash, 'base_lib': base_library, 'inter_hash': inter_hash.hash, 'inter_lib': inter_library, 'amount': amount } return data_dict result_data_dicts: tp.List[tp.Dict[str, tp.Any]] = [] for base_pair in base_inter_c_repo_pair_mapping: inter_pair_amount_dict = base_inter_c_repo_pair_mapping[ base_pair] for inter_pair in inter_pair_amount_dict: result_data_dicts.append( build_dataframe_row( base_hash=base_pair.commit.commit_hash, base_library=base_pair.commit.repository_name, inter_hash=inter_pair.commit.commit_hash, inter_library=inter_pair.commit.repository_name, amount=inter_pair_amount_dict[inter_pair])) return pd.DataFrame( result_data_dicts), report.head_commit.hash, str( report_path.stat().st_mtime_ns) report_files = get_processed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study)) failed_report_files = get_failed_revisions_files( project_name, BlameReport, get_case_study_file_name_filter(case_study)) # cls.CACHE_ID is set by superclass # pylint: disable=E1101 data_frame = build_cached_report_table( cls.CACHE_ID, project_name, report_files, failed_report_files, create_dataframe_layout, create_data_frame_for_report, lambda path: ReportFilename(path).commit_hash.hash, lambda path: str(path.stat().st_mtime_ns), lambda a, b: int(a) > int(b)) return data_frame