def dataframe_to_table(data: pd.DataFrame, table_format: TableFormat, wrap_table: bool = False, wrap_landscape: bool = False, **kwargs: tp.Any) -> str: """ Convert a pandas ``DataFrame`` to a table. Args: data: the ``DataFrame`` to convert table_format: the table format used for conversion wrap_table: whether to wrap the table in a separate document (latex only) wrap_landscape: whether to use landscape mode to wrap the table (latex only) **kwargs: kwargs that get passed to pandas' conversion functions (``DataFrame.to_latex`` or ``DataFrame.to_html``) Returns: the table as a string """ table = "" if table_format.is_latex(): table = data.to_latex(**kwargs) if wrap_table: table = wrap_table_in_latex_document(table, wrap_landscape) elif table_format.is_html(): table = data.to_html(**kwargs) else: table = tabulate(data, data.columns, table_format.value) return table
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_study = self.table_kwargs["case_study"] num_commits = self.table_kwargs["num_commits"] project_name = case_study.project_name revision = newest_processed_revision_for_case_study( case_study, BlameReport) if not revision: raise TableDataEmpty() nodes = _collect_cig_node_data(project_name, revision) data = pd.DataFrame(nodes) data["code_centrality"] = data["degree"] - data["insertions"] data.set_index("commit_hash", inplace=True) top_degree = data["code_centrality"].nlargest(num_commits) degree_data = pd.DataFrame.from_dict({ "commit": top_degree.index.values, "centrality": top_degree.values, }) degree_data.sort_values(["centrality", "commit"], ascending=[False, True], inplace=True) kwargs: tp.Dict[str, tp.Any] = {} if table_format.is_latex(): kwargs["index"] = False kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True kwargs["caption"] = f"Top {num_commits} Central Code Commits" return dataframe_to_table(degree_data, table_format, wrap_table, wrap_landscape=True, **kwargs)
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: project_name = self.table_kwargs["case_study"].project_name szz_tool: SZZTool = self.table_kwargs["szz_tool"] commit_map = get_commit_map(project_name) columns = { "revision": "fix", "introducer": "introducer", "score": "score" } if szz_tool == SZZTool.PYDRILLER_SZZ: data = PyDrillerSZZQualityMetricsDatabase.get_data_for_project( project_name, list(columns.keys()), commit_map ) elif szz_tool == SZZTool.SZZ_UNLEASHED: data = SZZUnleashedQualityMetricsDatabase.get_data_for_project( project_name, list(columns.keys()), commit_map ) else: raise ValueError(f"Unknown SZZ tool '{szz_tool.tool_name}'") data.rename(columns=columns, inplace=True) data.set_index(["fix", "introducer"], inplace=True) data.sort_values("score", inplace=True) data.sort_index(level="fix", sort_remaining=False, inplace=True) kwargs: tp.Dict[str, tp.Any] = {} if table_format.is_latex(): kwargs["multicolumn_format"] = "c" kwargs["longtable"] = True return dataframe_to_table( data, table_format, wrap_table, wrap_landscape=True, **kwargs )
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_studies = get_paper_config().get_all_case_studies() variables = [ "churn", "num_interactions", "num_interacting_commits", "num_interacting_authors" ] cs_data = [ BlameDiffMetricsDatabase.get_data_for_project( case_study.project_name, ["revision", *variables], get_commit_map(case_study.project_name), case_study) for case_study in case_studies ] for data in cs_data: data.set_index('revision', inplace=True) data.drop(data[data['churn'] == 0].index, inplace=True) correlations = [ data[variables].corr(method="pearson") for data in cs_data ] df = pd.concat(correlations, axis=1, keys=get_unique_cs_name(case_studies)) kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True} if table_format.is_latex(): kwargs["multicolumn_format"] = "c" return dataframe_to_table(df, table_format, wrap_table, wrap_landscape=False, **kwargs)
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: project_name: str = self.table_kwargs['case_study'].project_name bug_provider = BugProvider.get_provider_for_project( get_project_cls_by_name(project_name) ) variables = [ "fixing hash", "fixing message", "fixing author", "issue_number" ] pybugs = bug_provider.find_pygit_bugs() data_rows = [[ pybug.fixing_commit.hex, pybug.fixing_commit.message, pybug.fixing_commit.author.name, pybug.issue_id ] for pybug in pybugs] bug_df = pd.DataFrame(columns=variables, data=np.array(data_rows)) kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True} if table_format.is_latex(): kwargs["multicolumn_format"] = "c" kwargs["longtable"] = True return dataframe_to_table( bug_df, table_format, wrap_table, wrap_landscape=True, **kwargs )
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_study: CaseStudy = self.table_kwargs["case_study"] project_name: str = case_study.project_name revision = newest_processed_revision_for_case_study( case_study, BlameReport) if not revision: raise TableDataEmpty() blame_aig = create_blame_interaction_graph( project_name, revision).author_interaction_graph() file_aig = create_file_based_interaction_graph( project_name, revision).author_interaction_graph() blame_nodes: tp.List[tp.Dict[str, tp.Any]] = [] for node in blame_aig.nodes: node_attrs = tp.cast(AIGNodeAttrs, blame_aig.nodes[node]) blame_neighbors = set(blame_aig.successors(node)).union( blame_aig.predecessors(node)) file_neighbors = set(file_aig.successors(node)).union( file_aig.predecessors(node)) blame_nodes.append(({ "author": f"{node_attrs['author']}", "blame_num_commits": node_attrs['num_commits'], "blame_node_degree": blame_aig.degree(node), "author_diff": len(blame_neighbors.difference(file_neighbors)) })) blame_data = pd.DataFrame(blame_nodes) blame_data.set_index("author", inplace=True) file_nodes: tp.List[tp.Dict[str, tp.Any]] = [] for node in file_aig.nodes: node_attrs = tp.cast(AIGNodeAttrs, file_aig.nodes[node]) file_nodes.append(({ "author": f"{node_attrs['author']}", "file_num_commits": node_attrs['num_commits'], "file_node_degree": file_aig.degree(node) })) file_data = pd.DataFrame(file_nodes) file_data.set_index("author", inplace=True) degree_data = blame_data.join(file_data, how="outer") kwargs: tp.Dict[str, tp.Any] = {} if table_format.is_latex(): kwargs["index"] = True kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True return dataframe_to_table(degree_data, table_format, wrap_table, wrap_landscape=True, **kwargs)
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_studies = get_loaded_paper_config().get_all_case_studies() df = pd.DataFrame() for case_study in case_studies: project_name = case_study.project_name report_files = get_processed_revisions_files( project_name, TimeReportAggregate, get_case_study_file_name_filter(case_study), False) for report_file in report_files: time_aggregated = TimeReportAggregate(report_file) report_name = time_aggregated.filename mean_runtime = np.mean( time_aggregated.measurements_wall_clock_time) std_runtime = np.std( time_aggregated.measurements_wall_clock_time) mean_ctx = np.mean(time_aggregated.measurements_ctx_switches) std_ctx = np.std(time_aggregated.measurements_ctx_switches) new_row = { "Binary": report_name.binary_name, "Experiment": report_name.experiment_shorthand, "Runtime Mean (Std)": f"{mean_runtime:.2f} ({std_runtime:.2f})", "Ctx-Switches Mean (Std)": f"{mean_ctx:.2f} ({std_ctx:.2f})" } df = df.append(new_row, ignore_index=True) df.sort_values(["Binary", "Experiment"], inplace=True) df.set_index( ["Binary", "Experiment"], inplace=True, ) kwargs: tp.Dict[str, tp.Any] = {} if table_format.is_latex(): kwargs["column_format"] = "llrr" return dataframe_to_table(df, table_format, wrap_table, wrap_landscape=True, **kwargs)
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_studies = get_loaded_paper_config().get_all_case_studies() cs_data: tp.List[pd.DataFrame] = [] for case_study in case_studies: project_name = case_study.project_name commit_map = get_commit_map(project_name) project_cls = get_project_cls_by_name(project_name) project_repo = get_local_project_git(project_name) project_path = project_repo.path[:-5] project_git = git["-C", project_path] revisions = sorted( case_study.revisions, key=commit_map.time_id, reverse=True ) revision = revisions[0] rev_range = revision.hash if revision else "HEAD" cs_dict = { project_name: { "Domain": str(project_cls.DOMAIN)[0].upper() + str(project_cls.DOMAIN)[1:], "LOC": calc_repo_loc(project_repo, rev_range), "Commits": int(project_git("rev-list", "--count", rev_range)), "Authors": len( project_git("shortlog", "-s", rev_range).splitlines() ) } } if revision: cs_dict[project_name]["Revision"] = revision.short_hash cs_data.append(pd.DataFrame.from_dict(cs_dict, orient="index")) df = pd.concat(cs_data).sort_index() kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True} if table_format.is_latex(): kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True return dataframe_to_table( df, table_format, wrap_table, wrap_landscape=True, **kwargs )
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: case_study: CaseStudy = self.table_kwargs['case_study'] report_files = get_processed_revisions_files( case_study.project_name, FeatureAnalysisReport, get_case_study_file_name_filter(case_study) ) if len(report_files) == 0: raise AssertionError( "No FeatureAnalysisReport found for case study " f"{case_study.project_name}" ) cs_revisions = case_study.revisions if len(cs_revisions) > 1: LOG.debug(f"revisions={cs_revisions}") LOG.warning( "This tabled is only designed for usage with one revision " "but more were found. All revisions expect for the first " "one are ignored." ) gt_files: tp.List[Path] = [ Path(gt) for gt in \ re.compile(r',\s*').split(self.table_kwargs['ground_truth']) ] features: tp.List[str] = [] if self.table_kwargs['features'] is not None: features = re.compile(r',\s*').split(self.table_kwargs['features']) insts: int = 0 data: tp.List[pd.DataFrame] = [] binaries = case_study.project_cls.binaries_for_revision(cs_revisions[0]) for binary in binaries: name = "" if len(binaries) > 1: name = binary.name # report report_files_for_binary = filter_report_paths_binary( report_files, binary ) report: tp.Optional[FeatureAnalysisReport] = None if not report_files_for_binary: LOG.warning(f"No report file given for binary {binary.name}!") continue report = load_feature_analysis_report(report_files_for_binary[0]) # ground truth gt_files_for_binary = filter_ground_truth_paths_binary( gt_files, binary ) ground_truth: tp.Optional[FeatureAnalysisGroundTruth] if not gt_files_for_binary: LOG.warning( f"No ground truth file given for binary {binary.name}!" ) continue ground_truth = FeatureAnalysisGroundTruth(gt_files_for_binary[0]) # features if features == []: features = ground_truth.get_features() features = sorted(features) evaluation: FeatureAnalysisReportEval = FeatureAnalysisReportEval( report, ground_truth, features.copy() ) data.append( self.__create_eval_df(evaluation, ['Total'] + features, name) ) insts += report.meta_data.num_br_switch_insts df = pd.concat(data) kwargs: tp.Dict[str, tp.Any] = {} if table_format.is_latex(): kwargs["column_format"] = ( 'ccc|cc' + '|cc' * len(features) if len(binaries) > 1 \ else 'cc|cc' + '|cc' * len(features) ) kwargs["longtable"] = True kwargs["multicolumn"] = True kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True kwargs["caption"] = ( f"Evaluation of project {case_study.project_name}. " f"In total there were {insts} br and switch instructions." ) kwargs['position'] = 't' return dataframe_to_table( df, table_format, wrap_table, wrap_landscape=True, **kwargs )
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: cs_data: tp.List[pd.DataFrame] = [] col_format = 'cc' gt_files: tp.List[Path] = [ Path(gt) for gt in \ re.compile(r',\s*').split(self.table_kwargs['ground_truth']) ] for case_study in sorted( tp.cast(tp.List[CaseStudy], self.table_kwargs["case_study"]), key=lambda x: x.project_name ): report_files = get_processed_revisions_files( case_study.project_name, FeatureAnalysisReport, get_case_study_file_name_filter(case_study) ) if len(report_files) == 0: raise AssertionError( "No FeatureAnalysisReport found for case study " f"{case_study.project_name}" ) cs_revisions = case_study.revisions if len(cs_revisions) > 1: LOG.debug(f"revisions={cs_revisions}") LOG.warning( "This tabled is only designed for usage with one revision " "but more were found. All revisions expect for the first " "one are ignored." ) binaries = case_study.project_cls.binaries_for_revision( cs_revisions[0] ) for binary in binaries: if len(binaries) > 1: name = case_study.project_name + "-" + binary.name else: name = case_study.project_name # report report_files_for_binary = filter_report_paths_binary( report_files, binary ) report: tp.Optional[FeatureAnalysisReport] = None if not report_files_for_binary: LOG.warning(f"No report file given for binary {name}!") continue report = load_feature_analysis_report( report_files_for_binary[0] ) # ground truth gt_files_for_binary = filter_ground_truth_paths_binary( gt_files, binary ) ground_truth: tp.Optional[FeatureAnalysisGroundTruth] if not gt_files_for_binary: LOG.warning( f"No ground truth file given for binary {name}!" ) continue ground_truth = FeatureAnalysisGroundTruth( gt_files_for_binary[0] ) evaluation: FeatureAnalysisReportEval = ( FeatureAnalysisReportEval(report, ground_truth, []) ) cs_data.append(self.__create_eval_df(evaluation, name)) col_format += '|cc' df = pd.concat(cs_data, axis=1) kwargs: tp.Dict[str, tp.Any] = {} if table_format.is_latex(): kwargs["column_format"] = col_format kwargs["longtable"] = True kwargs["multicolumn"] = True kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True kwargs['position'] = 't' return dataframe_to_table( df, table_format, wrap_table, wrap_landscape=True, **kwargs )
def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: cs_data: tp.List[pd.DataFrame] = [] for case_study in sorted(tp.cast(tp.List[CaseStudy], self.table_kwargs["case_study"]), key=lambda x: x.project_name): report_files_with = get_processed_revisions_files( case_study.project_name, GlobalsReportWith, get_case_study_file_name_filter(case_study)) report_files_without = get_processed_revisions_files( case_study.project_name, GlobalsReportWithout, get_case_study_file_name_filter(case_study)) if len(report_files_with) > 1 or len(report_files_without) > 1: LOG.debug(f"report_files_with={report_files_with}") LOG.debug(f"report_files_without={report_files_with}") raise AssertionError("Too many report files given!") if len(case_study.revisions) > 1: LOG.debug( "This tabled is only designed for usage with one revision " "but we found more. All revisions expect for the first " "one are ignored.") binaries = case_study.project_cls.binaries_for_revision( case_study.revisions[0]) for binary in binaries: if len(binaries) > 1: unique_cs_name = case_study.project_name + "-" + binary.name else: unique_cs_name = case_study.project_name # With report_files_with_for_binary = filter_report_paths_binary( report_files_with, binary) report_with: tp.Optional[GlobalsReportWith] = None if report_files_with_for_binary: report_with = load_globals_with_report( report_files_with_for_binary[0]) cs_data.append( create_df_for_report(report_with, unique_cs_name)) # Without report_files_without_for_binary = filter_report_paths_binary( report_files_without, binary) report_without: tp.Optional[GlobalsReportWithout] = None if report_files_without_for_binary: report_without = load_globals_without_report( report_files_without_for_binary[0]) cs_data.append( create_df_for_report(report_without, unique_cs_name)) df = pd.concat(cs_data) df = df.round(2) div_series = df[df['auto-Gs'] == 'No'].Time / df[df['auto-Gs'] == 'Yes'].Time rggs = df[df['auto-Gs'] == 'No']['#RGG'] rho_p = pearsonr(rggs, div_series) mean_stddev = df[df["SDev %"] != '-']["SDev %"].mean() kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True} if table_format.is_latex(): kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True kwargs["longtable"] = True kwargs["caption"] = ( "Pearson correlation coefficient between RGG and Speedup " "(TimeWithout / TimeWith) " f"is: $\\rho$ = {rho_p[0]:.3f} with a two-sided p-value of " f"{rho_p[1]:.3f}." f" In total we analyzed {len(rggs)} binaries from " f"{len(rggs) - 1} different projects. " f"Relative mean stddev {mean_stddev:.1f}$\\%$") return dataframe_to_table(df, table_format, wrap_table, wrap_landscape=True, **kwargs)
def _generate_graph_table(case_studies: tp.List[CaseStudy], graph_generator: tp.Callable[[str, FullCommitHash], nx.DiGraph], table_format: TableFormat, wrap_table: bool) -> str: degree_data: tp.List[pd.DataFrame] = [] for case_study in case_studies: project_name = case_study.project_name project_git = git["-C", get_local_project_git(project_name).path] revision = newest_processed_revision_for_case_study( case_study, BlameReport) if not revision: continue graph = graph_generator(project_name, revision) nodes: tp.List[tp.Dict[str, tp.Any]] = [] for node in graph.nodes: nodes.append(({ "node_degree": graph.degree(node), "node_out_degree": graph.out_degree(node), "node_in_degree": graph.in_degree(node), })) data = pd.DataFrame(nodes) degree_data.append( pd.DataFrame.from_dict( { project_name: { ("commits", ""): int(project_git("rev-list", "--count", revision.hash)), ("authors", ""): len( project_git("shortlog", "-s", "--all").splitlines()), ("nodes", ""): len(graph.nodes), ("edges", ""): len(graph.edges), ("node degree", "mean"): data["node_degree"].mean(), ("node degree", "median"): data["node_degree"].median(), ("node degree", "min"): data["node_degree"].min(), ("node degree", "max"): data["node_degree"].max(), ("node out degree", "median"): data["node_out_degree"].median(), ("node out degree", "min"): data["node_out_degree"].min(), ("node out degree", "max"): data["node_out_degree"].max(), ("node in degree", "median"): data["node_in_degree"].median(), ("node in degree", "min"): data["node_in_degree"].min(), ("node in degree", "max"): data["node_in_degree"].max(), } }, orient="index")) df = pd.concat(degree_data).round(2) kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True} if table_format.is_latex(): kwargs["multicolumn_format"] = "c" kwargs["multirow"] = True return dataframe_to_table(df, table_format, wrap_table, wrap_landscape=True, **kwargs)