Пример #1
0
def dataframe_to_table(data: pd.DataFrame,
                       table_format: TableFormat,
                       wrap_table: bool = False,
                       wrap_landscape: bool = False,
                       **kwargs: tp.Any) -> str:
    """
    Convert a pandas ``DataFrame`` to a table.

    Args:
        data: the ``DataFrame`` to convert
        table_format: the table format used for conversion
        wrap_table: whether to wrap the table in a separate
                    document (latex only)
        wrap_landscape: whether to use landscape mode to wrap the
                        table (latex only)
        **kwargs: kwargs that get passed to pandas' conversion functions
                  (``DataFrame.to_latex`` or ``DataFrame.to_html``)

    Returns:
        the table as a string
    """
    table = ""
    if table_format.is_latex():
        table = data.to_latex(**kwargs)
        if wrap_table:
            table = wrap_table_in_latex_document(table, wrap_landscape)

    elif table_format.is_html():
        table = data.to_html(**kwargs)
    else:
        table = tabulate(data, data.columns, table_format.value)

    return table
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        case_study = self.table_kwargs["case_study"]
        num_commits = self.table_kwargs["num_commits"]

        project_name = case_study.project_name
        revision = newest_processed_revision_for_case_study(
            case_study, BlameReport)
        if not revision:
            raise TableDataEmpty()

        nodes = _collect_cig_node_data(project_name, revision)
        data = pd.DataFrame(nodes)
        data["code_centrality"] = data["degree"] - data["insertions"]
        data.set_index("commit_hash", inplace=True)
        top_degree = data["code_centrality"].nlargest(num_commits)
        degree_data = pd.DataFrame.from_dict({
            "commit": top_degree.index.values,
            "centrality": top_degree.values,
        })
        degree_data.sort_values(["centrality", "commit"],
                                ascending=[False, True],
                                inplace=True)

        kwargs: tp.Dict[str, tp.Any] = {}
        if table_format.is_latex():
            kwargs["index"] = False
            kwargs["multicolumn_format"] = "c"
            kwargs["multirow"] = True
            kwargs["caption"] = f"Top {num_commits} Central Code Commits"

        return dataframe_to_table(degree_data,
                                  table_format,
                                  wrap_table,
                                  wrap_landscape=True,
                                  **kwargs)
Пример #3
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        project_name = self.table_kwargs["case_study"].project_name
        szz_tool: SZZTool = self.table_kwargs["szz_tool"]

        commit_map = get_commit_map(project_name)
        columns = {
            "revision": "fix",
            "introducer": "introducer",
            "score": "score"
        }
        if szz_tool == SZZTool.PYDRILLER_SZZ:
            data = PyDrillerSZZQualityMetricsDatabase.get_data_for_project(
                project_name, list(columns.keys()), commit_map
            )
        elif szz_tool == SZZTool.SZZ_UNLEASHED:
            data = SZZUnleashedQualityMetricsDatabase.get_data_for_project(
                project_name, list(columns.keys()), commit_map
            )
        else:
            raise ValueError(f"Unknown SZZ tool '{szz_tool.tool_name}'")

        data.rename(columns=columns, inplace=True)
        data.set_index(["fix", "introducer"], inplace=True)
        data.sort_values("score", inplace=True)
        data.sort_index(level="fix", sort_remaining=False, inplace=True)

        kwargs: tp.Dict[str, tp.Any] = {}
        if table_format.is_latex():
            kwargs["multicolumn_format"] = "c"
            kwargs["longtable"] = True

        return dataframe_to_table(
            data, table_format, wrap_table, wrap_landscape=True, **kwargs
        )
Пример #4
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        case_studies = get_paper_config().get_all_case_studies()

        variables = [
            "churn", "num_interactions", "num_interacting_commits",
            "num_interacting_authors"
        ]
        cs_data = [
            BlameDiffMetricsDatabase.get_data_for_project(
                case_study.project_name, ["revision", *variables],
                get_commit_map(case_study.project_name), case_study)
            for case_study in case_studies
        ]
        for data in cs_data:
            data.set_index('revision', inplace=True)
            data.drop(data[data['churn'] == 0].index, inplace=True)

        correlations = [
            data[variables].corr(method="pearson") for data in cs_data
        ]

        df = pd.concat(correlations,
                       axis=1,
                       keys=get_unique_cs_name(case_studies))

        kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True}
        if table_format.is_latex():
            kwargs["multicolumn_format"] = "c"

        return dataframe_to_table(df,
                                  table_format,
                                  wrap_table,
                                  wrap_landscape=False,
                                  **kwargs)
Пример #5
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        project_name: str = self.table_kwargs['case_study'].project_name

        bug_provider = BugProvider.get_provider_for_project(
            get_project_cls_by_name(project_name)
        )

        variables = [
            "fixing hash", "fixing message", "fixing author", "issue_number"
        ]
        pybugs = bug_provider.find_pygit_bugs()

        data_rows = [[
            pybug.fixing_commit.hex, pybug.fixing_commit.message,
            pybug.fixing_commit.author.name, pybug.issue_id
        ] for pybug in pybugs]

        bug_df = pd.DataFrame(columns=variables, data=np.array(data_rows))

        kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True}
        if table_format.is_latex():
            kwargs["multicolumn_format"] = "c"
            kwargs["longtable"] = True

        return dataframe_to_table(
            bug_df, table_format, wrap_table, wrap_landscape=True, **kwargs
        )
Пример #6
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        case_study: CaseStudy = self.table_kwargs["case_study"]

        project_name: str = case_study.project_name
        revision = newest_processed_revision_for_case_study(
            case_study, BlameReport)
        if not revision:
            raise TableDataEmpty()

        blame_aig = create_blame_interaction_graph(
            project_name, revision).author_interaction_graph()
        file_aig = create_file_based_interaction_graph(
            project_name, revision).author_interaction_graph()

        blame_nodes: tp.List[tp.Dict[str, tp.Any]] = []
        for node in blame_aig.nodes:
            node_attrs = tp.cast(AIGNodeAttrs, blame_aig.nodes[node])

            blame_neighbors = set(blame_aig.successors(node)).union(
                blame_aig.predecessors(node))
            file_neighbors = set(file_aig.successors(node)).union(
                file_aig.predecessors(node))
            blame_nodes.append(({
                "author":
                f"{node_attrs['author']}",
                "blame_num_commits":
                node_attrs['num_commits'],
                "blame_node_degree":
                blame_aig.degree(node),
                "author_diff":
                len(blame_neighbors.difference(file_neighbors))
            }))
        blame_data = pd.DataFrame(blame_nodes)
        blame_data.set_index("author", inplace=True)

        file_nodes: tp.List[tp.Dict[str, tp.Any]] = []
        for node in file_aig.nodes:
            node_attrs = tp.cast(AIGNodeAttrs, file_aig.nodes[node])
            file_nodes.append(({
                "author": f"{node_attrs['author']}",
                "file_num_commits": node_attrs['num_commits'],
                "file_node_degree": file_aig.degree(node)
            }))
        file_data = pd.DataFrame(file_nodes)
        file_data.set_index("author", inplace=True)

        degree_data = blame_data.join(file_data, how="outer")

        kwargs: tp.Dict[str, tp.Any] = {}
        if table_format.is_latex():
            kwargs["index"] = True
            kwargs["multicolumn_format"] = "c"
            kwargs["multirow"] = True

        return dataframe_to_table(degree_data,
                                  table_format,
                                  wrap_table,
                                  wrap_landscape=True,
                                  **kwargs)
Пример #7
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        case_studies = get_loaded_paper_config().get_all_case_studies()

        df = pd.DataFrame()

        for case_study in case_studies:
            project_name = case_study.project_name

            report_files = get_processed_revisions_files(
                project_name, TimeReportAggregate,
                get_case_study_file_name_filter(case_study), False)

            for report_file in report_files:
                time_aggregated = TimeReportAggregate(report_file)
                report_name = time_aggregated.filename

                mean_runtime = np.mean(
                    time_aggregated.measurements_wall_clock_time)
                std_runtime = np.std(
                    time_aggregated.measurements_wall_clock_time)
                mean_ctx = np.mean(time_aggregated.measurements_ctx_switches)
                std_ctx = np.std(time_aggregated.measurements_ctx_switches)

                new_row = {
                    "Binary": report_name.binary_name,
                    "Experiment": report_name.experiment_shorthand,
                    "Runtime Mean (Std)":
                    f"{mean_runtime:.2f} ({std_runtime:.2f})",
                    "Ctx-Switches Mean (Std)":
                    f"{mean_ctx:.2f} ({std_ctx:.2f})"
                }

                df = df.append(new_row, ignore_index=True)

        df.sort_values(["Binary", "Experiment"], inplace=True)
        df.set_index(
            ["Binary", "Experiment"],
            inplace=True,
        )

        kwargs: tp.Dict[str, tp.Any] = {}
        if table_format.is_latex():
            kwargs["column_format"] = "llrr"

        return dataframe_to_table(df,
                                  table_format,
                                  wrap_table,
                                  wrap_landscape=True,
                                  **kwargs)
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        case_studies = get_loaded_paper_config().get_all_case_studies()

        cs_data: tp.List[pd.DataFrame] = []
        for case_study in case_studies:
            project_name = case_study.project_name
            commit_map = get_commit_map(project_name)
            project_cls = get_project_cls_by_name(project_name)
            project_repo = get_local_project_git(project_name)
            project_path = project_repo.path[:-5]
            project_git = git["-C", project_path]

            revisions = sorted(
                case_study.revisions, key=commit_map.time_id, reverse=True
            )
            revision = revisions[0]
            rev_range = revision.hash if revision else "HEAD"

            cs_dict = {
                project_name: {
                    "Domain":
                        str(project_cls.DOMAIN)[0].upper() +
                        str(project_cls.DOMAIN)[1:],
                    "LOC":
                        calc_repo_loc(project_repo, rev_range),
                    "Commits":
                        int(project_git("rev-list", "--count", rev_range)),
                    "Authors":
                        len(
                            project_git("shortlog", "-s",
                                        rev_range).splitlines()
                        )
                }
            }
            if revision:
                cs_dict[project_name]["Revision"] = revision.short_hash

            cs_data.append(pd.DataFrame.from_dict(cs_dict, orient="index"))

        df = pd.concat(cs_data).sort_index()

        kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True}
        if table_format.is_latex():
            kwargs["multicolumn_format"] = "c"
            kwargs["multirow"] = True

        return dataframe_to_table(
            df, table_format, wrap_table, wrap_landscape=True, **kwargs
        )
Пример #9
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        case_study: CaseStudy = self.table_kwargs['case_study']

        report_files = get_processed_revisions_files(
            case_study.project_name, FeatureAnalysisReport,
            get_case_study_file_name_filter(case_study)
        )
        if len(report_files) == 0:
            raise AssertionError(
                "No FeatureAnalysisReport found for case study "
                f"{case_study.project_name}"
            )

        cs_revisions = case_study.revisions
        if len(cs_revisions) > 1:
            LOG.debug(f"revisions={cs_revisions}")
            LOG.warning(
                "This tabled is only designed for usage with one revision "
                "but more were found. All revisions expect for the first "
                "one are ignored."
            )

        gt_files: tp.List[Path] = [
            Path(gt) for gt in \
                re.compile(r',\s*').split(self.table_kwargs['ground_truth'])
        ]

        features: tp.List[str] = []
        if self.table_kwargs['features'] is not None:
            features = re.compile(r',\s*').split(self.table_kwargs['features'])

        insts: int = 0
        data: tp.List[pd.DataFrame] = []
        binaries = case_study.project_cls.binaries_for_revision(cs_revisions[0])
        for binary in binaries:
            name = ""
            if len(binaries) > 1:
                name = binary.name

            # report
            report_files_for_binary = filter_report_paths_binary(
                report_files, binary
            )
            report: tp.Optional[FeatureAnalysisReport] = None
            if not report_files_for_binary:
                LOG.warning(f"No report file given for binary {binary.name}!")
                continue
            report = load_feature_analysis_report(report_files_for_binary[0])

            # ground truth
            gt_files_for_binary = filter_ground_truth_paths_binary(
                gt_files, binary
            )
            ground_truth: tp.Optional[FeatureAnalysisGroundTruth]
            if not gt_files_for_binary:
                LOG.warning(
                    f"No ground truth file given for binary {binary.name}!"
                )
                continue
            ground_truth = FeatureAnalysisGroundTruth(gt_files_for_binary[0])

            # features
            if features == []:
                features = ground_truth.get_features()
            features = sorted(features)

            evaluation: FeatureAnalysisReportEval = FeatureAnalysisReportEval(
                report, ground_truth, features.copy()
            )

            data.append(
                self.__create_eval_df(evaluation, ['Total'] + features, name)
            )

            insts += report.meta_data.num_br_switch_insts

        df = pd.concat(data)

        kwargs: tp.Dict[str, tp.Any] = {}
        if table_format.is_latex():
            kwargs["column_format"] = (
                'ccc|cc' + '|cc' * len(features) if len(binaries) > 1 \
                    else 'cc|cc' + '|cc' * len(features)
            )
            kwargs["longtable"] = True
            kwargs["multicolumn"] = True
            kwargs["multicolumn_format"] = "c"
            kwargs["multirow"] = True
            kwargs["caption"] = (
                f"Evaluation of project {case_study.project_name}. "
                f"In total there were {insts} br and switch instructions."
            )
            kwargs['position'] = 't'

        return dataframe_to_table(
            df, table_format, wrap_table, wrap_landscape=True, **kwargs
        )
Пример #10
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        cs_data: tp.List[pd.DataFrame] = []
        col_format = 'cc'

        gt_files: tp.List[Path] = [
            Path(gt) for gt in \
                re.compile(r',\s*').split(self.table_kwargs['ground_truth'])
        ]

        for case_study in sorted(
            tp.cast(tp.List[CaseStudy], self.table_kwargs["case_study"]),
            key=lambda x: x.project_name
        ):
            report_files = get_processed_revisions_files(
                case_study.project_name, FeatureAnalysisReport,
                get_case_study_file_name_filter(case_study)
            )
            if len(report_files) == 0:
                raise AssertionError(
                    "No FeatureAnalysisReport found for case study "
                    f"{case_study.project_name}"
                )

            cs_revisions = case_study.revisions
            if len(cs_revisions) > 1:
                LOG.debug(f"revisions={cs_revisions}")
                LOG.warning(
                    "This tabled is only designed for usage with one revision "
                    "but more were found. All revisions expect for the first "
                    "one are ignored."
                )

            binaries = case_study.project_cls.binaries_for_revision(
                cs_revisions[0]
            )
            for binary in binaries:
                if len(binaries) > 1:
                    name = case_study.project_name + "-" + binary.name
                else:
                    name = case_study.project_name

                # report
                report_files_for_binary = filter_report_paths_binary(
                    report_files, binary
                )
                report: tp.Optional[FeatureAnalysisReport] = None
                if not report_files_for_binary:
                    LOG.warning(f"No report file given for binary {name}!")
                    continue
                report = load_feature_analysis_report(
                    report_files_for_binary[0]
                )

                # ground truth
                gt_files_for_binary = filter_ground_truth_paths_binary(
                    gt_files, binary
                )
                ground_truth: tp.Optional[FeatureAnalysisGroundTruth]
                if not gt_files_for_binary:
                    LOG.warning(
                        f"No ground truth file given for binary {name}!"
                    )
                    continue
                ground_truth = FeatureAnalysisGroundTruth(
                    gt_files_for_binary[0]
                )

                evaluation: FeatureAnalysisReportEval = (
                    FeatureAnalysisReportEval(report, ground_truth, [])
                )

                cs_data.append(self.__create_eval_df(evaluation, name))

                col_format += '|cc'

        df = pd.concat(cs_data, axis=1)

        kwargs: tp.Dict[str, tp.Any] = {}
        if table_format.is_latex():
            kwargs["column_format"] = col_format
            kwargs["longtable"] = True
            kwargs["multicolumn"] = True
            kwargs["multicolumn_format"] = "c"
            kwargs["multirow"] = True
            kwargs['position'] = 't'

        return dataframe_to_table(
            df, table_format, wrap_table, wrap_landscape=True, **kwargs
        )
Пример #11
0
    def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str:
        cs_data: tp.List[pd.DataFrame] = []

        for case_study in sorted(tp.cast(tp.List[CaseStudy],
                                         self.table_kwargs["case_study"]),
                                 key=lambda x: x.project_name):
            report_files_with = get_processed_revisions_files(
                case_study.project_name, GlobalsReportWith,
                get_case_study_file_name_filter(case_study))
            report_files_without = get_processed_revisions_files(
                case_study.project_name, GlobalsReportWithout,
                get_case_study_file_name_filter(case_study))

            if len(report_files_with) > 1 or len(report_files_without) > 1:
                LOG.debug(f"report_files_with={report_files_with}")
                LOG.debug(f"report_files_without={report_files_with}")
                raise AssertionError("Too many report files given!")

            if len(case_study.revisions) > 1:
                LOG.debug(
                    "This tabled is only designed for usage with one revision "
                    "but we found more. All revisions expect for the first "
                    "one are ignored.")

            binaries = case_study.project_cls.binaries_for_revision(
                case_study.revisions[0])
            for binary in binaries:
                if len(binaries) > 1:
                    unique_cs_name = case_study.project_name + "-" + binary.name
                else:
                    unique_cs_name = case_study.project_name

                # With
                report_files_with_for_binary = filter_report_paths_binary(
                    report_files_with, binary)

                report_with: tp.Optional[GlobalsReportWith] = None
                if report_files_with_for_binary:
                    report_with = load_globals_with_report(
                        report_files_with_for_binary[0])

                cs_data.append(
                    create_df_for_report(report_with, unique_cs_name))

                # Without
                report_files_without_for_binary = filter_report_paths_binary(
                    report_files_without, binary)

                report_without: tp.Optional[GlobalsReportWithout] = None
                if report_files_without_for_binary:
                    report_without = load_globals_without_report(
                        report_files_without_for_binary[0])

                cs_data.append(
                    create_df_for_report(report_without, unique_cs_name))

        df = pd.concat(cs_data)
        df = df.round(2)

        div_series = df[df['auto-Gs'] == 'No'].Time / df[df['auto-Gs'] ==
                                                         'Yes'].Time
        rggs = df[df['auto-Gs'] == 'No']['#RGG']
        rho_p = pearsonr(rggs, div_series)

        mean_stddev = df[df["SDev %"] != '-']["SDev %"].mean()

        kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True}
        if table_format.is_latex():
            kwargs["multicolumn_format"] = "c"
            kwargs["multirow"] = True
            kwargs["longtable"] = True
            kwargs["caption"] = (
                "Pearson correlation coefficient between RGG and Speedup "
                "(TimeWithout / TimeWith) "
                f"is: $\\rho$ = {rho_p[0]:.3f} with a two-sided p-value of "
                f"{rho_p[1]:.3f}."
                f" In total we analyzed {len(rggs)} binaries from "
                f"{len(rggs) - 1} different projects. "
                f"Relative mean stddev {mean_stddev:.1f}$\\%$")

        return dataframe_to_table(df,
                                  table_format,
                                  wrap_table,
                                  wrap_landscape=True,
                                  **kwargs)
Пример #12
0
def _generate_graph_table(case_studies: tp.List[CaseStudy],
                          graph_generator: tp.Callable[[str, FullCommitHash],
                                                       nx.DiGraph],
                          table_format: TableFormat, wrap_table: bool) -> str:
    degree_data: tp.List[pd.DataFrame] = []
    for case_study in case_studies:
        project_name = case_study.project_name
        project_git = git["-C", get_local_project_git(project_name).path]
        revision = newest_processed_revision_for_case_study(
            case_study, BlameReport)
        if not revision:
            continue

        graph = graph_generator(project_name, revision)

        nodes: tp.List[tp.Dict[str, tp.Any]] = []
        for node in graph.nodes:
            nodes.append(({
                "node_degree": graph.degree(node),
                "node_out_degree": graph.out_degree(node),
                "node_in_degree": graph.in_degree(node),
            }))

        data = pd.DataFrame(nodes)
        degree_data.append(
            pd.DataFrame.from_dict(
                {
                    project_name: {
                        ("commits", ""):
                        int(project_git("rev-list", "--count", revision.hash)),
                        ("authors", ""):
                        len(
                            project_git("shortlog", "-s",
                                        "--all").splitlines()),
                        ("nodes", ""):
                        len(graph.nodes),
                        ("edges", ""):
                        len(graph.edges),
                        ("node degree", "mean"):
                        data["node_degree"].mean(),
                        ("node degree", "median"):
                        data["node_degree"].median(),
                        ("node degree", "min"):
                        data["node_degree"].min(),
                        ("node degree", "max"):
                        data["node_degree"].max(),
                        ("node out degree", "median"):
                        data["node_out_degree"].median(),
                        ("node out degree", "min"):
                        data["node_out_degree"].min(),
                        ("node out degree", "max"):
                        data["node_out_degree"].max(),
                        ("node in degree", "median"):
                        data["node_in_degree"].median(),
                        ("node in degree", "min"):
                        data["node_in_degree"].min(),
                        ("node in degree", "max"):
                        data["node_in_degree"].max(),
                    }
                },
                orient="index"))

    df = pd.concat(degree_data).round(2)

    kwargs: tp.Dict[str, tp.Any] = {"bold_rows": True}
    if table_format.is_latex():
        kwargs["multicolumn_format"] = "c"
        kwargs["multirow"] = True

    return dataframe_to_table(df,
                              table_format,
                              wrap_table,
                              wrap_landscape=True,
                              **kwargs)