def get_vnodes_number(repository: str, from_commit: str, to_commit: str,
                      context: AnalyzerContextManager, bblfsh: Optional[str]) -> int:
    """
    Calculate the number of expected vnodes number for a repository.

    :param repository: URL of repository.
    :param from_commit: Hash of the base commit.
    :param to_commit: Hash of the head commit.
    :param context: LookoutSDK instance to query analyzer.
    :param bblfsh: Babelfish server address to use. Specify None to use the default value.
    :return: expected vnodes number for a repository.
    """
    expected_vnodes_number = -1

    def _convert_files_to_xy(self, parsed_files):
        nonlocal expected_vnodes_number
        if expected_vnodes_number != -1:
            raise RuntimeError("_files_to_xy should be called only one time.")
        expected_vnodes_number = sum(len(vn) for vn, _, _ in parsed_files)
        raise RuntimeError("Forced FormatAnalyser.train call stop.")

    try:
        _convert_files_to_xy_backup = FeatureExtractor._convert_files_to_xy
        FeatureExtractor._convert_files_to_xy = _convert_files_to_xy
        with tempfile.TemporaryDirectory(prefix="top-repos-quality-repos-") as tmpdirname:
            git_dir = ensure_repo(repository, tmpdirname)
            try:
                context.push(fr=from_commit, to=to_commit, git_dir=git_dir, log_level="warning",
                             bblfsh=bblfsh)
            except subprocess.CalledProcessError as e:
                # Force stop expected
                pass
    finally:
        FeatureExtractor._convert_files_to_xy = _convert_files_to_xy_backup
    return expected_vnodes_number
Пример #2
0
class BaseAnalyzerIntegrationTests(unittest.TestCase):
    def setUp(self, fs=None):
        self.db = tempfile.NamedTemporaryFile(dir=self.base_dir)
        if fs is None:
            self.fs = tempfile.TemporaryDirectory(dir=self.base_dir)
        else:
            self.fs = fs

        self.context = AnalyzerContextManager(FormatAnalyzer,
                                              db=self.db.name,
                                              fs=self.fs.name).__enter__()
        self.logs = logs = []

        class ShadowHandler(logging.Handler):
            def emit(self, record):
                logs.append(logging.getLogger().handlers[0].format(record))

        self.log_handler = ShadowHandler()
        logging.getLogger().addHandler(self.log_handler)

    def tearDown(self, fs_cleanup=True):
        if fs_cleanup:
            self.fs.cleanup()
        self.context.__exit__()
        logging.getLogger().removeHandler(self.log_handler)
Пример #3
0
 def __enter__(self) -> "Reporter":
     self._tmpdir = tempfile.mkdtemp("reporter-") \
         if self._database is None or self._fs is None else None
     if self._database is None:
         self._database = os.path.join(self._tmpdir, "db.sqlite3")
     if self._fs is None:
         self._fs = os.path.join(self._tmpdir, "models")
     os.makedirs(self._fs, exist_ok=True)
     self._analyzer_context_manager = AnalyzerContextManager(
         self.inspected_analyzer_type,
         db=self._database,
         fs=self._fs,
         init=False)
     self._analyzer_context_manager.__enter__()
     return self
Пример #4
0
def evaluate_smoke_entry(
    inputpath: str,
    reportdir: str,
    database: str,
    bblfsh: str,
    config: dict,
) -> None:
    """
    CLI entry point.
    """
    start_time = time.time()
    report_filename = os.path.join(reportdir, "report.csv")
    log = logging.getLogger("evaluate_smoke")
    if database is None:
        db = tempfile.NamedTemporaryFile(dir=inputpath,
                                         prefix="db",
                                         suffix=".sqlite3")
        database = db.name
        log.info("Database %s created" % database)
    else:
        if os.path.exists(database):
            log.info("Found existing database %s" % database)
        else:
            log.info("Database %s not found and will be created." % database)
    with tempfile.TemporaryDirectory(dir=inputpath) as fs:
        with AnalyzerContextManager(SmokeEvalFormatAnalyzer,
                                    db=database,
                                    fs=fs) as server:
            inputpath = Path(inputpath)
            index_file = inputpath / "index.csv"
            os.makedirs(reportdir, exist_ok=True)
            with open(report_filename, "w") as report:
                csv.DictWriter(
                    report,
                    fieldnames=SmokeEvalFormatAnalyzer.REPORT_COLNAMES,
                ).writeheader()
            with open(str(index_file)) as index:
                reader = csv.DictReader(index)
                for row in tqdm(reader):
                    repopath = inputpath / row["repo"]
                    config_json = {
                        SmokeEvalFormatAnalyzer.name:
                        merge_dicts(config, {
                            "style_name": row["style"],
                            "report_path": reportdir,
                        })
                    }
                    server.review(fr=row["from"],
                                  to=row["to"],
                                  git_dir=str(repopath),
                                  log_level="warning",
                                  bblfsh=bblfsh,
                                  config_json=config_json)
            log.info("Quality report saved to %s", reportdir)

    report = pandas.read_csv(report_filename)
    with pandas.option_context("display.max_columns", 10,
                               "display.expand_frame_repr", False):
        print(report.describe())
    log.info("Time spent: %.3f" % (time.time() - start_time))
Пример #5
0
    def setUp(self, fs=None):
        self.db = tempfile.NamedTemporaryFile(dir=self.base_dir)
        if fs is None:
            self.fs = tempfile.TemporaryDirectory(dir=self.base_dir)
        else:
            self.fs = fs

        self.context = AnalyzerContextManager(FormatAnalyzer,
                                              db=self.db.name,
                                              fs=self.fs.name).__enter__()
        self.logs = logs = []

        class ShadowHandler(logging.Handler):
            def emit(self, record):
                logs.append(logging.getLogger().handlers[0].format(record))

        self.log_handler = ShadowHandler()
        logging.getLogger().addHandler(self.log_handler)
Пример #6
0
 def test_train_review_analyzer_integration(self):
     """Integration test for review event."""
     with AnalyzerContextManager(analyzer=QualityReportAnalyzer,
                                 db=self.db.name,
                                 fs=self.fs.name) as context:
         context.review(
             FROM_COMMIT,
             TO_COMMIT,
             git_dir=self.jquery_dir,
             config_json={QualityReportAnalyzer.name: get_config()})
def calc_expected_vnodes_number_entry(input: str, output: str, runs: int) -> None:
    """
    Entry point for `python -m lookout.style.format calc-expected-support` command.

    :param input: сsv file with repositories for quality report. Should contain url, to and from \
                 columns.
    :param output: Path to a output csv file.
    :param runs: Repeat number to ensure the result correctness.
    """
    log = logging.getLogger("expected_vnodes_number")
    handler = logging.handlers.RotatingFileHandler(output + ".errors")
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)

    repositories = list(csv.DictReader(handle_input_arg(input)))
    try:
        bblfsh = _restart_bblfshd(first_run=True)
        for cur_run in range(runs):
            with tempfile.TemporaryDirectory() as tmpdirname:
                database = os.path.join(tmpdirname, "db.sqlite3")
                fs = os.path.join(tmpdirname, "models")
                os.makedirs(fs, exist_ok=fs)
                with AnalyzerContextManager(FormatAnalyzer, db=database, fs=fs,
                                            init=False) as server:
                    for row in tqdm(repositories):
                        try:
                            vnodes_number = get_vnodes_number(
                                row["url"], to_commit=row["to"], from_commit=row["from"],
                                context=server, bblfsh=bblfsh)
                            log.info("%d/%d run. Expected vnodes number for %s is %d.",
                                     cur_run + 1, runs, row["url"], vnodes_number)
                            if row.get("vnodes_number", vnodes_number) != vnodes_number:
                                log.warning("vnodes number is different for %d/%d run. Get %d "
                                            "instead of %d. Set to nan.", cur_run + 1, runs,
                                            vnodes_number, row["vnodes_number"])
                                row["vnodes_number"] = float("nan")
                            else:
                                row["vnodes_number"] = vnodes_number
                        except Exception:
                            log.exception("-" * 20 + "\nFailed to process %s repo", row["url"])
                            continue
                        bblfsh = _restart_bblfshd()
    finally:
        _stop_bblfshd()

    fieldnames = ["url", "to", "from", "vnodes_number"]
    with open(output, "w") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in repositories:
            writer.writerow(row)
Пример #8
0
def measure_quality(repository: str, from_commit: str, to_commit: str,
                    context: AnalyzerContextManager, config: dict, bblfsh: Optional[str],
                    vnodes_expected_number: Optional[int], restarts: int=3) -> Dict[str, str]:
    """
    Generate `QualityReport` for a repository. If it fails it returns empty reports.

    :param repository: URL of repository.
    :param from_commit: Hash of the base commit.
    :param to_commit: Hash of the head commit.
    :param context: LookoutSDK instance to query analyzer.
    :param config: config for FormatAnalyzer.
    :param bblfsh: Babelfish server address to use. Specify None to use the default value.
    :param vnodes_expected_number: Specify number for expected number of vnodes if known. \
                                   report collection will be restarted if number of extracted \
                                   vnodes does not match.
    :param restarts: Number of restarts if number of extracted vnodes does not match.
    :return: Dictionary with all QualityReport reports.
    """
    log = logging.getLogger("QualityAnalyzer")

    # This dirty hack should be removed as soon as
    # https://github.com/src-d/style-analyzer/issues/557 resolved.
    sum_vnodes_number = 0
    call_numbers = 0

    _convert_files_to_xy_backup = FeatureExtractor._convert_files_to_xy

    def _convert_files_to_xy(self, parsed_files):
        nonlocal sum_vnodes_number, call_numbers
        call_numbers += 1
        sum_vnodes_number += sum(len(vn) for vn, _, _ in parsed_files)
        # sum_vnodes_number + 1 because of whatever reason if you extract test and train
        # separately you have -1 vnode
        # TODO (zurk): investigate ^
        if call_numbers == 2 and sum_vnodes_number + 1 != vnodes_expected_number:
            raise RestartReport("VNodes number does not match to expected: %d != %d:" % (
                sum_vnodes_number, vnodes_expected_number))
        log.info("VNodes number match to expected %d. ", vnodes_expected_number)
        return _convert_files_to_xy_backup(self, parsed_files)

    reports = {}

    def capture_reports(func):
        @functools.wraps(func)
        def wrapped_capture_quality_reports(*args, **kwargs):
            nonlocal reports
            if reports:
                raise RuntimeError("generate_reports should be called only one time.")
            result = func(*args, **kwargs)
            reports = result
            return result
        wrapped_capture_quality_reports.original = func
        return wrapped_capture_quality_reports

    try:
        QualityReportAnalyzer.generate_reports = \
            capture_reports(QualityReportAnalyzer.generate_reports)
        if vnodes_expected_number:
            log.info("Vnodes expected number is equal to %d", vnodes_expected_number)
            FeatureExtractor._convert_files_to_xy = _convert_files_to_xy
        with tempfile.TemporaryDirectory(prefix="top-repos-quality-repos-") as tmpdirname:
            git_dir = ensure_repo(repository, tmpdirname)
            for attempt_number in range(restarts):
                sum_vnodes_number = -1
                try:
                    context.push(fr=from_commit, to=to_commit, git_dir=git_dir,
                                 log_level="warning", bblfsh=bblfsh, config_json=config)
                    break
                except subprocess.CalledProcessError:
                    # Assume that we failed because VNodes number does not match to expected one
                    log.warning("%d/%d try to train the model failed.", attempt_number, restarts)
            else:
                raise RuntimeError("Run out of %d attempts. Failed to train proper model for %s." %
                                   (restarts, repository))
            context.review(fr=from_commit, to=to_commit, git_dir=git_dir, log_level="warning",
                           bblfsh=bblfsh, config_json=config)
    finally:
        QualityReportAnalyzer.generate_reports = QualityReportAnalyzer.generate_reports.original
        if vnodes_expected_number:
            FeatureExtractor._convert_files_to_xy = _convert_files_to_xy_backup
    return reports
Пример #9
0
def generate_quality_report(input: str, output: str, force: bool, bblfsh: str, config: dict,
                            database: Optional[str] = None, fs: Optional[str] = None) -> None:
    """
    Generate quality report for the given data. Entry point for command line interface.

    :param input: csv file with repositories to make report. Should contain url, to and from \
                  columns.
    :param output: Directory where to save results.
    :param force: force to overwrite results stored in output directory if True. \
                  Stored results will be used if False.
    :param bblfsh: bblfsh address to use.
    :param config: config for FormatAnalyzer.
    :param database: sqlite3 database path to store the models. Temporary file is used if not set.
    :param fs: Model repository file system root. Temporary directory is used if not set.
    :return:
    """
    os.makedirs(output, exist_ok=True)
    assert os.path.isdir(output), "Output should be a directory"
    log = logging.getLogger("QualityAnalyzer")
    handler = logging.handlers.RotatingFileHandler(os.path.join(output, "errors.txt"))
    handler.setLevel(logging.ERROR)
    log.addHandler(handler)
    reports = []
    config = {QualityReportAnalyzer.name: merge_dicts(config, {"aggregate": True})}
    repositories = list(csv.DictReader(handle_input_arg(input)))
    with tempfile.TemporaryDirectory() as tmpdirname:
        database = database if database else os.path.join(tmpdirname, "db.sqlite3")
        fs = fs if fs else os.path.join(tmpdirname, "models")
        os.makedirs(fs, exist_ok=True)
        with AnalyzerContextManager(QualityReportAnalyzer, db=database, fs=fs,
                                    init=False) as context:
            for row in huge_progress_bar(repositories, log, lambda row: row["url"]):
                path_tmpl = os.path.join(output, get_repo_name(row["url"])) + "-%s_report.md"
                try:
                    if force or not any(os.path.exists(path_tmpl % name)
                                        for name in QualityReportAnalyzer.get_report_names()):
                        vnodes_expected_number = int(row["vnodes_number"]) \
                            if "vnodes_number" in row else None
                        report = measure_quality(
                            row["url"], to_commit=row["to"], from_commit=row["from"],
                            context=context, config=config, bblfsh=bblfsh,
                            vnodes_expected_number=vnodes_expected_number)
                        for report_name in report:
                            with open(path_tmpl % report_name, "w", encoding="utf-8") as f:
                                f.write(report[report_name])
                        reports.append((row["url"], report))
                    else:
                        report = {}
                        log.info("Found existing reports for %s in %s", row["url"], output)
                        for report_name in QualityReportAnalyzer.get_report_names():
                            report_path = path_tmpl % report_name
                            if not os.path.exists(report_path):
                                log.warning(
                                    "skipped %s. %s report is missing", row["url"], report_name)
                                break
                            with open(path_tmpl % report_name, encoding="utf-8") as f:
                                report[report_name] = f.read()
                        else:
                            reports.append((row["url"], report))
                except Exception:
                    log.exception("-" * 20 + "\nFailed to process %s repo", row["url"])
                    continue

        for report_name in ("train", "test"):
            summary = _generate_report_summary(reports, report_name)
            log.info("\n%s\n%s", report_name, summary)
            summary_loc = os.path.join(output, "summary-%s_report.md" % report_name)
            with open(summary_loc, "w", encoding="utf-8") as f:
                f.write(summary)
Пример #10
0
class Reporter:
    """
    Base class to create performance reports for the analyzer.

    To create a reporter for your Analyzer you should make two steps.
    1. Inherit SpyAnalyzer from an Analyzer you want to evaluate. SpyAnalyzer's `analyze` \
       function should be overridden to return all the information you need for the following \
       evaluation of the `Comment`-s. Refer to `TyposAnalyzerSpy` as an example.
    2. Inherit MyReporter from this Reporter class. Set created SpyAnalyzer to \
       `inspected_analyzer_type` attribute. You should have a dataset that you feed to \
       `Reporter.run()`. The dataset rows are passed to `_trigger_review_event` to trigger your \
       analyzer's `analyze()`. The result is passed to `_generate_reports()`. If you need to \
       summarize your reports, override `_finalize` method.

       If you want to create several reports (e.g. separate train and test reports) you should \
       override both `get_report_names()` and `_generate_reports()`.
    """

    _log = logging.getLogger("Reporter")

    inspected_analyzer_type = None  # type: Type[Analyzer]

    def __init__(self,
                 config: Optional[dict] = None,
                 bblfsh: Optional[str] = None,
                 database: Optional[str] = None,
                 fs: Optional[str] = None):
        """
        Initialize a new `Reporter` instance.

        You should provide `database` and `fs` in order to re-use existing models (no training).

        :param config: Analyzer configuration for push and review events. The analyzer uses \
                       default config if not provided.
        :param bblfsh: Babelfish endpoint to use by lookout-sdk.
        :param database: Database endpoint to use to read and store information about models. \
            Sqlite3 database in a temporary file is used if not provided.
        :param fs: Model repository file system root. Temporary directory is used if not provided.
        """
        if self.inspected_analyzer_type is None or \
                not issubclass(self.inspected_analyzer_type, Analyzer):
            raise AttributeError(
                "inspected_analyzer_type attribute must be set to an Analyzer subclass in %s,"
                " got %s." % (type(self), repr(self.inspected_analyzer_type)))
        self._config = config
        self._bblfsh = bblfsh
        self._database = database
        self._fs = fs
        self._failures = {}

    def __enter__(self) -> "Reporter":
        self._tmpdir = tempfile.mkdtemp("reporter-") \
            if self._database is None or self._fs is None else None
        if self._database is None:
            self._database = os.path.join(self._tmpdir, "db.sqlite3")
        if self._fs is None:
            self._fs = os.path.join(self._tmpdir, "models")
        os.makedirs(self._fs, exist_ok=True)
        self._analyzer_context_manager = AnalyzerContextManager(
            self.inspected_analyzer_type,
            db=self._database,
            fs=self._fs,
            init=False)
        self._analyzer_context_manager.__enter__()
        return self

    def __exit__(self, exc_type=None, exc_val=None, exc_tb=None):
        self._analyzer_context_manager.__exit__()
        if self._tmpdir:
            shutil.rmtree(self._tmpdir)

    def run(self, dataset: Sequence[Dict[str,
                                         Any]]) -> Iterator[Dict[str, str]]:
        """
        Run report generation.

        :param dataset: The dataset for the report generation. The format is a list of data rows. \
                        The row is a Dictionary with a mapping from the column name to its content.

        :return: Iterator through generated reports. Each Generated report is extended with the \
                 corresponding row data from the dataset.
        """
        self._failures = {}

        def _run(dataset) -> Iterator[Dict[str, str]]:
            for index, row in enumerate(huge_progress_bar(
                    dataset, self._log, self._get_row_repr),
                                        start=1):
                self._log.info("processing %d / %d (%s)", index, len(dataset),
                               row)
                try:
                    fixes = self._trigger_review_event(row)
                    reports = self._generate_reports(row, fixes)
                    reports.update(row)
                    yield reports
                except Exception:
                    self._log.exception(
                        "failed to generate report %d / %d (%s)", index,
                        len(dataset), row)
                    self._failures[index] = row

        yield from self._finalize(_run(dataset))

    @classmethod
    def get_report_names(cls) -> Tuple[str, ...]:
        """
        Get all available report names.

        :return: Tuple with report names.
        """
        raise NotImplementedError()

    def _generate_reports(
        self,
        dataset_row: Dict[str, Any],
        fixes: Sequence[NamedTuple],
    ) -> Dict[str, str]:
        """
        Generate reports for a dataset row.

        :param dataset_row: Dataset row which triggered the analyze method of the analyzer.
        :param fixes: List of data provided by the analyze method of spied analyzer.
        :return: Dictionary with report names as keys and report string as values.
        """
        raise NotImplementedError()

    def _trigger_review_event(
            self, dataset_row: Dict[str, Any]) -> Sequence[NamedTuple]:
        """
        Trigger review event and convert provided comments to an internal representation.

        It is required to call `Reporter._analyzer_context_manager.review()` in this function \
        with arguments you need and convert provided comments to a Sequence of NamedTuple-s for \
        the report generation.

        :param dataset_row: Dataset row with information required to run \
                            `analyzer_context_manager.review()`.
        :return: Sequence of data extracted from comments to generate report.
        """
        raise NotImplementedError()

    def _finalize(
            self, reports: Iterable[Dict[str,
                                         str]]) -> Iterator[Dict[str, str]]:
        """
        Extend or summarize the generated reports.

        The function does not change the reports by default.

        :param reports: Iterable with generated reports.
        :return: New finalized reports.
        """
        yield from reports

    @staticmethod
    def _get_package_version():
        """Return lookout-style package version or "local" if it is a git repository."""
        if (Path(__file__).parents[2] / ".git").exists():
            return "local"
        else:
            return lookout.style.__version__

    @staticmethod
    def _get_commit():
        """Return current head commit hash if you run inside git repository."""
        if Reporter._get_package_version() != "local":
            return "0" * 40
        clean_status = porcelain.GitStatus(staged={
            "delete": [],
            "add": [],
            "modify": []
        },
                                           unstaged=[],
                                           untracked=[])
        repo_path = str(Path(__file__).parents[2])
        head = dulwich.repo.Repo(repo_path).head().decode()
        if porcelain.status(repo_path) == clean_status:
            return head
        else:
            return "%s (dirty)" % head

    @staticmethod
    def _get_row_repr(dataset_row: Dict[str, Any]) -> str:
        """Convert dataset row to its representation for logging purposes."""
        return repr(dataset_row)[:37] + "..."