def get_vnodes_number(repository: str, from_commit: str, to_commit: str, context: AnalyzerContextManager, bblfsh: Optional[str]) -> int: """ Calculate the number of expected vnodes number for a repository. :param repository: URL of repository. :param from_commit: Hash of the base commit. :param to_commit: Hash of the head commit. :param context: LookoutSDK instance to query analyzer. :param bblfsh: Babelfish server address to use. Specify None to use the default value. :return: expected vnodes number for a repository. """ expected_vnodes_number = -1 def _convert_files_to_xy(self, parsed_files): nonlocal expected_vnodes_number if expected_vnodes_number != -1: raise RuntimeError("_files_to_xy should be called only one time.") expected_vnodes_number = sum(len(vn) for vn, _, _ in parsed_files) raise RuntimeError("Forced FormatAnalyser.train call stop.") try: _convert_files_to_xy_backup = FeatureExtractor._convert_files_to_xy FeatureExtractor._convert_files_to_xy = _convert_files_to_xy with tempfile.TemporaryDirectory(prefix="top-repos-quality-repos-") as tmpdirname: git_dir = ensure_repo(repository, tmpdirname) try: context.push(fr=from_commit, to=to_commit, git_dir=git_dir, log_level="warning", bblfsh=bblfsh) except subprocess.CalledProcessError as e: # Force stop expected pass finally: FeatureExtractor._convert_files_to_xy = _convert_files_to_xy_backup return expected_vnodes_number
class BaseAnalyzerIntegrationTests(unittest.TestCase): def setUp(self, fs=None): self.db = tempfile.NamedTemporaryFile(dir=self.base_dir) if fs is None: self.fs = tempfile.TemporaryDirectory(dir=self.base_dir) else: self.fs = fs self.context = AnalyzerContextManager(FormatAnalyzer, db=self.db.name, fs=self.fs.name).__enter__() self.logs = logs = [] class ShadowHandler(logging.Handler): def emit(self, record): logs.append(logging.getLogger().handlers[0].format(record)) self.log_handler = ShadowHandler() logging.getLogger().addHandler(self.log_handler) def tearDown(self, fs_cleanup=True): if fs_cleanup: self.fs.cleanup() self.context.__exit__() logging.getLogger().removeHandler(self.log_handler)
def __enter__(self) -> "Reporter": self._tmpdir = tempfile.mkdtemp("reporter-") \ if self._database is None or self._fs is None else None if self._database is None: self._database = os.path.join(self._tmpdir, "db.sqlite3") if self._fs is None: self._fs = os.path.join(self._tmpdir, "models") os.makedirs(self._fs, exist_ok=True) self._analyzer_context_manager = AnalyzerContextManager( self.inspected_analyzer_type, db=self._database, fs=self._fs, init=False) self._analyzer_context_manager.__enter__() return self
def evaluate_smoke_entry( inputpath: str, reportdir: str, database: str, bblfsh: str, config: dict, ) -> None: """ CLI entry point. """ start_time = time.time() report_filename = os.path.join(reportdir, "report.csv") log = logging.getLogger("evaluate_smoke") if database is None: db = tempfile.NamedTemporaryFile(dir=inputpath, prefix="db", suffix=".sqlite3") database = db.name log.info("Database %s created" % database) else: if os.path.exists(database): log.info("Found existing database %s" % database) else: log.info("Database %s not found and will be created." % database) with tempfile.TemporaryDirectory(dir=inputpath) as fs: with AnalyzerContextManager(SmokeEvalFormatAnalyzer, db=database, fs=fs) as server: inputpath = Path(inputpath) index_file = inputpath / "index.csv" os.makedirs(reportdir, exist_ok=True) with open(report_filename, "w") as report: csv.DictWriter( report, fieldnames=SmokeEvalFormatAnalyzer.REPORT_COLNAMES, ).writeheader() with open(str(index_file)) as index: reader = csv.DictReader(index) for row in tqdm(reader): repopath = inputpath / row["repo"] config_json = { SmokeEvalFormatAnalyzer.name: merge_dicts(config, { "style_name": row["style"], "report_path": reportdir, }) } server.review(fr=row["from"], to=row["to"], git_dir=str(repopath), log_level="warning", bblfsh=bblfsh, config_json=config_json) log.info("Quality report saved to %s", reportdir) report = pandas.read_csv(report_filename) with pandas.option_context("display.max_columns", 10, "display.expand_frame_repr", False): print(report.describe()) log.info("Time spent: %.3f" % (time.time() - start_time))
def setUp(self, fs=None): self.db = tempfile.NamedTemporaryFile(dir=self.base_dir) if fs is None: self.fs = tempfile.TemporaryDirectory(dir=self.base_dir) else: self.fs = fs self.context = AnalyzerContextManager(FormatAnalyzer, db=self.db.name, fs=self.fs.name).__enter__() self.logs = logs = [] class ShadowHandler(logging.Handler): def emit(self, record): logs.append(logging.getLogger().handlers[0].format(record)) self.log_handler = ShadowHandler() logging.getLogger().addHandler(self.log_handler)
def test_train_review_analyzer_integration(self): """Integration test for review event.""" with AnalyzerContextManager(analyzer=QualityReportAnalyzer, db=self.db.name, fs=self.fs.name) as context: context.review( FROM_COMMIT, TO_COMMIT, git_dir=self.jquery_dir, config_json={QualityReportAnalyzer.name: get_config()})
def calc_expected_vnodes_number_entry(input: str, output: str, runs: int) -> None: """ Entry point for `python -m lookout.style.format calc-expected-support` command. :param input: сsv file with repositories for quality report. Should contain url, to and from \ columns. :param output: Path to a output csv file. :param runs: Repeat number to ensure the result correctness. """ log = logging.getLogger("expected_vnodes_number") handler = logging.handlers.RotatingFileHandler(output + ".errors") handler.setLevel(logging.ERROR) log.addHandler(handler) repositories = list(csv.DictReader(handle_input_arg(input))) try: bblfsh = _restart_bblfshd(first_run=True) for cur_run in range(runs): with tempfile.TemporaryDirectory() as tmpdirname: database = os.path.join(tmpdirname, "db.sqlite3") fs = os.path.join(tmpdirname, "models") os.makedirs(fs, exist_ok=fs) with AnalyzerContextManager(FormatAnalyzer, db=database, fs=fs, init=False) as server: for row in tqdm(repositories): try: vnodes_number = get_vnodes_number( row["url"], to_commit=row["to"], from_commit=row["from"], context=server, bblfsh=bblfsh) log.info("%d/%d run. Expected vnodes number for %s is %d.", cur_run + 1, runs, row["url"], vnodes_number) if row.get("vnodes_number", vnodes_number) != vnodes_number: log.warning("vnodes number is different for %d/%d run. Get %d " "instead of %d. Set to nan.", cur_run + 1, runs, vnodes_number, row["vnodes_number"]) row["vnodes_number"] = float("nan") else: row["vnodes_number"] = vnodes_number except Exception: log.exception("-" * 20 + "\nFailed to process %s repo", row["url"]) continue bblfsh = _restart_bblfshd() finally: _stop_bblfshd() fieldnames = ["url", "to", "from", "vnodes_number"] with open(output, "w") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for row in repositories: writer.writerow(row)
def measure_quality(repository: str, from_commit: str, to_commit: str, context: AnalyzerContextManager, config: dict, bblfsh: Optional[str], vnodes_expected_number: Optional[int], restarts: int=3) -> Dict[str, str]: """ Generate `QualityReport` for a repository. If it fails it returns empty reports. :param repository: URL of repository. :param from_commit: Hash of the base commit. :param to_commit: Hash of the head commit. :param context: LookoutSDK instance to query analyzer. :param config: config for FormatAnalyzer. :param bblfsh: Babelfish server address to use. Specify None to use the default value. :param vnodes_expected_number: Specify number for expected number of vnodes if known. \ report collection will be restarted if number of extracted \ vnodes does not match. :param restarts: Number of restarts if number of extracted vnodes does not match. :return: Dictionary with all QualityReport reports. """ log = logging.getLogger("QualityAnalyzer") # This dirty hack should be removed as soon as # https://github.com/src-d/style-analyzer/issues/557 resolved. sum_vnodes_number = 0 call_numbers = 0 _convert_files_to_xy_backup = FeatureExtractor._convert_files_to_xy def _convert_files_to_xy(self, parsed_files): nonlocal sum_vnodes_number, call_numbers call_numbers += 1 sum_vnodes_number += sum(len(vn) for vn, _, _ in parsed_files) # sum_vnodes_number + 1 because of whatever reason if you extract test and train # separately you have -1 vnode # TODO (zurk): investigate ^ if call_numbers == 2 and sum_vnodes_number + 1 != vnodes_expected_number: raise RestartReport("VNodes number does not match to expected: %d != %d:" % ( sum_vnodes_number, vnodes_expected_number)) log.info("VNodes number match to expected %d. ", vnodes_expected_number) return _convert_files_to_xy_backup(self, parsed_files) reports = {} def capture_reports(func): @functools.wraps(func) def wrapped_capture_quality_reports(*args, **kwargs): nonlocal reports if reports: raise RuntimeError("generate_reports should be called only one time.") result = func(*args, **kwargs) reports = result return result wrapped_capture_quality_reports.original = func return wrapped_capture_quality_reports try: QualityReportAnalyzer.generate_reports = \ capture_reports(QualityReportAnalyzer.generate_reports) if vnodes_expected_number: log.info("Vnodes expected number is equal to %d", vnodes_expected_number) FeatureExtractor._convert_files_to_xy = _convert_files_to_xy with tempfile.TemporaryDirectory(prefix="top-repos-quality-repos-") as tmpdirname: git_dir = ensure_repo(repository, tmpdirname) for attempt_number in range(restarts): sum_vnodes_number = -1 try: context.push(fr=from_commit, to=to_commit, git_dir=git_dir, log_level="warning", bblfsh=bblfsh, config_json=config) break except subprocess.CalledProcessError: # Assume that we failed because VNodes number does not match to expected one log.warning("%d/%d try to train the model failed.", attempt_number, restarts) else: raise RuntimeError("Run out of %d attempts. Failed to train proper model for %s." % (restarts, repository)) context.review(fr=from_commit, to=to_commit, git_dir=git_dir, log_level="warning", bblfsh=bblfsh, config_json=config) finally: QualityReportAnalyzer.generate_reports = QualityReportAnalyzer.generate_reports.original if vnodes_expected_number: FeatureExtractor._convert_files_to_xy = _convert_files_to_xy_backup return reports
def generate_quality_report(input: str, output: str, force: bool, bblfsh: str, config: dict, database: Optional[str] = None, fs: Optional[str] = None) -> None: """ Generate quality report for the given data. Entry point for command line interface. :param input: csv file with repositories to make report. Should contain url, to and from \ columns. :param output: Directory where to save results. :param force: force to overwrite results stored in output directory if True. \ Stored results will be used if False. :param bblfsh: bblfsh address to use. :param config: config for FormatAnalyzer. :param database: sqlite3 database path to store the models. Temporary file is used if not set. :param fs: Model repository file system root. Temporary directory is used if not set. :return: """ os.makedirs(output, exist_ok=True) assert os.path.isdir(output), "Output should be a directory" log = logging.getLogger("QualityAnalyzer") handler = logging.handlers.RotatingFileHandler(os.path.join(output, "errors.txt")) handler.setLevel(logging.ERROR) log.addHandler(handler) reports = [] config = {QualityReportAnalyzer.name: merge_dicts(config, {"aggregate": True})} repositories = list(csv.DictReader(handle_input_arg(input))) with tempfile.TemporaryDirectory() as tmpdirname: database = database if database else os.path.join(tmpdirname, "db.sqlite3") fs = fs if fs else os.path.join(tmpdirname, "models") os.makedirs(fs, exist_ok=True) with AnalyzerContextManager(QualityReportAnalyzer, db=database, fs=fs, init=False) as context: for row in huge_progress_bar(repositories, log, lambda row: row["url"]): path_tmpl = os.path.join(output, get_repo_name(row["url"])) + "-%s_report.md" try: if force or not any(os.path.exists(path_tmpl % name) for name in QualityReportAnalyzer.get_report_names()): vnodes_expected_number = int(row["vnodes_number"]) \ if "vnodes_number" in row else None report = measure_quality( row["url"], to_commit=row["to"], from_commit=row["from"], context=context, config=config, bblfsh=bblfsh, vnodes_expected_number=vnodes_expected_number) for report_name in report: with open(path_tmpl % report_name, "w", encoding="utf-8") as f: f.write(report[report_name]) reports.append((row["url"], report)) else: report = {} log.info("Found existing reports for %s in %s", row["url"], output) for report_name in QualityReportAnalyzer.get_report_names(): report_path = path_tmpl % report_name if not os.path.exists(report_path): log.warning( "skipped %s. %s report is missing", row["url"], report_name) break with open(path_tmpl % report_name, encoding="utf-8") as f: report[report_name] = f.read() else: reports.append((row["url"], report)) except Exception: log.exception("-" * 20 + "\nFailed to process %s repo", row["url"]) continue for report_name in ("train", "test"): summary = _generate_report_summary(reports, report_name) log.info("\n%s\n%s", report_name, summary) summary_loc = os.path.join(output, "summary-%s_report.md" % report_name) with open(summary_loc, "w", encoding="utf-8") as f: f.write(summary)
class Reporter: """ Base class to create performance reports for the analyzer. To create a reporter for your Analyzer you should make two steps. 1. Inherit SpyAnalyzer from an Analyzer you want to evaluate. SpyAnalyzer's `analyze` \ function should be overridden to return all the information you need for the following \ evaluation of the `Comment`-s. Refer to `TyposAnalyzerSpy` as an example. 2. Inherit MyReporter from this Reporter class. Set created SpyAnalyzer to \ `inspected_analyzer_type` attribute. You should have a dataset that you feed to \ `Reporter.run()`. The dataset rows are passed to `_trigger_review_event` to trigger your \ analyzer's `analyze()`. The result is passed to `_generate_reports()`. If you need to \ summarize your reports, override `_finalize` method. If you want to create several reports (e.g. separate train and test reports) you should \ override both `get_report_names()` and `_generate_reports()`. """ _log = logging.getLogger("Reporter") inspected_analyzer_type = None # type: Type[Analyzer] def __init__(self, config: Optional[dict] = None, bblfsh: Optional[str] = None, database: Optional[str] = None, fs: Optional[str] = None): """ Initialize a new `Reporter` instance. You should provide `database` and `fs` in order to re-use existing models (no training). :param config: Analyzer configuration for push and review events. The analyzer uses \ default config if not provided. :param bblfsh: Babelfish endpoint to use by lookout-sdk. :param database: Database endpoint to use to read and store information about models. \ Sqlite3 database in a temporary file is used if not provided. :param fs: Model repository file system root. Temporary directory is used if not provided. """ if self.inspected_analyzer_type is None or \ not issubclass(self.inspected_analyzer_type, Analyzer): raise AttributeError( "inspected_analyzer_type attribute must be set to an Analyzer subclass in %s," " got %s." % (type(self), repr(self.inspected_analyzer_type))) self._config = config self._bblfsh = bblfsh self._database = database self._fs = fs self._failures = {} def __enter__(self) -> "Reporter": self._tmpdir = tempfile.mkdtemp("reporter-") \ if self._database is None or self._fs is None else None if self._database is None: self._database = os.path.join(self._tmpdir, "db.sqlite3") if self._fs is None: self._fs = os.path.join(self._tmpdir, "models") os.makedirs(self._fs, exist_ok=True) self._analyzer_context_manager = AnalyzerContextManager( self.inspected_analyzer_type, db=self._database, fs=self._fs, init=False) self._analyzer_context_manager.__enter__() return self def __exit__(self, exc_type=None, exc_val=None, exc_tb=None): self._analyzer_context_manager.__exit__() if self._tmpdir: shutil.rmtree(self._tmpdir) def run(self, dataset: Sequence[Dict[str, Any]]) -> Iterator[Dict[str, str]]: """ Run report generation. :param dataset: The dataset for the report generation. The format is a list of data rows. \ The row is a Dictionary with a mapping from the column name to its content. :return: Iterator through generated reports. Each Generated report is extended with the \ corresponding row data from the dataset. """ self._failures = {} def _run(dataset) -> Iterator[Dict[str, str]]: for index, row in enumerate(huge_progress_bar( dataset, self._log, self._get_row_repr), start=1): self._log.info("processing %d / %d (%s)", index, len(dataset), row) try: fixes = self._trigger_review_event(row) reports = self._generate_reports(row, fixes) reports.update(row) yield reports except Exception: self._log.exception( "failed to generate report %d / %d (%s)", index, len(dataset), row) self._failures[index] = row yield from self._finalize(_run(dataset)) @classmethod def get_report_names(cls) -> Tuple[str, ...]: """ Get all available report names. :return: Tuple with report names. """ raise NotImplementedError() def _generate_reports( self, dataset_row: Dict[str, Any], fixes: Sequence[NamedTuple], ) -> Dict[str, str]: """ Generate reports for a dataset row. :param dataset_row: Dataset row which triggered the analyze method of the analyzer. :param fixes: List of data provided by the analyze method of spied analyzer. :return: Dictionary with report names as keys and report string as values. """ raise NotImplementedError() def _trigger_review_event( self, dataset_row: Dict[str, Any]) -> Sequence[NamedTuple]: """ Trigger review event and convert provided comments to an internal representation. It is required to call `Reporter._analyzer_context_manager.review()` in this function \ with arguments you need and convert provided comments to a Sequence of NamedTuple-s for \ the report generation. :param dataset_row: Dataset row with information required to run \ `analyzer_context_manager.review()`. :return: Sequence of data extracted from comments to generate report. """ raise NotImplementedError() def _finalize( self, reports: Iterable[Dict[str, str]]) -> Iterator[Dict[str, str]]: """ Extend or summarize the generated reports. The function does not change the reports by default. :param reports: Iterable with generated reports. :return: New finalized reports. """ yield from reports @staticmethod def _get_package_version(): """Return lookout-style package version or "local" if it is a git repository.""" if (Path(__file__).parents[2] / ".git").exists(): return "local" else: return lookout.style.__version__ @staticmethod def _get_commit(): """Return current head commit hash if you run inside git repository.""" if Reporter._get_package_version() != "local": return "0" * 40 clean_status = porcelain.GitStatus(staged={ "delete": [], "add": [], "modify": [] }, unstaged=[], untracked=[]) repo_path = str(Path(__file__).parents[2]) head = dulwich.repo.Repo(repo_path).head().decode() if porcelain.status(repo_path) == clean_status: return head else: return "%s (dirty)" % head @staticmethod def _get_row_repr(dataset_row: Dict[str, Any]) -> str: """Convert dataset row to its representation for logging purposes.""" return repr(dataset_row)[:37] + "..."