def test_quality_report_noisy(self): slogging.setup("DEBUG", False) with Capturing() as output: try: quality_report_noisy(bblfsh=self.bblfsh, language=self.language, confidence_threshold=0.8, support_threshold=20, precision_threshold=0.95, dir_output=tempfile.tempdir, repos=REPOSITORIES) except SystemExit: self.skipTest("Matplotlib is required to run this test") pattern = re.compile( r"((?:prediction rate x)|(?:precision y)): \[(\d+.\d+(, \d+.\d+)+)\]" ) metrics = {} for line in output: match = pattern.search(line) if match: metric, scores_string = list(match.groups())[:2] scores_string = scores_string.split(", ") scores = [float(f) for f in scores_string] metrics[metric] = scores self.assertGreater(metrics["prediction rate x"][-1], 0) self.assertGreater(metrics["precision y"][-1], 0)
def test_config(self): slogging.setup("INFO", True, "XXX.yml") with tempfile.NamedTemporaryFile() as f: f.write( b"FormatAnalyzer: INFO\nRules: INFO\nTrainableRules: INFO\n") f.flush() slogging.setup("INFO", True, f.name)
def run_analyzers(args): """ Launches the service with the specified analyzers. Blocks until a KeyboardInterrupt. :param args: Parsed command line arguments. :return: None """ slogging.setup(args.log_level, args.log_structured, args.log_config_path) log = logging.getLogger("run") model_repository = create_model_repo_from_args(args) log.info("Created %s", model_repository) if args.request_server == "auto": data_request_address = "%s:10301" % args.server.split(":")[0] else: data_request_address = args.request_server data_service = DataService(data_request_address) log.info("Created %s", data_service) manager = AnalyzerManager( analyzers=[ importlib.import_module(a).analyzer_class for a in args.analyzer ], model_repository=model_repository, data_service=data_service, ) log.info("Created %s", manager) listener = EventListener(address=args.server, handlers=manager, n_workers=args.workers) log.info("Created %s", listener) listener.start() log.info("Listening %s", args.server) listener.block() model_repository.shutdown() data_service.shutdown()
def print_reports(input_pattern: str, bblfsh: str, language: str, model_path: str, config: Union[str, dict] = "{}", log_level: str = "INFO") -> None: """Print quality and model reports for a given model on a given dataset.""" slogging.setup(log_level, False) log = logging.getLogger("quality_report") config = config if isinstance(config, dict) else json.loads(config) for report in analyze_files( QualityReportAnalyzer, config, model_path, language, bblfsh, input_pattern, log): print(report.text)
def init_repo(args): """ Initializes the model repository. :param args: Parsed command line arguments. :return: None """ slogging.setup(args.log_level, False, args.log_config_path) repo = create_model_repo_from_args(args) repo.init()
def package_cmdline_entry( args: argparse.Namespace) -> Union[None, int]: # noqa: D401 """ Package several analyzers to a Docker container and write a sample Docker Compose config \ for Lookout. :param args: Parsed command line arguments. :return: None or error code. """ slogging.setup(args.log_level, False, args.log_config_path) return package(args.yes, args.no, args.workdir, args.analyzer, args.requirements, args.repo, args.user, args.token)
def main(): setup("DEBUG", False) parser = ArgumentParser() parser.add_argument("training_dir", help="Path to the directory containing the files to train from.") parser.add_argument("output_path", help="Path to the model to write.") parser.add_argument("--bblfsh", default="0.0.0.0:9432", help="Address of babelfish server.") parser.add_argument("--language", default="javascript", help="Language to filter on.") parser.add_argument("--config", help="Path to a YAML file containing config to apply during training.") args = parser.parse_args() train(**vars(args))
def main(): args = parse_args() slogging.setup("INFO", False) clients = threading.local() pool = ThreadPoolExecutor(max_workers=args.threads) log = logging.getLogger("main") log.info("Will parse %d files", len(args.input)) roles = set() reserved = set() language = "" progress = tqdm(total=len(args.input)) errors = False def analyze_file(path: str): nonlocal errors if errors: return try: try: client = clients.client except AttributeError: client = bblfsh.BblfshClient(args.bblfsh) clients.client = client response = client.parse(path) nonlocal language if not language: language = response.language elif language != response.language: log.warning("dropped %s - language mismatch %s != %s", path, language, response.language) return analyze_uast(path, response.uast, roles, reserved) progress.update(1) except: # noqa: E722 log.exception("Parsing %s", path) errors = True with progress: for file in args.input: pool.submit(analyze_file, file) pool.shutdown() if errors: return 1 reserved.discard("") log.info("Internal roles: %d", len(roles)) log.info("Reserved: %d", len(reserved)) generate_files(args.output, roles, reserved)
def test_structured_logging(self): logging.basicConfig() handler_backup = logging.getLogger().handlers[0] slogging.setup("INFO", True, "logging.yml") backup = sys.stdout sys.stdout = buffer = io.StringIO() try: logging.getLogger("test").info("hello, world!") finally: sys.stdout = backup logging.getLogger().handlers[0] = handler_backup obj = json.loads(buffer.getvalue()) self.assertEqual(obj["level"], "info") self.assertEqual(obj["msg"], "hello, world!") self.assertEqual(obj["source"], "test_slogging.py:18") self.assertEqual(len(obj["thread"]), 4) self.assertIn("time", obj)
def main(): parser = create_parser() args = parser.parse_args() slogging.setup(args.log_level, False) return create_and_train_nn_prediction_from_file(**vars(args))
def main(): """Entry point.""" args = parse_args() slogging.setup(args.log_level, False) clients = threading.local() pool = ThreadPoolExecutor(max_workers=args.threads) log = logging.getLogger("main") log.info("Will parse %d files in %d threads", len(args.input), args.threads) internal_types = defaultdict(int) roles = defaultdict(int) reserved = set() language = args.parquet_language inputs = list(handle_input_arg(args.input)) progress = tqdm(total=len(inputs)) progress_lock = threading.Lock() errors = False def analyze_code_file(path: str): nonlocal errors if errors: return try: try: client = clients.client except AttributeError: client = bblfsh.BblfshClient(args.bblfsh) clients.client = client response = client.parse(path) nonlocal language if not language: language = response.language elif language != response.language: log.warning("dropped %s - language mismatch %s != %s", path, language, response.language) return content = Path(path).read_text() analyze_uast(path, content, response.uast, internal_types, roles, reserved) except: # noqa: E722 log.exception("Parsing %s", path) errors = True finally: with progress_lock: progress.disable = False # this is needed, do not remove progress.update(1) def analyze_parquet_row(row: pandas.Series, filepath: str): nonlocal errors if errors: return nonlocal language try: path = "%s:%s" % (filepath, row.path) analyze_uast(path, row.content.decode(errors="ignore"), bblfsh.Node.FromString(row.uast), internal_types, roles, reserved) except DecodeError as e: log.warning(e) except: # noqa: E722 log.exception("Parsing %s", row.path) errors = True finally: with progress_lock: progress.disable = False # this is needed, do not remove progress.update(1) try: if args.parquet: if not language: raise ValueError( "--parquet-language must be specified with --parquet.") with progress: for filepath in inputs: try: data = pandas.read_parquet(filepath) except: # noqa: E722 log.warning("Bad parquet file %s", filepath) else: analyze = partial(analyze_parquet_row, filepath=filepath) for _, row in data.iterrows(): progress.total += 1 pool.submit(analyze, row) progress.update(1) else: with progress: for filepath in inputs: pool.submit(analyze_code_file, filepath) finally: pool.shutdown() if errors: return 1 reserved.discard("") log.info("Internal types: %d", len(internal_types)) log.info("UAST roles: %d", len(roles)) log.info("Reserved: %d", len(reserved)) roles = {bblfsh.role_name(role_id): n for role_id, n in roles.items()} generate_files(args.output, internal_types, roles, reserved)
def setUp(self): slogging.setup("DEBUG", False) self.bblfsh_endpoint = "0.0.0.0:9432"
def setUpClass(cls): slogging.setup(logging.DEBUG, False)
def main(args): """Entry point for quality report generation.""" os.makedirs(args.output, exist_ok=True) assert os.path.isdir(args.output), "Output should be a directory" slogging.setup(args.log_level, False) log = logging.getLogger("QualityAnalyzer") handler = logging.handlers.RotatingFileHandler( os.path.join(args.output, "errors.txt")) handler.setLevel(logging.ERROR) log.addHandler(handler) if not server.exefile.exists(): server.fetch() # download executable # prepare output directory reports = [] port = server.find_port() review_config = {QualityReportAnalyzer.name: {"aggregate": True}} train_config = json.loads(args.train_config) with tempfile.TemporaryDirectory() as tmpdirname: database = args.database if args.database else os.path.join( tmpdirname, "db.sqlite3") fs = args.fs if args.fs else os.path.join(tmpdirname, "models") os.makedirs(fs, exist_ok=fs) with AnalyzerContextManager( port=port, db=database, fs=fs, analyzer="lookout.style.format.benchmarks.general_report", init=False): start_time = datetime.now() for ri, repo in enumerate(REPOSITORIES): repo, to_commit, from_commit = repo.split() now = datetime.now() if ri > 0: left = (len(REPOSITORIES) - ri) / ri * (now - start_time) else: left = None log.info( "\n%s\n" "= %-76s =\n" "= %2d / %2d%s=\n" "= Now: %-60s%s=\n" "= Left: %-40s%s=\n" "= Ends: %-60s%s=\n" "%s", "=" * 80, repo, ri + 1, len(REPOSITORIES), " " * 70, now, " " * 11, left, " " * 31, now + left if left is not None else None, " " * 11, "=" * 80, ) report_loc = os.path.join(args.output, get_repo_name(repo)) quality_rep_loc = report_loc + ".quality_report.md" model_rep_loc = report_loc + ".model_report.md" # generate or read report try: if args.force or not os.path.exists(quality_rep_loc) or \ not os.path.exists(model_rep_loc): # Skip this step if report was already generated report = measure_quality(repo, to_commit=to_commit, from_commit=from_commit, port=port, review_config=review_config, train_config=train_config, bblfsh=args.bblfsh) if report.quality is not None: with open(quality_rep_loc, "w", encoding="utf-8") as f: f.write(report.quality) if report.model is not None: with open(model_rep_loc, "w", encoding="utf-8") as f: f.write(report.model) else: report = QualityReport() with open(quality_rep_loc, encoding="utf-8") as f: report.quality = f.read() with open(model_rep_loc, encoding="utf-8") as f: report.model = f.read() if report.quality is not None and report.model is not None: reports.append((repo, report)) else: log.warning("skipped %s: quality %s model %s", repo, report.quality is not None, report.model is not None) except Exception: log.exception("-" * 20 + "\nFailed to process %s repo", repo) continue # precision, recall, f1, support, n_rules, avg_len stats table = [] fields2id = OrderedDict() additional_fields = ("Rules Number", "Average Rule Len") with io.StringIO() as output: for repo, report in reports: metrics = _get_metrics(report.quality) if not table: table.append(("repo", ) + metrics._fields + additional_fields) for i, field in enumerate(table[0]): fields2id[field] = i n_rules, avg_len = _get_model_summary(report.model) table.append((get_repo_name(repo), ) + metrics + (n_rules, avg_len)) average = tuple( ("%" + FLOAT_PRECISION) % calc_avg(table[1:], fields2id[field]) for field in metrics._fields) average += tuple( ("%" + FLOAT_PRECISION) % calc_avg(table[1:], fields2id[field]) for field in additional_fields) fields_to_weight = ( ("precision", "support"), ("recall", "support"), ("full_recall", "full_support"), ("f1", "support"), ("full_f1", "full_support"), ("ppcr", "support"), ) weighted_average = [] for field, weight_field in fields_to_weight: weighted_average.append( ("%" + FLOAT_PRECISION) % calc_weighted_avg(table[1:], col=fields2id[field], weight_col=fields2id[weight_field])) table.append(("Average", ) + average) table.append(("Weighted average", ) + tuple(weighted_average)) float_fields = ("precision", "recall", "full_recall", "f1", "full_f1", "ppcr") floatfmts = [] for field in fields2id: if field in float_fields: floatfmts.append(FLOAT_PRECISION) else: floatfmts.append("g") print(tabulate(table, tablefmt="pipe", headers="firstrow", floatfmt=floatfmts), file=output) summary = output.getvalue() print(summary) summary_loc = os.path.join(args.output, "summary.md") with open(summary_loc, "w", encoding="utf-8") as f: f.write(summary)
def setUpClass(cls): slogging.setup("INFO", False, "")