def test_send_new_scalar(self): name = "a_float" submit_event(name, 3.1) self.reader.parse_data() self.assertTrue(self.reader.metrics["{}_sum".format(name)] == 3.1) submit_event(name, 5.1) self.reader.parse_data() self.assertTrue(self.reader.metrics["{}_sum".format(name)] == 8.2)
def wrapped_timeit(self, request, context: grpc.ServicerContext): start_time = time.perf_counter() context.start_time = start_time result = func(self, request, context) if not getattr(context, "error", False): delta = time.perf_counter() - start_time submit_event("request." + type(request).__name__, delta) self._log.info("OK %.3f", delta) return result
def dummy_server(): from lookout.core.metrics import _prometheus_server as server if server is None: try: submit_event("start_server_hack", 8000) except OSError as e: raise e from lookout.core.metrics import _prometheus_server as server assert server is not None return server
def get(self, model_id: str, model_type: Type[AnalyzerModel], url: str) -> Tuple[Optional[AnalyzerModel], bool]: # noqa: D102 cache_key = self.cache_key(model_id, model_type, url) submit_event("SQLAlchemyModelRepository.cache.length", len(self._cache)) submit_event("SQLAlchemyModelRepository.cache.size", self._cache.currsize) with self._cache_lock: model = self._cache.get(cache_key) if model is not None: self._log.debug("used cache for %s with %s", model_id, url) submit_event("SQLAlchemyModelRepository.cache.hit", 1) return model, False submit_event("SQLAlchemyModelRepository.cache.miss", 1) with self._sessionmaker() as session: models = self._get_query(session).params(analyzer=model_id, repository=url).all() if len(models) == 0: self._log.debug("no models found for %s with %s", model_id, url) return None, True model = model_type().load(models[0].path) with self._cache_lock: self._cache[cache_key] = model self._log.debug("loaded %s with %s from %s", model_id, url, models[0].path) return model, True
def wrapped_catch_them_all(self, request, context: grpc.ServicerContext): try: return func(self, request, context) except Exception as e: start_time = getattr(context, "start_time", None) if start_time is not None: delta = time.perf_counter() - start_time self._log.exception("FAIL %.3f", delta) else: self._log.exception("FAIL ?") context.set_code(grpc.StatusCode.INTERNAL) context.set_details("%s: %s" % (type(e), e)) context.error = True submit_event("error", 1) return EventResponse()
def generate_file_fixes(self, data_service: DataService, changes: Sequence[Change], ) -> Iterator[FileFix]: """ Generate all data required for any type of further processing. Next processing can be comment generation or performance report generation. :param data_service: Connection to the Lookout data retrieval service. :param changes: The list of changes in the pointed state. :return: Iterator with unrendered data per comment. """ log = self._log base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) processed_files_counter = defaultdict(int) processed_fixes_counter = defaultdict(int) for lang, head_files in head_files_by_lang.items(): if lang not in self.model: log.warning("skipped %d written in %s. Rules for %s do not exist in model", len(head_files), lang, lang) continue rules = self.model[lang] config = self.analyze_config[lang] rules = rules.filter_by_confidence(config["confidence_threshold"]) \ .filter_by_support(config["support_threshold"]) for file in filter_files(head_files, rules.origin_config["line_length_limit"], rules.origin_config["overall_size_limit"], log=log): processed_files_counter[lang] += 1 try: prev_file = base_files_by_lang[lang][file.path] except KeyError: prev_file = None lines = None else: lines = sorted(chain.from_iterable(( find_new_lines(prev_file, file), find_deleted_lines(prev_file, file), ))) log.debug("%s %s", file.path, lines) fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"]) feature_extractor_output = fe.extract_features([file], [lines]) if feature_extractor_output is None: submit_event("%s.analyze.%s.parse_failures" % (self.name, lang), 1) if config["report_parse_failures"]: log.warning("Failed to parse %s", file.path) yield FileFix(error="Failed to parse", head_file=file, language=lang, feature_extractor=fe, base_file=prev_file, file_vnodes=[], line_fixes=[], y_pred_pure=None, y=None) else: fixes, file_vnodes, y_pred_pure, y = self._generate_token_fixes( file, fe, feature_extractor_output, data_service.get_bblfsh(), rules) log.debug("%s %d fixes", file.path, len(fixes)) processed_fixes_counter[lang] += len(fixes) yield FileFix(error="", head_file=file, language=lang, feature_extractor=fe, base_file=prev_file, file_vnodes=file_vnodes, line_fixes=fixes, y_pred_pure=y_pred_pure, y=y) for key, val in processed_files_counter.items(): submit_event("%s.analyze.%s.files" % (self.name, key), val) for key, val in processed_fixes_counter.items(): submit_event("%s.analyze.%s.fixes" % (self.name, key), val)
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, files: Iterator[File], **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :param files: iterator of File records from the data service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) train_config = cls._load_config(config)["train"] _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit, pformat(train_config, width=4096, compact=True)) model = FormatModel().generate(cls, ptr) for language, files in files_by_language(files).items(): try: lang_config = train_config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue _log.info("effective train config for %s:\n%s", language, pformat(lang_config, width=120, compact=True)) random_state = lang_config["random_state"] files = filter_files( files, lang_config["line_length_limit"], lang_config["overall_size_limit"], random_state, _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info("zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) train_files, test_files = FormatAnalyzer.split_train_test( files, lang_config["test_dataset_ratio"], random_state=random_state) # ensure that the features are reproducible train_files = sorted(train_files, key=lambda x: x.path) test_files = sorted(test_files, key=lambda x: x.path) X_train, y_train, _ = fe.extract_features(train_files) X_train, selected_features = fe.select_features(X_train, y_train) if test_files: X_test, y_test, _ = fe.extract_features(test_files) if lang_config["test_dataset_ratio"]: _log.debug("Real test ratio is %.3f", X_test.shape[0] / (X_test.shape[0] + X_train.shape[0]) if test_files else 0) lang_config["feature_extractor"]["selected_features"] = selected_features lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X_train.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X_train.shape[0], lower_bound_instances) continue _log.info("extracted %d samples to train, searching for the best hyperparameters", X_train.shape[0]) optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state) best_score, best_params = optimizer.optimize(X_train, y_train) if _log.isEnabledFor(logging.DEBUG): _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") else: _log.info("finished hyperopt at %.6f, training the full model", -best_score) lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], random_state=random_state, origin_config=lang_config) trainable_rules.fit(X_train, y_train) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) trainable_rules.prune_categorical_attributes(fe) _log.info("obtained %d rules, generating the classification report", len(trainable_rules.rules)) trainable_rules.rules.generate_classification_report( X_train, y_train, "train", fe.composite_class_representations) if test_files: trainable_rules.rules.generate_classification_report( X_test, y_test, "test", fe.composite_class_representations) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipped.", language) _log.info("trained %s", model) return model
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) _log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) model = FormatModel().construct(cls, ptr) config = cls._load_train_config(config) for language, files in files_by_language(data["files"]).items(): try: lang_config = config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue files = filter_files(files, lang_config["line_length_limit"], _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info( "zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path)) X, selected_features = fe.select_features(X, y) lang_config["feature_extractor"][ "selected_features"] = selected_features lang_config["feature_extractor"][ "label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue _log.debug("training the rules model") optimizer = Optimizer( n_jobs=lang_config["n_jobs"], n_iter=lang_config["n_iter"], cv=lang_config["cv"], random_state=lang_config["trainable_rules"]["random_state"]) best_score, best_params = optimizer.optimize(X, y) _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], origin_config=lang_config) trainable_rules.fit(X, y) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join( "%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model # throw away imprecise classes if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipping.", language) _log.info("trained %s", model) return model