示例#1
0
    def text_filter_1000_files(self):
        def create_files():
            files = [File(path="one", content=b"hello"),
                     File(path="two", content=b"world" * 100)] * 1000
            files = random.sample(files, k=len(files))  # note: no need to set the seed
            return {file.path: file for file in files}

        files1 = filter_files(create_files(), line_length_limit=80, overall_size_limit=5 << 20)
        files2 = filter_files(create_files(), line_length_limit=80, overall_size_limit=5 << 20)
        self.assertEqual(files1, files2)
示例#2
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, files: Iterator[File],
              **data) -> IdTyposModel:
        """
        Generate a new model on top of the specified source code.

        :param ptr: Git repository state pointer.
        :param config: Configuration of the training of unspecified structure.
        :param data_service: The channel to the data service in Lookout server to query for \
                             UASTs, file contents, etc.
        :param files: iterator of File records from the data service.
        :param data: Extra data passed into the method. Used by the decorators to simplify \
                     the data retrieval.
        :return: Instance of `AnalyzerModel` (`model_type`, to be precise).
        """
        _log = logging.getLogger(cls.__name__)
        train_config = cls._load_config(config)
        _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit,
                  pformat(train_config, width=4096, compact=True))
        model = IdTyposModel()
        for _, files in files_by_language(files).items():
            for file in filter_files(
                    files=files,
                    line_length_limit=train_config["line_length_limit"],
                    overall_size_limit=train_config["overall_size_limit"],
                    log=_log):
                model.identifiers.update({
                    node.token
                    for node in cls._get_identifiers(file.uast, [])
                })
        return model
示例#3
0
    def generate_file_fixes(self, data_service: DataService, changes: Sequence[Change],
                            ) -> Iterator[FileFix]:
        """
        Generate all data required for any type of further processing.

        Next processing can be comment generation or performance report generation.

        :param data_service: Connection to the Lookout data retrieval service.
        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        log = self._log
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        processed_files_counter = defaultdict(int)
        processed_fixes_counter = defaultdict(int)
        for lang, head_files in head_files_by_lang.items():
            if lang not in self.model:
                log.warning("skipped %d written in %s. Rules for %s do not exist in model",
                            len(head_files), lang, lang)
                continue
            rules = self.model[lang]
            config = self.analyze_config[lang]
            rules = rules.filter_by_confidence(config["confidence_threshold"]) \
                .filter_by_support(config["support_threshold"])
            for file in filter_files(head_files, rules.origin_config["line_length_limit"],
                                     rules.origin_config["overall_size_limit"], log=log):
                processed_files_counter[lang] += 1
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    prev_file = None
                    lines = None
                else:
                    lines = sorted(chain.from_iterable((
                        find_new_lines(prev_file, file),
                        find_deleted_lines(prev_file, file),
                    )))
                log.debug("%s %s", file.path, lines)
                fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"])
                feature_extractor_output = fe.extract_features([file], [lines])
                if feature_extractor_output is None:
                    submit_event("%s.analyze.%s.parse_failures" % (self.name, lang), 1)
                    if config["report_parse_failures"]:
                        log.warning("Failed to parse %s", file.path)
                        yield FileFix(error="Failed to parse", head_file=file, language=lang,
                                      feature_extractor=fe, base_file=prev_file, file_vnodes=[],
                                      line_fixes=[], y_pred_pure=None, y=None)
                else:
                    fixes, file_vnodes, y_pred_pure, y = self._generate_token_fixes(
                        file, fe, feature_extractor_output, data_service.get_bblfsh(), rules)
                    log.debug("%s %d fixes", file.path, len(fixes))
                    processed_fixes_counter[lang] += len(fixes)
                    yield FileFix(error="", head_file=file, language=lang, feature_extractor=fe,
                                  base_file=prev_file, file_vnodes=file_vnodes, line_fixes=fixes,
                                  y_pred_pure=y_pred_pure, y=y)
        for key, val in processed_files_counter.items():
            submit_event("%s.analyze.%s.files" % (self.name, key), val)
        for key, val in processed_fixes_counter.items():
            submit_event("%s.analyze.%s.fixes" % (self.name, key), val)
示例#4
0
    def generate_typos_fixes(self,
                             changes: Sequence[Change]) -> Iterator[TypoFix]:
        """
        Generate all data about typo fix required for any type of further processing.

        The processing can be comment generation or performance report generation.

        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(
                    files=head_files,
                    line_length_limit=self.config["line_length_limit"],
                    overall_size_limit=self.config["overall_size_limit"],
                    log=self._log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                    old_identifiers = set()
                else:
                    lines = find_new_lines(prev_file, file)
                    old_identifiers = {
                        node.token
                        for node in uast2sequence(prev_file.uast)
                        if bblfsh.role_id("IDENTIFIER") in node.roles
                        and bblfsh.role_id("IMPORT") not in node.roles
                        and node.token
                    }
                changed_nodes = extract_changed_nodes(file.uast, lines)
                new_identifiers = [
                    node for node in changed_nodes
                    if bblfsh.role_id("IDENTIFIER") in node.roles
                    and bblfsh.role_id("IMPORT") not in node.roles
                    and node.token and node.token not in old_identifiers
                ]
                if not new_identifiers:
                    continue
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                for index in suggestions.keys():
                    corrections = suggestions[index]
                    for token in corrections.keys():
                        yield TypoFix(
                            head_file=file,
                            token=new_identifiers[index].token,
                            candidates=[
                                Candidate(*c[:2]) for c in corrections[token]
                            ],
                            line_number=new_identifiers[index].start_position.
                            line,
                        )
示例#5
0
    def check_training_required(
            cls, old_model: FormatModel, ptr: ReferencePointer, config: Mapping[str, Any],
            data_service: "lookout.core.data_requests.DataService", **data) -> bool:
        """
        Return True if the format model needs to be refreshed; otherwise, False.

        We calculate the ratio of the number of changed lines to the overall number of lines.
        If it is bigger than lines_ratio_train_trigger - we need to train.

        :param old_model: Current FormatModel.
        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: True or False
        """
        _log = logging.getLogger(cls.__name__)
        changes = list(request_changes(
            data_service.get_data(), old_model.ptr, ptr, contents=True, uast=False))
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        config = cls._load_config(config)
        for language, head_files in head_files_by_lang.items():
            try:
                lang_config = config["train"][language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            overall_lines = changed_lines = 0
            for file in filter_files(head_files, lang_config["line_length_limit"],
                                     lang_config["overall_size_limit"], log=_log):
                head_lines = len(file.content.splitlines())
                overall_lines += head_lines
                try:
                    prev_file = base_files_by_lang[language][file.path]
                except KeyError:
                    changed_lines += head_lines
                else:
                    changed_lines += len(find_new_lines(prev_file, file))
                    changed_lines += len(find_deleted_lines(prev_file, file))
            ratio = changed_lines / (overall_lines or 1)
            _log.debug("check %s ratio: %.3f", language, ratio)
            if ratio > lang_config["lines_ratio_train_trigger"]:
                _log.info("%s triggers the training with changes ratio %.3f", language, ratio)
                return True
        return False
示例#6
0
    def test_filter_files(self):
        files = [File(path="one", content=b"hello"), File(path="two", content=b"world" * 100)]
        files = {file.path: file for file in files}
        logged = False

        class Log:
            def debug(self, *args, **kwargs):
                nonlocal logged
                logged = True

        try:
            bblfsh_client = BblfshClient("0.0.0.0:9432")
            filtered = filter_files(files=files, line_length_limit=80, overall_size_limit=5 << 20,
                                    log=Log())
            self.assertEqual(len(filtered), 1)
            self.assertEqual(filtered[0].content, b"hello")
            self.assertTrue(logged)
        finally:
            bblfsh_client._channel.close()
示例#7
0
    def generate_typos_fixes(self,
                             changes: Sequence[Change]) -> Iterator[TypoFix]:
        """
        Generate all data about typo fix required for any type of further processing.

        The processing can be comment generation or performance report generation.

        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(
                    files=head_files,
                    line_length_limit=self.config["line_length_limit"],
                    overall_size_limit=self.config["overall_size_limit"],
                    log=self._log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                else:
                    lines = self._find_new_lines(prev_file.content,
                                                 file.content)
                identifiers = self._get_identifiers(file.uast, lines)
                new_identifiers = [
                    node for node in identifiers
                    if node.token not in self.allowed_identifiers
                ]
                if not new_identifiers:
                    continue
                self._log.debug("found %d new identifiers" %
                                len(new_identifiers))
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                if not suggestions:
                    continue
                for index in suggestions.keys():
                    identifier = new_identifiers[index].token
                    candidates = {
                        token: [
                            Candidate(*sugg)
                            for sugg in suggestions[index][token]
                        ]
                        for token in suggestions[index]
                    }
                    sugg_identifiers, id_confidences = [], []
                    for final_sugg, conf in self.generate_identifier_suggestions(
                            candidates, identifier):
                        sugg_identifiers.append(final_sugg)
                        id_confidences.append(conf)

                    identifier_candidates = [
                        Candidate(i, c) for i, c in zip(
                            sugg_identifiers,
                            self._normalize_confidences(id_confidences),
                        ) if i != identifier
                    ]
                    if identifier_candidates:
                        yield TypoFix(
                            content=file.content.decode("utf-8", "replace"),
                            path=file.path,
                            identifier=identifier,
                            line_number=new_identifiers[index].start_position.
                            line,
                            candidates=identifier_candidates,
                            identifiers_number=len(
                                set(n.token for n in new_identifiers)),
                        )
示例#8
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService,
              files: Iterator[File], **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :param files: iterator of File records from the data service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        train_config = cls._load_config(config)["train"]
        _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit,
                  pformat(train_config, width=4096, compact=True))
        model = FormatModel().generate(cls, ptr)
        for language, files in files_by_language(files).items():
            try:
                lang_config = train_config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            _log.info("effective train config for %s:\n%s", language,
                      pformat(lang_config, width=120, compact=True))
            random_state = lang_config["random_state"]
            files = filter_files(
                files, lang_config["line_length_limit"], lang_config["overall_size_limit"],
                random_state, _log)
            submit_event("%s.train.%s.files" % (cls.name, language), len(files))
            if len(files) == 0:
                _log.info("zero files after filtering, language %s is skipped.", language)
                continue
            try:
                fe = FeatureExtractor(language=language, **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files), language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            train_files, test_files = FormatAnalyzer.split_train_test(
                files, lang_config["test_dataset_ratio"], random_state=random_state)
            # ensure that the features are reproducible
            train_files = sorted(train_files, key=lambda x: x.path)
            test_files = sorted(test_files, key=lambda x: x.path)
            X_train, y_train, _ = fe.extract_features(train_files)
            X_train, selected_features = fe.select_features(X_train, y_train)
            if test_files:
                X_test, y_test, _ = fe.extract_features(test_files)
            if lang_config["test_dataset_ratio"]:
                _log.debug("Real test ratio is %.3f",
                           X_test.shape[0] / (X_test.shape[0] + X_train.shape[0])
                           if test_files else 0)
            lang_config["feature_extractor"]["selected_features"] = selected_features
            lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X_train.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X_train.shape[0], lower_bound_instances)
                continue
            _log.info("extracted %d samples to train, searching for the best hyperparameters",
                      X_train.shape[0])
            optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state)
            best_score, best_params = optimizer.optimize(X_train, y_train)
            if _log.isEnabledFor(logging.DEBUG):
                _log.debug("score of the best estimator found: %.6f", best_score)
                _log.debug("params of the best estimator found: %s", str(best_params))
                _log.debug("training the model with complete data")
            else:
                _log.info("finished hyperopt at %.6f, training the full model", -best_score)
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             random_state=random_state,
                                             origin_config=lang_config)
            trainable_rules.fit(X_train, y_train)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"],
                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
            trainable_rules.prune_categorical_attributes(fe)
            _log.info("obtained %d rules, generating the classification report",
                      len(trainable_rules.rules))
            trainable_rules.rules.generate_classification_report(
                X_train, y_train, "train", fe.composite_class_representations)
            if test_files:
                trainable_rules.rules.generate_classification_report(
                    X_test, y_test, "test", fe.composite_class_representations)
            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipped.", language)
        _log.info("trained %s", model)
        return model
示例#9
0
    def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
                data_service: DataService, **data) -> [Comment]:
        """
        Return the list of `Comment`-s - found typo corrections.

        :param ptr_from: The Git revision of the fork point. Exists in both the original and \
                         the forked repositories.
        :param ptr_to: The Git revision to analyze. Exists only in the forked repository.
        :param data_service: The channel to the data service in Lookout server to query for \
                             UASTs, file contents, etc.
        :param data: Extra data passed into the method. Used by the decorators to simplify \
                     the data retrieval.
        :return: List of found review suggestions. Refer to \
                 lookout/core/server/sdk/service_analyzer.proto.
        """
        log = self.log
        comments = []
        changes = list(data["changes"])
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        line_length = self.config.get("line_length_limit",
                                      self.DEFAULT_LINE_LENGTH_LIMIT)
        for lang, head_files in head_files_by_lang.items():
            for file in filter_files(head_files, line_length, log):
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    lines = []
                    old_identifiers = set()
                else:
                    lines = find_new_lines(prev_file, file)
                    old_identifiers = {
                        node.token
                        for node in uast2sequence(prev_file.uast)
                        if bblfsh.role_id("IDENTIFIER") in node.roles
                        and bblfsh.role_id("IMPORT") not in node.roles
                        and node.token
                    }
                changed_nodes = extract_changed_nodes(file.uast, lines)
                new_identifiers = [
                    node for node in changed_nodes
                    if bblfsh.role_id("IDENTIFIER") in node.roles
                    and bblfsh.role_id("IMPORT") not in node.roles
                    and node.token and node.token not in old_identifiers
                ]
                if not new_identifiers:
                    continue
                suggestions = self.check_identifiers(
                    [n.token for n in new_identifiers])
                for index in suggestions.keys():
                    corrections = suggestions[index]
                    for token in corrections.keys():
                        comment = Comment()
                        comment.file = file.path
                        corrections_line = " " + ", ".join(
                            "%s (%d%%)" %
                            (candidate[0], int(candidate[1] * 100))
                            for candidate in corrections[token])
                        comment.text = """
                            Possible typo in \"%s\". Suggestions:
                        """.strip(
                        ) % new_identifiers[index].token + corrections_line
                        comment.line = new_identifiers[
                            index].start_position.line
                        comment.confidence = int(corrections[token][0][1] *
                                                 100)
                        comments.append(comment)
        return comments
示例#10
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        _log.info("train %s %s %s", ptr.url, ptr.commit,
                  pformat(config, width=4096, compact=True))
        model = FormatModel().construct(cls, ptr)
        config = cls._load_train_config(config)
        for language, files in files_by_language(data["files"]).items():
            try:
                lang_config = config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            files = filter_files(files, lang_config["line_length_limit"], _log)
            submit_event("%s.train.%s.files" % (cls.name, language),
                         len(files))
            if len(files) == 0:
                _log.info(
                    "zero files after filtering, language %s is skipped.",
                    language)
                continue
            try:
                fe = FeatureExtractor(language=language,
                                      **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files),
                             language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path))
            X, selected_features = fe.select_features(X, y)
            lang_config["feature_extractor"][
                "selected_features"] = selected_features
            lang_config["feature_extractor"][
                "label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X.shape[0],
                             lower_bound_instances)
                continue
            _log.debug("training the rules model")
            optimizer = Optimizer(
                n_jobs=lang_config["n_jobs"],
                n_iter=lang_config["n_iter"],
                cv=lang_config["cv"],
                random_state=lang_config["trainable_rules"]["random_state"])
            best_score, best_params = optimizer.optimize(X, y)
            _log.debug("score of the best estimator found: %.6f", best_score)
            _log.debug("params of the best estimator found: %s",
                       str(best_params))
            _log.debug("training the model with complete data")
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             origin_config=lang_config)
            trainable_rules.fit(X, y)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"], "\n\t".join(
                    "%-55s %.5E" % (fe.feature_names[i], importances[i])
                    for i in numpy.argsort(-importances)[:25]
                    if importances[i] > 1e-5))
            submit_event("%s.train.%s.rules" % (cls.name, language),
                         len(trainable_rules.rules))
            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
            # throw away imprecise classes
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipping.", language)
        _log.info("trained %s", model)
        return model