示例#1
0
class FeaturesTests(unittest.TestCase):
    def setUp(self):
        config = FormatAnalyzer._load_config(get_config())["train"]
        self.extractor = FeatureExtractor(
            language="javascript", **config["javascript"]["feature_extractor"])

    def test_positions(self):
        test_js_code_filepath = Path(
            __file__).parent / "browser-policy-content.js"
        with open(str(test_js_code_filepath), mode="rb") as f:
            code = f.read()
        uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                         language="javascript",
                                                         contents=code).uast
        converter = BytesToUnicodeConverter(code)
        code_uni = converter.convert_content()
        uast_uni = converter.convert_uast(uast)
        file = UnicodeFile(content=code_uni,
                           uast=uast_uni,
                           language="javascript",
                           path="test.js")
        annotated_data = AnnotationManager.from_file(file)
        self.extractor._parse_file(annotated_data)
        nodes, _ = file_to_old_parse_file_format(annotated_data)
        for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])):
            self.assertLessEqual(
                node1.start.line, node2.start.line,
                "Start line position decrease for %d, %d nodes" %
                (index, index + 1))
            self.assertLessEqual(
                node1.start.offset, node2.start.offset,
                "Start offset position decrease for %d, %d nodes" %
                (index, index + 1))
示例#2
0
    def generate_file_fixes(self, data_service: DataService, changes: Sequence[Change],
                            ) -> Iterator[FileFix]:
        """
        Generate all data required for any type of further processing.

        Next processing can be comment generation or performance report generation.

        :param data_service: Connection to the Lookout data retrieval service.
        :param changes: The list of changes in the pointed state.
        :return: Iterator with unrendered data per comment.
        """
        log = self._log
        base_files_by_lang = files_by_language(c.base for c in changes)
        head_files_by_lang = files_by_language(c.head for c in changes)
        processed_files_counter = defaultdict(int)
        processed_fixes_counter = defaultdict(int)
        for lang, head_files in head_files_by_lang.items():
            if lang not in self.model:
                log.warning("skipped %d written in %s. Rules for %s do not exist in model",
                            len(head_files), lang, lang)
                continue
            rules = self.model[lang]
            config = self.analyze_config[lang]
            rules = rules.filter_by_confidence(config["confidence_threshold"]) \
                .filter_by_support(config["support_threshold"])
            for file in filter_files(head_files, rules.origin_config["line_length_limit"],
                                     rules.origin_config["overall_size_limit"], log=log):
                processed_files_counter[lang] += 1
                try:
                    prev_file = base_files_by_lang[lang][file.path]
                except KeyError:
                    prev_file = None
                    lines = None
                else:
                    lines = sorted(chain.from_iterable((
                        find_new_lines(prev_file, file),
                        find_deleted_lines(prev_file, file),
                    )))
                log.debug("%s %s", file.path, lines)
                fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"])
                feature_extractor_output = fe.extract_features([file], [lines])
                if feature_extractor_output is None:
                    submit_event("%s.analyze.%s.parse_failures" % (self.name, lang), 1)
                    if config["report_parse_failures"]:
                        log.warning("Failed to parse %s", file.path)
                        yield FileFix(error="Failed to parse", head_file=file, language=lang,
                                      feature_extractor=fe, base_file=prev_file, file_vnodes=[],
                                      line_fixes=[], y_pred_pure=None, y=None)
                else:
                    fixes, file_vnodes, y_pred_pure, y = self._generate_token_fixes(
                        file, fe, feature_extractor_output, data_service.get_bblfsh(), rules)
                    log.debug("%s %d fixes", file.path, len(fixes))
                    processed_fixes_counter[lang] += len(fixes)
                    yield FileFix(error="", head_file=file, language=lang, feature_extractor=fe,
                                  base_file=prev_file, file_vnodes=file_vnodes, line_fixes=fixes,
                                  y_pred_pure=y_pred_pure, y=y)
        for key, val in processed_files_counter.items():
            submit_event("%s.analyze.%s.files" % (self.name, key), val)
        for key, val in processed_fixes_counter.items():
            submit_event("%s.analyze.%s.fixes" % (self.name, key), val)
示例#3
0
    def generate_local_test(mcs, case_name, uast, contents):
        fe_config = FormatAnalyzer._load_config(
            get_config())["train"]["javascript"]
        feature_extractor = FeatureExtractor(language="javascript",
                                             label_composites=label_composites,
                                             **fe_config["feature_extractor"])
        file = UnicodeFile(content=contents, uast=uast, path="", language="")
        _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file])
        offsets, y_pred, result = cases[case_name]

        def _test(self):
            y_cur = deepcopy(self.y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(self.feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))),
                FakeRules(y_cur))
            generated_file = code_generator.generate(pred_vnodes)
            self.assertEqual(generated_file, result)

        return _test
示例#4
0
 def setUp(self):
     config = FormatAnalyzer._load_config(get_config())
     self.annotated_file = AnnotationManager.from_file(self.file)
     self.final_config = config["train"]["javascript"]
     self.extractor = FeatureExtractor(
         language="javascript", **self.final_config["feature_extractor"])
     self.annotated_file = AnnotationManager.from_file(self.file)
示例#5
0
 def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]:
     uast = client.parse(filename="", language="javascript", contents=code.encode()).uast
     extractor = FeatureExtractor(language="javascript", **config)
     result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="",
                                                      language="javascript")])
     if result is None:
         self.fail("Could not parse test code.")
     _, _, (vnodes_y, _, _, _) = result
     return [vnode.y for vnode in vnodes_y]
示例#6
0
def return_features() -> Response:
    """Featurize the given code."""
    body = request.get_json()
    code = body["code"]
    babelfish_address = body["babelfish_address"]
    language = body["language"]
    client = BblfshClient(babelfish_address)
    res = client.parse(filename="", contents=code.encode(), language=language)
    if res.status != 0:
        abort(500)
    model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf"))
    if language not in model:
        raise NotFittedError()
    rules = model[language]
    file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path")
    config = rules.origin_config["feature_extractor"]
    config["return_sibling_indices"] = True
    fe = FeatureExtractor(language=language, **config)
    res = fe.extract_features([file])
    if res is None:
        abort(500)
    X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res
    y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict(
        X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe)
    refuse_to_predict = y_pred < 0
    checker = UASTStabilityChecker(fe)
    _, _, _, _, safe_preds = checker.check(
        y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub,
        vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners,
        grouped_quote_predictions=grouped_quote_predictions)
    break_uast = [False] * X.shape[0]
    for wrong_pred in set(range(X.shape[0])).difference(safe_preds):
        break_uast[wrong_pred] = True
    labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)}
    app.logger.info("returning features of shape %d, %d" % X.shape)
    app.logger.info("length of rules: %d", len(rules))
    return jsonify({
        "code": code,
        "features": _input_matrix_to_descriptions(X, fe),
        "ground_truths": y.tolist(),
        "predictions": y_pred.tolist(),
        "refuse_to_predict": refuse_to_predict.tolist(),
        "sibling_indices": sibling_indices,
        "rules": _rules_to_jsonable(rules, fe),
        "winners": rule_winners.tolist(),
        "break_uast": break_uast,
        "feature_names": fe.feature_names,
        "class_representations": fe.composite_class_representations,
        "class_printables": fe.composite_class_printables,
        "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)),
        "config": _mapping_to_jsonable(rules.origin_config)})
示例#7
0
class FeaturesTests(unittest.TestCase):
    def setUp(self):
        config = FormatAnalyzer._load_config(get_config())["train"]
        self.extractor = FeatureExtractor(language="javascript",
                                          **config["javascript"]["feature_extractor"])

    def test_vnode_positions(self):
        test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
        with open(str(test_js_code_filepath), mode="rb") as f:
            code = f.read()
        uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
            filename="", language="javascript", contents=code).uast
        nodes, parents = list(self.extractor._parse_file(code.decode("utf-8", "replace"),
                                                         uast, test_js_code_filepath))
        # Just should not fail
        list(self.extractor._classify_vnodes(nodes, "filepath"))
 def setUpClass(cls):
     config = FormatAnalyzer._load_train_config(merge_dicts(
         get_train_config(), {
             "javascript": {
                 "feature_extractor": {
                     "left_siblings_window": 1,
                     "right_siblings_window": 1,
                     "parents_depth": 1,
                     "node_features": ["start_line", "reserved", "roles"],
                 },
             },
         }))
     base = Path(__file__).parent
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     files = [file, file]
     cls.fe = FeatureExtractor(language="javascript",
                               **config["javascript"]["feature_extractor"])
     cls.fe.extract_features(files)
     cls.class_representations = cls.fe.composite_class_representations
     cls.n_classes = len(cls.fe.labels_to_class_sequences)
     cls.ordinal = cls.return_node_feature(FeatureId.start_line)
     cls.categorical = cls.return_node_feature(FeatureId.reserved)
     cls.bag = cls.return_node_feature(FeatureId.roles)
示例#9
0
 def _get_composite(feature_extractor: FeatureExtractor,
                    labels: Tuple[int, ...]) -> int:
     if labels in feature_extractor.class_sequences_to_labels:
         return feature_extractor.class_sequences_to_labels[labels]
     feature_extractor.class_sequences_to_labels[labels] = \
         len(feature_extractor.class_sequences_to_labels)
     feature_extractor.labels_to_class_sequences.append(labels)
     return len(feature_extractor.labels_to_class_sequences) - 1
示例#10
0
class FeaturesTests(unittest.TestCase):
    def setUp(self):
        config = FormatAnalyzer._load_config(get_config())["train"]
        self.extractor = FeatureExtractor(language="javascript",
                                          **config["javascript"]["feature_extractor"])

    def test_vnode_positions(self):
        test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
        with open(str(test_js_code_filepath), mode="rb") as f:
            code = f.read()
        uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
            filename="", language="javascript", contents=code).uast
        file = BytesToUnicodeConverter.convert_file(
            File(content=code, uast=uast, language="javascript", path="test.js"))
        annotated_data = AnnotationManager.from_file(file)
        self.extractor._parse_file(annotated_data)
        # Just should not fail
        self.extractor._classify_vnodes(annotated_data)
示例#11
0
 def setUpClass(cls):
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.files = [file]
     config = FormatAnalyzer._load_config(get_config())["train"]
     cls.extractor = FeatureExtractor(
         language="javascript", **config["javascript"]["feature_extractor"])
def files2vnodes(
    files: Iterable[str],
    feature_extractor: FeatureExtractor,
    client: str,
) -> Iterable[VirtualNode]:
    """
    Return the `VirtualNode`-s extracted from a list of files.

    :param files: List of files to get `Misprediction`-s and `VirtualNode`-s from.
    :param feature_extractor: FeatureExtractor to use.
    :param client: Babelfish client. Babelfish server should be started accordingly.
    :return: List of `VirtualNode`-s extracted from a given list of files.
    """
    files = prepare_files(files, client, feature_extractor.language)
    _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features(files)
    return vnodes_y
示例#13
0
def files2vnodes(filepaths: Iterable[str], feature_extractor: FeatureExtractor, rules: Rules,
                 client: BblfshClient) -> Iterable[VirtualNode]:
    """
    Return the `VirtualNode`-s extracted from a list of files.

    :param filepaths: List of files to get `Misprediction`-s and `VirtualNode`-s from.
    :param feature_extractor: FeatureExtractor to use.
    :param rules: Rules to use for prediction.
    :param client: Babelfish client. Babelfish server should be started accordingly.
    :return: List of `VirtualNode`-s extracted from a given list of files.
    """
    files = parse_files(filepaths=filepaths,
                        line_length_limit=rules.origin_config["line_length_limit"],
                        overall_size_limit=rules.origin_config["overall_size_limit"],
                        client=client, language=feature_extractor.language)
    _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features(files)
    return vnodes_y
示例#14
0
 def setUpClass(cls):
     cls.maxDiff = None
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     config = FormatAnalyzer._load_train_config(get_train_config())
     fe_config = config["javascript"]
     cls.feature_extractor = FeatureExtractor(
         language="javascript",
         label_composites=label_composites,
         **fe_config["feature_extractor"])
     cls.file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \
         cls.feature_extractor.extract_features([cls.file])
示例#15
0
 def setUpClass(cls):
     config = FormatAnalyzer._load_config(get_config())["train"]
     cls.extractor = FeatureExtractor(
         language="javascript", **config["javascript"]["feature_extractor"])
     test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         cls.code = f.read()
     cls.uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
         filename="", language="javascript", contents=cls.code).uast
     feature_extractor_output = cls.extractor.extract_features([
         FakeFile(path="test.py",
                  content=cls.code,
                  uast=cls.uast,
                  language="JavaScript")
     ])
     X, cls.y, (cls.vnodes_y, cls.vnodes, vnode_parents, node_parents) = \
         feature_extractor_output
示例#16
0
def dump_rule(model: FormatModel, rule_hash: str):
    """
    Print the rule contained in the model by hash.

    :param model: Trained model.
    :param rule_hash: 8-char rule hash.
    :return: Nothing
    """
    for lang in model.languages:
        rules = model[lang]
        fe = FeatureExtractor(language=lang,
                              **rules.origin_config["feature_extractor"])
        for rule in rules.rules:
            h = hash_rule(rule, fe)
            if h == rule_hash:
                print(lang)
                print("    " + describe_rule(rule, fe).replace("\t", "    "))
示例#17
0
class FeaturesTests(unittest.TestCase):
    def setUp(self):
        config = FormatAnalyzer._load_config(get_config())["train"]
        self.extractor = FeatureExtractor(
            language="javascript", **config["javascript"]["feature_extractor"])

    def test_positions(self):
        test_js_code_filepath = Path(
            __file__).parent / "browser-policy-content.js"
        with open(str(test_js_code_filepath), mode="rt") as f:
            code = f.read()
        uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
            filename="", language="javascript", contents=code.encode()).uast
        nodes, parents = self.extractor._parse_file(code, uast,
                                                    test_js_code_filepath)
        for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])):
            self.assertLessEqual(
                node1.start.line, node2.start.line,
                "Start line position decrease for %d, %d nodes" %
                (index, index + 1))
            self.assertLessEqual(
                node1.start.offset, node2.start.offset,
                "Start offset position decrease for %d, %d nodes" %
                (index, index + 1))
示例#18
0
    def test_generate_new_line(self):
        self.maxDiff = None
        expected_res = {
            "nothing changed": [],
            "remove new line in the end of 4th line":
            None,
            "indentation in the beginning":
            [" import { makeToast } from '../../common/app/Toasts/redux';"],
            "remove indentation in the 4th line till the end":
            [" return Object.keys(flash)", " }"],
            "new line between 6th and 7th regular code lines":
            ["\n      return messages.map(message => ({"],
            "new line in the middle of the 7th code line with indentation increase":
            ["      return messages\n        .map(message => ({", "  })"],
            "new line in the middle of the 7th code line with indentation decrease":
            ["      return messages\n    .map(message => ({", "      })"],
            "new line in the middle of the 7th code line without indentation increase":
            ["      return messages\n      .map(message => ({"],
            "change quotes":
            ['import { makeToast } from "../../common/app/Toasts/redux";'],
            "remove indentation decrease 11th line": ["        }));"],
            "change indentation decrease to indentation increase 11th line":
            ["          }));"],
            "change indentation decrease to indentation increase 11th line but keep the rest":
            ["          }));", "})"],
        }

        base = Path(__file__).parent
        # str() is needed for Python 3.5
        with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
            contents = fin.read()
        with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
            uast = bblfsh.Node.FromString(fin.read())
        config = FormatAnalyzer._load_config(get_config())
        fe_config = config["train"]["javascript"]

        for case in expected_res:
            offsets, y_pred, _ = cases[case]
            feature_extractor = FeatureExtractor(
                language="javascript",
                label_composites=label_composites,
                **fe_config["feature_extractor"])
            file = UnicodeFile(content=contents,
                               uast=uast,
                               path="",
                               language="")
            X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = \
                feature_extractor.extract_features([file])
            y_cur = deepcopy(y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                vnodes, vnodes_y, list(range(len(vnodes_y))), FakeRules(y_cur))
            res = []
            for gln in FormatAnalyzer._group_line_nodes(
                    y, y_cur, vnodes_y, pred_vnodes, [1] * len(y)):
                line, (line_y, line_y_pred, line_vnodes_y, line_vnodes,
                       line_rule_winners) = gln
                new_code_line = code_generator.generate_new_line(line_vnodes)
                res.append(new_code_line)
            if expected_res[case] is not None:
                # None means that we delete some lines. We are not handle this properly now.
                self.assertEqual(res, expected_res[case], case)
示例#19
0
def quality_report_noisy(bblfsh: str,
                         language: str,
                         confidence_threshold: float,
                         support_threshold: int,
                         precision_threshold: float,
                         dir_output: str,
                         config: Optional[dict] = None,
                         repos: Optional[str] = None) -> None:
    """
    Generate a quality report on the artificial noisy dataset including evaluation curves.

    :param bblfsh: Babelfish client. Babelfish server should be started accordingly.
    :param language: Language to consider, others will be discarded.
    :param confidence_threshold: Confidence threshold to filter relevant rules.
    :param support_threshold: Support threshold to filter relevant rules.
    :param precision_threshold: Precision threshold tolerated by the model. \
           Limit drawn as a red horizontal line on the figure.
    :param dir_output: Path to the output directory where to store the quality report in Markdown \
           and the precision-recall curve in png format.
    :param config: FormatAnalyzer config to use. Default one is used if not set.
    :param repos: Input list of urls to the repositories to analyze. \
           Should be strings separated by newlines. If it is None, \
           we use the string defined at the beginning of the file.
    """
    log = logging.getLogger("quality_report_noisy")

    # initialization
    repo_names = []
    last_accepted_rule = {}
    prediction_rates, precisions, accepted_rules = (defaultdict(list)
                                                    for _ in range(3))
    n_mistakes, prec_max_prediction_rate, confidence_threshold_exp, max_prediction_rate, \
        n_rules, n_rules_filtered = ({} for _ in range(6))
    if repos is None:
        repos = REPOSITORIES
    try:
        # fetch the the original and noisy repositories
        client = BblfshClient(bblfsh)
        log.info("Repositories: %s", repos)
        with tempfile.TemporaryDirectory() as tmpdirname:
            for raw in repos.splitlines():
                repo_path, clean_commit, noisy_commit = raw.split(",")
                repo = repo_path.split("/")[-1]
                log.info("Fetching %s", repo_path)
                git_dir = os.path.join(tmpdirname, repo)
                git_dir_noisy = os.path.join(tmpdirname, repo + "_noisy")
                cmd1 = "git clone --single-branch --branch master %s %s" % (
                    repo_path, git_dir)
                cmd2 = "git clone --single-branch --branch style-noise-1-per-file %s %s" \
                    % (repo_path, git_dir_noisy)
                try:
                    for cmd in (cmd1, cmd2):
                        log.debug("Running: %s", cmd)
                        subprocess.check_call(cmd.split())
                except subprocess.CalledProcessError as e:
                    raise ConnectionError("Unable to fetch repository %s" %
                                          repo_path) from e

                # train the model on the original repository
                ref = ReferencePointer(repo_path, "HEAD", clean_commit)
                model_path = os.path.join(git_dir, "model.asdf")
                format_model = train(training_dir=git_dir,
                                     ref=ref,
                                     output_path=model_path,
                                     language=language,
                                     bblfsh=bblfsh,
                                     config=config,
                                     log=log)
                rules = format_model[language]

                # extract the raw data and the diff from the repositories
                input_pattern = os.path.join(git_dir, "**", "*.js")
                input_pattern_noisy = os.path.join(git_dir_noisy, "**", "*.js")
                true_content = get_content_from_repo(input_pattern)
                noisy_content = get_content_from_repo(input_pattern_noisy)
                true_files, noisy_files, start_changes = get_difflib_changes(
                    true_content, noisy_content)
                if not true_files:
                    raise ValueError(
                        "Noisy repo should count at least one artificial mistake"
                    )
                log.info(
                    "Number of files modified by adding style noise: %d / %d",
                    len(true_files), len(true_content))
                del true_content, noisy_content

                # extract the features
                feature_extractor = FeatureExtractor(
                    language=language,
                    **rules.origin_config["feature_extractor"])
                vnodes_y_true = files2vnodes(true_files, feature_extractor,
                                             rules, client)
                mispreds_noise = files2mispreds(noisy_files, feature_extractor,
                                                rules, client, log)

                # compute the prediction rate and precision score on the artificial noisy dataset
                diff_mispreds = get_diff_mispreds(mispreds_noise,
                                                  start_changes)
                changes_count = len(start_changes)
                n_rules[repo] = len(rules.rules)
                rules_id = [(i, r.stats.conf)
                            for i, r in enumerate(rules.rules)
                            if r.stats.conf > confidence_threshold
                            and r.stats.support > support_threshold]
                rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True)
                for i in range(len(rules_id)):
                    filtered_mispreds = {
                        k: m
                        for k, m in diff_mispreds.items()
                        if any(r[0] == m.rule for r in rules_id[:i + 1])
                    }
                    style_fixes = get_style_fixes(filtered_mispreds,
                                                  vnodes_y_true, true_files,
                                                  noisy_files,
                                                  feature_extractor)
                    prediction_rate, precision = compute_metrics(
                        changes_count=changes_count,
                        predictions_count=len(filtered_mispreds),
                        true_positive=len(style_fixes))
                    prediction_rates[repo].append(round(prediction_rate, 3))
                    precisions[repo].append(round(precision, 3))
                print("prediction rate x:", prediction_rates[repo])
                print("precision y:", precisions[repo])

                # compute other statistics and quality metrics for the model's evaluation
                repo_names.append(repo)
                n_mistakes[repo] = len(true_files)
                prec_max_prediction_rate[repo] = precisions[repo][-1]
                max_prediction_rate[repo] = max(prediction_rates[repo])
                n_rules_filtered[repo] = len(rules_id)

                # compute the confidence and prediction rate limit for a given precision threshold
                for i, (prediction_rate, prec) in enumerate(
                        zip(prediction_rates[repo], precisions[repo])):
                    if prec >= precision_threshold:
                        accepted_rules[repo].append(
                            (i, rules_id[i][1], prediction_rate))
                last_accepted_rule[repo] = min(accepted_rules[repo],
                                               key=itemgetter(1))
                confidence_threshold_exp[repo] = (last_accepted_rule[repo][0],
                                                  last_accepted_rule[repo][1])
    finally:
        client._channel.close()

    # compute the index of the last accepted rule according to the maximum confidence threshold
    limit_conf_id = {}
    max_confidence_threshold_exp = max(confidence_threshold_exp.values(),
                                       key=itemgetter(1))
    for repo, rules in accepted_rules.items():
        for rule in rules:
            if rule[1] < max_confidence_threshold_exp[1]:
                break
            limit_conf_id[repo] = rule[0]

    # compile the curves showing the evolutions of the prediction rate and precision score
    path_to_figure = os.path.join(dir_output, "pr_curves.png")
    plot_curve(repo_names, prediction_rates, precisions, precision_threshold,
               limit_conf_id, path_to_figure)

    # compile the markdown template for the report through jinja2
    loader = jinja2.FileSystemLoader(
        (os.path.join(os.path.dirname(__file__), "..", "templates"), ),
        followlinks=True)
    env = jinja2.Environment(trim_blocks=True,
                             lstrip_blocks=True,
                             keep_trailing_newline=True)
    env.globals.update(range=range)
    template = loader.load(env, "noisy_quality_report.md.jinja2")
    report = template.render(repos=repo_names,
                             n_mistakes=n_mistakes,
                             prec_max_prediction_rate=prec_max_prediction_rate,
                             confidence_threshold_exp=round(
                                 max_confidence_threshold_exp[1], 2),
                             max_prediction_rate=max_prediction_rate,
                             confidence_threshold=confidence_threshold,
                             support_threshold=support_threshold,
                             n_rules=n_rules,
                             n_rules_filtered=n_rules_filtered,
                             path_to_figure=path_to_figure)

    # write the quality report
    repo_pathrt = os.path.join(dir_output, "report_noise.md")
    with open(repo_pathrt, "w", encoding="utf-8") as f:
        f.write(report)
示例#20
0
class FeaturesTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        base = Path(__file__).parent
        # str() is needed for Python 3.5
        with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
            cls.contents = fin.read()
        with lzma.open(str(base / "benchmark.uast.xz")) as fin:
            cls.uast = bblfsh.Node.FromString(fin.read())

    def setUp(self):
        config = FormatAnalyzer._load_config(get_config())
        self.final_config = config["train"]["javascript"]
        self.extractor = FeatureExtractor(language="javascript",
                                          **self.final_config["feature_extractor"])

    def test_parse_file_exact_match(self):
        test_js_code_filepath = Path(__file__).parent / "for_parse_test.js.xz"
        with lzma.open(str(test_js_code_filepath), mode="rt") as f:
            code = f.read()
        uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
            filename="", language="javascript", contents=code.encode()).uast
        nodes, parents = self.extractor._parse_file(code, uast, test_js_code_filepath)
        self.assertEqual("".join(n.value for n in nodes), code)

    def test_extract_features_exact_match(self):
        file = UnicodeFile(content=self.contents,
                           uast=self.uast, path="test.js", language="javascript")
        files = [file]
        X, y, (vnodes_y, vnodes, _, _) = self.extractor.extract_features(files)
        self.assertEqual("".join(vnode.value for vnode in vnodes), self.contents)

    def test_parse_file_comment_after_regexp(self):
        code = "x = // comment\n/<regexp>/;"
        uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
            filename="", language="javascript", contents=code.encode()).uast
        nodes, parents = self.extractor._parse_file(code, uast, "")
        self.assertEqual("".join(n.value for n in nodes), code)

    def test_parse_file(self):
        nodes, parents = self.extractor._parse_file(self.contents, self.uast, "test_file")
        text = []
        offset = line = col = 0
        for n in nodes:
            if line == n.start.line - 1:
                line += 1
                col = 1
            self.assertEqual((offset, line, col), n.start, n.value)
            text.append(n.value)
            if n.node is not None:
                self.assertIsNotNone(parents.get(id(n.node)), n)
            offset, line, col = n.end
        self.assertEqual(len(self.contents), offset)
        # New line ends on the next line
        self.assertEqual(len(self.contents.splitlines()) + 1, line)
        self.assertEqual("".join(text), self.contents)

    def test_parse_file_with_trailing_space(self):
        contents = self.contents + " "
        nodes, parents = self.extractor._parse_file(contents, self.uast, "test_file")
        offset, line, col = nodes[-1].end
        self.assertEqual(len(contents), offset)
        # Space token always ends on the same line
        self.assertEqual(len(contents.splitlines()), line)
        self.assertEqual("".join(n.value for n in nodes), contents)

    def test_classify_vnodes(self):
        nodes, _ = self.extractor._parse_file(self.contents, self.uast, "test_file")
        nodes = list(self.extractor._classify_vnodes(nodes, "test_file"))
        text = "".join(n.value for n in nodes)
        self.assertEqual(text, self.contents)
        cls_counts = Counter()
        offset = line = col = 0
        for n in nodes:
            if line == n.start.line - 1:
                line += 1
                col = 1
            self.assertEqual((offset, line, col), n.start, n.value)
            if n.y is not None:
                cls_counts.update(map(CLASSES.__getitem__, n.y))
            offset, line, col = n.end
        self.assertEqual(len(self.contents), offset)
        # New line ends on the next line
        self.assertEqual(len(self.contents.splitlines()) + 1, line)
        self.assertEqual(cls_counts[CLS_SPACE_INC], cls_counts[CLS_SPACE_DEC])
        self.assertGreater(cls_counts[CLS_SPACE_INC], 0)
        self.assertGreater(cls_counts[CLS_SPACE], 0)
        self.assertGreater(cls_counts[CLS_NEWLINE], 0)
        self.assertGreater(cls_counts[CLS_SINGLE_QUOTE], 0)
        self.assertTrue(cls_counts[CLS_SINGLE_QUOTE] % 2 == 0)

    def test_classify_vnodes_with_trailing_space(self):
        contents = self.contents + " "
        nodes, _ = self.extractor._parse_file(contents, self.uast, "test_file")
        nodes = list(self.extractor._classify_vnodes(nodes, "test_file"))
        text = "".join(n.value for n in nodes)
        self.assertEqual(text, contents)
        cls_counts = Counter()
        offset = line = col = 0
        for n in nodes:
            if line == n.start.line - 1:
                line += 1
                col = 1
            self.assertEqual((offset, line, col), n.start, n.value)
            if n.y is not None:
                cls_counts.update(map(CLASSES.__getitem__, n.y))
            offset, line, col = n.end
        self.assertEqual(len(contents), offset)
        # Space token always ends on the same line
        self.assertEqual(len(contents.splitlines()), line)
        self.assertEqual(cls_counts[CLS_SPACE_INC], cls_counts[CLS_SPACE_DEC] + 1)
        self.assertGreater(cls_counts[CLS_SPACE_INC], 0)
        self.assertGreater(cls_counts[CLS_SPACE], 0)
        self.assertGreater(cls_counts[CLS_NEWLINE], 0)
        self.assertGreater(cls_counts[CLS_SINGLE_QUOTE], 0)
        self.assertTrue(cls_counts[CLS_SINGLE_QUOTE] % 2 == 0)

    def test_compute_labels_mappings(self):
        pos1, pos2 = Position(1, 1, 1), Position(10, 2, 1)
        files = [VirtualNode("", pos1, pos2, y=(1,))] * 2 + \
            [VirtualNode("", pos1, pos2), VirtualNode("", pos1, pos2, y=(2,)),
             VirtualNode("", pos1, pos2, y=(3,))]
        self.extractor.cutoff_label_support = 2
        self.extractor._compute_labels_mappings(files)
        self.assertEqual(self.extractor.labels_to_class_sequences, [(1,)])
        self.assertEqual(self.extractor.class_sequences_to_labels, {(1,): 0})

    def test_extract_features(self):
        file = UnicodeFile(content=self.contents,
                           uast=self.uast, path="test.js", language="javascript")
        files = [file, file]

        res = self.extractor.extract_features(files)
        self.assertIsNotNone(res, "Failed to parse files.")
        self.check_X_y(*res)

    def check_X_y(self, X_csr, y, secondary_features):
        X = X_csr.toarray()
        vnodes_y, vnodes, vnode_parents, node_parents = secondary_features
        self.assertEqual(X.shape[0], y.shape[0])
        self.assertEqual(X.shape[0], len(vnodes_y))
        self.assertEqual(len(vnodes), len(vnode_parents))
        for vn in vnodes_y:
            self.assertIsInstance(vn, VirtualNode)
        self.assertEqual(type(vnode_parents[id(vnodes[0])]).__module__, bblfsh.Node.__module__)
        for _, node in node_parents.items():
            self.assertEqual(type(node).__module__, bblfsh.Node.__module__)
        self.assertEqual(X.shape[1], self.extractor.count_features())
        not_set = X == -1
        unset_rows = numpy.nonzero(numpy.all(not_set, axis=1))[0]
        unset_columns = numpy.nonzero(numpy.all(not_set, axis=0))[0]
        self.assertEqual(len(unset_rows), 0, "%d rows are unset" % len(unset_rows))
        self.assertEqual(len(unset_columns), 0,
                         "columns %s are unset" % ", ".join(map(str, unset_columns)))

    def test_extract_features_all_lines(self):
        file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js",
                           language="javascript")
        files = [file, file]

        self.check_X_y(*self.extractor.extract_features(
            files, [list(range(1, self.contents.count("\n") + 1))] * 2))

    def test_empty_strings(self):
        config = deepcopy(self.final_config["feature_extractor"])
        config["cutoff_label_support"] = 0
        client = bblfsh.BblfshClient("0.0.0.0:9432")

        def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]:
            uast = client.parse(filename="", language="javascript", contents=code.encode()).uast
            extractor = FeatureExtractor(language="javascript", **config)
            result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="",
                                                             language="javascript")])
            if result is None:
                self.fail("Could not parse test code.")
            _, _, (vnodes_y, _, _, _) = result
            return [vnode.y for vnode in vnodes_y]
        self.assertEqual(get_class_sequences_from_code("var a = '';"),
                         get_class_sequences_from_code("var a = 'a';"))

    def test_extract_features_some_lines(self):
        file = UnicodeFile(content=self.contents,
                           uast=self.uast, path="test.js", language="javascript")
        files = [file]

        X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents) = self.extractor.extract_features(
            files, [list(range(1, self.contents.count("\n") // 2 + 1))] * 2)
        self.check_X_y(X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents))
        X2_csr, y2, (vn2_y, vn2, _, _) = self.extractor.extract_features(files)
        X1, X2 = X1_csr.toarray(), X2_csr.toarray()
        self.assertTrue((X1 == X2[:len(X1)]).all())
        self.assertTrue((y1 == y2[:len(y1)]).all())
        self.assertTrue(vn1_y == vn2_y[:len(vn1_y)])
        self.assertLess(len(y1), len(y2))

    def test_noop_vnodes(self):
        vnodes, parents = self.extractor._parse_file(self.contents, self.uast, "test_file")
        vnodes = self.extractor._classify_vnodes(vnodes, "test_file")
        vnodes = self.extractor._merge_classes_to_composite_labels(
            vnodes, "test_file", index_labels=True)
        vnodes = self.extractor._add_noops(list(vnodes), "test_file", index_labels=True)
        for vnode1, vnode2, vnode3 in zip(vnodes,
                                          islice(vnodes, 1, None),
                                          islice(vnodes, 2, None)):
            if vnode1.y is not None or vnode3.y is not None:
                self.assertNotIn(CLASS_INDEX[CLS_NOOP], vnode2.y if vnode2.y else set(),
                                 "\n".join(map(repr, [vnode1, vnode2, vnode3])))
示例#21
0
def filter_uast_breaking_preds(
    y: numpy.ndarray,
    y_pred: numpy.ndarray,
    vnodes_y: Sequence[VirtualNode],
    vnodes: Sequence[VirtualNode],
    files: Mapping[str, File],
    feature_extractor: FeatureExtractor,
    stub: "bblfsh.aliases.ProtocolServiceStub",
    vnode_parents: Mapping[int, bblfsh.Node],
    node_parents: Mapping[int, bblfsh.Node],
    rule_winners: numpy.ndarray,
    grouped_quote_predictions: QuotedNodeTripleMapping,
) -> Tuple[numpy.ndarray, numpy.ndarray, Sequence[VirtualNode], numpy.ndarray,
           Sequence[int]]:
    """
    Filter the model's predictions that modify the UAST apart from changing positions.

    :param y: Numpy 1-dimensional array of labels.
    :param y_pred: Numpy 1-dimensional array of predicted labels by the model.
    :param vnodes_y: Sequence of the labeled `VirtualNode`-s corresponding to labeled samples.
    :param vnodes: Sequence of all the `VirtualNode`-s corresponding to the input.
    :param files: Dictionary of File-s with content, uast and path.
    :param feature_extractor: FeatureExtractor used to extract features.
    :param stub: Babelfish GRPC service stub.
    :param vnode_parents: `VirtualNode`-s' parents mapping as the LCA of the closest
                           left and right babelfish nodes.
    :param node_parents: Parents mapping of the input UASTs.
    :param rule_winners: Numpy array of the index of the winning rule for each sample.
    :param grouped_quote_predictions: Quotes predictions (handled differenlty from the rest).
    :return: List of predictions indices that are considered valid i.e. that are not breaking
             the UAST.
    """
    safe_preds = []
    current_path = None  # type: Optional[str]
    parsing_cache = {
    }  # type: Dict[int, Optional[Tuple[bblfsh.Node, int, int]]]
    file_content = None  # type: Optional[str]
    cur_i = 0
    for i, (gt, pred, vn_y) in enumerate(zip(y, y_pred, vnodes_y)):
        if vn_y.path != current_path:
            parsing_cache = {}
            current_path = vn_y.path
            file_content = files[vn_y.path].content.decode("utf-8", "replace")
        while vn_y is not vnodes[cur_i]:
            cur_i += 1
            if cur_i >= len(vnodes):
                raise AssertionError("vnodes_y and vnodes are not consistent.")
        # quote types are special cased
        if id(vn_y) in grouped_quote_predictions:
            pred_string = feature_extractor.label_to_str(pred)
            group = grouped_quote_predictions[id(vn_y)]
            # already handled with the previous vnode
            if group is None:
                continue
            vnode1, vnode2, vnode3 = group
            content_before = file_content[vnode1.start.offset:vnode3.end.
                                          offset]
            content_after = (feature_extractor.label_to_str(y_pred[i]) +
                             vnode2.value +
                             feature_extractor.label_to_str(y_pred[i + 1]))
            parsed_before, errors = parse_uast(
                stub,
                content_before,
                filename="",
                language=feature_extractor.language)
            if not errors:
                parsed_after, errors = parse_uast(
                    stub,
                    content_after,
                    filename="",
                    language=feature_extractor.language)
                if check_uasts_are_equal(parsed_before, parsed_after):
                    safe_preds.extend((i, i + 1))
            continue
        if gt == pred:
            safe_preds.append(i)
            continue
        pred_string = feature_extractor.label_to_str(pred)
        parsed_before = _parse_code(vnode_parents[id(vn_y)], file_content,
                                    stub, parsing_cache,
                                    feature_extractor.language, node_parents,
                                    vn_y.path)
        if parsed_before is None:
            continue
        parent_before, start, end = parsed_before
        # when the input node value is NOOP i.e. an empty string, the replacement is restricted
        # to the first occurence
        output_pred = "".join(n.value
                              for n in vnodes[cur_i:cur_i + 2]).replace(
                                  vn_y.value, pred_string, 1)
        diff_pred_offset = len(pred_string) - len(vn_y.value)
        try:
            # to handle mixed indentations, we include the `VirtualNode` following the predicted
            # one in the output predicted string, and start the rest of the sequence one
            # `VirtualNode` further to avoid its repetitions
            start_next_vnode = vn_y.start.offset + len(vn_y.value) + len(
                vnodes[cur_i + 1].value)
            content_after = (file_content[:vn_y.start.offset] + output_pred +
                             file_content[start_next_vnode:])
        # in case the prediction to check corresponds to the last label of a file
        except IndexError:
            content_after = file_content[:vn_y.start.offset] \
                + output_pred
        content_after = content_after[start:end + diff_pred_offset]
        parent_after, errors_after = parse_uast(
            stub,
            content_after,
            filename="",
            language=feature_extractor.language)
        if not errors_after:
            if check_uasts_are_equal(parent_before, parent_after):
                safe_preds.append(i)
    _log.info("Non UAST breaking predictions: %d selected out of %d",
              len(safe_preds), y_pred.shape[0])
    vnodes_y = [vn for i, vn in enumerate(list(vnodes_y)) if i in safe_preds]
    return y[safe_preds], y_pred[safe_preds], vnodes_y, rule_winners[
        safe_preds], safe_preds
示例#22
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService,
              files: Iterator[File], **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :param files: iterator of File records from the data service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        train_config = cls._load_config(config)["train"]
        _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit,
                  pformat(train_config, width=4096, compact=True))
        model = FormatModel().generate(cls, ptr)
        for language, files in files_by_language(files).items():
            try:
                lang_config = train_config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            _log.info("effective train config for %s:\n%s", language,
                      pformat(lang_config, width=120, compact=True))
            random_state = lang_config["random_state"]
            files = filter_files(
                files, lang_config["line_length_limit"], lang_config["overall_size_limit"],
                random_state, _log)
            submit_event("%s.train.%s.files" % (cls.name, language), len(files))
            if len(files) == 0:
                _log.info("zero files after filtering, language %s is skipped.", language)
                continue
            try:
                fe = FeatureExtractor(language=language, **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files), language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            train_files, test_files = FormatAnalyzer.split_train_test(
                files, lang_config["test_dataset_ratio"], random_state=random_state)
            # ensure that the features are reproducible
            train_files = sorted(train_files, key=lambda x: x.path)
            test_files = sorted(test_files, key=lambda x: x.path)
            X_train, y_train, _ = fe.extract_features(train_files)
            X_train, selected_features = fe.select_features(X_train, y_train)
            if test_files:
                X_test, y_test, _ = fe.extract_features(test_files)
            if lang_config["test_dataset_ratio"]:
                _log.debug("Real test ratio is %.3f",
                           X_test.shape[0] / (X_test.shape[0] + X_train.shape[0])
                           if test_files else 0)
            lang_config["feature_extractor"]["selected_features"] = selected_features
            lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X_train.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X_train.shape[0], lower_bound_instances)
                continue
            _log.info("extracted %d samples to train, searching for the best hyperparameters",
                      X_train.shape[0])
            optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state)
            best_score, best_params = optimizer.optimize(X_train, y_train)
            if _log.isEnabledFor(logging.DEBUG):
                _log.debug("score of the best estimator found: %.6f", best_score)
                _log.debug("params of the best estimator found: %s", str(best_params))
                _log.debug("training the model with complete data")
            else:
                _log.info("finished hyperopt at %.6f, training the full model", -best_score)
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             random_state=random_state,
                                             origin_config=lang_config)
            trainable_rules.fit(X_train, y_train)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"],
                "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i])
                            for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5))
            trainable_rules.prune_categorical_attributes(fe)
            _log.info("obtained %d rules, generating the classification report",
                      len(trainable_rules.rules))
            trainable_rules.rules.generate_classification_report(
                X_train, y_train, "train", fe.composite_class_representations)
            if test_files:
                trainable_rules.rules.generate_classification_report(
                    X_test, y_test, "test", fe.composite_class_representations)
            submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules))
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipped.", language)
        _log.info("trained %s", model)
        return model
示例#23
0
class PostprocessingTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        slogging_setup("DEBUG", False)
        cls.language = "javascript"
        cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432")
        cls.data_service = FakeDataService(cls.bblfsh_client,
                                           files=None,
                                           changes=None)
        cls.stub = cls.data_service.get_bblfsh()
        cls.config = FormatAnalyzer._load_config({
            "train": {
                "language_defaults": {
                    "feature_extractor": {
                        "cutoff_label_support": 0
                    }
                }
            },
        })["train"][cls.language]["feature_extractor"]

    @classmethod
    def tearDownClass(cls):
        cls.bblfsh_client._channel.close()

    def setUp(self):
        self.fe = FeatureExtractor(language=self.language, **self.config)

    def _to_label(self, classes: Sequence[str]) -> Tuple[int, ...]:
        return self.fe.class_sequences_to_labels[tuple(CLASS_INDEX[cls]
                                                       for cls in classes)]

    @staticmethod
    def _grouped_predictions_mapping(vnodes: Sequence[VirtualNode],
                                     indices: Optional[Sequence[int]]):
        result = OrderedDict()
        if indices is None:
            return result
        y_index = [i for i, vnode in enumerate(vnodes) if vnode.y is not None]
        for i in indices:
            y_i = y_index[i]
            result[id(vnodes[y_i])] = (vnodes[y_i], vnodes[y_i + 1],
                                       vnodes[y_i + 2])
            result[id(vnodes[y_i + 2])] = None
        return result

    def edit_and_test(self,
                      code: str,
                      modifs: Mapping[int, Sequence[str]],
                      *,
                      quote_indices: Optional[Tuple[int, ...]] = None,
                      bad_indices: Optional[FrozenSet[int]] = None) -> None:
        uast, errors = parse_uast(self.stub,
                                  code,
                                  filename="",
                                  language=self.language)
        if errors:
            self.fail("Could not parse the testing code.")
        file = File(content=code.encode(), uast=uast, path="test_file")
        X, y, (vnodes_y, vnodes, vnode_parents,
               node_parents) = self.fe.extract_features([file])
        y_pred = y.copy()
        rule_winners = numpy.zeros(y.shape)
        for index, classes in modifs.items():
            y_pred[index] = self._to_label(classes)
        checker = UASTStabilityChecker(self.fe)
        grouped_quote_predictions = self._grouped_predictions_mapping(
            vnodes, quote_indices)
        new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
            y,
            y_pred,
            vnodes_y,
            vnodes, [file],
            self.stub,
            vnode_parents,
            node_parents,
            rule_winners,
            grouped_quote_predictions=grouped_quote_predictions)
        bad_preds = set(range(y.shape[0])) - set(safe_preds)
        bad = modifs.keys() if bad_indices is None else bad_indices
        self.assertEqual(bad_preds, bad)
        self.assertEqual(len(y) - len(bad), len(new_y))
        self.assertEqual(len(y_pred) - len(bad), len(new_y_pred))
        self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y))
        self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))

    def test_posprocess(self):
        self.edit_and_test("var a = 0", {1: (CLS_NOOP, )})

    def test_bad_and_good_quotes(self):
        self.edit_and_test("""var a = '"0"'; var c = "0";""", {
            4: (CLS_DOUBLE_QUOTE, ),
            5: (CLS_DOUBLE_QUOTE, ),
            10: (CLS_SINGLE_QUOTE, ),
            11: (CLS_SINGLE_QUOTE, )
        },
                           quote_indices=(4, 10),
                           bad_indices=frozenset((4, 5)))

    def test_lonely_quote(self):
        self.edit_and_test("var a = 0; var b = 'c';", {2: (CLS_SINGLE_QUOTE)},
                           quote_indices=(9, ))

    def test_multiple_files(self):
        data = [
            ("var a = 0", {
                1: (CLS_NOOP, )
            }),
            ("var b = 123", {
                4: (CLS_NOOP, )
            }),
        ]
        files = []
        for i, (code, _) in enumerate(data):
            uast, errors = parse_uast(self.stub,
                                      code,
                                      filename="",
                                      language=self.language)
            if errors:
                self.fail("Could not parse the testing code.")
            files.append(
                File(content=code.encode(), uast=uast,
                     path="test_file_%d" % i))
        X, y, (vnodes_y, vnodes, vnode_parents,
               node_parents) = self.fe.extract_features(files)
        y_pred = y.copy()
        rule_winners = numpy.zeros(y.shape)
        for (_, modif) in data:
            for i in modif:
                y_pred[i] = self._to_label(modif[i])
        checker = UASTStabilityChecker(self.fe)
        new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
            y,
            y_pred,
            vnodes_y,
            vnodes,
            files,
            self.stub,
            vnode_parents,
            node_parents,
            rule_winners,
            grouped_quote_predictions={})
        self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
示例#24
0
 def setUp(self):
     self.fe = FeatureExtractor(language=self.language, **self.config)
示例#25
0
def visualize(input_filename: str, bblfsh: str, language: str,
              model_path: str) -> None:
    """Visualize the errors made on a single file."""
    model = FormatModel().load(model_path)
    rules = model[language]
    print("Model parameters: %s" % rules.origin)
    print("Stats about rules: %s" % rules)

    client = BblfshClient(bblfsh)
    file = prepare_file(input_filename, client, language)

    fe = FeatureExtractor(language=language,
                          **rules.origin_config["feature_extractor"])
    X, y, vnodes_y, vnodes = fe.extract_features([file])

    y_pred, _, _ = rules.predict(X, vnodes_y, vnodes, fe)

    # collect lines with mispredictions - could be removed
    mispred_lines = set()
    lines = set()
    for gt, pred, node in zip(y, y_pred, vnodes_y):
        lines.add((node.path, node.start.line))
        if gt != pred:
            mispred_lines.add((node.path, node.start.line))
    print("Number of lines with mispredictions: %s out of %s mispredicted" %
          (len(mispred_lines), len(lines)))

    # collect mispredictions and all other predictions for each line with mistake
    mispred = defaultdict(list)
    for gt, pred, node in zip(y, y_pred, vnodes_y):
        if (node.path, node.start.line) in mispred_lines:
            mispred[(node.path,
                     node.start.line)].append(Misprediction(gt, pred, node))

    # sort each line
    for value in mispred.values():
        value.sort(key=lambda k: k.node.start.offset)

    # final mispredictions
    final_mispred = []
    for line in sorted(mispred):
        gt = [m.y for m in mispred[line]]
        pred = [m.pred for m in mispred[line]]
        s = SequenceMatcher(None, gt, pred)
        blocks = s.get_matching_blocks()

        if blocks[0].a != 0:
            # mispredictions before the first matching block
            final_mispred.extend(mispred[line][:blocks[0].a])
        for i in range(len(blocks) - 1):
            final_mispred.extend(mispred[line][blocks[i].a:blocks[i + 1].a])
        if blocks[-1].a != len(mispred[line]):
            # mispredictions after the last matching block
            final_mispred.extend(mispred[line][blocks[-1].a:])

    mispred = sorted([misp for misp in final_mispred if misp.y != misp.pred],
                     key=lambda r: r.node.start.offset)

    new_content = ENDC
    old_content = file.content.decode("utf-8")
    for i in range(len(mispred)):
        wrong = mispred[i]
        start = wrong.node.start.offset
        end = wrong.node.end.offset
        if end == start:
            end = start + len(wrong.node.value)

        if i == 0 and start != 0:
            new_content += old_content[:start]

        new_content += GREEN + CLASSES[wrong.y] + RED + CLASSES[
            wrong.pred] + ENDC

        if i == len(mispred) - 1:
            if end != len(old_content):
                new_content += old_content[end:]
        else:
            new_content += old_content[end:mispred[i + 1].node.start.offset]
    print("Visualization:\n" + new_content)
示例#26
0
    def train(cls, ptr: ReferencePointer, config: Mapping[str, Any],
              data_service: DataService, **data) -> FormatModel:
        """
        Train a model given the files available.

        :param ptr: Git repository state pointer.
        :param config: configuration dict.
        :param data: contains "files" - the list of files in the pointed state.
        :param data_service: connection to the Lookout data retrieval service.
        :return: AnalyzerModel containing the learned rules, per language.
        """
        _log = logging.getLogger(cls.__name__)
        _log.info("train %s %s %s", ptr.url, ptr.commit,
                  pformat(config, width=4096, compact=True))
        model = FormatModel().construct(cls, ptr)
        config = cls._load_train_config(config)
        for language, files in files_by_language(data["files"]).items():
            try:
                lang_config = config[language]
            except KeyError:
                _log.warning("language %s is not supported, skipped", language)
                continue
            files = filter_files(files, lang_config["line_length_limit"], _log)
            submit_event("%s.train.%s.files" % (cls.name, language),
                         len(files))
            if len(files) == 0:
                _log.info(
                    "zero files after filtering, language %s is skipped.",
                    language)
                continue
            try:
                fe = FeatureExtractor(language=language,
                                      **lang_config["feature_extractor"])
            except ImportError:
                _log.warning("skipped %d %s files - not supported", len(files),
                             language)
                continue
            else:
                _log.info("training on %d %s files", len(files), language)
            # we sort to make the features reproducible
            X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path))
            X, selected_features = fe.select_features(X, y)
            lang_config["feature_extractor"][
                "selected_features"] = selected_features
            lang_config["feature_extractor"][
                "label_composites"] = fe.labels_to_class_sequences
            lower_bound_instances = lang_config["lower_bound_instances"]
            if X.shape[0] < lower_bound_instances:
                _log.warning("skipped %d %s files: too few samples (%d/%d)",
                             len(files), language, X.shape[0],
                             lower_bound_instances)
                continue
            _log.debug("training the rules model")
            optimizer = Optimizer(
                n_jobs=lang_config["n_jobs"],
                n_iter=lang_config["n_iter"],
                cv=lang_config["cv"],
                random_state=lang_config["trainable_rules"]["random_state"])
            best_score, best_params = optimizer.optimize(X, y)
            _log.debug("score of the best estimator found: %.6f", best_score)
            _log.debug("params of the best estimator found: %s",
                       str(best_params))
            _log.debug("training the model with complete data")
            lang_config["trainable_rules"].update(best_params)
            trainable_rules = TrainableRules(**lang_config["trainable_rules"],
                                             origin_config=lang_config)
            trainable_rules.fit(X, y)
            importances = trainable_rules.feature_importances_
            _log.debug(
                "feature importances from %s:\n\t%s",
                lang_config["trainable_rules"]["base_model_name"], "\n\t".join(
                    "%-55s %.5E" % (fe.feature_names[i], importances[i])
                    for i in numpy.argsort(-importances)[:25]
                    if importances[i] > 1e-5))
            submit_event("%s.train.%s.rules" % (cls.name, language),
                         len(trainable_rules.rules))
            # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model
            # throw away imprecise classes
            if trainable_rules.rules.rules:
                model[language] = trainable_rules.rules
            else:
                _log.warning("model for %s has 0 rules. Skipping.", language)
        _log.info("trained %s", model)
        return model
示例#27
0
 def setUp(self):
     config = FormatAnalyzer._load_config(get_config())["train"]
     self.extractor = FeatureExtractor(language="javascript",
                                       **config["javascript"]["feature_extractor"])
 def setUp(self):
     config = FormatAnalyzer._load_train_config(get_train_config())
     self.final_config = config["javascript"]
     self.extractor = FeatureExtractor(
         language="javascript", **self.final_config["feature_extractor"])