Пример #1
0
 def test_positions(self):
     test_js_code_filepath = Path(
         __file__).parent / "browser-policy-content.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         code = f.read()
     uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="",
                                                      language="javascript",
                                                      contents=code).uast
     converter = BytesToUnicodeConverter(code)
     code_uni = converter.convert_content()
     uast_uni = converter.convert_uast(uast)
     file = UnicodeFile(content=code_uni,
                        uast=uast_uni,
                        language="javascript",
                        path="test.js")
     annotated_data = AnnotationManager.from_file(file)
     self.extractor._parse_file(annotated_data)
     nodes, _ = file_to_old_parse_file_format(annotated_data)
     for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])):
         self.assertLessEqual(
             node1.start.line, node2.start.line,
             "Start line position decrease for %d, %d nodes" %
             (index, index + 1))
         self.assertLessEqual(
             node1.start.offset, node2.start.offset,
             "Start offset position decrease for %d, %d nodes" %
             (index, index + 1))
Пример #2
0
    def test_extract_features_all_lines(self):
        file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js",
                           language="javascript")
        files = [file, file]

        self.check_X_y(*self.extractor.extract_features(
            files, [list(range(1, self.contents.count("\n") + 1))] * 2))
Пример #3
0
    def generate_local_test(mcs, case_name, uast, contents):
        fe_config = FormatAnalyzer._load_config(
            get_config())["train"]["javascript"]
        feature_extractor = FeatureExtractor(language="javascript",
                                             label_composites=label_composites,
                                             **fe_config["feature_extractor"])
        file = UnicodeFile(content=contents, uast=uast, path="", language="")
        _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file])
        offsets, y_pred, result = cases[case_name]

        def _test(self):
            y_cur = deepcopy(self.y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(self.feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))),
                FakeRules(y_cur))
            generated_file = code_generator.generate(pred_vnodes)
            self.assertEqual(generated_file, result)

        return _test
Пример #4
0
 def setUpClass(cls):
     config = FormatAnalyzer._load_config(
         merge_dicts(
             get_config(), {
                 "train": {
                     "javascript": {
                         "feature_extractor": {
                             "left_siblings_window":
                             1,
                             "right_siblings_window":
                             1,
                             "parents_depth":
                             1,
                             "node_features":
                             ["start_line", "reserved", "roles"],
                         },
                     },
                 },
             }))["train"]
     base = Path(__file__).parent
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = UnicodeFile(content=contents, uast=uast, path="", language="")
     files = [file, file]
     cls.fe = FeatureExtractor(language="javascript",
                               **config["javascript"]["feature_extractor"])
     cls.fe.extract_features(files)
     cls.class_representations = cls.fe.composite_class_representations
     cls.n_classes = len(cls.fe.labels_to_class_sequences)
     cls.ordinal = cls.return_node_feature(FeatureId.start_line)
     cls.categorical = cls.return_node_feature(FeatureId.reserved)
     cls.bag = cls.return_node_feature(FeatureId.roles)
Пример #5
0
 def test_multiple_files(self):
     data = [
         ("var a = 0",
          {1: (CLS_NOOP,)}),
         ("var b = 123",
          {4: (CLS_NOOP,)}),
     ]
     files = []
     for i, (code, _) in enumerate(data):
         uast, errors = parse_uast(self.stub, code, filename="", language=self.language,
                                   unicode=True)
         if errors:
             self.fail("Could not parse the testing code.")
         files.append(UnicodeFile(content=code, uast=uast, path="test_file_%d" % i,
                                  language="javascript"))
     X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features(files)
     y_pred = y.copy()
     rule_winners = numpy.zeros(y.shape)
     for (_, modif) in data:
         for i in modif:
             y_pred[i] = self._to_label(modif[i])
     checker = UASTStabilityChecker(self.fe)
     new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
         y, y_pred, vnodes_y, vnodes, files, self.stub, vnode_parents,
         node_parents, rule_winners, grouped_quote_predictions={})
     self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
Пример #6
0
 def edit_and_test(self, code: str, modifs: Mapping[int, Sequence[str]], *,
                   quote_indices: Optional[Tuple[int, ...]] = None,
                   bad_indices: Optional[FrozenSet[int]] = None) -> None:
     uast, errors = parse_uast(self.stub, code, filename="", language=self.language,
                               unicode=True)
     if errors:
         self.fail("Could not parse the testing code.")
     file = UnicodeFile(content=code, uast=uast, path="test_file", language="javascript")
     X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features([file])
     y_pred = y.copy()
     rule_winners = numpy.zeros(y.shape)
     for index, classes in modifs.items():
         y_pred[index] = self._to_label(classes)
     checker = UASTStabilityChecker(self.fe)
     grouped_quote_predictions = self._grouped_predictions_mapping(vnodes, quote_indices)
     new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check(
         y, y_pred, vnodes_y, vnodes, [file], self.stub, vnode_parents,
         node_parents, rule_winners, grouped_quote_predictions=grouped_quote_predictions)
     bad_preds = set(range(y.shape[0])) - set(safe_preds)
     bad = modifs.keys() if bad_indices is None else bad_indices
     self.assertEqual(bad_preds, bad)
     self.assertEqual(len(y) - len(bad), len(new_y))
     self.assertEqual(len(y_pred) - len(bad), len(new_y_pred))
     self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y))
     self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))
Пример #7
0
    def test_extract_features(self):
        file = UnicodeFile(content=self.contents,
                           uast=self.uast, path="test.js", language="javascript")
        files = [file, file]

        res = self.extractor.extract_features(files)
        self.assertIsNotNone(res, "Failed to parse files.")
        self.check_X_y(*res)
Пример #8
0
 def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]:
     uast = client.parse(filename="", language="javascript", contents=code.encode()).uast
     extractor = FeatureExtractor(language="javascript", **config)
     result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="",
                                                      language="javascript")])
     if result is None:
         self.fail("Could not parse test code.")
     _, _, (vnodes_y, _, _, _) = result
     return [vnode.y for vnode in vnodes_y]
Пример #9
0
 def test_extract_features_exact_match(self):
     file = UnicodeFile(content=self.contents,
                        uast=self.uast,
                        path="test.js",
                        language="javascript")
     files = [file]
     X, y, (vnodes_y, vnodes, _, _) = self.extractor.extract_features(files)
     self.assertEqual("".join(vnode.value for vnode in vnodes),
                      self.contents)
Пример #10
0
def return_features() -> Response:
    """Featurize the given code."""
    body = request.get_json()
    code = body["code"]
    babelfish_address = body["babelfish_address"]
    language = body["language"]
    client = BblfshClient(babelfish_address)
    res = client.parse(filename="", contents=code.encode(), language=language)
    if res.status != 0:
        abort(500)
    model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf"))
    if language not in model:
        raise NotFittedError()
    rules = model[language]
    file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path")
    config = rules.origin_config["feature_extractor"]
    config["return_sibling_indices"] = True
    fe = FeatureExtractor(language=language, **config)
    res = fe.extract_features([file])
    if res is None:
        abort(500)
    X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res
    y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict(
        X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe)
    refuse_to_predict = y_pred < 0
    checker = UASTStabilityChecker(fe)
    _, _, _, _, safe_preds = checker.check(
        y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub,
        vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners,
        grouped_quote_predictions=grouped_quote_predictions)
    break_uast = [False] * X.shape[0]
    for wrong_pred in set(range(X.shape[0])).difference(safe_preds):
        break_uast[wrong_pred] = True
    labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)}
    app.logger.info("returning features of shape %d, %d" % X.shape)
    app.logger.info("length of rules: %d", len(rules))
    return jsonify({
        "code": code,
        "features": _input_matrix_to_descriptions(X, fe),
        "ground_truths": y.tolist(),
        "predictions": y_pred.tolist(),
        "refuse_to_predict": refuse_to_predict.tolist(),
        "sibling_indices": sibling_indices,
        "rules": _rules_to_jsonable(rules, fe),
        "winners": rule_winners.tolist(),
        "break_uast": break_uast,
        "feature_names": fe.feature_names,
        "class_representations": fe.composite_class_representations,
        "class_printables": fe.composite_class_printables,
        "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)),
        "config": _mapping_to_jsonable(rules.origin_config)})
Пример #11
0
    def test_extract_features_some_lines(self):
        file = UnicodeFile(content=self.contents,
                           uast=self.uast, path="test.js", language="javascript")
        files = [file]

        X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents) = self.extractor.extract_features(
            files, [list(range(1, self.contents.count("\n") // 2 + 1))] * 2)
        self.check_X_y(X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents))
        X2_csr, y2, (vn2_y, vn2, _, _) = self.extractor.extract_features(files)
        X1, X2 = X1_csr.toarray(), X2_csr.toarray()
        self.assertTrue((X1 == X2[:len(X1)]).all())
        self.assertTrue((y1 == y2[:len(y1)]).all())
        self.assertTrue(vn1_y == vn2_y[:len(vn1_y)])
        self.assertLess(len(y1), len(y2))
Пример #12
0
 def setUpClass(cls):
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = UnicodeFile(content=contents,
                        uast=uast,
                        path="",
                        language="javascript")
     cls.files = [file]
     config = FormatAnalyzer._load_config(get_config())["train"]
     cls.extractor = FeatureExtractor(
         language="javascript", **config["javascript"]["feature_extractor"])
Пример #13
0
    def convert_file(file: File) -> UnicodeFile:
        """
        Convert lookout `File` to `UnicodeFile` with converted content and uast.

        path and language fields are the same for result and provided `File` instance.

        :param file: lookout File to convert.
        :return: New UnicodeFile instance.
        """
        converter = BytesToUnicodeConverter(file.content)
        return UnicodeFile(
            content=converter.convert_content(),
            uast=converter.convert_uast(file.uast),
            path=file.path,
            language=file.language,
        )
Пример #14
0
 def setUpClass(cls):
     cls.maxDiff = None
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     config = FormatAnalyzer._load_config(get_config())
     fe_config = config["train"]["javascript"]
     cls.feature_extractor = FeatureExtractor(
         language="javascript",
         label_composites=label_composites,
         **fe_config["feature_extractor"])
     cls.file = UnicodeFile(content=contents,
                            uast=uast,
                            path="",
                            language="")
     cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \
         cls.feature_extractor.extract_features([cls.file])
Пример #15
0
    def test_generate_new_line(self):
        self.maxDiff = None
        expected_res = {
            "nothing changed": [],
            "remove new line in the end of 4th line":
            None,
            "indentation in the beginning":
            [" import { makeToast } from '../../common/app/Toasts/redux';"],
            "remove indentation in the 4th line till the end":
            [" return Object.keys(flash)", " }"],
            "new line between 6th and 7th regular code lines":
            ["\n      return messages.map(message => ({"],
            "new line in the middle of the 7th code line with indentation increase":
            ["      return messages\n        .map(message => ({", "  })"],
            "new line in the middle of the 7th code line with indentation decrease":
            ["      return messages\n    .map(message => ({", "      })"],
            "new line in the middle of the 7th code line without indentation increase":
            ["      return messages\n      .map(message => ({"],
            "change quotes":
            ['import { makeToast } from "../../common/app/Toasts/redux";'],
            "remove indentation decrease 11th line": ["        }));"],
            "change indentation decrease to indentation increase 11th line":
            ["          }));"],
            "change indentation decrease to indentation increase 11th line but keep the rest":
            ["          }));", "})"],
        }

        base = Path(__file__).parent
        # str() is needed for Python 3.5
        with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
            contents = fin.read()
        with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
            uast = bblfsh.Node.FromString(fin.read())
        config = FormatAnalyzer._load_config(get_config())
        fe_config = config["train"]["javascript"]

        for case in expected_res:
            offsets, y_pred, _ = cases[case]
            feature_extractor = FeatureExtractor(
                language="javascript",
                label_composites=label_composites,
                **fe_config["feature_extractor"])
            file = UnicodeFile(content=contents,
                               uast=uast,
                               path="",
                               language="")
            X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = \
                feature_extractor.extract_features([file])
            y_cur = deepcopy(y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                vnodes, vnodes_y, list(range(len(vnodes_y))), FakeRules(y_cur))
            res = []
            for gln in FormatAnalyzer._group_line_nodes(
                    y, y_cur, vnodes_y, pred_vnodes, [1] * len(y)):
                line, (line_y, line_y_pred, line_vnodes_y, line_vnodes,
                       line_rule_winners) = gln
                new_code_line = code_generator.generate_new_line(line_vnodes)
                res.append(new_code_line)
            if expected_res[case] is not None:
                # None means that we delete some lines. We are not handle this properly now.
                self.assertEqual(res, expected_res[case], case)