示例#1
0
 def test_file_filtering(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     config = get_config()
     config["train"]["language_defaults"]["line_length_limit"] = 0
     model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service)
     self.assertEqual(len(model_trained._rules_by_lang), 0)
     config["train"]["language_defaults"]["line_length_limit"] = 500
     model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service)
     self.assertGreater(len(model_trained._rules_by_lang), 0)
示例#2
0
 def test_train(self):
     datastub = FakeDataStub(files=self.base_files.values(), changes=None)
     config = {"n_iter": 1}
     model1 = FormatAnalyzer.train(self.ptr, config, datastub)
     self.assertIsInstance(model1, FormatModel)
     self.assertIn("javascript", model1, str(model1))
     datastub = FakeDataStub(files=self.base_files.values(), changes=None)
     config = {"n_iter": 1}
     model2 = FormatAnalyzer.train(self.ptr, config, datastub)
     self.assertEqual(model1["javascript"].rules,
                      model2["javascript"].rules)
     self.assertGreater(len(model1["javascript"]), 10)
示例#3
0
 def test_analyze(self):
     common = self.base_files.keys() & self.head_files.keys()
     datastub = FakeDataStub(files=self.base_files.values(),
                             changes=[
                                 Change(base=self.base_files[k],
                                        head=self.head_files[k])
                                 for k in common
                             ])
     config = {"n_iter": 1}
     model = FormatAnalyzer.train(self.ptr, config, datastub)
     analyzer = FormatAnalyzer(model, self.ptr.url, {})
     comments = analyzer.analyze(self.ptr, self.ptr, datastub)
     self.assertGreater(len(comments), 0)
示例#4
0
 def test_analyze(self):
     common = self.base_files.keys() & self.head_files.keys()
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k])
                  for k in common])
     config = get_config()
     # Make uast_break_check only here
     config["analyze"]["language_defaults"]["uast_break_check"] = True
     model = FormatAnalyzer.train(self.ptr, config, self.data_service)
     analyzer = FormatAnalyzer(model, self.ptr.url, config)
     comments = analyzer.analyze(self.ptr, self.ptr, self.data_service)
     self.assertGreater(len(comments), 0)
示例#5
0
 def test_train_cutoff_labels(self):
     self.data_service = FakeDataService(
         self.bblfsh_client, files=self.base_files.values(), changes=[])
     model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertIsInstance(model1, FormatModel)
     self.assertIn("javascript", model1, str(model1))
     model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     self.assertEqual(model1["javascript"].rules, model2["javascript"].rules)
     self.assertGreater(len(model1["javascript"]), 5)
     # Check that model can be saved without problems and then load back
     with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f:
         model2.save(f)
         f.seek(0)
         model3 = FormatModel().load(f)
         compare_models(self, model2, model3)
 def setUpClass(cls):
     config = FormatAnalyzer._load_train_config(merge_dicts(
         get_train_config(), {
             "javascript": {
                 "feature_extractor": {
                     "left_siblings_window": 1,
                     "right_siblings_window": 1,
                     "parents_depth": 1,
                     "node_features": ["start_line", "reserved", "roles"],
                 },
             },
         }))
     base = Path(__file__).parent
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     files = [file, file]
     cls.fe = FeatureExtractor(language="javascript",
                               **config["javascript"]["feature_extractor"])
     cls.fe.extract_features(files)
     cls.class_representations = cls.fe.composite_class_representations
     cls.n_classes = len(cls.fe.labels_to_class_sequences)
     cls.ordinal = cls.return_node_feature(FeatureId.start_line)
     cls.categorical = cls.return_node_feature(FeatureId.reserved)
     cls.bag = cls.return_node_feature(FeatureId.roles)
示例#7
0
 def setUp(self):
     config = FormatAnalyzer._load_config(get_config())
     self.annotated_file = AnnotationManager.from_file(self.file)
     self.final_config = config["train"]["javascript"]
     self.extractor = FeatureExtractor(
         language="javascript", **self.final_config["feature_extractor"])
     self.annotated_file = AnnotationManager.from_file(self.file)
示例#8
0
    def generate_local_test(mcs, case_name, uast, contents):
        fe_config = FormatAnalyzer._load_config(
            get_config())["train"]["javascript"]
        feature_extractor = FeatureExtractor(language="javascript",
                                             label_composites=label_composites,
                                             **fe_config["feature_extractor"])
        file = UnicodeFile(content=contents, uast=uast, path="", language="")
        _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file])
        offsets, y_pred, result = cases[case_name]

        def _test(self):
            y_cur = deepcopy(self.y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(self.feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))),
                FakeRules(y_cur))
            generated_file = code_generator.generate(pred_vnodes)
            self.assertEqual(generated_file, result)

        return _test
示例#9
0
 def setUpClass(cls):
     slogging_setup("DEBUG", False)
     cls.language = "javascript"
     cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432")
     cls.data_service = FakeDataService(cls.bblfsh_client, files=None, changes=None)
     cls.stub = cls.data_service.get_bblfsh()
     cls.config = FormatAnalyzer._load_config({
         "train": {"language_defaults": {"feature_extractor": {"cutoff_label_support": 0}}},
     })["train"][cls.language]["feature_extractor"]
示例#10
0
def train(training_dir: str,
          ref: ReferencePointer,
          output_path: str,
          language: str,
          bblfsh: str,
          config: Optional[Union[str, dict]],
          log: Optional[logging.Logger] = None) -> FormatModel:
    """
    Train a FormatModel for debugging purposes.

    :param training_dir: Path to the directory containing the files to train from.
    :param ref: Reference pointer to repository for training
    :param output_path: Path to the model to write.
    :param language: Language to filter on.
    :param bblfsh: Address of the babelfish server.
    :param config: Path to a YAML config to use during the training or \
                   json-like object with a config.
    :param log: logger used to report during training.
    :return: Trained FormatNodel.
    """
    bblfsh_client = BblfshClient(bblfsh)
    if config is not None:
        if isinstance(config, str):
            with open(config) as fh:
                config = safe_load(fh)
    else:
        config = {}
    config = FormatAnalyzer._load_config(config)
    filepaths = glob.glob(os.path.join(training_dir, "**", "*.js"),
                          recursive=True)
    model = FormatAnalyzer.train(
        ref, config,
        FakeDataService(bblfsh_client=bblfsh_client,
                        files=parse_files(filepaths=filepaths,
                                          line_length_limit=config["train"]
                                          [language]["line_length_limit"],
                                          overall_size_limit=config["train"]
                                          [language]["overall_size_limit"],
                                          client=bblfsh_client,
                                          language=language,
                                          log=log),
                        changes=None))
    model.save(output_path)
    return model
示例#11
0
 def test_train_check(self):
     common = self.base_files.keys() & self.head_files.keys()
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=self.base_files[k], head=self.head_files[k])
                  for k in common])
     model = FormatAnalyzer.train(self.ptr, get_config(), self.data_service)
     required = FormatAnalyzer.check_training_required(
         model, self.ptr, get_config(), self.data_service)
     self.assertFalse(required)
     self.data_service = FakeDataService(
         self.bblfsh_client,
         files=self.base_files.values(),
         changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k])
                  for k in common])
     required = FormatAnalyzer.check_training_required(
         model, self.ptr, get_config(), self.data_service)
     self.assertTrue(required)
示例#12
0
 def setUpClass(cls):
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.files = [file]
     config = FormatAnalyzer._load_config(get_config())["train"]
     cls.extractor = FeatureExtractor(
         language="javascript", **config["javascript"]["feature_extractor"])
示例#13
0
 def test_files_by_language(self):
     file_stats = {"js": 2, "Python": 5, "ruby": 7}
     files = []
     for language, n_files in file_stats.items():
         for i in range(n_files):
             files.append(
                 File(language=language, uast=self.uast, path=str(i)))
     result = FormatAnalyzer._files_by_language(files)
     self.assertEqual({
         "js": 2,
         "python": 5,
         "ruby": 7
     }, {k: len(v)
         for k, v in result.items()})
     return result
示例#14
0
 def test_vnode_positions(self):
     code_generator = CodeGenerator(feature_extractor=self.extractor)
     lines = self.code.decode("utf-8", "replace").splitlines()
     lines.append("\r\n")
     ok = True
     for line_number, line in FormatAnalyzer._group_line_nodes(
             self.y, self.y - 1, self.vnodes_y, self.vnodes, repeat(0)):
         line_ys, line_ys_pred, line_vnodes_y, new_line_vnodes, line_winners = line
         new_code_line = code_generator.generate_new_line(new_line_vnodes)
         if lines[line_number - 1] != new_code_line:
             print("Lines %d are different" % line_number, file=sys.stderr)
             print(repr(lines[line_number - 1]), file=sys.stderr)
             print(repr(new_code_line), file=sys.stderr)
             print()
             ok = False
     self.assertTrue(ok, "Original and restored lines are different")
示例#15
0
 def setUpClass(cls):
     config = FormatAnalyzer._load_config(get_config())["train"]
     cls.extractor = FeatureExtractor(
         language="javascript", **config["javascript"]["feature_extractor"])
     test_js_code_filepath = Path(__file__).parent / "jquery.layout.js"
     with open(str(test_js_code_filepath), mode="rb") as f:
         cls.code = f.read()
     cls.uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(
         filename="", language="javascript", contents=cls.code).uast
     feature_extractor_output = cls.extractor.extract_features([
         FakeFile(path="test.py",
                  content=cls.code,
                  uast=cls.uast,
                  language="JavaScript")
     ])
     X, cls.y, (cls.vnodes_y, cls.vnodes, vnode_parents, node_parents) = \
         feature_extractor_output
示例#16
0
 def setUpClass(cls):
     cls.maxDiff = None
     base = Path(__file__).parent
     # str() is needed for Python 3.5
     with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
         contents = fin.read()
     with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
         uast = bblfsh.Node.FromString(fin.read())
     config = FormatAnalyzer._load_train_config(get_train_config())
     fe_config = config["javascript"]
     cls.feature_extractor = FeatureExtractor(
         language="javascript",
         label_composites=label_composites,
         **fe_config["feature_extractor"])
     cls.file = File(content=bytes(contents, "utf-8"), uast=uast)
     cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \
         cls.feature_extractor.extract_features([cls.file])
示例#17
0
def train(training_dir: str, output_path: str, language: str, bblfsh: str, config: str
          ) -> None:
    """
    Train a FormatModel for debugging purposes.

    :param training_dir: Path to the directory containing the files to train from.
    :param output_path: Path to the model to write.
    :param language: Language to filter on.
    :param bblfsh: Address of the babelfish server.
    :param config: Path to a YAML config to use during the training.
    """
    bblfsh_client = BblfshClient(bblfsh)
    if config is not None:
        with open(config) as fh:
            config = safe_load(fh)
    else:
        config = {}
    filenames = glob.glob(join(training_dir, "**", "*"), recursive=True)
    model = FormatAnalyzer.train(
        ReferencePointer("someurl", "someref", "somecommit"),
        config,
        FakeDataService(bblfsh_client, prepare_files(filenames, bblfsh_client, language), None)
    )
    model.save(output_path)
示例#18
0
 def setUp(self):
     config = FormatAnalyzer._load_config(get_config())["train"]
     self.extractor = FeatureExtractor(language="javascript",
                                       **config["javascript"]["feature_extractor"])
 def setUp(self):
     config = FormatAnalyzer._load_train_config(get_train_config())
     self.final_config = config["javascript"]
     self.extractor = FeatureExtractor(
         language="javascript", **self.final_config["feature_extractor"])
示例#20
0
    def test_generate_new_line(self):
        self.maxDiff = None
        expected_res = {
            "nothing changed": [],
            "remove new line in the end of 4th line":
            None,
            "indentation in the beginning":
            [" import { makeToast } from '../../common/app/Toasts/redux';"],
            "remove indentation in the 4th line till the end":
            [" return Object.keys(flash)", " }"],
            "new line between 6th and 7th regular code lines":
            ["\n      return messages.map(message => ({"],
            "new line in the middle of the 7th code line with indentation increase":
            ["      return messages\n        .map(message => ({", "  })"],
            "new line in the middle of the 7th code line with indentation decrease":
            ["      return messages\n    .map(message => ({", "      })"],
            "new line in the middle of the 7th code line without indentation increase":
            ["      return messages\n      .map(message => ({"],
            "change quotes":
            ['import { makeToast } from "../../common/app/Toasts/redux";'],
            "remove indentation decrease 11th line": ["        }));"],
            "change indentation decrease to indentation increase 11th line":
            ["          }));"],
            "change indentation decrease to indentation increase 11th line but keep the rest":
            ["          }));", "})"],
        }

        base = Path(__file__).parent
        # str() is needed for Python 3.5
        with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin:
            contents = fin.read()
        with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin:
            uast = bblfsh.Node.FromString(fin.read())
        config = FormatAnalyzer._load_config(get_config())
        fe_config = config["train"]["javascript"]

        for case in expected_res:
            offsets, y_pred, _ = cases[case]
            feature_extractor = FeatureExtractor(
                language="javascript",
                label_composites=label_composites,
                **fe_config["feature_extractor"])
            file = UnicodeFile(content=contents,
                               uast=uast,
                               path="",
                               language="")
            X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = \
                feature_extractor.extract_features([file])
            y_cur = deepcopy(y)
            for offset, yi in zip(offsets, y_pred):
                i = None
                for i, vnode in enumerate(vnodes_y):  # noqa: B007
                    if offset == vnode.start.offset:
                        break
                y_cur[i] = yi
            code_generator = CodeGenerator(feature_extractor)
            pred_vnodes = code_generator.apply_predicted_y(
                vnodes, vnodes_y, list(range(len(vnodes_y))), FakeRules(y_cur))
            res = []
            for gln in FormatAnalyzer._group_line_nodes(
                    y, y_cur, vnodes_y, pred_vnodes, [1] * len(y)):
                line, (line_y, line_y_pred, line_vnodes_y, line_vnodes,
                       line_rule_winners) = gln
                new_code_line = code_generator.generate_new_line(line_vnodes)
                res.append(new_code_line)
            if expected_res[case] is not None:
                # None means that we delete some lines. We are not handle this properly now.
                self.assertEqual(res, expected_res[case], case)