def test_file_filtering(self): self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[]) config = get_config() config["train"]["language_defaults"]["line_length_limit"] = 0 model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service) self.assertEqual(len(model_trained._rules_by_lang), 0) config["train"]["language_defaults"]["line_length_limit"] = 500 model_trained = FormatAnalyzer.train(self.ptr, config, self.data_service) self.assertGreater(len(model_trained._rules_by_lang), 0)
def test_train(self): dataservice = FakeDataService(self.bblfsh_client, files=self.base_files, changes=[]) model = IdTyposAnalyzer.train(ptr=self.ptr, config={}, data_service=dataservice) self.assertSetEqual( model.identifiers, {"name", "print_type", "get_length", "customidentifiertostore"})
def test_train_check(self): common = self.base_files.keys() & self.head_files.keys() self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[Change(base=self.base_files[k], head=self.head_files[k]) for k in common]) model = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) required = FormatAnalyzer.check_training_required( model, self.ptr, get_config(), self.data_service) self.assertFalse(required) self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k]) for k in common]) required = FormatAnalyzer.check_training_required( model, self.ptr, get_config(), self.data_service) self.assertTrue(required)
def test_analyze(self): common = self.base_files.keys() & self.head_files.keys() self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[Change(base=remove_uast(self.base_files[k]), head=self.head_files[k]) for k in common]) config = get_config() # Make uast_break_check only here config["analyze"]["language_defaults"]["uast_break_check"] = True model = FormatAnalyzer.train(self.ptr, config, self.data_service) analyzer = FormatAnalyzer(model, self.ptr.url, config) comments = analyzer.analyze(self.ptr, self.ptr, self.data_service) self.assertGreater(len(comments), 0)
def test_analyze(self): dataservice = FakeDataService( self.bblfsh_client, files=self.base_files, changes=[Change(base=self.base_files[0], head=self.head_files[0])]) model = IdTyposAnalyzer.train(ptr=self.ptr, config={}, data_service=dataservice) analyzer = IdTyposAnalyzer(model=model, url=self.ptr.url, config=dict(model=MODEL_PATH, confidence_threshold=0.0, n_candidates=3, check_all_identifiers=False)) comments = analyzer.analyze(ptr_from=self.ptr, ptr_to=self.ptr, data_service=dataservice) self.assertGreater(len(comments), 0) bad_names = ["nam", "print_tipe", "gett_lenght"] good_names = [ "name", "print_type", "get_length", "customidentifiertostore" ] for c in comments: self.assertFalse( any(name in c.text.split(", fixes:")[0] for name in good_names)) self.assertTrue( any(name in c.text.split(", fixes:")[0] for name in bad_names)) analyzer = IdTyposAnalyzer(model=model, url=self.ptr.url, config=dict(model=MODEL_PATH, confidence_threshold=0.0, n_candidates=3, check_all_identifiers=True)) comments = analyzer.analyze(ptr_from=self.ptr, ptr_to=self.ptr, data_service=dataservice) self.assertGreater(len(comments), 0) bad_names = [ "nam", "print_tipe", "gett_lenght", "customidentifiertostore" ] good_names = ["name", "print_type", "get_length"] for c in comments: self.assertFalse( any(name in c.text.split(", fixes:")[0] for name in good_names)) self.assertTrue( any(name in c.text.split(", fixes:")[0] for name in bad_names))
def test_run(self): dataservice = FakeDataService( self.bblfsh_client, files=self.head_files, changes=[Change(base=self.base_files[0], head=self.head_files[0])]) model = IdTyposAnalyzerSpy.train(ptr=self.ptr, config={}, data_service=dataservice) analyzer = IdTyposAnalyzerSpy(model=model, url=self.ptr.url, config=self.config) typo_fixes = list(analyzer.run(ptr=self.ptr, data_service=dataservice)) self.assertGreater(len(typo_fixes), 0) for typo_fix in typo_fixes: self.check_typo_fix(typo_fix)
def test_train_cutoff_labels(self): self.data_service = FakeDataService( self.bblfsh_client, files=self.base_files.values(), changes=[]) model1 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) self.assertIsInstance(model1, FormatModel) self.assertIn("javascript", model1, str(model1)) model2 = FormatAnalyzer.train(self.ptr, get_config(), self.data_service) self.assertEqual(model1["javascript"].rules, model2["javascript"].rules) self.assertGreater(len(model1["javascript"]), 5) # Check that model can be saved without problems and then load back with TemporaryFile(prefix="analyzer_model-", suffix=".asdf") as f: model2.save(f) f.seek(0) model3 = FormatModel().load(f) compare_models(self, model2, model3)
def train(training_dir: str, ref: ReferencePointer, output_path: str, language: str, bblfsh: str, config: Optional[Union[str, dict]], log: Optional[logging.Logger] = None) -> FormatModel: """ Train a FormatModel for debugging purposes. :param training_dir: Path to the directory containing the files to train from. :param ref: Reference pointer to repository for training :param output_path: Path to the model to write. :param language: Language to filter on. :param bblfsh: Address of the babelfish server. :param config: Path to a YAML config to use during the training or \ json-like object with a config. :param log: logger used to report during training. :return: Trained FormatNodel. """ bblfsh_client = BblfshClient(bblfsh) if config is not None: if isinstance(config, str): with open(config) as fh: config = safe_load(fh) else: config = {} config = FormatAnalyzer._load_config(config) filepaths = glob.glob(os.path.join(training_dir, "**", "*.js"), recursive=True) model = FormatAnalyzer.train( ref, config, FakeDataService(bblfsh_client=bblfsh_client, files=parse_files(filepaths=filepaths, line_length_limit=config["train"] [language]["line_length_limit"], overall_size_limit=config["train"] [language]["overall_size_limit"], client=bblfsh_client, language=language, log=log), changes=None)) model.save(output_path) return model
def test_analyze(self): dataservice = FakeDataService( self.bblfsh_client, files=self.head_files, changes=[Change(base=self.base_files[0], head=self.head_files[0])]) model = IdTyposAnalyzerSpy.train(ptr=self.ptr, config={}, data_service=dataservice) analyzer = IdTyposAnalyzerSpy(model=model, url=self.ptr.url, config=self.config) comments = analyzer.analyze(ptr_from=self.ptr, ptr_to=self.ptr, data_service=dataservice) self.assertGreater(len(comments), 0) for comment in comments: self.assertIsInstance(comment, Comment) typo_fix_dict = json.loads(comment.text) typo_fix_dict["candidates"] = [ Candidate(identifier, confidence) for identifier, confidence in typo_fix_dict["candidates"] ] typo_fix = TypoFix(**typo_fix_dict) self.check_typo_fix(typo_fix)