class FeaturesTests(unittest.TestCase): def setUp(self): config = FormatAnalyzer._load_config(get_config())["train"] self.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"]) def test_positions(self): test_js_code_filepath = Path( __file__).parent / "browser-policy-content.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="", language="javascript", contents=code).uast converter = BytesToUnicodeConverter(code) code_uni = converter.convert_content() uast_uni = converter.convert_uast(uast) file = UnicodeFile(content=code_uni, uast=uast_uni, language="javascript", path="test.js") annotated_data = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_data) nodes, _ = file_to_old_parse_file_format(annotated_data) for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])): self.assertLessEqual( node1.start.line, node2.start.line, "Start line position decrease for %d, %d nodes" % (index, index + 1)) self.assertLessEqual( node1.start.offset, node2.start.offset, "Start offset position decrease for %d, %d nodes" % (index, index + 1))
def generate_file_fixes(self, data_service: DataService, changes: Sequence[Change], ) -> Iterator[FileFix]: """ Generate all data required for any type of further processing. Next processing can be comment generation or performance report generation. :param data_service: Connection to the Lookout data retrieval service. :param changes: The list of changes in the pointed state. :return: Iterator with unrendered data per comment. """ log = self._log base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) processed_files_counter = defaultdict(int) processed_fixes_counter = defaultdict(int) for lang, head_files in head_files_by_lang.items(): if lang not in self.model: log.warning("skipped %d written in %s. Rules for %s do not exist in model", len(head_files), lang, lang) continue rules = self.model[lang] config = self.analyze_config[lang] rules = rules.filter_by_confidence(config["confidence_threshold"]) \ .filter_by_support(config["support_threshold"]) for file in filter_files(head_files, rules.origin_config["line_length_limit"], rules.origin_config["overall_size_limit"], log=log): processed_files_counter[lang] += 1 try: prev_file = base_files_by_lang[lang][file.path] except KeyError: prev_file = None lines = None else: lines = sorted(chain.from_iterable(( find_new_lines(prev_file, file), find_deleted_lines(prev_file, file), ))) log.debug("%s %s", file.path, lines) fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"]) feature_extractor_output = fe.extract_features([file], [lines]) if feature_extractor_output is None: submit_event("%s.analyze.%s.parse_failures" % (self.name, lang), 1) if config["report_parse_failures"]: log.warning("Failed to parse %s", file.path) yield FileFix(error="Failed to parse", head_file=file, language=lang, feature_extractor=fe, base_file=prev_file, file_vnodes=[], line_fixes=[], y_pred_pure=None, y=None) else: fixes, file_vnodes, y_pred_pure, y = self._generate_token_fixes( file, fe, feature_extractor_output, data_service.get_bblfsh(), rules) log.debug("%s %d fixes", file.path, len(fixes)) processed_fixes_counter[lang] += len(fixes) yield FileFix(error="", head_file=file, language=lang, feature_extractor=fe, base_file=prev_file, file_vnodes=file_vnodes, line_fixes=fixes, y_pred_pure=y_pred_pure, y=y) for key, val in processed_files_counter.items(): submit_event("%s.analyze.%s.files" % (self.name, key), val) for key, val in processed_fixes_counter.items(): submit_event("%s.analyze.%s.fixes" % (self.name, key), val)
def generate_local_test(mcs, case_name, uast, contents): fe_config = FormatAnalyzer._load_config( get_config())["train"]["javascript"] feature_extractor = FeatureExtractor(language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) file = UnicodeFile(content=contents, uast=uast, path="", language="") _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file]) offsets, y_pred, result = cases[case_name] def _test(self): y_cur = deepcopy(self.y) for offset, yi in zip(offsets, y_pred): i = None for i, vnode in enumerate(vnodes_y): # noqa: B007 if offset == vnode.start.offset: break y_cur[i] = yi code_generator = CodeGenerator(self.feature_extractor) pred_vnodes = code_generator.apply_predicted_y( self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))), FakeRules(y_cur)) generated_file = code_generator.generate(pred_vnodes) self.assertEqual(generated_file, result) return _test
def setUp(self): config = FormatAnalyzer._load_config(get_config()) self.annotated_file = AnnotationManager.from_file(self.file) self.final_config = config["train"]["javascript"] self.extractor = FeatureExtractor( language="javascript", **self.final_config["feature_extractor"]) self.annotated_file = AnnotationManager.from_file(self.file)
def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]: uast = client.parse(filename="", language="javascript", contents=code.encode()).uast extractor = FeatureExtractor(language="javascript", **config) result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="", language="javascript")]) if result is None: self.fail("Could not parse test code.") _, _, (vnodes_y, _, _, _) = result return [vnode.y for vnode in vnodes_y]
def return_features() -> Response: """Featurize the given code.""" body = request.get_json() code = body["code"] babelfish_address = body["babelfish_address"] language = body["language"] client = BblfshClient(babelfish_address) res = client.parse(filename="", contents=code.encode(), language=language) if res.status != 0: abort(500) model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf")) if language not in model: raise NotFittedError() rules = model[language] file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path") config = rules.origin_config["feature_extractor"] config["return_sibling_indices"] = True fe = FeatureExtractor(language=language, **config) res = fe.extract_features([file]) if res is None: abort(500) X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict( X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe) refuse_to_predict = y_pred < 0 checker = UASTStabilityChecker(fe) _, _, _, _, safe_preds = checker.check( y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub, vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners, grouped_quote_predictions=grouped_quote_predictions) break_uast = [False] * X.shape[0] for wrong_pred in set(range(X.shape[0])).difference(safe_preds): break_uast[wrong_pred] = True labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)} app.logger.info("returning features of shape %d, %d" % X.shape) app.logger.info("length of rules: %d", len(rules)) return jsonify({ "code": code, "features": _input_matrix_to_descriptions(X, fe), "ground_truths": y.tolist(), "predictions": y_pred.tolist(), "refuse_to_predict": refuse_to_predict.tolist(), "sibling_indices": sibling_indices, "rules": _rules_to_jsonable(rules, fe), "winners": rule_winners.tolist(), "break_uast": break_uast, "feature_names": fe.feature_names, "class_representations": fe.composite_class_representations, "class_printables": fe.composite_class_printables, "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)), "config": _mapping_to_jsonable(rules.origin_config)})
class FeaturesTests(unittest.TestCase): def setUp(self): config = FormatAnalyzer._load_config(get_config())["train"] self.extractor = FeatureExtractor(language="javascript", **config["javascript"]["feature_extractor"]) def test_vnode_positions(self): test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code).uast nodes, parents = list(self.extractor._parse_file(code.decode("utf-8", "replace"), uast, test_js_code_filepath)) # Just should not fail list(self.extractor._classify_vnodes(nodes, "filepath"))
def setUpClass(cls): config = FormatAnalyzer._load_train_config(merge_dicts( get_train_config(), { "javascript": { "feature_extractor": { "left_siblings_window": 1, "right_siblings_window": 1, "parents_depth": 1, "node_features": ["start_line", "reserved", "roles"], }, }, })) base = Path(__file__).parent with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = File(content=bytes(contents, "utf-8"), uast=uast) files = [file, file] cls.fe = FeatureExtractor(language="javascript", **config["javascript"]["feature_extractor"]) cls.fe.extract_features(files) cls.class_representations = cls.fe.composite_class_representations cls.n_classes = len(cls.fe.labels_to_class_sequences) cls.ordinal = cls.return_node_feature(FeatureId.start_line) cls.categorical = cls.return_node_feature(FeatureId.reserved) cls.bag = cls.return_node_feature(FeatureId.roles)
def _get_composite(feature_extractor: FeatureExtractor, labels: Tuple[int, ...]) -> int: if labels in feature_extractor.class_sequences_to_labels: return feature_extractor.class_sequences_to_labels[labels] feature_extractor.class_sequences_to_labels[labels] = \ len(feature_extractor.class_sequences_to_labels) feature_extractor.labels_to_class_sequences.append(labels) return len(feature_extractor.labels_to_class_sequences) - 1
class FeaturesTests(unittest.TestCase): def setUp(self): config = FormatAnalyzer._load_config(get_config())["train"] self.extractor = FeatureExtractor(language="javascript", **config["javascript"]["feature_extractor"]) def test_vnode_positions(self): test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code).uast file = BytesToUnicodeConverter.convert_file( File(content=code, uast=uast, language="javascript", path="test.js")) annotated_data = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_data) # Just should not fail self.extractor._classify_vnodes(annotated_data)
def setUpClass(cls): base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = File(content=bytes(contents, "utf-8"), uast=uast) cls.files = [file] config = FormatAnalyzer._load_config(get_config())["train"] cls.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"])
def files2vnodes( files: Iterable[str], feature_extractor: FeatureExtractor, client: str, ) -> Iterable[VirtualNode]: """ Return the `VirtualNode`-s extracted from a list of files. :param files: List of files to get `Misprediction`-s and `VirtualNode`-s from. :param feature_extractor: FeatureExtractor to use. :param client: Babelfish client. Babelfish server should be started accordingly. :return: List of `VirtualNode`-s extracted from a given list of files. """ files = prepare_files(files, client, feature_extractor.language) _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features(files) return vnodes_y
def files2vnodes(filepaths: Iterable[str], feature_extractor: FeatureExtractor, rules: Rules, client: BblfshClient) -> Iterable[VirtualNode]: """ Return the `VirtualNode`-s extracted from a list of files. :param filepaths: List of files to get `Misprediction`-s and `VirtualNode`-s from. :param feature_extractor: FeatureExtractor to use. :param rules: Rules to use for prediction. :param client: Babelfish client. Babelfish server should be started accordingly. :return: List of `VirtualNode`-s extracted from a given list of files. """ files = parse_files(filepaths=filepaths, line_length_limit=rules.origin_config["line_length_limit"], overall_size_limit=rules.origin_config["overall_size_limit"], client=client, language=feature_extractor.language) _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features(files) return vnodes_y
def setUpClass(cls): cls.maxDiff = None base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) config = FormatAnalyzer._load_train_config(get_train_config()) fe_config = config["javascript"] cls.feature_extractor = FeatureExtractor( language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) cls.file = File(content=bytes(contents, "utf-8"), uast=uast) cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \ cls.feature_extractor.extract_features([cls.file])
def setUpClass(cls): config = FormatAnalyzer._load_config(get_config())["train"] cls.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"]) test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: cls.code = f.read() cls.uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=cls.code).uast feature_extractor_output = cls.extractor.extract_features([ FakeFile(path="test.py", content=cls.code, uast=cls.uast, language="JavaScript") ]) X, cls.y, (cls.vnodes_y, cls.vnodes, vnode_parents, node_parents) = \ feature_extractor_output
def dump_rule(model: FormatModel, rule_hash: str): """ Print the rule contained in the model by hash. :param model: Trained model. :param rule_hash: 8-char rule hash. :return: Nothing """ for lang in model.languages: rules = model[lang] fe = FeatureExtractor(language=lang, **rules.origin_config["feature_extractor"]) for rule in rules.rules: h = hash_rule(rule, fe) if h == rule_hash: print(lang) print(" " + describe_rule(rule, fe).replace("\t", " "))
class FeaturesTests(unittest.TestCase): def setUp(self): config = FormatAnalyzer._load_config(get_config())["train"] self.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"]) def test_positions(self): test_js_code_filepath = Path( __file__).parent / "browser-policy-content.js" with open(str(test_js_code_filepath), mode="rt") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code.encode()).uast nodes, parents = self.extractor._parse_file(code, uast, test_js_code_filepath) for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])): self.assertLessEqual( node1.start.line, node2.start.line, "Start line position decrease for %d, %d nodes" % (index, index + 1)) self.assertLessEqual( node1.start.offset, node2.start.offset, "Start offset position decrease for %d, %d nodes" % (index, index + 1))
def test_generate_new_line(self): self.maxDiff = None expected_res = { "nothing changed": [], "remove new line in the end of 4th line": None, "indentation in the beginning": [" import { makeToast } from '../../common/app/Toasts/redux';"], "remove indentation in the 4th line till the end": [" return Object.keys(flash)", " }"], "new line between 6th and 7th regular code lines": ["\n return messages.map(message => ({"], "new line in the middle of the 7th code line with indentation increase": [" return messages\n .map(message => ({", " })"], "new line in the middle of the 7th code line with indentation decrease": [" return messages\n .map(message => ({", " })"], "new line in the middle of the 7th code line without indentation increase": [" return messages\n .map(message => ({"], "change quotes": ['import { makeToast } from "../../common/app/Toasts/redux";'], "remove indentation decrease 11th line": [" }));"], "change indentation decrease to indentation increase 11th line": [" }));"], "change indentation decrease to indentation increase 11th line but keep the rest": [" }));", "})"], } base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) config = FormatAnalyzer._load_config(get_config()) fe_config = config["train"]["javascript"] for case in expected_res: offsets, y_pred, _ = cases[case] feature_extractor = FeatureExtractor( language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) file = UnicodeFile(content=contents, uast=uast, path="", language="") X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = \ feature_extractor.extract_features([file]) y_cur = deepcopy(y) for offset, yi in zip(offsets, y_pred): i = None for i, vnode in enumerate(vnodes_y): # noqa: B007 if offset == vnode.start.offset: break y_cur[i] = yi code_generator = CodeGenerator(feature_extractor) pred_vnodes = code_generator.apply_predicted_y( vnodes, vnodes_y, list(range(len(vnodes_y))), FakeRules(y_cur)) res = [] for gln in FormatAnalyzer._group_line_nodes( y, y_cur, vnodes_y, pred_vnodes, [1] * len(y)): line, (line_y, line_y_pred, line_vnodes_y, line_vnodes, line_rule_winners) = gln new_code_line = code_generator.generate_new_line(line_vnodes) res.append(new_code_line) if expected_res[case] is not None: # None means that we delete some lines. We are not handle this properly now. self.assertEqual(res, expected_res[case], case)
def quality_report_noisy(bblfsh: str, language: str, confidence_threshold: float, support_threshold: int, precision_threshold: float, dir_output: str, config: Optional[dict] = None, repos: Optional[str] = None) -> None: """ Generate a quality report on the artificial noisy dataset including evaluation curves. :param bblfsh: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider, others will be discarded. :param confidence_threshold: Confidence threshold to filter relevant rules. :param support_threshold: Support threshold to filter relevant rules. :param precision_threshold: Precision threshold tolerated by the model. \ Limit drawn as a red horizontal line on the figure. :param dir_output: Path to the output directory where to store the quality report in Markdown \ and the precision-recall curve in png format. :param config: FormatAnalyzer config to use. Default one is used if not set. :param repos: Input list of urls to the repositories to analyze. \ Should be strings separated by newlines. If it is None, \ we use the string defined at the beginning of the file. """ log = logging.getLogger("quality_report_noisy") # initialization repo_names = [] last_accepted_rule = {} prediction_rates, precisions, accepted_rules = (defaultdict(list) for _ in range(3)) n_mistakes, prec_max_prediction_rate, confidence_threshold_exp, max_prediction_rate, \ n_rules, n_rules_filtered = ({} for _ in range(6)) if repos is None: repos = REPOSITORIES try: # fetch the the original and noisy repositories client = BblfshClient(bblfsh) log.info("Repositories: %s", repos) with tempfile.TemporaryDirectory() as tmpdirname: for raw in repos.splitlines(): repo_path, clean_commit, noisy_commit = raw.split(",") repo = repo_path.split("/")[-1] log.info("Fetching %s", repo_path) git_dir = os.path.join(tmpdirname, repo) git_dir_noisy = os.path.join(tmpdirname, repo + "_noisy") cmd1 = "git clone --single-branch --branch master %s %s" % ( repo_path, git_dir) cmd2 = "git clone --single-branch --branch style-noise-1-per-file %s %s" \ % (repo_path, git_dir_noisy) try: for cmd in (cmd1, cmd2): log.debug("Running: %s", cmd) subprocess.check_call(cmd.split()) except subprocess.CalledProcessError as e: raise ConnectionError("Unable to fetch repository %s" % repo_path) from e # train the model on the original repository ref = ReferencePointer(repo_path, "HEAD", clean_commit) model_path = os.path.join(git_dir, "model.asdf") format_model = train(training_dir=git_dir, ref=ref, output_path=model_path, language=language, bblfsh=bblfsh, config=config, log=log) rules = format_model[language] # extract the raw data and the diff from the repositories input_pattern = os.path.join(git_dir, "**", "*.js") input_pattern_noisy = os.path.join(git_dir_noisy, "**", "*.js") true_content = get_content_from_repo(input_pattern) noisy_content = get_content_from_repo(input_pattern_noisy) true_files, noisy_files, start_changes = get_difflib_changes( true_content, noisy_content) if not true_files: raise ValueError( "Noisy repo should count at least one artificial mistake" ) log.info( "Number of files modified by adding style noise: %d / %d", len(true_files), len(true_content)) del true_content, noisy_content # extract the features feature_extractor = FeatureExtractor( language=language, **rules.origin_config["feature_extractor"]) vnodes_y_true = files2vnodes(true_files, feature_extractor, rules, client) mispreds_noise = files2mispreds(noisy_files, feature_extractor, rules, client, log) # compute the prediction rate and precision score on the artificial noisy dataset diff_mispreds = get_diff_mispreds(mispreds_noise, start_changes) changes_count = len(start_changes) n_rules[repo] = len(rules.rules) rules_id = [(i, r.stats.conf) for i, r in enumerate(rules.rules) if r.stats.conf > confidence_threshold and r.stats.support > support_threshold] rules_id = sorted(rules_id, key=lambda k: k[1], reverse=True) for i in range(len(rules_id)): filtered_mispreds = { k: m for k, m in diff_mispreds.items() if any(r[0] == m.rule for r in rules_id[:i + 1]) } style_fixes = get_style_fixes(filtered_mispreds, vnodes_y_true, true_files, noisy_files, feature_extractor) prediction_rate, precision = compute_metrics( changes_count=changes_count, predictions_count=len(filtered_mispreds), true_positive=len(style_fixes)) prediction_rates[repo].append(round(prediction_rate, 3)) precisions[repo].append(round(precision, 3)) print("prediction rate x:", prediction_rates[repo]) print("precision y:", precisions[repo]) # compute other statistics and quality metrics for the model's evaluation repo_names.append(repo) n_mistakes[repo] = len(true_files) prec_max_prediction_rate[repo] = precisions[repo][-1] max_prediction_rate[repo] = max(prediction_rates[repo]) n_rules_filtered[repo] = len(rules_id) # compute the confidence and prediction rate limit for a given precision threshold for i, (prediction_rate, prec) in enumerate( zip(prediction_rates[repo], precisions[repo])): if prec >= precision_threshold: accepted_rules[repo].append( (i, rules_id[i][1], prediction_rate)) last_accepted_rule[repo] = min(accepted_rules[repo], key=itemgetter(1)) confidence_threshold_exp[repo] = (last_accepted_rule[repo][0], last_accepted_rule[repo][1]) finally: client._channel.close() # compute the index of the last accepted rule according to the maximum confidence threshold limit_conf_id = {} max_confidence_threshold_exp = max(confidence_threshold_exp.values(), key=itemgetter(1)) for repo, rules in accepted_rules.items(): for rule in rules: if rule[1] < max_confidence_threshold_exp[1]: break limit_conf_id[repo] = rule[0] # compile the curves showing the evolutions of the prediction rate and precision score path_to_figure = os.path.join(dir_output, "pr_curves.png") plot_curve(repo_names, prediction_rates, precisions, precision_threshold, limit_conf_id, path_to_figure) # compile the markdown template for the report through jinja2 loader = jinja2.FileSystemLoader( (os.path.join(os.path.dirname(__file__), "..", "templates"), ), followlinks=True) env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True, keep_trailing_newline=True) env.globals.update(range=range) template = loader.load(env, "noisy_quality_report.md.jinja2") report = template.render(repos=repo_names, n_mistakes=n_mistakes, prec_max_prediction_rate=prec_max_prediction_rate, confidence_threshold_exp=round( max_confidence_threshold_exp[1], 2), max_prediction_rate=max_prediction_rate, confidence_threshold=confidence_threshold, support_threshold=support_threshold, n_rules=n_rules, n_rules_filtered=n_rules_filtered, path_to_figure=path_to_figure) # write the quality report repo_pathrt = os.path.join(dir_output, "report_noise.md") with open(repo_pathrt, "w", encoding="utf-8") as f: f.write(report)
class FeaturesTests(unittest.TestCase): @classmethod def setUpClass(cls): base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: cls.contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: cls.uast = bblfsh.Node.FromString(fin.read()) def setUp(self): config = FormatAnalyzer._load_config(get_config()) self.final_config = config["train"]["javascript"] self.extractor = FeatureExtractor(language="javascript", **self.final_config["feature_extractor"]) def test_parse_file_exact_match(self): test_js_code_filepath = Path(__file__).parent / "for_parse_test.js.xz" with lzma.open(str(test_js_code_filepath), mode="rt") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code.encode()).uast nodes, parents = self.extractor._parse_file(code, uast, test_js_code_filepath) self.assertEqual("".join(n.value for n in nodes), code) def test_extract_features_exact_match(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file] X, y, (vnodes_y, vnodes, _, _) = self.extractor.extract_features(files) self.assertEqual("".join(vnode.value for vnode in vnodes), self.contents) def test_parse_file_comment_after_regexp(self): code = "x = // comment\n/<regexp>/;" uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code.encode()).uast nodes, parents = self.extractor._parse_file(code, uast, "") self.assertEqual("".join(n.value for n in nodes), code) def test_parse_file(self): nodes, parents = self.extractor._parse_file(self.contents, self.uast, "test_file") text = [] offset = line = col = 0 for n in nodes: if line == n.start.line - 1: line += 1 col = 1 self.assertEqual((offset, line, col), n.start, n.value) text.append(n.value) if n.node is not None: self.assertIsNotNone(parents.get(id(n.node)), n) offset, line, col = n.end self.assertEqual(len(self.contents), offset) # New line ends on the next line self.assertEqual(len(self.contents.splitlines()) + 1, line) self.assertEqual("".join(text), self.contents) def test_parse_file_with_trailing_space(self): contents = self.contents + " " nodes, parents = self.extractor._parse_file(contents, self.uast, "test_file") offset, line, col = nodes[-1].end self.assertEqual(len(contents), offset) # Space token always ends on the same line self.assertEqual(len(contents.splitlines()), line) self.assertEqual("".join(n.value for n in nodes), contents) def test_classify_vnodes(self): nodes, _ = self.extractor._parse_file(self.contents, self.uast, "test_file") nodes = list(self.extractor._classify_vnodes(nodes, "test_file")) text = "".join(n.value for n in nodes) self.assertEqual(text, self.contents) cls_counts = Counter() offset = line = col = 0 for n in nodes: if line == n.start.line - 1: line += 1 col = 1 self.assertEqual((offset, line, col), n.start, n.value) if n.y is not None: cls_counts.update(map(CLASSES.__getitem__, n.y)) offset, line, col = n.end self.assertEqual(len(self.contents), offset) # New line ends on the next line self.assertEqual(len(self.contents.splitlines()) + 1, line) self.assertEqual(cls_counts[CLS_SPACE_INC], cls_counts[CLS_SPACE_DEC]) self.assertGreater(cls_counts[CLS_SPACE_INC], 0) self.assertGreater(cls_counts[CLS_SPACE], 0) self.assertGreater(cls_counts[CLS_NEWLINE], 0) self.assertGreater(cls_counts[CLS_SINGLE_QUOTE], 0) self.assertTrue(cls_counts[CLS_SINGLE_QUOTE] % 2 == 0) def test_classify_vnodes_with_trailing_space(self): contents = self.contents + " " nodes, _ = self.extractor._parse_file(contents, self.uast, "test_file") nodes = list(self.extractor._classify_vnodes(nodes, "test_file")) text = "".join(n.value for n in nodes) self.assertEqual(text, contents) cls_counts = Counter() offset = line = col = 0 for n in nodes: if line == n.start.line - 1: line += 1 col = 1 self.assertEqual((offset, line, col), n.start, n.value) if n.y is not None: cls_counts.update(map(CLASSES.__getitem__, n.y)) offset, line, col = n.end self.assertEqual(len(contents), offset) # Space token always ends on the same line self.assertEqual(len(contents.splitlines()), line) self.assertEqual(cls_counts[CLS_SPACE_INC], cls_counts[CLS_SPACE_DEC] + 1) self.assertGreater(cls_counts[CLS_SPACE_INC], 0) self.assertGreater(cls_counts[CLS_SPACE], 0) self.assertGreater(cls_counts[CLS_NEWLINE], 0) self.assertGreater(cls_counts[CLS_SINGLE_QUOTE], 0) self.assertTrue(cls_counts[CLS_SINGLE_QUOTE] % 2 == 0) def test_compute_labels_mappings(self): pos1, pos2 = Position(1, 1, 1), Position(10, 2, 1) files = [VirtualNode("", pos1, pos2, y=(1,))] * 2 + \ [VirtualNode("", pos1, pos2), VirtualNode("", pos1, pos2, y=(2,)), VirtualNode("", pos1, pos2, y=(3,))] self.extractor.cutoff_label_support = 2 self.extractor._compute_labels_mappings(files) self.assertEqual(self.extractor.labels_to_class_sequences, [(1,)]) self.assertEqual(self.extractor.class_sequences_to_labels, {(1,): 0}) def test_extract_features(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file, file] res = self.extractor.extract_features(files) self.assertIsNotNone(res, "Failed to parse files.") self.check_X_y(*res) def check_X_y(self, X_csr, y, secondary_features): X = X_csr.toarray() vnodes_y, vnodes, vnode_parents, node_parents = secondary_features self.assertEqual(X.shape[0], y.shape[0]) self.assertEqual(X.shape[0], len(vnodes_y)) self.assertEqual(len(vnodes), len(vnode_parents)) for vn in vnodes_y: self.assertIsInstance(vn, VirtualNode) self.assertEqual(type(vnode_parents[id(vnodes[0])]).__module__, bblfsh.Node.__module__) for _, node in node_parents.items(): self.assertEqual(type(node).__module__, bblfsh.Node.__module__) self.assertEqual(X.shape[1], self.extractor.count_features()) not_set = X == -1 unset_rows = numpy.nonzero(numpy.all(not_set, axis=1))[0] unset_columns = numpy.nonzero(numpy.all(not_set, axis=0))[0] self.assertEqual(len(unset_rows), 0, "%d rows are unset" % len(unset_rows)) self.assertEqual(len(unset_columns), 0, "columns %s are unset" % ", ".join(map(str, unset_columns))) def test_extract_features_all_lines(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file, file] self.check_X_y(*self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") + 1))] * 2)) def test_empty_strings(self): config = deepcopy(self.final_config["feature_extractor"]) config["cutoff_label_support"] = 0 client = bblfsh.BblfshClient("0.0.0.0:9432") def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]: uast = client.parse(filename="", language="javascript", contents=code.encode()).uast extractor = FeatureExtractor(language="javascript", **config) result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="", language="javascript")]) if result is None: self.fail("Could not parse test code.") _, _, (vnodes_y, _, _, _) = result return [vnode.y for vnode in vnodes_y] self.assertEqual(get_class_sequences_from_code("var a = '';"), get_class_sequences_from_code("var a = 'a';")) def test_extract_features_some_lines(self): file = UnicodeFile(content=self.contents, uast=self.uast, path="test.js", language="javascript") files = [file] X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents) = self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") // 2 + 1))] * 2) self.check_X_y(X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents)) X2_csr, y2, (vn2_y, vn2, _, _) = self.extractor.extract_features(files) X1, X2 = X1_csr.toarray(), X2_csr.toarray() self.assertTrue((X1 == X2[:len(X1)]).all()) self.assertTrue((y1 == y2[:len(y1)]).all()) self.assertTrue(vn1_y == vn2_y[:len(vn1_y)]) self.assertLess(len(y1), len(y2)) def test_noop_vnodes(self): vnodes, parents = self.extractor._parse_file(self.contents, self.uast, "test_file") vnodes = self.extractor._classify_vnodes(vnodes, "test_file") vnodes = self.extractor._merge_classes_to_composite_labels( vnodes, "test_file", index_labels=True) vnodes = self.extractor._add_noops(list(vnodes), "test_file", index_labels=True) for vnode1, vnode2, vnode3 in zip(vnodes, islice(vnodes, 1, None), islice(vnodes, 2, None)): if vnode1.y is not None or vnode3.y is not None: self.assertNotIn(CLASS_INDEX[CLS_NOOP], vnode2.y if vnode2.y else set(), "\n".join(map(repr, [vnode1, vnode2, vnode3])))
def filter_uast_breaking_preds( y: numpy.ndarray, y_pred: numpy.ndarray, vnodes_y: Sequence[VirtualNode], vnodes: Sequence[VirtualNode], files: Mapping[str, File], feature_extractor: FeatureExtractor, stub: "bblfsh.aliases.ProtocolServiceStub", vnode_parents: Mapping[int, bblfsh.Node], node_parents: Mapping[int, bblfsh.Node], rule_winners: numpy.ndarray, grouped_quote_predictions: QuotedNodeTripleMapping, ) -> Tuple[numpy.ndarray, numpy.ndarray, Sequence[VirtualNode], numpy.ndarray, Sequence[int]]: """ Filter the model's predictions that modify the UAST apart from changing positions. :param y: Numpy 1-dimensional array of labels. :param y_pred: Numpy 1-dimensional array of predicted labels by the model. :param vnodes_y: Sequence of the labeled `VirtualNode`-s corresponding to labeled samples. :param vnodes: Sequence of all the `VirtualNode`-s corresponding to the input. :param files: Dictionary of File-s with content, uast and path. :param feature_extractor: FeatureExtractor used to extract features. :param stub: Babelfish GRPC service stub. :param vnode_parents: `VirtualNode`-s' parents mapping as the LCA of the closest left and right babelfish nodes. :param node_parents: Parents mapping of the input UASTs. :param rule_winners: Numpy array of the index of the winning rule for each sample. :param grouped_quote_predictions: Quotes predictions (handled differenlty from the rest). :return: List of predictions indices that are considered valid i.e. that are not breaking the UAST. """ safe_preds = [] current_path = None # type: Optional[str] parsing_cache = { } # type: Dict[int, Optional[Tuple[bblfsh.Node, int, int]]] file_content = None # type: Optional[str] cur_i = 0 for i, (gt, pred, vn_y) in enumerate(zip(y, y_pred, vnodes_y)): if vn_y.path != current_path: parsing_cache = {} current_path = vn_y.path file_content = files[vn_y.path].content.decode("utf-8", "replace") while vn_y is not vnodes[cur_i]: cur_i += 1 if cur_i >= len(vnodes): raise AssertionError("vnodes_y and vnodes are not consistent.") # quote types are special cased if id(vn_y) in grouped_quote_predictions: pred_string = feature_extractor.label_to_str(pred) group = grouped_quote_predictions[id(vn_y)] # already handled with the previous vnode if group is None: continue vnode1, vnode2, vnode3 = group content_before = file_content[vnode1.start.offset:vnode3.end. offset] content_after = (feature_extractor.label_to_str(y_pred[i]) + vnode2.value + feature_extractor.label_to_str(y_pred[i + 1])) parsed_before, errors = parse_uast( stub, content_before, filename="", language=feature_extractor.language) if not errors: parsed_after, errors = parse_uast( stub, content_after, filename="", language=feature_extractor.language) if check_uasts_are_equal(parsed_before, parsed_after): safe_preds.extend((i, i + 1)) continue if gt == pred: safe_preds.append(i) continue pred_string = feature_extractor.label_to_str(pred) parsed_before = _parse_code(vnode_parents[id(vn_y)], file_content, stub, parsing_cache, feature_extractor.language, node_parents, vn_y.path) if parsed_before is None: continue parent_before, start, end = parsed_before # when the input node value is NOOP i.e. an empty string, the replacement is restricted # to the first occurence output_pred = "".join(n.value for n in vnodes[cur_i:cur_i + 2]).replace( vn_y.value, pred_string, 1) diff_pred_offset = len(pred_string) - len(vn_y.value) try: # to handle mixed indentations, we include the `VirtualNode` following the predicted # one in the output predicted string, and start the rest of the sequence one # `VirtualNode` further to avoid its repetitions start_next_vnode = vn_y.start.offset + len(vn_y.value) + len( vnodes[cur_i + 1].value) content_after = (file_content[:vn_y.start.offset] + output_pred + file_content[start_next_vnode:]) # in case the prediction to check corresponds to the last label of a file except IndexError: content_after = file_content[:vn_y.start.offset] \ + output_pred content_after = content_after[start:end + diff_pred_offset] parent_after, errors_after = parse_uast( stub, content_after, filename="", language=feature_extractor.language) if not errors_after: if check_uasts_are_equal(parent_before, parent_after): safe_preds.append(i) _log.info("Non UAST breaking predictions: %d selected out of %d", len(safe_preds), y_pred.shape[0]) vnodes_y = [vn for i, vn in enumerate(list(vnodes_y)) if i in safe_preds] return y[safe_preds], y_pred[safe_preds], vnodes_y, rule_winners[ safe_preds], safe_preds
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, files: Iterator[File], **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :param files: iterator of File records from the data service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) train_config = cls._load_config(config)["train"] _log.info("train %s %s %s %s", __version__, ptr.url, ptr.commit, pformat(train_config, width=4096, compact=True)) model = FormatModel().generate(cls, ptr) for language, files in files_by_language(files).items(): try: lang_config = train_config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue _log.info("effective train config for %s:\n%s", language, pformat(lang_config, width=120, compact=True)) random_state = lang_config["random_state"] files = filter_files( files, lang_config["line_length_limit"], lang_config["overall_size_limit"], random_state, _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info("zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) train_files, test_files = FormatAnalyzer.split_train_test( files, lang_config["test_dataset_ratio"], random_state=random_state) # ensure that the features are reproducible train_files = sorted(train_files, key=lambda x: x.path) test_files = sorted(test_files, key=lambda x: x.path) X_train, y_train, _ = fe.extract_features(train_files) X_train, selected_features = fe.select_features(X_train, y_train) if test_files: X_test, y_test, _ = fe.extract_features(test_files) if lang_config["test_dataset_ratio"]: _log.debug("Real test ratio is %.3f", X_test.shape[0] / (X_test.shape[0] + X_train.shape[0]) if test_files else 0) lang_config["feature_extractor"]["selected_features"] = selected_features lang_config["feature_extractor"]["label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X_train.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X_train.shape[0], lower_bound_instances) continue _log.info("extracted %d samples to train, searching for the best hyperparameters", X_train.shape[0]) optimizer = Optimizer(**lang_config["optimizer"], random_state=random_state) best_score, best_params = optimizer.optimize(X_train, y_train) if _log.isEnabledFor(logging.DEBUG): _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") else: _log.info("finished hyperopt at %.6f, training the full model", -best_score) lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], random_state=random_state, origin_config=lang_config) trainable_rules.fit(X_train, y_train) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join("%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) trainable_rules.prune_categorical_attributes(fe) _log.info("obtained %d rules, generating the classification report", len(trainable_rules.rules)) trainable_rules.rules.generate_classification_report( X_train, y_train, "train", fe.composite_class_representations) if test_files: trainable_rules.rules.generate_classification_report( X_test, y_test, "test", fe.composite_class_representations) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipped.", language) _log.info("trained %s", model) return model
class PostprocessingTests(unittest.TestCase): @classmethod def setUpClass(cls): slogging_setup("DEBUG", False) cls.language = "javascript" cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") cls.data_service = FakeDataService(cls.bblfsh_client, files=None, changes=None) cls.stub = cls.data_service.get_bblfsh() cls.config = FormatAnalyzer._load_config({ "train": { "language_defaults": { "feature_extractor": { "cutoff_label_support": 0 } } }, })["train"][cls.language]["feature_extractor"] @classmethod def tearDownClass(cls): cls.bblfsh_client._channel.close() def setUp(self): self.fe = FeatureExtractor(language=self.language, **self.config) def _to_label(self, classes: Sequence[str]) -> Tuple[int, ...]: return self.fe.class_sequences_to_labels[tuple(CLASS_INDEX[cls] for cls in classes)] @staticmethod def _grouped_predictions_mapping(vnodes: Sequence[VirtualNode], indices: Optional[Sequence[int]]): result = OrderedDict() if indices is None: return result y_index = [i for i, vnode in enumerate(vnodes) if vnode.y is not None] for i in indices: y_i = y_index[i] result[id(vnodes[y_i])] = (vnodes[y_i], vnodes[y_i + 1], vnodes[y_i + 2]) result[id(vnodes[y_i + 2])] = None return result def edit_and_test(self, code: str, modifs: Mapping[int, Sequence[str]], *, quote_indices: Optional[Tuple[int, ...]] = None, bad_indices: Optional[FrozenSet[int]] = None) -> None: uast, errors = parse_uast(self.stub, code, filename="", language=self.language) if errors: self.fail("Could not parse the testing code.") file = File(content=code.encode(), uast=uast, path="test_file") X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features([file]) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for index, classes in modifs.items(): y_pred[index] = self._to_label(classes) checker = UASTStabilityChecker(self.fe) grouped_quote_predictions = self._grouped_predictions_mapping( vnodes, quote_indices) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, [file], self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions=grouped_quote_predictions) bad_preds = set(range(y.shape[0])) - set(safe_preds) bad = modifs.keys() if bad_indices is None else bad_indices self.assertEqual(bad_preds, bad) self.assertEqual(len(y) - len(bad), len(new_y)) self.assertEqual(len(y_pred) - len(bad), len(new_y_pred)) self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y)) self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners)) def test_posprocess(self): self.edit_and_test("var a = 0", {1: (CLS_NOOP, )}) def test_bad_and_good_quotes(self): self.edit_and_test("""var a = '"0"'; var c = "0";""", { 4: (CLS_DOUBLE_QUOTE, ), 5: (CLS_DOUBLE_QUOTE, ), 10: (CLS_SINGLE_QUOTE, ), 11: (CLS_SINGLE_QUOTE, ) }, quote_indices=(4, 10), bad_indices=frozenset((4, 5))) def test_lonely_quote(self): self.edit_and_test("var a = 0; var b = 'c';", {2: (CLS_SINGLE_QUOTE)}, quote_indices=(9, )) def test_multiple_files(self): data = [ ("var a = 0", { 1: (CLS_NOOP, ) }), ("var b = 123", { 4: (CLS_NOOP, ) }), ] files = [] for i, (code, _) in enumerate(data): uast, errors = parse_uast(self.stub, code, filename="", language=self.language) if errors: self.fail("Could not parse the testing code.") files.append( File(content=code.encode(), uast=uast, path="test_file_%d" % i)) X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features(files) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for (_, modif) in data: for i in modif: y_pred[i] = self._to_label(modif[i]) checker = UASTStabilityChecker(self.fe) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, files, self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions={}) self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
def setUp(self): self.fe = FeatureExtractor(language=self.language, **self.config)
def visualize(input_filename: str, bblfsh: str, language: str, model_path: str) -> None: """Visualize the errors made on a single file.""" model = FormatModel().load(model_path) rules = model[language] print("Model parameters: %s" % rules.origin) print("Stats about rules: %s" % rules) client = BblfshClient(bblfsh) file = prepare_file(input_filename, client, language) fe = FeatureExtractor(language=language, **rules.origin_config["feature_extractor"]) X, y, vnodes_y, vnodes = fe.extract_features([file]) y_pred, _, _ = rules.predict(X, vnodes_y, vnodes, fe) # collect lines with mispredictions - could be removed mispred_lines = set() lines = set() for gt, pred, node in zip(y, y_pred, vnodes_y): lines.add((node.path, node.start.line)) if gt != pred: mispred_lines.add((node.path, node.start.line)) print("Number of lines with mispredictions: %s out of %s mispredicted" % (len(mispred_lines), len(lines))) # collect mispredictions and all other predictions for each line with mistake mispred = defaultdict(list) for gt, pred, node in zip(y, y_pred, vnodes_y): if (node.path, node.start.line) in mispred_lines: mispred[(node.path, node.start.line)].append(Misprediction(gt, pred, node)) # sort each line for value in mispred.values(): value.sort(key=lambda k: k.node.start.offset) # final mispredictions final_mispred = [] for line in sorted(mispred): gt = [m.y for m in mispred[line]] pred = [m.pred for m in mispred[line]] s = SequenceMatcher(None, gt, pred) blocks = s.get_matching_blocks() if blocks[0].a != 0: # mispredictions before the first matching block final_mispred.extend(mispred[line][:blocks[0].a]) for i in range(len(blocks) - 1): final_mispred.extend(mispred[line][blocks[i].a:blocks[i + 1].a]) if blocks[-1].a != len(mispred[line]): # mispredictions after the last matching block final_mispred.extend(mispred[line][blocks[-1].a:]) mispred = sorted([misp for misp in final_mispred if misp.y != misp.pred], key=lambda r: r.node.start.offset) new_content = ENDC old_content = file.content.decode("utf-8") for i in range(len(mispred)): wrong = mispred[i] start = wrong.node.start.offset end = wrong.node.end.offset if end == start: end = start + len(wrong.node.value) if i == 0 and start != 0: new_content += old_content[:start] new_content += GREEN + CLASSES[wrong.y] + RED + CLASSES[ wrong.pred] + ENDC if i == len(mispred) - 1: if end != len(old_content): new_content += old_content[end:] else: new_content += old_content[end:mispred[i + 1].node.start.offset] print("Visualization:\n" + new_content)
def train(cls, ptr: ReferencePointer, config: Mapping[str, Any], data_service: DataService, **data) -> FormatModel: """ Train a model given the files available. :param ptr: Git repository state pointer. :param config: configuration dict. :param data: contains "files" - the list of files in the pointed state. :param data_service: connection to the Lookout data retrieval service. :return: AnalyzerModel containing the learned rules, per language. """ _log = logging.getLogger(cls.__name__) _log.info("train %s %s %s", ptr.url, ptr.commit, pformat(config, width=4096, compact=True)) model = FormatModel().construct(cls, ptr) config = cls._load_train_config(config) for language, files in files_by_language(data["files"]).items(): try: lang_config = config[language] except KeyError: _log.warning("language %s is not supported, skipped", language) continue files = filter_files(files, lang_config["line_length_limit"], _log) submit_event("%s.train.%s.files" % (cls.name, language), len(files)) if len(files) == 0: _log.info( "zero files after filtering, language %s is skipped.", language) continue try: fe = FeatureExtractor(language=language, **lang_config["feature_extractor"]) except ImportError: _log.warning("skipped %d %s files - not supported", len(files), language) continue else: _log.info("training on %d %s files", len(files), language) # we sort to make the features reproducible X, y, _ = fe.extract_features(sorted(files, key=lambda x: x.path)) X, selected_features = fe.select_features(X, y) lang_config["feature_extractor"][ "selected_features"] = selected_features lang_config["feature_extractor"][ "label_composites"] = fe.labels_to_class_sequences lower_bound_instances = lang_config["lower_bound_instances"] if X.shape[0] < lower_bound_instances: _log.warning("skipped %d %s files: too few samples (%d/%d)", len(files), language, X.shape[0], lower_bound_instances) continue _log.debug("training the rules model") optimizer = Optimizer( n_jobs=lang_config["n_jobs"], n_iter=lang_config["n_iter"], cv=lang_config["cv"], random_state=lang_config["trainable_rules"]["random_state"]) best_score, best_params = optimizer.optimize(X, y) _log.debug("score of the best estimator found: %.6f", best_score) _log.debug("params of the best estimator found: %s", str(best_params)) _log.debug("training the model with complete data") lang_config["trainable_rules"].update(best_params) trainable_rules = TrainableRules(**lang_config["trainable_rules"], origin_config=lang_config) trainable_rules.fit(X, y) importances = trainable_rules.feature_importances_ _log.debug( "feature importances from %s:\n\t%s", lang_config["trainable_rules"]["base_model_name"], "\n\t".join( "%-55s %.5E" % (fe.feature_names[i], importances[i]) for i in numpy.argsort(-importances)[:25] if importances[i] > 1e-5)) submit_event("%s.train.%s.rules" % (cls.name, language), len(trainable_rules.rules)) # TODO(vmarkovtsev): save the achieved precision, recall, etc. to the model # throw away imprecise classes if trainable_rules.rules.rules: model[language] = trainable_rules.rules else: _log.warning("model for %s has 0 rules. Skipping.", language) _log.info("trained %s", model) return model
def setUp(self): config = FormatAnalyzer._load_config(get_config())["train"] self.extractor = FeatureExtractor(language="javascript", **config["javascript"]["feature_extractor"])
def setUp(self): config = FormatAnalyzer._load_train_config(get_train_config()) self.final_config = config["javascript"] self.extractor = FeatureExtractor( language="javascript", **self.final_config["feature_extractor"])