Пример #1
0
 def test_analyze(self):
     datastub = FakeDataStub(files=self.base_files,
                             changes=[Change(base=self.base_files[0], head=self.head_files[0])])
     model = IdTyposAnalyzer.train(self.ptr, {}, datastub)
     analyzer = IdTyposAnalyzer(model, self.ptr.url, {})
     comments = analyzer.analyze(self.ptr, self.ptr, datastub)
     self.assertGreater(len(comments), 0)
Пример #2
0
    def test_reconstruct_identifier_fail(self):
        tokens = [
            ("UpperCamelCase", ["upper", "camel", "case", "fail"]),
        ]

        parser = IdTyposAnalyzer.create_token_parser()

        for identifier, splitted_tokens in tokens:
            with self.assertRaises(AssertionError):
                IdTyposAnalyzer.reconstruct_identifier(
                    parser, pred_tokens=splitted_tokens, identifier=identifier)
Пример #3
0
    def test_reconstruct_identifier(self):
        tokens = [
            ("UpperCamelCase", "UpperComelCase", ["upper", "camel", "case"]),
            ("camelCase", "comelCase", ["camel", "case"]),
            ("FRAPScase", "FRAPScase", ["frap", "scase"]),
            ("SQLThing", "SQLThing", ["sql", "thing"]),
            ("_Astra", "_Ostra", ["astra"]),
            ("CAPS_CONST", "COPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", "_something_SIILLY_", ["something",
                                                         "silly"]),
            ("blink182", "blunk182", ["blink"]),
            ("FooBar100500Bingo", "FuBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", "Men45var", ["man", "var"]),
            ("method_name", "metod_name", ["method", "name"]),
            ("Method_Name", "Metod_Name", ["method", "name"]),
            ("101dalms", "101dolms", ["dalms"]),
            ("101_dalms", "101_dolms", ["dalms"]),
            ("101_DalmsBug", "101_DolmsBug", ["dalms", "bug"]),
            ("101_Dalms45Bug7", "101_Dolms45Bug7", ["dalms", "bug"]),
            ("wdSize", "pwdSize", ["wd", "size"]),
            ("Glint", "Glunt", ["glint"]),
            ("foo_BAR", "fu_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             "source.ml.algorithmos.uast_ids_to_bags",
             ["sourced", "ml", "algorithms", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE", "WORSTnomeYOUcanIMGINE",
             ["wors", "tname", "yo", "ucan", "imagine"]),
            ("SmallIdsToFoOo", "SmallestIdsToFoOo",
             ["small", "ids", "to", "fo", "oo"]),
            ("SmallIdFooo", "SmallestIdFooo", ["small", "id", "fooo"]),
            ("ONE_M0re_.__badId.example", "ONE_M0ree_.__badId.exomple",
             ["one", "m", "re", "bad", "id", "example"]),
            ("never_use_Such__varsableNames",
             "never_use_Such__varsablezzNameszz",
             ["never", "use", "such", "varsable", "names"]),
            ("a.b.c.d", "a.b.ce.de", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", "A.be.Cde.Ee", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", "looongzz_shzz_loooongzz_shzz",
             ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", "ch_ch_ch_ch", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", "laoong_loaong_looang",
             ["loooong", "loooong", "loooong"]),
        ]

        parser = IdTyposAnalyzer.create_token_parser()

        for correct, corrupted, correct_tokens in tokens:
            self.assertEqual(
                correct,
                IdTyposAnalyzer.reconstruct_identifier(
                    parser, pred_tokens=correct_tokens, identifier=corrupted))
Пример #4
0
def evaluate_typos_on_identifiers(
        dataset: str = TYPOS_DATASET,
        config: Optional[Mapping[str, Any]] = None,
        mistakes_output: Optional[str] = None) -> str:
    """
    Run IdTyposAnalyzer on the identifiers from the evaluation dataset.

    :param dataset: Dataset of misspelled identifiers.
    :param config: Configuration for the IdTyposAnalyzer.
    :param mistakes_output: Path to the file for printing the wrong corrections.
    :return: Quality report.
    """
    identifiers = pandas.read_csv(dataset,
                                  header=0,
                                  usecols=[0, 1],
                                  names=["wrong", "correct"],
                                  keep_default_na=False)
    analyzer = IdTyposAnalyzer(IdTyposModel(), "",
                               {} if config is None else config)
    suggestions = analyzer.check_identifiers(identifiers["wrong"].tolist())
    corrections = []
    for i, identifier in enumerate(identifiers["wrong"]):
        candidates = list(
            analyzer.generate_identifier_suggestions(suggestions[i],
                                                     identifier))
        corrections.append(candidates if len(candidates) > 0 else
                           [Candidate(identifier, 1.0)])

    for pos in range(analyzer.config["n_candidates"]):
        identifiers["sugg " + str(pos)] = [
            correction[pos][0] if pos < len(correction) else ""
            for correction in corrections
        ]
    if mistakes_output is not None:
        identifiers[identifiers["sugg 0"] != identifiers["correct"]][[
            "wrong", "sugg 0", "correct"
        ]].to_csv(mistakes_output)
    template = load_jinja2_template(
        os.path.join(TEMPLATE_DIR, "quality_on_identifiers.md.jinja2"))
    return template.render(
        identifiers=identifiers,
        suggestions=suggestions,
        vocabulary_tokens=analyzer.corrector.generator.tokens,
        n_candidates=analyzer.config["n_candidates"],
        IDENTIFIER_INDEX_COLUMN=IDENTIFIER_INDEX_COLUMN,
        Candidate=Candidate,
        Columns=Columns,
        tokenize=lambda x: list(analyzer.parser.split(x)),
        flatten_df_by_column=flatten_df_by_column,
        generate_report=generate_report)
Пример #5
0
 def setUpClass(cls):
     cls.checker = IdTyposAnalyzer(model=IdTyposModel(),
                                   url="",
                                   config=dict(model=MODEL_PATH,
                                               confidence_threshold=0.2,
                                               n_candidates=3))
     cls.identifiers = ["get", "gpt_tokeb"]
     cls.test_df = pandas.DataFrame(
         [[0, "get", "get"], [1, "gpt tokeb", "gpt"],
          [1, "gpt tokeb", "tokeb"]],
         columns=[IDENTIFIER_INDEX_COLUMN, Columns.Split, Columns.Token])
     cls.suggestions = {
         1: [Candidate("get", 0.9),
             Candidate("gpt", 0.3)],
         2: [
             Candidate("token", 0.98),
             Candidate("taken", 0.3),
             Candidate("tokem", 0.01)
         ]
     }
     cls.filtered_suggestions = {
         1: [Candidate("get", 0.9),
             Candidate("gpt", 0.3)],
         2: [Candidate("token", 0.98),
             Candidate("taken", 0.3)]
     }
Пример #6
0
 def test_train(self):
     dataservice = FakeDataService(self.bblfsh_client,
                                   files=self.base_files,
                                   changes=[])
     model = IdTyposAnalyzer.train(ptr=self.ptr,
                                   config={},
                                   data_service=dataservice)
     self.assertSetEqual(
         model.identifiers,
         {"name", "print_type", "get_length", "customidentifiertostore"})
Пример #7
0
    def test_analyze(self):
        dataservice = FakeDataService(
            self.bblfsh_client,
            files=self.base_files,
            changes=[Change(base=self.base_files[0], head=self.head_files[0])])
        model = IdTyposAnalyzer.train(ptr=self.ptr,
                                      config={},
                                      data_service=dataservice)
        analyzer = IdTyposAnalyzer(model=model,
                                   url=self.ptr.url,
                                   config=dict(model=MODEL_PATH,
                                               confidence_threshold=0.0,
                                               n_candidates=3,
                                               check_all_identifiers=False))
        comments = analyzer.analyze(ptr_from=self.ptr,
                                    ptr_to=self.ptr,
                                    data_service=dataservice)
        self.assertGreater(len(comments), 0)
        bad_names = ["nam", "print_tipe", "gett_lenght"]
        good_names = [
            "name", "print_type", "get_length", "customidentifiertostore"
        ]
        for c in comments:
            self.assertFalse(
                any(name in c.text.split(", fixes:")[0]
                    for name in good_names))
            self.assertTrue(
                any(name in c.text.split(", fixes:")[0] for name in bad_names))

        analyzer = IdTyposAnalyzer(model=model,
                                   url=self.ptr.url,
                                   config=dict(model=MODEL_PATH,
                                               confidence_threshold=0.0,
                                               n_candidates=3,
                                               check_all_identifiers=True))
        comments = analyzer.analyze(ptr_from=self.ptr,
                                    ptr_to=self.ptr,
                                    data_service=dataservice)
        self.assertGreater(len(comments), 0)
        bad_names = [
            "nam", "print_tipe", "gett_lenght", "customidentifiertostore"
        ]
        good_names = ["name", "print_type", "get_length"]
        for c in comments:
            self.assertFalse(
                any(name in c.text.split(", fixes:")[0]
                    for name in good_names))
            self.assertTrue(
                any(name in c.text.split(", fixes:")[0] for name in bad_names))
Пример #8
0
 def setUpClass(cls):
     cls.checker = IdTyposAnalyzer(
         DummyAnalyzerModel(), "", config=dict(
             model=str(Path(__file__).parent / "sample_corrector.asdf"),
             confidence_threshold=0.2, n_candidates=3))
     cls.identifiers = ["get", "gpt_tokeb"]
     cls.test_df = pandas.DataFrame(
         [[0, "get", "get"], [1, "gpt tokeb", "gpt"], [1, "gpt tokeb", "tokeb"]],
         columns=[IdTyposAnalyzer.default_config["index_column"], Columns.Split, Columns.Token])
     cls.suggestions = {1: [("get", 0.9),
                            ("gpt", 0.3)],
                        2: [("token", 0.98),
                            ("taken", 0.3),
                            ("tokem", 0.01)]}
     cls.filtered_suggestions = {1: [("get", 0.9)],
                                 2: [("token", 0.98),
                                     ("taken", 0.3)]}
Пример #9
0
def pipeline(yaml_dir, n_jobs=10):
    distance = textdistance.DamerauLevenshtein()

    yaml_files = glob(os.path.join(yaml_dir, "*"))
    log("Number of YAML files", len(yaml_files))

    HERC_COLUMNS = ["repository", "hash"]
    TYPOS_COLUMNS = ["wrong", "correct", "commit", "file", "line"]

    def yaml_to_dict(yaml_loc):
        if not yaml_loc.endswith("yaml"):
            # commits.txt
            return []
        rows = []
        with open(yaml_loc, "r") as f:
            a = yaml.load(f.read(), Loader=yaml.FullLoader)

        base = {col: a["hercules"][col] for col in HERC_COLUMNS}
        for typo in a["TyposDataset"]:
            res = base.copy()
            for col in TYPOS_COLUMNS:
                res[col] = typo[col]
            rows.append(res)
        return rows

    results = Parallel(n_jobs=n_jobs)(delayed(yaml_to_dict)(loc)
                                      for loc in yaml_files)
    pandas_dict = defaultdict(list)
    for rows in results:
        for row in rows:
            for c in (HERC_COLUMNS + TYPOS_COLUMNS):
                pandas_dict[c].append(row[c])
    df = pd.DataFrame.from_dict(pandas_dict)
    initial_n_samples = df.shape[0]
    log("Number of samples in initial dataset", initial_n_samples)
    # deduplication
    deduplicated_df = df.drop_duplicates(subset=["wrong", "correct"],
                                         keep="first")
    log("Number of samples after deduplication", deduplicated_df.shape[0],
        ", before", initial_n_samples)

    # check that number of subtokens keeps the same
    splitter = IdTyposAnalyzer.create_token_parser()

    def check_2(line):
        wrong = line.wrong
        correct = line.correct
        wrong_tokens = list(splitter.split(wrong))
        corr_tokens = list(splitter.split(correct))
        if len(wrong_tokens) != len(corr_tokens):
            return "Number of subtokens is different"
        if not len(wrong_tokens):
            return "Identifier without alphabetic characters"
        return ""

    deduplicated_df["check2"] = deduplicated_df.apply(check_2, axis=1)

    log("Number of good samples after check2",
        deduplicated_df[deduplicated_df["check2"] == ""].shape[0], ", before",
        initial_n_samples)

    # Demerau-Levenshtein distance
    def check_3(line):
        wrong = line.wrong
        correct = line.correct
        wrong_tokens = list(splitter.split(wrong))
        corr_tokens = list(splitter.split(correct))
        res = []
        for t, ct in zip(wrong_tokens, corr_tokens):
            if distance(t, ct) > 2:
                res.append((t, ct))
        if res:
            return "big Demerau-Levenshtein distance %s" % res
        return ""

    deduplicated_df["check3"] = deduplicated_df.apply(check_3, axis=1)
    suspicious_tokens = deduplicated_df[deduplicated_df["check3"] != ""]
    log("Number of samples with big Demerau-Levenshtein distance",
        suspicious_tokens.shape[0])

    # examples, where token splits of the wrong and the correct identifiers are equal
    # (they differ in non-alpha chars or casing)
    deduplicated_df["wrong_split"] = deduplicated_df["wrong"].apply(
        lambda x: " ".join(splitter.split(x)))
    deduplicated_df["correct_split"] = deduplicated_df["correct"].apply(
        lambda x: " ".join(splitter.split(x)))
    deduplicated_df["check4"] = ""
    deduplicated_df["check4"][deduplicated_df["wrong_split"] ==
                              deduplicated_df["correct_split"]] = "Bad split"
    log("Number of samples where tokens are the same",
        deduplicated_df[deduplicated_df["check4"] == "Bad split"].shape[0])
    # examples, where wrong and correct identifiers are equal on lemmas level.
    nlp = spacy.load("en", disable=["parser", "ner"])

    # Filter examples with equal lemmas
    def _lemmatize(token):
        lemm = nlp(token)
        if len(lemm) > 1 or lemm[0].lemma_ == "-PRON-" or (
                token[-2:] == "ss" and lemm[0].lemma_ == token[:-1]):
            return token
        return lemm[0].lemma_

    deduplicated_df["wrong_lem"] = deduplicated_df["wrong_split"].apply(
        lambda x: " ".join(_lemmatize(token) for token in x.split()))
    deduplicated_df["correct_lem"] = deduplicated_df["correct_split"].apply(
        lambda x: " ".join(_lemmatize(token) for token in x.split()))

    deduplicated_df["check5"] = ""
    deduplicated_df["check5"][(deduplicated_df["wrong_lem"] == deduplicated_df["correct_lem"])] = \
        "Equal lemmas"
    log("Number of good samples after check5",
        deduplicated_df[deduplicated_df["check5"] == ""].shape[0], ", before",
        initial_n_samples)

    deduplicated_df["check6"] = ""
    deduplicated_df["check6"][(deduplicated_df["wrong"].str.lower() ==
                               deduplicated_df["correct"].str.lower())] = \
        "Difference in case"

    good_df = deduplicated_df[(deduplicated_df["check2"] == "")
                              & (deduplicated_df["check3"] == "") &
                              (deduplicated_df["check4"] == "") &
                              (deduplicated_df["check5"] == "") &
                              (deduplicated_df["check6"] == "")]
    good_df["repository"] = good_df["repository"].str.replace("@", "/")
    log("Number of good samples", good_df.shape[0])
    for i, row in good_df[["repository"] + TYPOS_COLUMNS].iterrows():
        print(",".join(map(str, row.values)))
Пример #10
0
 def test_train(self):
     datastub = FakeDataStub(files=self.base_files, changes=None)
     model = IdTyposAnalyzer.train(self.ptr, {}, datastub)
     self.assertIsInstance(model, DummyAnalyzerModel)