Exemplo n.º 1
0
    def test_custom_ranker(self):
        custom_data = pandas.DataFrame(
            [[["get", "tokens", "num"], "get", "get"],
             [["gwt", "tokens"], "gwt", "get"],
             [["get", "tokem"], "tokem", "token"]],
            columns=[SPLIT_COLUMN, TYPO_COLUMN, CORRECT_TOKEN_COLUMN])
        custom_candidates_tokens = pandas.DataFrame(
            [[0, "get", "get"], [1, "gwt", "get"], [1, "gwt", "gpt"],
             [2, "tokem", "tokem"], [2, "tokem", "taken"],
             [2, "tokem", "token"]],
            columns=[ID_COLUMN, TYPO_COLUMN, CANDIDATE_COLUMN])
        custom_candidates_features = numpy.array([[10.0, 1.0, 3.5],
                                                  [9.6, 1.3, 2.3],
                                                  [0.23, -1.3, 156.3],
                                                  [5.6, 0.4, 32.65],
                                                  [-0.03, 0.2, 678.4],
                                                  [8.9, 0.8, 5.2]])

        ranker = CandidatesRanker()
        ranker.fit(custom_data[CORRECT_TOKEN_COLUMN],
                   custom_candidates_tokens,
                   custom_candidates_features,
                   val_part=0.5)
        suggestions = ranker.rank(custom_candidates_tokens,
                                  custom_candidates_features,
                                  n_candidates=1,
                                  return_all=True)
        self.assertSetEqual(set(suggestions.keys()), set(custom_data.index))
Exemplo n.º 2
0
    def test_custom_ranker(self):
        custom_data = pandas.DataFrame([[["get", "tokens", "num"], "get", "get"],
                                        [["gwt", "tokens"], "gwt", "get"],
                                        [["get", "tokem"], "tokem", "token"]],
                                       columns=[Columns.Split, Columns.Token,
                                                Columns.CorrectToken])
        custom_candidates_tokens = pandas.DataFrame([[0, "get", "get"],
                                                     [1, "gwt", "get"],
                                                     [1, "gwt", "gpt"],
                                                     [2, "tokem", "tokem"],
                                                     [2, "tokem", "taken"],
                                                     [2, "tokem", "token"]],
                                                    columns=[Columns.Id, Columns.Token,
                                                             Columns.Candidate])
        custom_candidates_features = numpy.array([[10.0, 1.0, 3.5],
                                                  [9.6, 1.3, 2.3],
                                                  [0.23, -1.3, 156.3],
                                                  [5.6, 0.4, 32.65],
                                                  [-0.03, 0.2, 678.4],
                                                  [8.9, 0.8, 5.2]])

        ranker = CandidatesRanker()
        ranker.fit(custom_data[Columns.CorrectToken], custom_candidates_tokens,
                   custom_candidates_features, val_part=0.5)
        suggestions = ranker.rank(custom_candidates_tokens, custom_candidates_features,
                                  n_candidates=1, return_all=True)
        self.assertSetEqual(set(suggestions.keys()), set(custom_data.index))
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()
Exemplo n.º 4
0
    def __init__(self, ranking_config: Optional[Mapping[str, Any]] = None, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param ranking_config: Ranking configuration, options:
                                train_rounds: Number of training rounds (int).
                                early_stopping: Early stopping parameter (int).
                                boost_param: Boosting parameters (dict).
        :param kwargs: Extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker(ranking_config)
Exemplo n.º 5
0
 def test_ranker(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     suggestions = ranker.rank(get_candidates_metadata(candidates),
                               get_candidates_features(candidates),
                               n_candidates=3, return_all=True)
     self.assertSetEqual(set(suggestions.keys()), set(data.index))
Exemplo n.º 6
0
 def test_ranker(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     suggestions = ranker.rank(get_candidates_metadata(candidates),
                               get_candidates_features(candidates),
                               n_candidates=3,
                               return_all=True)
     self.assertSetEqual(set(suggestions.keys()), set(data.index))
Exemplo n.º 7
0
 def test_eq(self):
     self.assertTrue(CandidatesRanker() == CandidatesRanker())
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     self.assertFalse(ranker == CandidatesRanker())
Exemplo n.º 8
0
 def test_eq(self):
     self.assertTrue(CandidatesRanker() == CandidatesRanker())
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     self.assertFalse(ranker == CandidatesRanker())
Exemplo n.º 9
0
 def test_save_load(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     with io.BytesIO() as buffer:
         ranker.save(output=buffer, series="typos-analyzer")
         print(buffer.tell())
         buffer.seek(0)
         ranker2 = CandidatesRanker().load(buffer)
     print(ranker)
     self.assertTrue(ranker == ranker2)
Exemplo n.º 10
0
 def test_save_load(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     with io.BytesIO() as buffer:
         ranker.save(buffer)
         print(buffer.tell())
         buffer.seek(0)
         ranker2 = CandidatesRanker().load(buffer)
     print(ranker)
     self.assertTrue(ranker == ranker2)
Exemplo n.º 11
0
class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    _log = logging.getLogger("TyposCorrector")

    NAME = "typos_correction"
    VENDOR = "source{d}"
    DESCRIPTION = "Model that suggests fixes to correct typos."
    LICENSE = DEFAULT_LICENSE

    def __init__(self, ranking_config: Optional[Mapping[str, Any]] = None, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param ranking_config: Ranking configuration, options:
                                train_rounds: Number of training rounds (int).
                                early_stopping: Early stopping parameter (int).
                                boost_param: Boosting parameters (dict).
        :param kwargs: Extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker(ranking_config)

    @property
    def processes_number(self) -> int:
        """Return the number of processes for multiprocessing used to train and to predict."""
        return self.ranker.config["boost_param"]["nthread"]

    @processes_number.setter
    def processes_number(self, processes_number: int):
        """Set the number of processes for multiprocessing used to train and to predict."""
        self.ranker.config["boost_param"]["nthread"] = processes_number

    def initialize_generator(self, vocabulary_file: str, frequencies_file: str,
                             embeddings_file: str, config: Optional[Mapping[str, Any]] = None,
                             ) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path to the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens at \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.generator.construct(vocabulary_file, frequencies_file, embeddings_file, config)
        self._log.debug("%s is initialized", repr(self.generator))

    def set_ranking_config(self, config: Mapping[str, Any]) -> None:
        """
        Update the ranking config - see XGBoost docs for details.

        :param config: Ranking configuration, options:
                       train_rounds: Number of training rounds (int).
                       early_stopping: Early stopping parameter (int).
                       boost_param: Boosting parameters (dict).
        """
        self.ranker.set_config(config)
        self._log.debug("%s is initialized", repr(self.ranker))

    def set_generation_config(self, config: Mapping[str, Any]) -> None:
        """
        Update the candidates generation config.

        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens at \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.generator.set_config(config)
        self._log.debug("%s is initialized", repr(self.ranker))

    def expand_vocabulary(self, additional_tokens: Iterable[str]) -> None:
        """
        Add given tokens to the model's vocabulary.

        :param additional_tokens: Tokens to add to the vocabulary.
        """
        self.generator.expand_vocabulary(additional_tokens)

    def train(self, data: pandas.DataFrame, candidates: Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self._log.info("train input shape: %s", data.shape)
        if candidates is None:
            self._log.info("candidates were not provided and will be generated")
            candidates = self.generator.generate_candidates(
                data, self.processes_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
            self._log.info("loaded candidates from %s", candidates)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self, data_file: str, candidates:  Optional[str] = None,
                      save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given file.

        :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \
                          Columns.CorrectToken and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self.train(pandas.read_csv(data_file, index_col=0, keep_default_na=False), candidates,
                   save_candidates_file)

    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.processes_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)

    def suggest_on_file(self, data_file: str, candidates:  Optional[str] = None,
                        save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                        return_all: bool = True) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given file.

        :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \
                          and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        return self.suggest(pandas.read_csv(data_file, index_col=0, keep_default_na=False),
                            candidates, save_candidates_file, n_candidates, return_all)

    def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3,
                           return_all: bool = True, batch_size: int = 2048,
                           ) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given dataset by batches. \
        Does not support precalculated candidates.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :param batch_size: Batch size.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(data), batch_size)):
            suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates,
                                       return_all=return_all)
            all_suggestions.append(suggestions.items())
        return dict(chain.from_iterable(all_suggestions))

    def evaluate(self, test_data: pandas.DataFrame) -> Tuple[Dict[int, List[Candidate]], str]:
        """
        Evaluate the corrector on the given test dataset.

        Save the result metrics to the model metadata and print it to the standard output.
        :param test_data: DataFrame which contains column Columns.Token, \
                          column Columns.Split is optional, but used when present.
        :return: Suggestions for correction of tokens inside the `test_data` and the quality
                 report.
        """
        self._log.info("evaluate on test data with shape %s", test_data.shape)
        suggestions = self.suggest(test_data)
        report = generate_report(test_data, suggestions)
        self.metrics = get_scores(test_data, suggestions)
        self._log.info("evaluation report:\n%s", report)
        return suggestions, report

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" %
                (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {"generator": self.generator._generate_tree(),
                "ranker": self.ranker._generate_tree()}

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])
Exemplo n.º 12
0
class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    NAME = "typos_correction"
    VENDOR = "source{d}"
    DESCRIPTION = "Model that suggests fixes to correct typos."
    LICENSE = DEFAULT_LICENSE
    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_CANDIDATES = 20
    DEFAULT_TRAIN_ROUNDS = 4000
    DEFAULT_EARLY_STOPPING = 200
    DEFAULT_BOOST_PARAM = {"max_depth": 6,
                           "eta": 0.03,
                           "min_child_weight": 2,
                           "silent": 1,
                           "objective": "binary:logistic",
                           "subsample": 0.5,
                           "colsample_bytree": 0.5,
                           "alpha": 1,
                           "eval_metric": ["error"],
                           "nthread": 0}

    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()

    @property
    def threads_number(self) -> int:
        """Return the number of threads for multiprocessing used to train and to predict."""
        return self.ranker.boost_param["nthread"]

    @threads_number.setter
    def threads_number(self, threads_number: int):
        """Set the number of threads for multiprocessing used to train and to predict."""
        self.ranker.boost_param["nthread"] = threads_number

    def initialize_ranker(self, boost_params: Optional[dict] = None,
                          train_rounds: int = DEFAULT_TRAIN_ROUNDS,
                          early_stopping: int = DEFAULT_EARLY_STOPPING) -> None:
        """
        Apply the ranking parameters - see XGBoost docs for details.

        :param train_rounds: Number of training rounds.
        :param early_stopping: Early stopping parameter.
        :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM.
        :return: Nothing
        """
        boost_params = boost_params or self.DEFAULT_BOOST_PARAM
        self.ranker.construct(boost_params, train_rounds, early_stopping)

    def initialize_generator(self, vocabulary_file: str, frequencies_file: str,
                             embeddings_file: Optional[str] = None,
                             neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER,
                             edit_candidates: int = DEFAULT_EDIT_CANDIDATES,
                             max_distance: int = DEFAULT_MAX_DISTANCE,
                             radius: int = DEFAULT_RADIUS) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path to the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param neighbors_number: Number of neighbors of context and typo embeddings \
                                 to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup.
        :param radius: Maximum edit distance from typo allowed for candidates.
        """
        self.generator.construct(vocabulary_file, frequencies_file, embeddings_file,
                                 neighbors_number, edit_candidates, max_distance, radius)

    def train(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self, data_file: str, candidates:  Optional[str] = None,
                      save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given file.

        :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \
                          Columns.CorrectToken and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self.train(pandas.read_csv(data_file, index_col=0), candidates, save_candidates_file)

    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)

    def suggest_on_file(self, data_file: str, candidates:  Optional[str] = None,
                        save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                        return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given file.

        :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \
                          and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        return self.suggest(pandas.read_csv(data_file, index_col=0), candidates,
                            save_candidates_file, n_candidates, return_all)

    def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3,
                           return_all: bool = True, batch_size: int = 2048,
                           ) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset by batches. \
        Does not support precalculated candidates.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :param batch_size: Batch size.

        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(data), batch_size)):
            suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates,
                                       return_all=return_all)
            all_suggestions.append(suggestions.items())
        return dict(chain.from_iterable(all_suggestions))

    def evaluate(self, test_data: pandas.DataFrame) -> None:
        """
        Evaluate the corrector on the given test dataset.

        Save the result metrics to the model metadata and print it to the standard output.
        :param test_data: DataFrame which contains column Columns.Token, \
                          column Columns.Split is optional, but used when present
        """
        suggestions = self.suggest(test_data)
        self.metrics = get_scores(test_data, suggestions)
        print(generate_report(test_data, suggestions))

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" %
                (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {"generator": self.generator._generate_tree(),
                "ranker": self.ranker._generate_tree()}

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])
Exemplo n.º 13
0
class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    NAME = "typos_correction"
    VENDOR = "source{d}"

    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_CANDIDATES = 20
    DEFAULT_TRAIN_ROUNDS = 4000
    DEFAULT_EARLY_STOPPING = 200
    DEFAULT_BOOST_PARAM = {
        "max_depth": 6,
        "eta": 0.03,
        "min_child_weight": 2,
        "silent": 1,
        "objective": "binary:logistic",
        "subsample": 0.5,
        "colsample_bytree": 0.5,
        "alpha": 1,
        "eval_metric": ["error"],
        "nthread": 0
    }

    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()

    @property
    def threads_number(self):
        """Return the number of threads used to train and to predict."""
        return self.ranker.boost_param["nthread"]

    def initialize_ranker(self,
                          train_rounds: int = DEFAULT_TRAIN_ROUNDS,
                          early_stopping: int = DEFAULT_EARLY_STOPPING,
                          boost_params: dict = None) -> None:
        """
        Apply the ranking parameters - see XGBoost docs for details.

        :param train_rounds: Number of training rounds.
        :param early_stopping: Early stopping parameter.
        :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM.
        :return: Nothing
        """
        boost_params = boost_params or self.DEFAULT_BOOST_PARAM
        self.ranker.construct(train_rounds, early_stopping, boost_params)

    def initialize_generator(self,
                             vocabulary_file: str,
                             frequencies_file: str,
                             embeddings_file: str = None,
                             neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER,
                             edit_candidates: int = DEFAULT_EDIT_CANDIDATES,
                             max_distance: int = DEFAULT_MAX_DISTANCE,
                             radius: int = DEFAULT_RADIUS) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path ot the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param neighbors_number: Number of neighbors of context and typo embeddings \
                                 to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :return: Nothing
        """
        self.generator.construct(vocabulary_file, frequencies_file,
                                 embeddings_file, neighbors_number,
                                 edit_candidates, max_distance, radius)

    def train(self,
              typos: pandas.DataFrame,
              candidates: pandas.DataFrame = None,
              save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos: DataFrame containing columns "typo" and "identifier",
                      column "token_split" is optional, but used when present.
        :param candidates: DataFrame with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        self.ranker.fit(typos[CORRECT_TOKEN_COLUMN],
                        get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self,
                      typos_file: str,
                      candidates_file: str = None,
                      save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos_file: CSV file with columns "typo" and "identifier",
                           column "token_split" is optional, but used when present.
        :param candidates_file: Pickle dump of pandas.DataFrame with precalculated \
                                candidates and features
        :param save_candidates_file: Path to file where to save the candidates.
        """
        typos = pandas.read_csv(typos_file, index_col=0)
        candidates = None
        if candidates_file is not None:
            candidates = pandas.read_pickle(candidates_file)
        self.train(typos, candidates, save_candidates_file)

    def suggest(self,
                typos: pandas.DataFrame,
                candidates: pandas.DataFrame = None,
                save_candidates_file: str = None,
                n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos: DataFrame containing column "typo", \
                      column "token_split" is optional, but used when present
        :param candidates: DataFrame with precalculated candidates
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates),
                                n_candidates, return_all)

    def suggest_file(
            self,
            typos_file: str,
            candidates_file: str = None,
            save_candidates_file: str = None,
            n_candidates: int = 3,
            return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos_file: csv file containing DataFrame with column "typo", \
                           column "token_split" is optional, but used when present
        :param candidates_file: pickle file containing DataFrame with precalculated \
                                candidates and features
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        typos = pandas.read_csv(typos_file, index_col=0)
        candidates = None
        if candidates_file is not None:
            candidates = pandas.read_pickle(candidates_file)
        return self.suggest(typos, candidates, save_candidates_file,
                            n_candidates, return_all)

    def suggest_by_batches(
        self,
        typos: pandas.DataFrame,
        n_candidates: int = None,
        return_all: bool = True,
        batch_size: int = 2048,
    ) -> Dict[int, List[Tuple[str, float]]]:
        """
        Correct typos from dataset by batches. Does not support precalculated candidates.

        Suggest corrections for given typos
        :param typos: DataFrame containing column "typo", \
               column "token_split" is optional, but used when present
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param batch_size: Batch size
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(typos), batch_size)):
            suggestions = self.suggest(
                typos.loc[typos.index[i]:typos.
                          index[min(len(typos) - 1, i + batch_size - 1)], :],
                n_candidates=n_candidates,
                return_all=return_all)
            all_suggestions.append(suggestions.items())

        return dict(chain.from_iterable(all_suggestions))

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" % (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {
            "generator": self.generator._generate_tree(),
            "ranker": self.ranker._generate_tree()
        }

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])