Python CandidatesRanker.rank примеры использования

Язык программирования: Python

Пространство имен/Пакет: lookout.style.typos.ranking

Класс/Тип: CandidatesRanker

Метод/Функция: rank

Примеров на hotexamples.com: 7

Python CandidatesRanker.rank - 7 примеров найдено. Это лучшие примеры Python кода для lookout.style.typos.ranking.CandidatesRanker.rank, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

fit(11)

CandidatesRanker(10)

rank(7)

_generate_tree(3)

_load_tree(3)

dump(3)

construct(2)

save(2)

set_config(1)

Пример #1

Показать файл

    def test_custom_ranker(self):
        custom_data = pandas.DataFrame([[["get", "tokens", "num"], "get", "get"],
                                        [["gwt", "tokens"], "gwt", "get"],
                                        [["get", "tokem"], "tokem", "token"]],
                                       columns=[Columns.Split, Columns.Token,
                                                Columns.CorrectToken])
        custom_candidates_tokens = pandas.DataFrame([[0, "get", "get"],
                                                     [1, "gwt", "get"],
                                                     [1, "gwt", "gpt"],
                                                     [2, "tokem", "tokem"],
                                                     [2, "tokem", "taken"],
                                                     [2, "tokem", "token"]],
                                                    columns=[Columns.Id, Columns.Token,
                                                             Columns.Candidate])
        custom_candidates_features = numpy.array([[10.0, 1.0, 3.5],
                                                  [9.6, 1.3, 2.3],
                                                  [0.23, -1.3, 156.3],
                                                  [5.6, 0.4, 32.65],
                                                  [-0.03, 0.2, 678.4],
                                                  [8.9, 0.8, 5.2]])

        ranker = CandidatesRanker()
        ranker.fit(custom_data[Columns.CorrectToken], custom_candidates_tokens,
                   custom_candidates_features, val_part=0.5)
        suggestions = ranker.rank(custom_candidates_tokens, custom_candidates_features,
                                  n_candidates=1, return_all=True)
        self.assertSetEqual(set(suggestions.keys()), set(custom_data.index))

Пример #2

Показать файл

    def test_custom_ranker(self):
        custom_data = pandas.DataFrame(
            [[["get", "tokens", "num"], "get", "get"],
             [["gwt", "tokens"], "gwt", "get"],
             [["get", "tokem"], "tokem", "token"]],
            columns=[SPLIT_COLUMN, TYPO_COLUMN, CORRECT_TOKEN_COLUMN])
        custom_candidates_tokens = pandas.DataFrame(
            [[0, "get", "get"], [1, "gwt", "get"], [1, "gwt", "gpt"],
             [2, "tokem", "tokem"], [2, "tokem", "taken"],
             [2, "tokem", "token"]],
            columns=[ID_COLUMN, TYPO_COLUMN, CANDIDATE_COLUMN])
        custom_candidates_features = numpy.array([[10.0, 1.0, 3.5],
                                                  [9.6, 1.3, 2.3],
                                                  [0.23, -1.3, 156.3],
                                                  [5.6, 0.4, 32.65],
                                                  [-0.03, 0.2, 678.4],
                                                  [8.9, 0.8, 5.2]])

        ranker = CandidatesRanker()
        ranker.fit(custom_data[CORRECT_TOKEN_COLUMN],
                   custom_candidates_tokens,
                   custom_candidates_features,
                   val_part=0.5)
        suggestions = ranker.rank(custom_candidates_tokens,
                                  custom_candidates_features,
                                  n_candidates=1,
                                  return_all=True)
        self.assertSetEqual(set(suggestions.keys()), set(custom_data.index))

Пример #3

Показать файл

Файл: test_ranking.py Проект: warenlg/style-analyzer

 def test_ranker(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz"))
     candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply(
         lambda x: list(map(float, x[1:-1].split())))
     ranker = CandidatesRanker()
     ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     suggestions = ranker.rank(get_candidates_metadata(candidates),
                               get_candidates_features(candidates),
                               n_candidates=3, return_all=True)
     self.assertSetEqual(set(suggestions.keys()), set(data.index))

Пример #4

Показать файл

 def test_ranker(self):
     data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"),
                            index_col=0).infer_objects()
     candidates = pandas.read_pickle(
         join(TEST_DATA_PATH, "test_data_candidates_full.pkl"))
     ranker = CandidatesRanker()
     ranker.fit(data[CORRECT_TOKEN_COLUMN],
                get_candidates_metadata(candidates),
                get_candidates_features(candidates))
     suggestions = ranker.rank(get_candidates_metadata(candidates),
                               get_candidates_features(candidates),
                               n_candidates=3,
                               return_all=True)
     self.assertSetEqual(set(suggestions.keys()), set(data.index))

Пример #5

Показать файл

class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    _log = logging.getLogger("TyposCorrector")

    NAME = "typos_correction"
    VENDOR = "source{d}"
    DESCRIPTION = "Model that suggests fixes to correct typos."
    LICENSE = DEFAULT_LICENSE

    def __init__(self, ranking_config: Optional[Mapping[str, Any]] = None, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param ranking_config: Ranking configuration, options:
                                train_rounds: Number of training rounds (int).
                                early_stopping: Early stopping parameter (int).
                                boost_param: Boosting parameters (dict).
        :param kwargs: Extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker(ranking_config)

    @property
    def processes_number(self) -> int:
        """Return the number of processes for multiprocessing used to train and to predict."""
        return self.ranker.config["boost_param"]["nthread"]

    @processes_number.setter
    def processes_number(self, processes_number: int):
        """Set the number of processes for multiprocessing used to train and to predict."""
        self.ranker.config["boost_param"]["nthread"] = processes_number

    def initialize_generator(self, vocabulary_file: str, frequencies_file: str,
                             embeddings_file: str, config: Optional[Mapping[str, Any]] = None,
                             ) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path to the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens at \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.generator.construct(vocabulary_file, frequencies_file, embeddings_file, config)
        self._log.debug("%s is initialized", repr(self.generator))

    def set_ranking_config(self, config: Mapping[str, Any]) -> None:
        """
        Update the ranking config - see XGBoost docs for details.

        :param config: Ranking configuration, options:
                       train_rounds: Number of training rounds (int).
                       early_stopping: Early stopping parameter (int).
                       boost_param: Boosting parameters (dict).
        """
        self.ranker.set_config(config)
        self._log.debug("%s is initialized", repr(self.ranker))

    def set_generation_config(self, config: Mapping[str, Any]) -> None:
        """
        Update the candidates generation config.

        :param config: Candidates generation configuration, options:
                       neighbors_number: Number of neighbors of context and typo embeddings \
                                         to consider as candidates (int).
                       edit_dist_number: Number of the most frequent tokens among tokens at \
                                         equal edit distance from the typo to consider as \
                                         candidates (int).
                       max_distance: Maximum edit distance for symspell lookup for candidates \
                                    (int).
                       radius: Maximum edit distance from typo allowed for candidates (int).
                       max_corrected_length: Maximum length of prefix in which symspell lookup \
                                             for typos is conducted (int).
                       start_pool_size: Length of data, starting from which multiprocessing is \
                                        desired (int).
                       chunksize: Max size of a chunk for one process during multiprocessing (int).
        """
        self.generator.set_config(config)
        self._log.debug("%s is initialized", repr(self.ranker))

    def expand_vocabulary(self, additional_tokens: Iterable[str]) -> None:
        """
        Add given tokens to the model's vocabulary.

        :param additional_tokens: Tokens to add to the vocabulary.
        """
        self.generator.expand_vocabulary(additional_tokens)

    def train(self, data: pandas.DataFrame, candidates: Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self._log.info("train input shape: %s", data.shape)
        if candidates is None:
            self._log.info("candidates were not provided and will be generated")
            candidates = self.generator.generate_candidates(
                data, self.processes_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
            self._log.info("loaded candidates from %s", candidates)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self, data_file: str, candidates:  Optional[str] = None,
                      save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given file.

        :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \
                          Columns.CorrectToken and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self.train(pandas.read_csv(data_file, index_col=0, keep_default_na=False), candidates,
                   save_candidates_file)

    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.processes_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)

    def suggest_on_file(self, data_file: str, candidates:  Optional[str] = None,
                        save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                        return_all: bool = True) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given file.

        :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \
                          and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        return self.suggest(pandas.read_csv(data_file, index_col=0, keep_default_na=False),
                            candidates, save_candidates_file, n_candidates, return_all)

    def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3,
                           return_all: bool = True, batch_size: int = 2048,
                           ) -> Dict[int, List[Candidate]]:
        """
        Suggest corrections for the tokens from the given dataset by batches. \
        Does not support precalculated candidates.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :param batch_size: Batch size.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(data), batch_size)):
            suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates,
                                       return_all=return_all)
            all_suggestions.append(suggestions.items())
        return dict(chain.from_iterable(all_suggestions))

    def evaluate(self, test_data: pandas.DataFrame) -> Tuple[Dict[int, List[Candidate]], str]:
        """
        Evaluate the corrector on the given test dataset.

        Save the result metrics to the model metadata and print it to the standard output.
        :param test_data: DataFrame which contains column Columns.Token, \
                          column Columns.Split is optional, but used when present.
        :return: Suggestions for correction of tokens inside the `test_data` and the quality
                 report.
        """
        self._log.info("evaluate on test data with shape %s", test_data.shape)
        suggestions = self.suggest(test_data)
        report = generate_report(test_data, suggestions)
        self.metrics = get_scores(test_data, suggestions)
        self._log.info("evaluation report:\n%s", report)
        return suggestions, report

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" %
                (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {"generator": self.generator._generate_tree(),
                "ranker": self.ranker._generate_tree()}

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])

Пример #6

Показать файл

Файл: corrector.py Проект: warenlg/style-analyzer

class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    NAME = "typos_correction"
    VENDOR = "source{d}"
    DESCRIPTION = "Model that suggests fixes to correct typos."
    LICENSE = DEFAULT_LICENSE
    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_CANDIDATES = 20
    DEFAULT_TRAIN_ROUNDS = 4000
    DEFAULT_EARLY_STOPPING = 200
    DEFAULT_BOOST_PARAM = {"max_depth": 6,
                           "eta": 0.03,
                           "min_child_weight": 2,
                           "silent": 1,
                           "objective": "binary:logistic",
                           "subsample": 0.5,
                           "colsample_bytree": 0.5,
                           "alpha": 1,
                           "eval_metric": ["error"],
                           "nthread": 0}

    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()

    @property
    def threads_number(self) -> int:
        """Return the number of threads for multiprocessing used to train and to predict."""
        return self.ranker.boost_param["nthread"]

    @threads_number.setter
    def threads_number(self, threads_number: int):
        """Set the number of threads for multiprocessing used to train and to predict."""
        self.ranker.boost_param["nthread"] = threads_number

    def initialize_ranker(self, boost_params: Optional[dict] = None,
                          train_rounds: int = DEFAULT_TRAIN_ROUNDS,
                          early_stopping: int = DEFAULT_EARLY_STOPPING) -> None:
        """
        Apply the ranking parameters - see XGBoost docs for details.

        :param train_rounds: Number of training rounds.
        :param early_stopping: Early stopping parameter.
        :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM.
        :return: Nothing
        """
        boost_params = boost_params or self.DEFAULT_BOOST_PARAM
        self.ranker.construct(boost_params, train_rounds, early_stopping)

    def initialize_generator(self, vocabulary_file: str, frequencies_file: str,
                             embeddings_file: Optional[str] = None,
                             neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER,
                             edit_candidates: int = DEFAULT_EDIT_CANDIDATES,
                             max_distance: int = DEFAULT_MAX_DISTANCE,
                             radius: int = DEFAULT_RADIUS) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path to the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param neighbors_number: Number of neighbors of context and typo embeddings \
                                 to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup.
        :param radius: Maximum edit distance from typo allowed for candidates.
        """
        self.generator.construct(vocabulary_file, frequencies_file, embeddings_file,
                                 neighbors_number, edit_candidates, max_distance, radius)

    def train(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
              save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \
                     and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self, data_file: str, candidates:  Optional[str] = None,
                      save_candidates_file: Optional[str] = None) -> None:
        """
        Train corrector on tokens from the given file.

        :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \
                          Columns.CorrectToken and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates (.csv.xz).
        """
        self.train(pandas.read_csv(data_file, index_col=0), candidates, save_candidates_file)

    def suggest(self, data: pandas.DataFrame, candidates:  Optional[str] = None,
                save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                data, self.threads_number, save_candidates_file)
        else:
            candidates = pandas.read_csv(candidates, index_col=0)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates), n_candidates, return_all)

    def suggest_on_file(self, data_file: str, candidates:  Optional[str] = None,
                        save_candidates_file: Optional[str] = None, n_candidates: int = 3,
                        return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given file.

        :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \
                          and Columns.Split.
        :param candidates: A .csv.xz dump of a dataframe with precalculated candidates.
        :param save_candidates_file: Path to file to save candidates to (.csv.xz).
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        return self.suggest(pandas.read_csv(data_file, index_col=0), candidates,
                            save_candidates_file, n_candidates, return_all)

    def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3,
                           return_all: bool = True, batch_size: int = 2048,
                           ) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for the tokens from the given dataset by batches. \
        Does not support precalculated candidates.

        :param data: DataFrame which contains columns Columns.Token and Columns.Split.
        :param n_candidates: Number of most probable candidates to return.
        :param return_all: False to return suggestions only for corrected tokens.
        :param batch_size: Batch size.

        :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(data), batch_size)):
            suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates,
                                       return_all=return_all)
            all_suggestions.append(suggestions.items())
        return dict(chain.from_iterable(all_suggestions))

    def evaluate(self, test_data: pandas.DataFrame) -> None:
        """
        Evaluate the corrector on the given test dataset.

        Save the result metrics to the model metadata and print it to the standard output.
        :param test_data: DataFrame which contains column Columns.Token, \
                          column Columns.Split is optional, but used when present
        """
        suggestions = self.suggest(test_data)
        self.metrics = get_scores(test_data, suggestions)
        print(generate_report(test_data, suggestions))

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" %
                (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {"generator": self.generator._generate_tree(),
                "ranker": self.ranker._generate_tree()}

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])

Пример #7

Показать файл

class TyposCorrector(Model):
    """
    Model for correcting typos in tokens inside identifiers.
    """

    NAME = "typos_correction"
    VENDOR = "source{d}"

    DEFAULT_RADIUS = 3
    DEFAULT_MAX_DISTANCE = 2
    DEFAULT_NEIGHBORS_NUMBER = 0
    DEFAULT_EDIT_CANDIDATES = 20
    DEFAULT_TRAIN_ROUNDS = 4000
    DEFAULT_EARLY_STOPPING = 200
    DEFAULT_BOOST_PARAM = {
        "max_depth": 6,
        "eta": 0.03,
        "min_child_weight": 2,
        "silent": 1,
        "objective": "binary:logistic",
        "subsample": 0.5,
        "colsample_bytree": 0.5,
        "alpha": 1,
        "eval_metric": ["error"],
        "nthread": 0
    }

    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()

    @property
    def threads_number(self):
        """Return the number of threads used to train and to predict."""
        return self.ranker.boost_param["nthread"]

    def initialize_ranker(self,
                          train_rounds: int = DEFAULT_TRAIN_ROUNDS,
                          early_stopping: int = DEFAULT_EARLY_STOPPING,
                          boost_params: dict = None) -> None:
        """
        Apply the ranking parameters - see XGBoost docs for details.

        :param train_rounds: Number of training rounds.
        :param early_stopping: Early stopping parameter.
        :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM.
        :return: Nothing
        """
        boost_params = boost_params or self.DEFAULT_BOOST_PARAM
        self.ranker.construct(train_rounds, early_stopping, boost_params)

    def initialize_generator(self,
                             vocabulary_file: str,
                             frequencies_file: str,
                             embeddings_file: str = None,
                             neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER,
                             edit_candidates: int = DEFAULT_EDIT_CANDIDATES,
                             max_distance: int = DEFAULT_MAX_DISTANCE,
                             radius: int = DEFAULT_RADIUS) -> None:
        """
        Construct a new CandidatesGenerator.

        :param vocabulary_file: The path to the vocabulary.
        :param frequencies_file: The path ot the frequencies.
        :param embeddings_file: The path to the embeddings.
        :param neighbors_number: Number of neighbors of context and typo embeddings \
                                 to consider as candidates.
        :param edit_candidates: Number of the most frequent tokens among tokens on \
                                equal edit distance from the typo to consider as candidates.
        :param max_distance: Maximum edit distance for symspell lookup.
        :param radius: Maximum edit distance from typo allowed for candidates.
        :return: Nothing
        """
        self.generator.construct(vocabulary_file, frequencies_file,
                                 embeddings_file, neighbors_number,
                                 edit_candidates, max_distance, radius)

    def train(self,
              typos: pandas.DataFrame,
              candidates: pandas.DataFrame = None,
              save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos: DataFrame containing columns "typo" and "identifier",
                      column "token_split" is optional, but used when present.
        :param candidates: DataFrame with precalculated candidates.
        :param save_candidates_file: Path to file where to save the candidates.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        self.ranker.fit(typos[CORRECT_TOKEN_COLUMN],
                        get_candidates_metadata(candidates),
                        get_candidates_features(candidates))

    def train_on_file(self,
                      typos_file: str,
                      candidates_file: str = None,
                      save_candidates_file: str = None) -> None:
        """
        Train corrector on the given dataset of typos inside identifiers.

        :param typos_file: CSV file with columns "typo" and "identifier",
                           column "token_split" is optional, but used when present.
        :param candidates_file: Pickle dump of pandas.DataFrame with precalculated \
                                candidates and features
        :param save_candidates_file: Path to file where to save the candidates.
        """
        typos = pandas.read_csv(typos_file, index_col=0)
        candidates = None
        if candidates_file is not None:
            candidates = pandas.read_pickle(candidates_file)
        self.train(typos, candidates, save_candidates_file)

    def suggest(self,
                typos: pandas.DataFrame,
                candidates: pandas.DataFrame = None,
                save_candidates_file: str = None,
                n_candidates: int = 3,
                return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos: DataFrame containing column "typo", \
                      column "token_split" is optional, but used when present
        :param candidates: DataFrame with precalculated candidates
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        if candidates is None:
            candidates = self.generator.generate_candidates(
                typos, self.threads_number, save_candidates_file)
        return self.ranker.rank(get_candidates_metadata(candidates),
                                get_candidates_features(candidates),
                                n_candidates, return_all)

    def suggest_file(
            self,
            typos_file: str,
            candidates_file: str = None,
            save_candidates_file: str = None,
            n_candidates: int = 3,
            return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]:
        """
        Suggest corrections for given typos.

        :param typos_file: csv file containing DataFrame with column "typo", \
                           column "token_split" is optional, but used when present
        :param candidates_file: pickle file containing DataFrame with precalculated \
                                candidates and features
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param save_candidates_file: Path to file to save candidates to
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        typos = pandas.read_csv(typos_file, index_col=0)
        candidates = None
        if candidates_file is not None:
            candidates = pandas.read_pickle(candidates_file)
        return self.suggest(typos, candidates, save_candidates_file,
                            n_candidates, return_all)

    def suggest_by_batches(
        self,
        typos: pandas.DataFrame,
        n_candidates: int = None,
        return_all: bool = True,
        batch_size: int = 2048,
    ) -> Dict[int, List[Tuple[str, float]]]:
        """
        Correct typos from dataset by batches. Does not support precalculated candidates.

        Suggest corrections for given typos
        :param typos: DataFrame containing column "typo", \
               column "token_split" is optional, but used when present
        :param n_candidates: Number of most probable candidates to return
        :param return_all: False to return suggestions only for corrected tokens
        :param batch_size: Batch size
        :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \
                 by correctness probability in a descending order.
        """
        all_suggestions = []
        for i in tqdm(range(0, len(typos), batch_size)):
            suggestions = self.suggest(
                typos.loc[typos.index[i]:typos.
                          index[min(len(typos) - 1, i + batch_size - 1)], :],
                n_candidates=n_candidates,
                return_all=return_all)
            all_suggestions.append(suggestions.items())

        return dict(chain.from_iterable(all_suggestions))

    def __eq__(self, other: "TyposCorrector") -> bool:
        return self.generator == other.generator and self.ranker == other.ranker

    def dump(self) -> str:
        """Model.__str__ to format the object."""
        return ("# Generator\n"
                "%s\n\n"
                "# Ranker\n"
                "%s" % (self.generator.dump(), self.ranker.dump()))

    def _generate_tree(self) -> dict:
        return {
            "generator": self.generator._generate_tree(),
            "ranker": self.ranker._generate_tree()
        }

    def _load_tree(self, tree: dict) -> None:
        self.generator._load_tree(tree["generator"])
        self.ranker._load_tree(tree["ranker"])