def test_custom_ranker(self): custom_data = pandas.DataFrame( [[["get", "tokens", "num"], "get", "get"], [["gwt", "tokens"], "gwt", "get"], [["get", "tokem"], "tokem", "token"]], columns=[SPLIT_COLUMN, TYPO_COLUMN, CORRECT_TOKEN_COLUMN]) custom_candidates_tokens = pandas.DataFrame( [[0, "get", "get"], [1, "gwt", "get"], [1, "gwt", "gpt"], [2, "tokem", "tokem"], [2, "tokem", "taken"], [2, "tokem", "token"]], columns=[ID_COLUMN, TYPO_COLUMN, CANDIDATE_COLUMN]) custom_candidates_features = numpy.array([[10.0, 1.0, 3.5], [9.6, 1.3, 2.3], [0.23, -1.3, 156.3], [5.6, 0.4, 32.65], [-0.03, 0.2, 678.4], [8.9, 0.8, 5.2]]) ranker = CandidatesRanker() ranker.fit(custom_data[CORRECT_TOKEN_COLUMN], custom_candidates_tokens, custom_candidates_features, val_part=0.5) suggestions = ranker.rank(custom_candidates_tokens, custom_candidates_features, n_candidates=1, return_all=True) self.assertSetEqual(set(suggestions.keys()), set(custom_data.index))
def test_custom_ranker(self): custom_data = pandas.DataFrame([[["get", "tokens", "num"], "get", "get"], [["gwt", "tokens"], "gwt", "get"], [["get", "tokem"], "tokem", "token"]], columns=[Columns.Split, Columns.Token, Columns.CorrectToken]) custom_candidates_tokens = pandas.DataFrame([[0, "get", "get"], [1, "gwt", "get"], [1, "gwt", "gpt"], [2, "tokem", "tokem"], [2, "tokem", "taken"], [2, "tokem", "token"]], columns=[Columns.Id, Columns.Token, Columns.Candidate]) custom_candidates_features = numpy.array([[10.0, 1.0, 3.5], [9.6, 1.3, 2.3], [0.23, -1.3, 156.3], [5.6, 0.4, 32.65], [-0.03, 0.2, 678.4], [8.9, 0.8, 5.2]]) ranker = CandidatesRanker() ranker.fit(custom_data[Columns.CorrectToken], custom_candidates_tokens, custom_candidates_features, val_part=0.5) suggestions = ranker.rank(custom_candidates_tokens, custom_candidates_features, n_candidates=1, return_all=True) self.assertSetEqual(set(suggestions.keys()), set(custom_data.index))
def test_eq(self): self.assertTrue(CandidatesRanker() == CandidatesRanker()) data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz")) candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply( lambda x: list(map(float, x[1:-1].split()))) ranker = CandidatesRanker() ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) self.assertFalse(ranker == CandidatesRanker())
def test_eq(self): self.assertTrue(CandidatesRanker() == CandidatesRanker()) data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_pickle( join(TEST_DATA_PATH, "test_data_candidates_full.pkl")) ranker = CandidatesRanker() ranker.fit(data[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) self.assertFalse(ranker == CandidatesRanker())
def test_ranker(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz")) candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply( lambda x: list(map(float, x[1:-1].split()))) ranker = CandidatesRanker() ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) suggestions = ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates=3, return_all=True) self.assertSetEqual(set(suggestions.keys()), set(data.index))
def test_ranker(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_pickle( join(TEST_DATA_PATH, "test_data_candidates_full.pkl")) ranker = CandidatesRanker() ranker.fit(data[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) suggestions = ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates=3, return_all=True) self.assertSetEqual(set(suggestions.keys()), set(data.index))
def test_save_load(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz")) candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply( lambda x: list(map(float, x[1:-1].split()))) ranker = CandidatesRanker() ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) with io.BytesIO() as buffer: ranker.save(output=buffer, series="typos-analyzer") print(buffer.tell()) buffer.seek(0) ranker2 = CandidatesRanker().load(buffer) print(ranker) self.assertTrue(ranker == ranker2)
def test_save_load(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_pickle( join(TEST_DATA_PATH, "test_data_candidates_full.pkl")) ranker = CandidatesRanker() ranker.fit(data[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) with io.BytesIO() as buffer: ranker.save(buffer) print(buffer.tell()) buffer.seek(0) ranker2 = CandidatesRanker().load(buffer) print(ranker) self.assertTrue(ranker == ranker2)
class TyposCorrector(Model): """ Model for correcting typos in tokens inside identifiers. """ _log = logging.getLogger("TyposCorrector") NAME = "typos_correction" VENDOR = "source{d}" DESCRIPTION = "Model that suggests fixes to correct typos." LICENSE = DEFAULT_LICENSE def __init__(self, ranking_config: Optional[Mapping[str, Any]] = None, **kwargs): """ Initialize a new instance of TyposCorrector class. :param ranking_config: Ranking configuration, options: train_rounds: Number of training rounds (int). early_stopping: Early stopping parameter (int). boost_param: Boosting parameters (dict). :param kwargs: Extra keyword arguments which are consumed by Model. """ super().__init__(**kwargs) self.generator = CandidatesGenerator() self.ranker = CandidatesRanker(ranking_config) @property def processes_number(self) -> int: """Return the number of processes for multiprocessing used to train and to predict.""" return self.ranker.config["boost_param"]["nthread"] @processes_number.setter def processes_number(self, processes_number: int): """Set the number of processes for multiprocessing used to train and to predict.""" self.ranker.config["boost_param"]["nthread"] = processes_number def initialize_generator(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, config: Optional[Mapping[str, Any]] = None, ) -> None: """ Construct a new CandidatesGenerator. :param vocabulary_file: The path to the vocabulary. :param frequencies_file: The path to the frequencies. :param embeddings_file: The path to the embeddings. :param config: Candidates generation configuration, options: neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates (int). edit_dist_number: Number of the most frequent tokens among tokens at \ equal edit distance from the typo to consider as \ candidates (int). max_distance: Maximum edit distance for symspell lookup for candidates \ (int). radius: Maximum edit distance from typo allowed for candidates (int). max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted (int). start_pool_size: Length of data, starting from which multiprocessing is \ desired (int). chunksize: Max size of a chunk for one process during multiprocessing (int). """ self.generator.construct(vocabulary_file, frequencies_file, embeddings_file, config) self._log.debug("%s is initialized", repr(self.generator)) def set_ranking_config(self, config: Mapping[str, Any]) -> None: """ Update the ranking config - see XGBoost docs for details. :param config: Ranking configuration, options: train_rounds: Number of training rounds (int). early_stopping: Early stopping parameter (int). boost_param: Boosting parameters (dict). """ self.ranker.set_config(config) self._log.debug("%s is initialized", repr(self.ranker)) def set_generation_config(self, config: Mapping[str, Any]) -> None: """ Update the candidates generation config. :param config: Candidates generation configuration, options: neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates (int). edit_dist_number: Number of the most frequent tokens among tokens at \ equal edit distance from the typo to consider as \ candidates (int). max_distance: Maximum edit distance for symspell lookup for candidates \ (int). radius: Maximum edit distance from typo allowed for candidates (int). max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted (int). start_pool_size: Length of data, starting from which multiprocessing is \ desired (int). chunksize: Max size of a chunk for one process during multiprocessing (int). """ self.generator.set_config(config) self._log.debug("%s is initialized", repr(self.ranker)) def expand_vocabulary(self, additional_tokens: Iterable[str]) -> None: """ Add given tokens to the model's vocabulary. :param additional_tokens: Tokens to add to the vocabulary. """ self.generator.expand_vocabulary(additional_tokens) def train(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None) -> None: """ Train corrector on tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). """ self._log.info("train input shape: %s", data.shape) if candidates is None: self._log.info("candidates were not provided and will be generated") candidates = self.generator.generate_candidates( data, self.processes_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False) self._log.info("loaded candidates from %s", candidates) self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) def train_on_file(self, data_file: str, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None) -> None: """ Train corrector on tokens from the given file. :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \ Columns.CorrectToken and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). """ self.train(pandas.read_csv(data_file, index_col=0, keep_default_na=False), candidates, save_candidates_file) def suggest(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Candidate]]: """ Suggest corrections for the tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file to save candidates to (.csv.xz). :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ if candidates is None: candidates = self.generator.generate_candidates( data, self.processes_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False) return self.ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates, return_all) def suggest_on_file(self, data_file: str, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Candidate]]: """ Suggest corrections for the tokens from the given file. :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file to save candidates to (.csv.xz). :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ return self.suggest(pandas.read_csv(data_file, index_col=0, keep_default_na=False), candidates, save_candidates_file, n_candidates, return_all) def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3, return_all: bool = True, batch_size: int = 2048, ) -> Dict[int, List[Candidate]]: """ Suggest corrections for the tokens from the given dataset by batches. \ Does not support precalculated candidates. :param data: DataFrame which contains columns Columns.Token and Columns.Split. :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :param batch_size: Batch size. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ all_suggestions = [] for i in tqdm(range(0, len(data), batch_size)): suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates, return_all=return_all) all_suggestions.append(suggestions.items()) return dict(chain.from_iterable(all_suggestions)) def evaluate(self, test_data: pandas.DataFrame) -> Tuple[Dict[int, List[Candidate]], str]: """ Evaluate the corrector on the given test dataset. Save the result metrics to the model metadata and print it to the standard output. :param test_data: DataFrame which contains column Columns.Token, \ column Columns.Split is optional, but used when present. :return: Suggestions for correction of tokens inside the `test_data` and the quality report. """ self._log.info("evaluate on test data with shape %s", test_data.shape) suggestions = self.suggest(test_data) report = generate_report(test_data, suggestions) self.metrics = get_scores(test_data, suggestions) self._log.info("evaluation report:\n%s", report) return suggestions, report def __eq__(self, other: "TyposCorrector") -> bool: return self.generator == other.generator and self.ranker == other.ranker def dump(self) -> str: """Model.__str__ to format the object.""" return ("# Generator\n" "%s\n\n" "# Ranker\n" "%s" % (self.generator.dump(), self.ranker.dump())) def _generate_tree(self) -> dict: return {"generator": self.generator._generate_tree(), "ranker": self.ranker._generate_tree()} def _load_tree(self, tree: dict) -> None: self.generator._load_tree(tree["generator"]) self.ranker._load_tree(tree["ranker"])
class TyposCorrector(Model): """ Model for correcting typos in tokens inside identifiers. """ NAME = "typos_correction" VENDOR = "source{d}" DESCRIPTION = "Model that suggests fixes to correct typos." LICENSE = DEFAULT_LICENSE DEFAULT_RADIUS = 3 DEFAULT_MAX_DISTANCE = 2 DEFAULT_NEIGHBORS_NUMBER = 0 DEFAULT_EDIT_CANDIDATES = 20 DEFAULT_TRAIN_ROUNDS = 4000 DEFAULT_EARLY_STOPPING = 200 DEFAULT_BOOST_PARAM = {"max_depth": 6, "eta": 0.03, "min_child_weight": 2, "silent": 1, "objective": "binary:logistic", "subsample": 0.5, "colsample_bytree": 0.5, "alpha": 1, "eval_metric": ["error"], "nthread": 0} def __init__(self, **kwargs): """ Initialize a new instance of TyposCorrector class. :param kwargs: extra keyword arguments which are consumed by Model. """ super().__init__(**kwargs) self.generator = CandidatesGenerator() self.ranker = CandidatesRanker() @property def threads_number(self) -> int: """Return the number of threads for multiprocessing used to train and to predict.""" return self.ranker.boost_param["nthread"] @threads_number.setter def threads_number(self, threads_number: int): """Set the number of threads for multiprocessing used to train and to predict.""" self.ranker.boost_param["nthread"] = threads_number def initialize_ranker(self, boost_params: Optional[dict] = None, train_rounds: int = DEFAULT_TRAIN_ROUNDS, early_stopping: int = DEFAULT_EARLY_STOPPING) -> None: """ Apply the ranking parameters - see XGBoost docs for details. :param train_rounds: Number of training rounds. :param early_stopping: Early stopping parameter. :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM. :return: Nothing """ boost_params = boost_params or self.DEFAULT_BOOST_PARAM self.ranker.construct(boost_params, train_rounds, early_stopping) def initialize_generator(self, vocabulary_file: str, frequencies_file: str, embeddings_file: Optional[str] = None, neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER, edit_candidates: int = DEFAULT_EDIT_CANDIDATES, max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS) -> None: """ Construct a new CandidatesGenerator. :param vocabulary_file: The path to the vocabulary. :param frequencies_file: The path to the frequencies. :param embeddings_file: The path to the embeddings. :param neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates. :param edit_candidates: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as candidates. :param max_distance: Maximum edit distance for symspell lookup. :param radius: Maximum edit distance from typo allowed for candidates. """ self.generator.construct(vocabulary_file, frequencies_file, embeddings_file, neighbors_number, edit_candidates, max_distance, radius) def train(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None) -> None: """ Train corrector on tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). """ if candidates is None: candidates = self.generator.generate_candidates( data, self.threads_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0) self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) def train_on_file(self, data_file: str, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None) -> None: """ Train corrector on tokens from the given file. :param data_file: A .csv dump of a dataframe which contains columns Columns.Token, \ Columns.CorrectToken and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). """ self.train(pandas.read_csv(data_file, index_col=0), candidates, save_candidates_file) def suggest(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for the tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file to save candidates to (.csv.xz). :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ if candidates is None: candidates = self.generator.generate_candidates( data, self.threads_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0) return self.ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates, return_all) def suggest_on_file(self, data_file: str, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for the tokens from the given file. :param data_file: A .csv dump of a DataFrame which contains columns Columns.Token \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file to save candidates to (.csv.xz). :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ return self.suggest(pandas.read_csv(data_file, index_col=0), candidates, save_candidates_file, n_candidates, return_all) def suggest_by_batches(self, data: pandas.DataFrame, n_candidates: int = 3, return_all: bool = True, batch_size: int = 2048, ) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for the tokens from the given dataset by batches. \ Does not support precalculated candidates. :param data: DataFrame which contains columns Columns.Token and Columns.Split. :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :param batch_size: Batch size. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ all_suggestions = [] for i in tqdm(range(0, len(data), batch_size)): suggestions = self.suggest(data.iloc[i:i + batch_size, :], n_candidates=n_candidates, return_all=return_all) all_suggestions.append(suggestions.items()) return dict(chain.from_iterable(all_suggestions)) def evaluate(self, test_data: pandas.DataFrame) -> None: """ Evaluate the corrector on the given test dataset. Save the result metrics to the model metadata and print it to the standard output. :param test_data: DataFrame which contains column Columns.Token, \ column Columns.Split is optional, but used when present """ suggestions = self.suggest(test_data) self.metrics = get_scores(test_data, suggestions) print(generate_report(test_data, suggestions)) def __eq__(self, other: "TyposCorrector") -> bool: return self.generator == other.generator and self.ranker == other.ranker def dump(self) -> str: """Model.__str__ to format the object.""" return ("# Generator\n" "%s\n\n" "# Ranker\n" "%s" % (self.generator.dump(), self.ranker.dump())) def _generate_tree(self) -> dict: return {"generator": self.generator._generate_tree(), "ranker": self.ranker._generate_tree()} def _load_tree(self, tree: dict) -> None: self.generator._load_tree(tree["generator"]) self.ranker._load_tree(tree["ranker"])
class TyposCorrector(Model): """ Model for correcting typos in tokens inside identifiers. """ NAME = "typos_correction" VENDOR = "source{d}" DEFAULT_RADIUS = 3 DEFAULT_MAX_DISTANCE = 2 DEFAULT_NEIGHBORS_NUMBER = 0 DEFAULT_EDIT_CANDIDATES = 20 DEFAULT_TRAIN_ROUNDS = 4000 DEFAULT_EARLY_STOPPING = 200 DEFAULT_BOOST_PARAM = { "max_depth": 6, "eta": 0.03, "min_child_weight": 2, "silent": 1, "objective": "binary:logistic", "subsample": 0.5, "colsample_bytree": 0.5, "alpha": 1, "eval_metric": ["error"], "nthread": 0 } def __init__(self, **kwargs): """ Initialize a new instance of TyposCorrector class. :param kwargs: extra keyword arguments which are consumed by Model. """ super().__init__(**kwargs) self.generator = CandidatesGenerator() self.ranker = CandidatesRanker() @property def threads_number(self): """Return the number of threads used to train and to predict.""" return self.ranker.boost_param["nthread"] def initialize_ranker(self, train_rounds: int = DEFAULT_TRAIN_ROUNDS, early_stopping: int = DEFAULT_EARLY_STOPPING, boost_params: dict = None) -> None: """ Apply the ranking parameters - see XGBoost docs for details. :param train_rounds: Number of training rounds. :param early_stopping: Early stopping parameter. :param boost_params: Boosting parameters. The defaults are DEFAULT_BOOST_PARAM. :return: Nothing """ boost_params = boost_params or self.DEFAULT_BOOST_PARAM self.ranker.construct(train_rounds, early_stopping, boost_params) def initialize_generator(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str = None, neighbors_number: int = DEFAULT_NEIGHBORS_NUMBER, edit_candidates: int = DEFAULT_EDIT_CANDIDATES, max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS) -> None: """ Construct a new CandidatesGenerator. :param vocabulary_file: The path to the vocabulary. :param frequencies_file: The path ot the frequencies. :param embeddings_file: The path to the embeddings. :param neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates. :param edit_candidates: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as candidates. :param max_distance: Maximum edit distance for symspell lookup. :param radius: Maximum edit distance from typo allowed for candidates. :return: Nothing """ self.generator.construct(vocabulary_file, frequencies_file, embeddings_file, neighbors_number, edit_candidates, max_distance, radius) def train(self, typos: pandas.DataFrame, candidates: pandas.DataFrame = None, save_candidates_file: str = None) -> None: """ Train corrector on the given dataset of typos inside identifiers. :param typos: DataFrame containing columns "typo" and "identifier", column "token_split" is optional, but used when present. :param candidates: DataFrame with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates. """ if candidates is None: candidates = self.generator.generate_candidates( typos, self.threads_number, save_candidates_file) self.ranker.fit(typos[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) def train_on_file(self, typos_file: str, candidates_file: str = None, save_candidates_file: str = None) -> None: """ Train corrector on the given dataset of typos inside identifiers. :param typos_file: CSV file with columns "typo" and "identifier", column "token_split" is optional, but used when present. :param candidates_file: Pickle dump of pandas.DataFrame with precalculated \ candidates and features :param save_candidates_file: Path to file where to save the candidates. """ typos = pandas.read_csv(typos_file, index_col=0) candidates = None if candidates_file is not None: candidates = pandas.read_pickle(candidates_file) self.train(typos, candidates, save_candidates_file) def suggest(self, typos: pandas.DataFrame, candidates: pandas.DataFrame = None, save_candidates_file: str = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for given typos. :param typos: DataFrame containing column "typo", \ column "token_split" is optional, but used when present :param candidates: DataFrame with precalculated candidates :param n_candidates: Number of most probable candidates to return :param return_all: False to return suggestions only for corrected tokens :param save_candidates_file: Path to file to save candidates to :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \ by correctness probability in a descending order. """ if candidates is None: candidates = self.generator.generate_candidates( typos, self.threads_number, save_candidates_file) return self.ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates, return_all) def suggest_file( self, typos_file: str, candidates_file: str = None, save_candidates_file: str = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for given typos. :param typos_file: csv file containing DataFrame with column "typo", \ column "token_split" is optional, but used when present :param candidates_file: pickle file containing DataFrame with precalculated \ candidates and features :param n_candidates: Number of most probable candidates to return :param return_all: False to return suggestions only for corrected tokens :param save_candidates_file: Path to file to save candidates to :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \ by correctness probability in a descending order. """ typos = pandas.read_csv(typos_file, index_col=0) candidates = None if candidates_file is not None: candidates = pandas.read_pickle(candidates_file) return self.suggest(typos, candidates, save_candidates_file, n_candidates, return_all) def suggest_by_batches( self, typos: pandas.DataFrame, n_candidates: int = None, return_all: bool = True, batch_size: int = 2048, ) -> Dict[int, List[Tuple[str, float]]]: """ Correct typos from dataset by batches. Does not support precalculated candidates. Suggest corrections for given typos :param typos: DataFrame containing column "typo", \ column "token_split" is optional, but used when present :param n_candidates: Number of most probable candidates to return :param return_all: False to return suggestions only for corrected tokens :param batch_size: Batch size :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \ by correctness probability in a descending order. """ all_suggestions = [] for i in tqdm(range(0, len(typos), batch_size)): suggestions = self.suggest( typos.loc[typos.index[i]:typos. index[min(len(typos) - 1, i + batch_size - 1)], :], n_candidates=n_candidates, return_all=return_all) all_suggestions.append(suggestions.items()) return dict(chain.from_iterable(all_suggestions)) def __eq__(self, other: "TyposCorrector") -> bool: return self.generator == other.generator and self.ranker == other.ranker def dump(self) -> str: """Model.__str__ to format the object.""" return ("# Generator\n" "%s\n\n" "# Ranker\n" "%s" % (self.generator.dump(), self.ranker.dump())) def _generate_tree(self) -> dict: return { "generator": self.generator._generate_tree(), "ranker": self.ranker._generate_tree() } def _load_tree(self, tree: dict) -> None: self.generator._load_tree(tree["generator"]) self.ranker._load_tree(tree["ranker"])