def test_ranker(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz")) candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply( lambda x: list(map(float, x[1:-1].split()))) ranker = CandidatesRanker() ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) suggestions = ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates=3, return_all=True) self.assertSetEqual(set(suggestions.keys()), set(data.index))
def test_ranker(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_pickle( join(TEST_DATA_PATH, "test_data_candidates_full.pkl")) ranker = CandidatesRanker() ranker.fit(data[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) suggestions = ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates=3, return_all=True) self.assertSetEqual(set(suggestions.keys()), set(data.index))
def test_eq(self): self.assertTrue(CandidatesRanker() == CandidatesRanker()) data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz")) candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply( lambda x: list(map(float, x[1:-1].split()))) ranker = CandidatesRanker() ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) self.assertFalse(ranker == CandidatesRanker())
def test_eq(self): self.assertTrue(CandidatesRanker() == CandidatesRanker()) data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_pickle( join(TEST_DATA_PATH, "test_data_candidates_full.pkl")) ranker = CandidatesRanker() ranker.fit(data[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) self.assertFalse(ranker == CandidatesRanker())
def test_save_load(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_csv(join(TEST_DATA_PATH, "test_data_candidates_full.csv.xz")) candidates.loc[:, Columns.Features] = candidates[Columns.Features].apply( lambda x: list(map(float, x[1:-1].split()))) ranker = CandidatesRanker() ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates)) with io.BytesIO() as buffer: ranker.save(output=buffer, series="typos-analyzer") print(buffer.tell()) buffer.seek(0) ranker2 = CandidatesRanker().load(buffer) print(ranker) self.assertTrue(ranker == ranker2)
def test_save_load(self): data = pandas.read_csv(join(TEST_DATA_PATH, "test_data.csv.xz"), index_col=0).infer_objects() candidates = pandas.read_pickle( join(TEST_DATA_PATH, "test_data_candidates_full.pkl")) ranker = CandidatesRanker() ranker.fit(data[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates)) with io.BytesIO() as buffer: ranker.save(buffer) print(buffer.tell()) buffer.seek(0) ranker2 = CandidatesRanker().load(buffer) print(ranker) self.assertTrue(ranker == ranker2)
def train(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None) -> None: """ Train corrector on tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). """ if candidates is None: candidates = self.generator.generate_candidates( data, self.threads_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0) self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates))
def train(self, typos: pandas.DataFrame, candidates: pandas.DataFrame = None, save_candidates_file: str = None) -> None: """ Train corrector on the given dataset of typos inside identifiers. :param typos: DataFrame containing columns "typo" and "identifier", column "token_split" is optional, but used when present. :param candidates: DataFrame with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates. """ if candidates is None: candidates = self.generator.generate_candidates( typos, self.threads_number, save_candidates_file) self.ranker.fit(typos[CORRECT_TOKEN_COLUMN], get_candidates_metadata(candidates), get_candidates_features(candidates))
def train(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None) -> None: """ Train corrector on tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). """ self._log.info("train input shape: %s", data.shape) if candidates is None: self._log.info("candidates were not provided and will be generated") candidates = self.generator.generate_candidates( data, self.processes_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False) self._log.info("loaded candidates from %s", candidates) self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates))
def suggest(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Candidate]]: """ Suggest corrections for the tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file to save candidates to (.csv.xz). :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ if candidates is None: candidates = self.generator.generate_candidates( data, self.processes_number, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False) return self.ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates, return_all)
def suggest(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, n_candidates: int = 3, return_all: bool = True, start_pool_size: int = DEFAULT_START_POOL_SIZE, chunksize: int = DEFAULT_CHUNKSIZE) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for the tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file to save candidates to (.csv.xz). :param n_candidates: Number of most probable candidates to return. :param return_all: False to return suggestions only for corrected tokens. :param start_pool_size: Length of data, starting from which multiprocessing is desired. :param chunksize: Max size of a chunk for one process during multiprocessing. :return: Dictionary `{id : [(candidate, correctness_proba), ...]}`, candidates are sorted \ by correctness probability in a descending order. """ if candidates is None: candidates = self.generator.generate_candidates( data, self.processes_number, start_pool_size, chunksize, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False) return self.ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates, return_all)
def train(self, data: pandas.DataFrame, candidates: Optional[str] = None, save_candidates_file: Optional[str] = None, start_pool_size: int = DEFAULT_START_POOL_SIZE, chunksize: int = DEFAULT_CHUNKSIZE) -> None: """ Train corrector on tokens from the given dataset. :param data: DataFrame which contains columns Columns.Token, Columns.CorrectToken, \ and Columns.Split. :param candidates: A .csv.xz dump of a dataframe with precalculated candidates. :param save_candidates_file: Path to file where to save the candidates (.csv.xz). :param start_pool_size: Length of data, starting from which multiprocessing is desired. :param chunksize: Max size of a chunk for one process during multiprocessing. """ self._log.info("train input shape: %s", data.shape) if candidates is None: self._log.info("candidates were not provided and will be generated") candidates = self.generator.generate_candidates( data, self.processes_number, start_pool_size, chunksize, save_candidates_file) else: candidates = pandas.read_csv(candidates, index_col=0, keep_default_na=False) self._log.info("loaded candidates from %s", candidates) self.ranker.fit(data[Columns.CorrectToken], get_candidates_metadata(candidates), get_candidates_features(candidates))
def suggest(self, typos: pandas.DataFrame, candidates: pandas.DataFrame = None, save_candidates_file: str = None, n_candidates: int = 3, return_all: bool = True) -> Dict[int, List[Tuple[str, float]]]: """ Suggest corrections for given typos. :param typos: DataFrame containing column "typo", \ column "token_split" is optional, but used when present :param candidates: DataFrame with precalculated candidates :param n_candidates: Number of most probable candidates to return :param return_all: False to return suggestions only for corrected tokens :param save_candidates_file: Path to file to save candidates to :return: Dictionary {id : [[candidate, correctness_proba]]}, candidates are sorted \ by correctness probability in a descending order. """ if candidates is None: candidates = self.generator.generate_candidates( typos, self.threads_number, save_candidates_file) return self.ranker.rank(get_candidates_metadata(candidates), get_candidates_features(candidates), n_candidates, return_all)
def test_get_candidates_features(self): features = numpy.array([[0.1, 0.2], [0.8, 0.3], [0.1, 0.5]], dtype="float32") assert_array_equal(get_candidates_features(self.candidates), features)