def test_save_load(self): generator = CandidatesGenerator() generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE, neighbors=3, edit_candidates=3, max_distance=3, radius=3) with io.BytesIO() as buffer: generator.save(buffer) print(buffer.tell()) buffer.seek(0) generator2 = CandidatesGenerator().load(buffer) self.assertTrue(generator == generator2)
def test_generate_candidates(self): generator = CandidatesGenerator() generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE, neighbors=3, edit_candidates=3, max_distance=3, radius=3) data = pandas.read_csv(str(TEST_DATA_PATH / "test_data.csv.xz"), index_col=0).infer_objects() custom_data = pandas.DataFrame( [[["get", "tokens", "num"], "tokens", "tokens"], [["gwt", "tokens"], "gwt", "get"], [["get", "tokem"], "tokem", "token"]], columns=[SPLIT_COLUMN, TYPO_COLUMN, CORRECT_TOKEN_COLUMN]) for test_data in [data, custom_data]: candidates = generator.generate_candidates( test_data, threads_number=1, start_pool_size=len(test_data) + 1) self.assertFalse(candidates.isnull().values.any()) self.assertSequenceEqual(set(candidates[ID_COLUMN].values), set(test_data.index)) self.assertSequenceEqual(set(candidates[TYPO_COLUMN].values), set(test_data[TYPO_COLUMN].values))
def test_generate_candidates(self): generator = CandidatesGenerator() generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE, neighbors=3, edit_candidates=3, max_distance=3, radius=3) data = pandas.read_csv(str(TEST_DATA_PATH / "test_data.csv.xz"), index_col=0, keep_default_na=False) custom_data = pandas.DataFrame( [["get tokens num", "tokens", "tokens"], ["gwt tokens", "gwt", "get"], ["get tokem", "tokem", "token"]], columns=[Columns.Split, Columns.Token, Columns.CorrectToken]) for test_data in [data, custom_data]: candidates = generator.generate_candidates( test_data, processes_number=1, start_pool_size=len(test_data) + 1, chunksize=1) self.assertFalse(candidates.isnull().values.any()) self.assertSetEqual(set(candidates[Columns.Id].values), set(test_data.index)) self.assertSetEqual(set(candidates[Columns.Token].values), set(test_data[Columns.Token].values))
def test_save_load(self): with io.BytesIO() as buffer: self.generator.save(buffer) print(buffer.tell()) buffer.seek(0) generator2 = CandidatesGenerator().load(buffer) self.assertTrue(self.generator == generator2)
def __init__(self, **kwargs): """ Initialize a new instance of TyposCorrector class. :param kwargs: extra keyword arguments which are consumed by Model. """ super().__init__(**kwargs) self.generator = CandidatesGenerator() self.ranker = CandidatesRanker()
def test_expand_vocabulary(self): generator = CandidatesGenerator() generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE, self.config) additional_tokens = {"a", "aaa", "123", "get", "341"} vocabulary = generator.tokens generator.expand_vocabulary(additional_tokens) self.assertSetEqual(generator.tokens, vocabulary.union(additional_tokens))
def setUpClass(cls): cls.config = { "neighbors_number": 3, "edit_dist_number": 3, "max_distance": 3 } cls.generator = CandidatesGenerator() cls.generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE, cls.config)
def __init__(self, ranking_config: Optional[Mapping[str, Any]] = None, **kwargs): """ Initialize a new instance of TyposCorrector class. :param ranking_config: Ranking configuration, options: train_rounds: Number of training rounds (int). early_stopping: Early stopping parameter (int). boost_param: Boosting parameters (dict). :param kwargs: Extra keyword arguments which are consumed by Model. """ super().__init__(**kwargs) self.generator = CandidatesGenerator() self.ranker = CandidatesRanker(ranking_config)
def test_expand_vocabulary(self): generator = CandidatesGenerator() generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE, neighbors=3, edit_candidates=3, max_distance=3, radius=3) additional_tokens = {"a", "aaa", "123", "get", "341"} vocabulary = generator.tokens generator.expand_vocabulary(additional_tokens) self.assertSetEqual(generator.tokens, vocabulary.union(additional_tokens))