예제 #1
0
    def test_save_load(self):
        generator = CandidatesGenerator()
        generator.construct(VOCABULARY_FILE, VOCABULARY_FILE, FASTTEXT_DUMP_FILE,
                            neighbors=3, edit_candidates=3, max_distance=3, radius=3)
        with io.BytesIO() as buffer:
            generator.save(buffer)
            print(buffer.tell())
            buffer.seek(0)
            generator2 = CandidatesGenerator().load(buffer)

        self.assertTrue(generator == generator2)
예제 #2
0
    def test_generate_candidates(self):
        generator = CandidatesGenerator()
        generator.construct(VOCABULARY_FILE,
                            VOCABULARY_FILE,
                            FASTTEXT_DUMP_FILE,
                            neighbors=3,
                            edit_candidates=3,
                            max_distance=3,
                            radius=3)

        data = pandas.read_csv(str(TEST_DATA_PATH / "test_data.csv.xz"),
                               index_col=0).infer_objects()
        custom_data = pandas.DataFrame(
            [[["get", "tokens", "num"], "tokens", "tokens"],
             [["gwt", "tokens"], "gwt", "get"],
             [["get", "tokem"], "tokem", "token"]],
            columns=[SPLIT_COLUMN, TYPO_COLUMN, CORRECT_TOKEN_COLUMN])
        for test_data in [data, custom_data]:
            candidates = generator.generate_candidates(
                test_data,
                threads_number=1,
                start_pool_size=len(test_data) + 1)
            self.assertFalse(candidates.isnull().values.any())
            self.assertSequenceEqual(set(candidates[ID_COLUMN].values),
                                     set(test_data.index))
            self.assertSequenceEqual(set(candidates[TYPO_COLUMN].values),
                                     set(test_data[TYPO_COLUMN].values))
예제 #3
0
    def test_generate_candidates(self):
        generator = CandidatesGenerator()
        generator.construct(VOCABULARY_FILE,
                            VOCABULARY_FILE,
                            FASTTEXT_DUMP_FILE,
                            neighbors=3,
                            edit_candidates=3,
                            max_distance=3,
                            radius=3)

        data = pandas.read_csv(str(TEST_DATA_PATH / "test_data.csv.xz"),
                               index_col=0,
                               keep_default_na=False)
        custom_data = pandas.DataFrame(
            [["get tokens num", "tokens", "tokens"],
             ["gwt tokens", "gwt", "get"], ["get tokem", "tokem", "token"]],
            columns=[Columns.Split, Columns.Token, Columns.CorrectToken])
        for test_data in [data, custom_data]:
            candidates = generator.generate_candidates(
                test_data,
                processes_number=1,
                start_pool_size=len(test_data) + 1,
                chunksize=1)
            self.assertFalse(candidates.isnull().values.any())
            self.assertSetEqual(set(candidates[Columns.Id].values),
                                set(test_data.index))
            self.assertSetEqual(set(candidates[Columns.Token].values),
                                set(test_data[Columns.Token].values))
예제 #4
0
    def test_save_load(self):
        with io.BytesIO() as buffer:
            self.generator.save(buffer)
            print(buffer.tell())
            buffer.seek(0)
            generator2 = CandidatesGenerator().load(buffer)

        self.assertTrue(self.generator == generator2)
예제 #5
0
    def __init__(self, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param kwargs: extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker()
예제 #6
0
 def test_expand_vocabulary(self):
     generator = CandidatesGenerator()
     generator.construct(VOCABULARY_FILE, VOCABULARY_FILE,
                         FASTTEXT_DUMP_FILE, self.config)
     additional_tokens = {"a", "aaa", "123", "get", "341"}
     vocabulary = generator.tokens
     generator.expand_vocabulary(additional_tokens)
     self.assertSetEqual(generator.tokens,
                         vocabulary.union(additional_tokens))
예제 #7
0
 def setUpClass(cls):
     cls.config = {
         "neighbors_number": 3,
         "edit_dist_number": 3,
         "max_distance": 3
     }
     cls.generator = CandidatesGenerator()
     cls.generator.construct(VOCABULARY_FILE, VOCABULARY_FILE,
                             FASTTEXT_DUMP_FILE, cls.config)
예제 #8
0
    def __init__(self, ranking_config: Optional[Mapping[str, Any]] = None, **kwargs):
        """
        Initialize a new instance of TyposCorrector class.

        :param ranking_config: Ranking configuration, options:
                                train_rounds: Number of training rounds (int).
                                early_stopping: Early stopping parameter (int).
                                boost_param: Boosting parameters (dict).
        :param kwargs: Extra keyword arguments which are consumed by Model.
        """
        super().__init__(**kwargs)
        self.generator = CandidatesGenerator()
        self.ranker = CandidatesRanker(ranking_config)
예제 #9
0
 def test_expand_vocabulary(self):
     generator = CandidatesGenerator()
     generator.construct(VOCABULARY_FILE,
                         VOCABULARY_FILE,
                         FASTTEXT_DUMP_FILE,
                         neighbors=3,
                         edit_candidates=3,
                         max_distance=3,
                         radius=3)
     additional_tokens = {"a", "aaa", "123", "get", "341"}
     vocabulary = generator.tokens
     generator.expand_vocabulary(additional_tokens)
     self.assertSetEqual(generator.tokens,
                         vocabulary.union(additional_tokens))