def test_prepare_data_with_load(self): with tempfile.TemporaryDirectory( prefix="lookout_typos_prepare_load_") as temp_dir: config = { "data_dir": temp_dir, "dataset_url": "https://docs.google.com/uc?export=download&" "id=1htVU1UR0gSmopVbvU6_Oc-4iD0cw1ldo", "input_path": None, "raw_data_filename": "raw_test_data.csv.xz", "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(config) vocabulary = read_vocabulary( os.path.join(temp_dir, config["vocabulary_filename"])) self.assertEqual(len(vocabulary), config["vocabulary_size"]) self.assertTrue(set(vocabulary).issubset(set(data[Columns.Token]))) frequencies = read_frequencies( os.path.join(temp_dir, config["frequencies_filename"])) self.assertEqual(len(frequencies), config["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
def test_read_functions(self): vocabulary = read_vocabulary( join(TEST_DATA_PATH, "test_frequencies.csv.xz")) frequencies = read_frequencies( join(TEST_DATA_PATH, "test_frequencies.csv.xz")) self.assertEqual(len(vocabulary), 100) self.assertSetEqual(set(vocabulary), set(frequencies.keys()))
def test_prepare_data_from_file(self): temp_dir = mkdtemp() params = { "data_dir": temp_dir, "input_path": str(pathlib.Path(__file__).parent / "raw_test_data.csv.xz"), "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(params) vocabulary = read_vocabulary( os.path.join(temp_dir, params["vocabulary_filename"])) self.assertEqual(len(vocabulary), params["vocabulary_size"]) self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary))) frequencies = read_frequencies( os.path.join(temp_dir, params["frequencies_filename"])) self.assertEqual(len(frequencies), params["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns)) shutil.rmtree(temp_dir)
def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, neighbors: int = DEFAULT_NEIGHBORS_NUMBER, edit_candidates: int = DEFAULT_EDIT_DISTANCE, max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS, max_corrected_length: int = 12) -> None: """ Construct correction candidates generator. :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \ First token in every line split is added to the vocabulary. :param frequencies_file: Path to the text file with frequencies. Each line must be two \ values separated with a whitespace: "token count". :param embeddings_file: Path to the dump of FastText model. :param neighbors: Number of neighbors of context and typo embeddings \ to consider as candidates. :param edit_candidates: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as candidates. :param max_distance: Maximum edit distance for symspell lookup for candidates. :param radius: Maximum edit distance from typo allowed for candidates. :param max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted """ self.checker = SymSpell(max_dictionary_edit_distance=max_distance, prefix_length=max_corrected_length) self.checker.load_dictionary(vocabulary_file) self.wv = FastText.load_fasttext_format(embeddings_file).wv self.neighbors_number = neighbors self.edit_candidates_number = edit_candidates self.max_distance = max_distance self.radius = radius self.tokens = read_vocabulary(vocabulary_file) self.frequencies = read_frequencies(frequencies_file)
def test_prepare_data_from_file(self): with tempfile.TemporaryDirectory(prefix="lookout_typos_prepare_local_") as temp_dir: config = { "data_dir": temp_dir, "input_path": str(TEST_DATA_DIR / "raw_test_data.csv.xz"), "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(config) vocabulary = read_vocabulary(os.path.join(temp_dir, config["vocabulary_filename"])) self.assertEqual(len(vocabulary), config["vocabulary_size"]) self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary))) frequencies = read_frequencies(os.path.join(temp_dir, config["frequencies_filename"])) self.assertEqual(len(frequencies), config["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, config: Optional[Mapping[str, Any]] = None) -> None: """ Construct correction candidates generator. :param vocabulary_file: Text file used to generate vocabulary of correction \ candidates. First token in every line split is added \ to the vocabulary. :param frequencies_file: Path to the text file with frequencies. Each line must \ be two values separated with a whitespace: "token count". :param embeddings_file: Path to the dump of FastText model. :param config: Candidates generation configuration, options: neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates (int). edit_dist_number: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as \ candidates (int). max_distance: Maximum edit distance for symspell lookup for candidates \ (int). radius: Maximum edit distance from typo allowed for candidates (int). max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted (int). start_pool_size: Length of data, starting from which multiprocessing is \ desired (int). chunksize: Max size of a chunk for one process during multiprocessing (int). """ self.set_config(config) self.checker = SymSpell( max_dictionary_edit_distance=self.config["max_distance"], prefix_length=self.config["max_corrected_length"]) self.checker.load_dictionary(vocabulary_file) self.wv = FastText.load_fasttext_format(embeddings_file).wv self.tokens = set(read_vocabulary(vocabulary_file)) self.frequencies = read_frequencies(frequencies_file) self.min_freq = min(self.frequencies.values())