def test_read_functions(self): vocabulary = read_vocabulary( join(TEST_DATA_PATH, "test_frequencies.csv.xz")) frequencies = read_frequencies( join(TEST_DATA_PATH, "test_frequencies.csv.xz")) self.assertEqual(len(vocabulary), 100) self.assertSetEqual(set(vocabulary), set(frequencies.keys()))
def test_prepare_data_with_load(self): with tempfile.TemporaryDirectory( prefix="lookout_typos_prepare_load_") as temp_dir: config = { "data_dir": temp_dir, "dataset_url": "https://docs.google.com/uc?export=download&" "id=1htVU1UR0gSmopVbvU6_Oc-4iD0cw1ldo", "input_path": None, "raw_data_filename": "raw_test_data.csv.xz", "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(config) vocabulary = read_vocabulary( os.path.join(temp_dir, config["vocabulary_filename"])) self.assertEqual(len(vocabulary), config["vocabulary_size"]) self.assertTrue(set(vocabulary).issubset(set(data[Columns.Token]))) frequencies = read_frequencies( os.path.join(temp_dir, config["frequencies_filename"])) self.assertEqual(len(frequencies), config["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
def test_prepare_data_from_file(self): temp_dir = mkdtemp() params = { "data_dir": temp_dir, "input_path": str(pathlib.Path(__file__).parent / "raw_test_data.csv.xz"), "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(params) vocabulary = read_vocabulary( os.path.join(temp_dir, params["vocabulary_filename"])) self.assertEqual(len(vocabulary), params["vocabulary_size"]) self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary))) frequencies = read_frequencies( os.path.join(temp_dir, params["frequencies_filename"])) self.assertEqual(len(frequencies), params["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns)) shutil.rmtree(temp_dir)
def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, neighbors: int = DEFAULT_NEIGHBORS_NUMBER, edit_candidates: int = DEFAULT_EDIT_DISTANCE, max_distance: int = DEFAULT_MAX_DISTANCE, radius: int = DEFAULT_RADIUS, max_corrected_length: int = 12) -> None: """ Construct correction candidates generator. :param vocabulary_file: Text file used to generate vocabulary of correction candidates. \ First token in every line split is added to the vocabulary. :param frequencies_file: Path to the text file with frequencies. Each line must be two \ values separated with a whitespace: "token count". :param embeddings_file: Path to the dump of FastText model. :param neighbors: Number of neighbors of context and typo embeddings \ to consider as candidates. :param edit_candidates: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as candidates. :param max_distance: Maximum edit distance for symspell lookup for candidates. :param radius: Maximum edit distance from typo allowed for candidates. :param max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted """ self.checker = SymSpell(max_dictionary_edit_distance=max_distance, prefix_length=max_corrected_length) self.checker.load_dictionary(vocabulary_file) self.wv = FastText.load_fasttext_format(embeddings_file).wv self.neighbors_number = neighbors self.edit_candidates_number = edit_candidates self.max_distance = max_distance self.radius = radius self.tokens = read_vocabulary(vocabulary_file) self.frequencies = read_frequencies(frequencies_file)
def test_prepare_data_from_file(self): with tempfile.TemporaryDirectory(prefix="lookout_typos_prepare_local_") as temp_dir: config = { "data_dir": temp_dir, "input_path": str(TEST_DATA_DIR / "raw_test_data.csv.xz"), "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(config) vocabulary = read_vocabulary(os.path.join(temp_dir, config["vocabulary_filename"])) self.assertEqual(len(vocabulary), config["vocabulary_size"]) self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary))) frequencies = read_frequencies(os.path.join(temp_dir, config["frequencies_filename"])) self.assertEqual(len(frequencies), config["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
def construct(self, vocabulary_file: str, frequencies_file: str, embeddings_file: str, config: Optional[Mapping[str, Any]] = None) -> None: """ Construct correction candidates generator. :param vocabulary_file: Text file used to generate vocabulary of correction \ candidates. First token in every line split is added \ to the vocabulary. :param frequencies_file: Path to the text file with frequencies. Each line must \ be two values separated with a whitespace: "token count". :param embeddings_file: Path to the dump of FastText model. :param config: Candidates generation configuration, options: neighbors_number: Number of neighbors of context and typo embeddings \ to consider as candidates (int). edit_dist_number: Number of the most frequent tokens among tokens on \ equal edit distance from the typo to consider as \ candidates (int). max_distance: Maximum edit distance for symspell lookup for candidates \ (int). radius: Maximum edit distance from typo allowed for candidates (int). max_corrected_length: Maximum length of prefix in which symspell lookup \ for typos is conducted (int). start_pool_size: Length of data, starting from which multiprocessing is \ desired (int). chunksize: Max size of a chunk for one process during multiprocessing (int). """ self.set_config(config) self.checker = SymSpell( max_dictionary_edit_distance=self.config["max_distance"], prefix_length=self.config["max_corrected_length"]) self.checker.load_dictionary(vocabulary_file) self.wv = FastText.load_fasttext_format(embeddings_file).wv self.tokens = set(read_vocabulary(vocabulary_file)) self.frequencies = read_frequencies(frequencies_file) self.min_freq = min(self.frequencies.values())
def generate_vocabulary(frequencies_path: str, config: Mapping[str, Any]) -> Dict[str, int]: """ Compose vocabulary from a set of tokens with known frequencies. Filtering of the input tokens depends on their frequencies and edit distances between them. All found English words and tokens that the algorithm considers word-like are added \ regardless of their frequencies. :param frequencies_path: Path to the .csv file with space-separated word-frequency pairs \ one-per-line. :param config: Configuration for the vocabulary creation: stable: How many tokens, which don't have more frequent \ edit-distance-neighbors, to take into the vocabulary. suspicious: How many tokens, whose more frequent edit-distance-neighbor is an English word, to take into the vocabulary. non_suspicious: How many tokens, whose more frequent edit-distance-neighbor \ is not an English word, to take into the vocabulary. :return: Dictionary with the vocabulary tokens as keys and their corresponding \ frequencies as values. """ checker = SymSpell(max_dictionary_edit_distance=2, prefix_length=100) checker.load_dictionary(frequencies_path) frequencies = read_frequencies(frequencies_path) sorted_frequencies = sorted(frequencies.items(), key=lambda x: -x[1]) # For every token, find a token on edit distance 1, which has higher frequency, if there is one def _correct_token(token_freq): token, freq = token_freq suggestions = checker.lookup(token, 2, 1) if len(suggestions) > 1: correction = suggestions[1].term return correction, frequencies[correction] return token, freq corrections = list(tqdm(map(_correct_token, sorted_frequencies), total=len(sorted_frequencies))) all_tokens = pandas.DataFrame(columns=["token", "token_freq", "correction", "correction_freq"]) all_tokens["token"] = [token for token, _ in sorted_frequencies] all_tokens["token_freq"] = [freq for _, freq in sorted_frequencies] all_tokens["correction"] = [token_freq[0] if token_freq[1] > sorted_frequencies[i][1] else sorted_frequencies[i][0] for i, token_freq in enumerate(corrections)] all_tokens["correction_freq"] = [token_freq[1] if token_freq[1] > sorted_frequencies[i][1] else sorted_frequencies[i][1] for i, token_freq in enumerate(corrections)] all_tokens["rel"] = all_tokens["correction_freq"] / all_tokens["token_freq"] # Find all English words among all the tokens eng_voc = set() with smart_open(str(pathlib.Path(__file__).parent / "words_alpha.txt.xz"), "r") as f: for line in f: eng_voc.add(line.strip()) # Leave only non-english tokens for analysis stable = all_tokens[(all_tokens.rel == 1.0) & ~all_tokens.token.isin(eng_voc)] unstable = all_tokens[(all_tokens.rel > 1) & ~all_tokens.token.isin(eng_voc)] # Get tokens and their corrections lemmas spacy.cli.download("en") nlp = spacy.load("en", disable=["parser", "ner"]) def _lemmatize(token): lemm = nlp(token) if len(lemm) > 1 or lemm[0].lemma_ == "-PRON-" or (token[-2:] == "ss" and lemm[0].lemma_ == token[:-1]): return token return lemm[0].lemma_ token_lemma = list(tqdm(map(_lemmatize, list(unstable.token)), total=len(unstable))) correction_lemma = list(tqdm(map(_lemmatize, list(unstable.correction)), total=len(unstable))) unstable["token_lemma"] = token_lemma unstable["cor_lemma"] = correction_lemma # Equal lemmas -> different forms of a morphologically changing token -> token is a "word" # Use some heuristics to remove noise eq_lemmas = unstable[ (unstable["token_lemma"] == unstable["cor_lemma"]) | (unstable["token_lemma"] == unstable["correction"]) & (~unstable["correction"].isin(eng_voc) | (unstable["correction"].apply(lambda x: x[-3:]) == "ing"))] dif_lemmas = unstable[(unstable["token_lemma"] != unstable["cor_lemma"]) & (unstable["token_lemma"] != unstable["correction"])] # Stemming heuristics def _norm(word: str) -> str: if word[-2:] == "ed" or word[-2:] == "er" or word[-1] == "s" and word[-2] != "s": return word[:-1] return word norm_eq = dif_lemmas[(dif_lemmas.token.apply(_norm) == dif_lemmas.correction)] # Gather all results good = all_tokens[all_tokens.token.isin(set( list(eq_lemmas[:].token) + list(eq_lemmas[:].correction) + list(norm_eq.token) + list(norm_eq.correction)))] unstable = unstable[~unstable.token.isin(good.token)] stable = stable[~stable.token.isin(good.token)] # Suspicious - have high probability to be typo-ed English words suspicious = unstable[unstable.correction.isin(eng_voc)] non_suspicious = unstable[~unstable.correction.isin(eng_voc)] vocabulary = all_tokens[all_tokens.token.isin(set( list(stable[:config["stable"]].token) + list(suspicious[:config["suspicious"]].token) + list(non_suspicious[:config["non_suspicious"]].token) + list(eng_voc) + list(good.token)))] return {token: freq for token, freq in vocabulary[["token", "token_freq"]].values}