def test_prepare_data_with_load(self): with tempfile.TemporaryDirectory( prefix="lookout_typos_prepare_load_") as temp_dir: config = { "data_dir": temp_dir, "dataset_url": "https://docs.google.com/uc?export=download&" "id=1htVU1UR0gSmopVbvU6_Oc-4iD0cw1ldo", "input_path": None, "raw_data_filename": "raw_test_data.csv.xz", "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(config) vocabulary = read_vocabulary( os.path.join(temp_dir, config["vocabulary_filename"])) self.assertEqual(len(vocabulary), config["vocabulary_size"]) self.assertTrue(set(vocabulary).issubset(set(data[Columns.Token]))) frequencies = read_frequencies( os.path.join(temp_dir, config["frequencies_filename"])) self.assertEqual(len(frequencies), config["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))
def test_prepare_data_from_file(self): with tempfile.TemporaryDirectory(prefix="lookout_typos_prepare_local_") as temp_dir: config = { "data_dir": temp_dir, "input_path": str(TEST_DATA_DIR / "raw_test_data.csv.xz"), "vocabulary_size": 10, "frequencies_size": 20, "vocabulary_filename": "vocabulary.csv", "frequencies_filename": "frequencies.csv", } data = prepare_data(config) vocabulary = read_vocabulary(os.path.join(temp_dir, config["vocabulary_filename"])) self.assertEqual(len(vocabulary), config["vocabulary_size"]) self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary))) frequencies = read_frequencies(os.path.join(temp_dir, config["frequencies_filename"])) self.assertEqual(len(frequencies), config["frequencies_size"]) self.assertTrue(set(vocabulary).issubset(set(frequencies.keys()))) self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))