示例#1
0
 def test_prepare_data_with_load(self):
     with tempfile.TemporaryDirectory(
             prefix="lookout_typos_prepare_load_") as temp_dir:
         config = {
             "data_dir": temp_dir,
             "dataset_url": "https://docs.google.com/uc?export=download&"
             "id=1htVU1UR0gSmopVbvU6_Oc-4iD0cw1ldo",
             "input_path": None,
             "raw_data_filename": "raw_test_data.csv.xz",
             "vocabulary_size": 10,
             "frequencies_size": 20,
             "vocabulary_filename": "vocabulary.csv",
             "frequencies_filename": "frequencies.csv",
         }
         data = prepare_data(config)
         vocabulary = read_vocabulary(
             os.path.join(temp_dir, config["vocabulary_filename"]))
         self.assertEqual(len(vocabulary), config["vocabulary_size"])
         self.assertTrue(set(vocabulary).issubset(set(data[Columns.Token])))
         frequencies = read_frequencies(
             os.path.join(temp_dir, config["frequencies_filename"]))
         self.assertEqual(len(frequencies), config["frequencies_size"])
         self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
         self.assertTrue({Columns.Token,
                          Columns.Split}.issubset(data.columns))
示例#2
0
 def test_prepare_data_from_file(self):
     with tempfile.TemporaryDirectory(prefix="lookout_typos_prepare_local_") as temp_dir:
         config = {
             "data_dir": temp_dir,
             "input_path": str(TEST_DATA_DIR / "raw_test_data.csv.xz"),
             "vocabulary_size": 10,
             "frequencies_size": 20,
             "vocabulary_filename": "vocabulary.csv",
             "frequencies_filename": "frequencies.csv",
         }
         data = prepare_data(config)
         vocabulary = read_vocabulary(os.path.join(temp_dir, config["vocabulary_filename"]))
         self.assertEqual(len(vocabulary), config["vocabulary_size"])
         self.assertTrue(set(data[Columns.Token]).issubset(set(vocabulary)))
         frequencies = read_frequencies(os.path.join(temp_dir, config["frequencies_filename"]))
         self.assertEqual(len(frequencies), config["frequencies_size"])
         self.assertTrue(set(vocabulary).issubset(set(frequencies.keys())))
         self.assertTrue({Columns.Token, Columns.Split}.issubset(data.columns))