def test_breaking_loop_at_reaching_max_count(self, mocked_get_file_handle): config = {} max_files = 5 sample_keys = [ { "key": "a.jsonl" }, { "key": "b.csv" }, { "key": "c.gz" }, { "key": "d.txt" }, { "key": "e.jsonl" }, ] mocked_get_file_handle.return_value = None files = s3.get_files_to_sample(config, sample_keys, max_files) self.assertEquals(max_files, len(files))
def test_sampling_of_zip_file(self, mocked_infer, mocked_get_file_handle): config = {} sample_key = { "key": "unittest_compressed_files/sample_compressed_zip_mixer_files.zip" } zip_file_path = get_resources_path( "sample_compressed_zip_mixer_files.zip", CSV_FOLDER_PATH) expected_extensions = ["csv", "gz", "jsonl"] with zipfile.ZipFile(zip_file_path, "r") as zip_file: mocked_get_file_handle.return_value = zip_file.fp mocked_infer.return_value = [ zip_file.open(file) for file in zip_file.namelist() ] files = s3.get_files_to_sample(config, [sample_key], 5) self.assertTrue( all([ True for file in files if file["file_handle"].name.split( ".")[-1].lower() in expected_extensions ]))
def test_non_compress_file_csv(self, mocked_get_file_handle): config = {} sample_key = {"key": "unittest_compressed_files/sample.csv"} mocked_get_file_handle.return_value = None files = s3.get_files_to_sample(config, [sample_key], 5) self.assertTrue("type" not in files[0])
def test_s3_bucket_key(self, mocked_get_file_handle): config = {} sample_key = {"other_key": "unittest_compressed_files/sample.txt"} mocked_get_file_handle.return_value = None files = s3.get_files_to_sample(config, [sample_key], 5) self.assertListEqual([], files)
def test_get_sampling_files_with_file_without_extension( self, mocked_get_file_handle, mocked_logger): config = {} sample_key = {"key": "unittest_compressed_files/sample"} mocked_get_file_handle.return_value = None files = s3.get_files_to_sample(config, [sample_key], 5) self.assertTrue(len(files) == 0) mocked_logger.assert_called_with( '"%s" without extension will not be sampled.', sample_key["key"])
def test_get_files_for_samples_of_tar_gz_file_samples( self, mocked_file_handle, mocked_logger): config = {} sample_key = { "key": "unittest_compressed_files/sample_compressed.tar.gz" } mocked_file_handle.return_value = None actual_output = s3.get_files_to_sample(config, [sample_key], 5) self.assertEquals(0, len(actual_output)) mocked_logger.assert_called_with( 'Skipping "%s" file as .tar.gz extension is not supported', sample_key["key"])
def test_get_files_for_samples_of_zip_contains_tar_gz_file( self, mocked_file_handle, mocked_infer, mocked_logger): config = {} sample_key = {"key": "unittest_compressed_files/sample_compressed.zip"} mocked_file_handle.return_value = None zip_file_path = get_resources_path( "sample_compressed_zip_contains_tar_gz_file.zip", COMPRESSION_FOLDER_PATH) with zipfile.ZipFile(zip_file_path, "r") as zip_file: mocked_file_handle.return_value = zip_file.fp mocked_infer.return_value = [ zip_file.open(file) for file in zip_file.namelist() ] actual_output = s3.get_files_to_sample(config, [sample_key], 5) self.assertEquals(0, len(actual_output))
def test_get_sampling_files_with_unsupported_file(self, mocked_get_file_handle, mocked_logger): config = {} sample_key = {"key": "unittest_compressed_files/sample.exe"} extension = sample_key["key"].split(".")[-1].lower() mocked_get_file_handle.return_value = None files = s3.get_files_to_sample(config, [sample_key], 5) self.assertTrue(len(files) == 0) mocked_logger.assert_called_with( '"%s" having the ".%s" extension will not be sampled.', sample_key["key"], extension)