示例#1
0
    def test_breaking_loop_at_reaching_max_count(self, mocked_get_file_handle):
        config = {}
        max_files = 5
        sample_keys = [
            {
                "key": "a.jsonl"
            },
            {
                "key": "b.csv"
            },
            {
                "key": "c.gz"
            },
            {
                "key": "d.txt"
            },
            {
                "key": "e.jsonl"
            },
        ]

        mocked_get_file_handle.return_value = None

        files = s3.get_files_to_sample(config, sample_keys, max_files)

        self.assertEquals(max_files, len(files))
示例#2
0
    def test_sampling_of_zip_file(self, mocked_infer, mocked_get_file_handle):
        config = {}

        sample_key = {
            "key":
            "unittest_compressed_files/sample_compressed_zip_mixer_files.zip"
        }

        zip_file_path = get_resources_path(
            "sample_compressed_zip_mixer_files.zip", CSV_FOLDER_PATH)

        expected_extensions = ["csv", "gz", "jsonl"]

        with zipfile.ZipFile(zip_file_path, "r") as zip_file:

            mocked_get_file_handle.return_value = zip_file.fp
            mocked_infer.return_value = [
                zip_file.open(file) for file in zip_file.namelist()
            ]
            files = s3.get_files_to_sample(config, [sample_key], 5)

            self.assertTrue(
                all([
                    True for file in files if file["file_handle"].name.split(
                        ".")[-1].lower() in expected_extensions
                ]))
示例#3
0
    def test_non_compress_file_csv(self, mocked_get_file_handle):
        config = {}

        sample_key = {"key": "unittest_compressed_files/sample.csv"}

        mocked_get_file_handle.return_value = None

        files = s3.get_files_to_sample(config, [sample_key], 5)

        self.assertTrue("type" not in files[0])
示例#4
0
    def test_s3_bucket_key(self, mocked_get_file_handle):
        config = {}

        sample_key = {"other_key": "unittest_compressed_files/sample.txt"}

        mocked_get_file_handle.return_value = None

        files = s3.get_files_to_sample(config, [sample_key], 5)

        self.assertListEqual([], files)
示例#5
0
    def test_get_sampling_files_with_file_without_extension(
            self, mocked_get_file_handle, mocked_logger):
        config = {}

        sample_key = {"key": "unittest_compressed_files/sample"}

        mocked_get_file_handle.return_value = None

        files = s3.get_files_to_sample(config, [sample_key], 5)

        self.assertTrue(len(files) == 0)

        mocked_logger.assert_called_with(
            '"%s" without extension will not be sampled.', sample_key["key"])
示例#6
0
    def test_get_files_for_samples_of_tar_gz_file_samples(
            self, mocked_file_handle, mocked_logger):
        config = {}
        sample_key = {
            "key": "unittest_compressed_files/sample_compressed.tar.gz"
        }
        mocked_file_handle.return_value = None

        actual_output = s3.get_files_to_sample(config, [sample_key], 5)

        self.assertEquals(0, len(actual_output))

        mocked_logger.assert_called_with(
            'Skipping "%s" file as .tar.gz extension is not supported',
            sample_key["key"])
示例#7
0
    def test_get_files_for_samples_of_zip_contains_tar_gz_file(
            self, mocked_file_handle, mocked_infer, mocked_logger):
        config = {}
        sample_key = {"key": "unittest_compressed_files/sample_compressed.zip"}
        mocked_file_handle.return_value = None

        zip_file_path = get_resources_path(
            "sample_compressed_zip_contains_tar_gz_file.zip",
            COMPRESSION_FOLDER_PATH)
        with zipfile.ZipFile(zip_file_path, "r") as zip_file:

            mocked_file_handle.return_value = zip_file.fp
            mocked_infer.return_value = [
                zip_file.open(file) for file in zip_file.namelist()
            ]
            actual_output = s3.get_files_to_sample(config, [sample_key], 5)
            self.assertEquals(0, len(actual_output))
示例#8
0
    def test_get_sampling_files_with_unsupported_file(self,
                                                      mocked_get_file_handle,
                                                      mocked_logger):
        config = {}

        sample_key = {"key": "unittest_compressed_files/sample.exe"}

        extension = sample_key["key"].split(".")[-1].lower()

        mocked_get_file_handle.return_value = None

        files = s3.get_files_to_sample(config, [sample_key], 5)

        self.assertTrue(len(files) == 0)

        mocked_logger.assert_called_with(
            '"%s" having the ".%s" extension will not be sampled.',
            sample_key["key"], extension)