Exemplo n.º 1
0
    def test_decompress_gzip(self):
        def create_compressed(root, content="this is the content"):
            file = os.path.join(root, "file")
            compressed = f"{file}.gz"

            with gzip.open(compressed, "wb") as fh:
                fh.write(content.encode())

            return compressed, file, content

        with get_tmp_dir() as temp_dir:
            compressed, file, content = create_compressed(temp_dir)

            utils._decompress(compressed)

            self.assertTrue(os.path.exists(file))

            with open(file, "r") as fh:
                self.assertEqual(fh.read(), content)
Exemplo n.º 2
0
    def test_decompress(self, extension, tmpdir):
        def create_compressed(root, content="this is the content"):
            file = os.path.join(root, "file")
            compressed = f"{file}{extension}"
            compressed_file_opener = _COMPRESSED_FILE_OPENERS[extension]

            with compressed_file_opener(compressed, "wb") as fh:
                fh.write(content.encode())

            return compressed, file, content

        compressed, file, content = create_compressed(tmpdir)

        utils._decompress(compressed)

        assert os.path.exists(file)

        with open(file, "r") as fh:
            assert fh.read() == content
Exemplo n.º 3
0
 def _decompress(file: pathlib.Path) -> None:
     _decompress(str(file), remove_finished=True)
Exemplo n.º 4
0
 def test_decompress_no_compression(self):
     with self.assertRaises(RuntimeError):
         utils._decompress("foo.tar")
Exemplo n.º 5
0
 def test_decompress_no_compression(self):
     with pytest.raises(RuntimeError):
         utils._decompress("foo.tar")
Exemplo n.º 6
0
 def _decompress(file: pathlib.Path) -> pathlib.Path:
     return pathlib.Path(_decompress(str(file), remove_finished=True))
Exemplo n.º 7
0
class TestOnlineResource:
    class DummyResource(OnlineResource):
        def __init__(self, download_fn=None, **kwargs):
            super().__init__(**kwargs)
            self._download_fn = download_fn

        def _download(self, root):
            if self._download_fn is None:
                raise pytest.UsageError(
                    "`_download()` was called, but `DummyResource(...)` was constructed without `download_fn`."
                )

            return self._download_fn(self, root)

    def _make_file(self, root, *, content, name="file.txt"):
        file = root / name
        with open(file, "w") as fh:
            fh.write(content)

        return file

    def _make_folder(self, root, *, name="folder"):
        folder = root / name
        subfolder = folder / "subfolder"
        subfolder.mkdir(parents=True)

        files = {}
        for idx, root in enumerate([folder, folder, subfolder]):
            content = f"sentinel{idx}"
            file = self._make_file(root,
                                   name=f"file{idx}.txt",
                                   content=content)
            files[str(file)] = content

        return folder, files

    def _make_tar(self, root, *, name="archive.tar", remove=True):
        folder, files = self._make_folder(root, name=name.split(".")[0])
        archive = make_tar(root, name, folder, remove=remove)
        files = {
            str(archive / pathlib.Path(file).relative_to(root)): content
            for file, content in files.items()
        }
        return archive, files

    def test_load_file(self, tmp_path):
        content = "sentinel"
        file = self._make_file(tmp_path, content=content)

        resource = self.DummyResource(file_name=file.name)

        dp = resource.load(tmp_path)
        assert isinstance(dp, FileOpener)

        data = list(dp)
        assert len(data) == 1

        path, buffer = data[0]
        assert path == str(file)
        assert buffer.read().decode() == content

    def test_load_folder(self, tmp_path):
        folder, files = self._make_folder(tmp_path)

        resource = self.DummyResource(file_name=folder.name)

        dp = resource.load(tmp_path)
        assert isinstance(dp, FileOpener)
        assert {path: buffer.read().decode() for path, buffer in dp} == files

    def test_load_archive(self, tmp_path):
        archive, files = self._make_tar(tmp_path)

        resource = self.DummyResource(file_name=archive.name)

        dp = resource.load(tmp_path)
        assert isinstance(dp, TarArchiveLoader)
        assert {path: buffer.read().decode() for path, buffer in dp} == files

    def test_priority_decompressed_gt_raw(self, tmp_path):
        # We don't need to actually compress here. Adding the suffix is sufficient
        self._make_file(tmp_path, content="raw_sentinel", name="file.txt.gz")
        file = self._make_file(tmp_path,
                               content="decompressed_sentinel",
                               name="file.txt")

        resource = self.DummyResource(file_name=file.name)

        dp = resource.load(tmp_path)
        path, buffer = next(iter(dp))

        assert path == str(file)
        assert buffer.read().decode() == "decompressed_sentinel"

    def test_priority_extracted_gt_decompressed(self, tmp_path):
        archive, _ = self._make_tar(tmp_path, remove=False)

        resource = self.DummyResource(file_name=archive.name)

        dp = resource.load(tmp_path)
        # If the archive had been selected, this would be a `TarArchiveReader`
        assert isinstance(dp, FileOpener)

    def test_download(self, tmp_path):
        download_fn_was_called = False

        def download_fn(resource, root):
            nonlocal download_fn_was_called
            download_fn_was_called = True

            return self._make_file(root, content="_", name=resource.file_name)

        resource = self.DummyResource(
            file_name="file.txt",
            download_fn=download_fn,
        )

        resource.load(tmp_path)

        assert download_fn_was_called, "`download_fn()` was never called"

    # This tests the `"decompress"` literal as well as a custom callable
    @pytest.mark.parametrize(
        "preprocess",
        [
            "decompress",
            lambda path: _decompress(str(path), remove_finished=True),
        ],
    )
    def test_preprocess_decompress(self, tmp_path, preprocess):
        file_name = "file.txt.gz"
        content = "sentinel"

        def download_fn(resource, root):
            file = root / resource.file_name
            with gzip.open(file, "wb") as fh:
                fh.write(content.encode())
            return file

        resource = self.DummyResource(file_name=file_name,
                                      preprocess=preprocess,
                                      download_fn=download_fn)

        dp = resource.load(tmp_path)
        data = list(dp)
        assert len(data) == 1

        path, buffer = data[0]
        assert path == str(tmp_path / file_name).replace(".gz", "")
        assert buffer.read().decode() == content

    def test_preprocess_extract(self, tmp_path):
        files = None

        def download_fn(resource, root):
            nonlocal files
            archive, files = self._make_tar(root, name=resource.file_name)
            return archive

        resource = self.DummyResource(file_name="folder.tar",
                                      preprocess="extract",
                                      download_fn=download_fn)

        dp = resource.load(tmp_path)
        assert files is not None, "`download_fn()` was never called"
        assert isinstance(dp, FileOpener)

        actual = {path: buffer.read().decode() for path, buffer in dp}
        expected = {
            path.replace(resource.file_name,
                         resource.file_name.split(".")[0]): content
            for path, content in files.items()
        }
        assert actual == expected

    def test_preprocess_only_after_download(self, tmp_path):
        file = self._make_file(tmp_path, content="_")

        def preprocess(path):
            raise AssertionError(
                "`preprocess` was called although the file was already present."
            )

        resource = self.DummyResource(
            file_name=file.name,
            preprocess=preprocess,
        )

        resource.load(tmp_path)