Пример #1
0
    def generate_signed_url(self, expiration=datetime.timedelta(days=7)):
        """Generate a signed URL for the dataset. This is done by uploading a uniquely named metadata file containing
        signed URLs to the datasets' files and returning a signed URL to that metadata file.

        :param datetime.datetime|datetime.timedelta expiration: the amount of time or date after which the URL should expire
        :return str: the signed URL for the dataset
        """
        if not self.exists_in_cloud:
            raise CloudLocationNotSpecified(
                f"{self!r} must exist in the cloud for a signed URL to be generated for it."
            )

        signed_metadata = self.to_primitive()
        signed_metadata["files"] = [
            datafile.generate_signed_url(expiration=expiration)
            for datafile in self.files
        ]

        path_to_signed_metadata_file = storage.path.join(
            self.path, SIGNED_METADATA_DIRECTORY, coolname.generate_slug())

        storage_client = GoogleCloudStorageClient()

        storage_client.upload_from_string(
            string=json.dumps(signed_metadata, cls=OctueJSONEncoder),
            cloud_path=path_to_signed_metadata_file,
        )

        return storage_client.generate_signed_url(
            cloud_path=path_to_signed_metadata_file, expiration=expiration)
Пример #2
0
    def test_upload(self):
        """Test that a dataset can be uploaded to a cloud path, including all its files and the dataset's metadata."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = create_dataset_with_two_files(temporary_directory)
            dataset.tags = {"a": "b", "c": 1}

            output_directory = "my_datasets"
            cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                       output_directory,
                                                       dataset.name)
            dataset.upload(cloud_path)

            storage_client = GoogleCloudStorageClient()

            # Check its files have been uploaded.
            persisted_file_0 = storage_client.download_as_string(
                storage.path.join(cloud_path, "file_0.txt"))
            self.assertEqual(persisted_file_0, "0")

            persisted_file_1 = storage_client.download_as_string(
                storage.path.join(cloud_path, "file_1.txt"))
            self.assertEqual(persisted_file_1, "1")

            # Check its metadata has been uploaded.
            dataset._get_cloud_metadata()
            self.assertEqual(dataset._cloud_metadata["tags"],
                             dataset.tags.to_primitive())
Пример #3
0
    def test_instantiating_from_serialised_cloud_datasets_with_no_dataset_json_file(
            self):
        """Test that a Manifest can be instantiated from a serialized cloud dataset with no `dataset.json` file. This
        simulates what happens when such a cloud dataset is referred to in a manifest received by a child service.
        """
        GoogleCloudStorageClient().upload_from_string(
            "[1, 2, 3]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_0.txt"),
        )

        GoogleCloudStorageClient().upload_from_string(
            "[4, 5, 6]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_1.txt"),
        )

        serialised_cloud_dataset = Dataset(
            path=f"gs://{TEST_BUCKET_NAME}/my_dataset").to_primitive()

        manifest = Manifest(datasets={"my_dataset": serialised_cloud_dataset})
        self.assertEqual(len(manifest.datasets), 1)
        self.assertEqual(manifest.datasets["my_dataset"].path,
                         f"gs://{TEST_BUCKET_NAME}/my_dataset")
        self.assertEqual(len(manifest.datasets["my_dataset"].files), 2)
Пример #4
0
    def _create_nested_cloud_dataset(self, dataset_name=None):
        """Create a dataset in cloud storage with the given name containing a nested set of files.

        :param str|None dataset_name: the name to give the dataset; a random name is generated if none is given
        :return str: the cloud path for the dataset
        """
        cloud_storage_client = GoogleCloudStorageClient()
        dataset_path = storage.path.generate_gs_path(
            TEST_BUCKET_NAME, dataset_name or coolname.generate_slug(2))

        cloud_storage_client.upload_from_string("[1, 2, 3]",
                                                cloud_path=storage.path.join(
                                                    dataset_path,
                                                    "file_0.txt"))
        cloud_storage_client.upload_from_string("[4, 5, 6]",
                                                cloud_path=storage.path.join(
                                                    dataset_path,
                                                    "file_1.txt"))

        cloud_storage_client.upload_from_string(
            "['a', 'b', 'c']",
            cloud_path=storage.path.join(dataset_path, "sub-directory",
                                         "sub_file.txt"),
        )

        cloud_storage_client.upload_from_string(
            "['blah', 'b', 'c']",
            cloud_path=storage.path.join(dataset_path, "sub-directory",
                                         "sub-sub-directory",
                                         "sub_sub_file.txt"),
        )

        return dataset_path
Пример #5
0
    def test_instantiating_from_datasets_from_different_cloud_buckets(self):
        """Test instantiating a manifest from multiple datasets from different cloud buckets."""
        storage_client = GoogleCloudStorageClient()

        extra_bucket_name = TEST_BUCKET_NAME + "-another"
        storage_client.create_bucket(name=extra_bucket_name)

        storage_client.upload_from_string(
            "[1, 2, 3]",
            storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset_a",
                                          "file_0.txt"),
        )

        storage_client.upload_from_string(
            "[4, 5, 6]",
            storage.path.generate_gs_path(extra_bucket_name, "my_dataset_b",
                                          "the_data.txt"))

        manifest = Manifest(
            datasets={
                "my_dataset_a": f"gs://{TEST_BUCKET_NAME}/my_dataset_a",
                "my_dataset_b": f"gs://{extra_bucket_name}/my_dataset_b",
            })

        self.assertEqual(
            {dataset.name
             for dataset in manifest.datasets.values()},
            {"my_dataset_a", "my_dataset_b"})

        files = [
            list(dataset.files)[0] for dataset in manifest.datasets.values()
        ]
        self.assertEqual({file.bucket_name
                          for file in files},
                         {TEST_BUCKET_NAME, extra_bucket_name})
Пример #6
0
    def to_cloud(self, cloud_path):
        """Upload a manifest to a cloud location, optionally uploading its datasets into the same directory.

        :param str cloud_path: full path to cloud storage location to store manifest at (e.g. `gs://bucket_name/path/to/manifest.json`)
        :return None:
        """
        GoogleCloudStorageClient().upload_from_string(string=json.dumps(self.to_primitive()), cloud_path=cloud_path)
Пример #7
0
    def test_upload_with_nested_dataset_preserves_nested_structure(self):
        """Test that uploading a dataset containing datafiles in a nested directory structure to the cloud preserves
        this structure in the cloud.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            local_paths = self._create_files_and_nested_subdirectories(
                temporary_directory)
            dataset = Dataset(path=temporary_directory, recursive=True)

            upload_path = storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "my-dataset")
            dataset.upload(cloud_path=upload_path)

        cloud_datafile_relative_paths = {
            blob.name.split(dataset.name)[-1].strip("/")
            for blob in GoogleCloudStorageClient().scandir(
                upload_path,
                filter=lambda blob: not blob.name.endswith(".octue") and
                SIGNED_METADATA_DIRECTORY not in blob.name,
            )
        }

        # Check that the paths relative to the dataset directory are the same in the cloud as they are locally.
        local_datafile_relative_paths = {
            path.split(temporary_directory)[-1].strip(os.path.sep).replace(
                os.path.sep, "/")
            for path in local_paths
        }

        self.assertEqual(cloud_datafile_relative_paths,
                         local_datafile_relative_paths)
Пример #8
0
    def _get_cloud_metadata(self):
        """Get the cloud metadata for the given dataset if a dataset metadata file has previously been uploaded.

        :return None:
        """
        if storage.path.is_url(self.path):
            try:
                self._cloud_metadata = requests.get(self.path).json()
            except requests.exceptions.ConnectionError:
                pass
            return

        storage_client = GoogleCloudStorageClient()

        if not storage_client.exists(cloud_path=self._metadata_path):
            return

        self._cloud_metadata = json.loads(
            storage_client.download_as_string(
                cloud_path=self._metadata_path)).get("dataset", {})
Пример #9
0
    def update_cloud_metadata(self):
        """Create or update the cloud metadata file for the dataset.

        :return None:
        """
        GoogleCloudStorageClient().upload_from_string(
            string=json.dumps(
                {"dataset": self.to_primitive(include_files=False)},
                cls=OctueJSONEncoder),
            cloud_path=self._metadata_path,
        )
Пример #10
0
    def from_cloud(cls, cloud_path):
        """Instantiate a Manifest from Google Cloud storage.

        :param str cloud_path: full path to manifest in cloud storage (e.g. `gs://bucket_name/path/to/manifest.json`)
        :return Dataset:
        """
        serialised_manifest = json.loads(GoogleCloudStorageClient().download_as_string(cloud_path))

        return Manifest(
            id=serialised_manifest["id"],
            datasets={key: Dataset(path=dataset) for key, dataset in serialised_manifest["datasets"].items()},
        )
Пример #11
0
    def test_download(self):
        """Test that all files in a dataset can be downloaded with one command."""
        storage_client = GoogleCloudStorageClient()

        dataset_name = "another-dataset"
        storage_client.upload_from_string(
            string=json.dumps([1, 2, 3]),
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     dataset_name,
                                                     "file_0.txt"),
        )
        storage_client.upload_from_string(
            string=json.dumps([4, 5, 6]),
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     dataset_name,
                                                     "file_1.txt"),
        )

        dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/{dataset_name}")

        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset.download(local_directory=temporary_directory)

            with open(os.path.join(temporary_directory, "file_0.txt")) as f:
                self.assertEqual(f.read(), "[1, 2, 3]")

            with open(os.path.join(temporary_directory, "file_1.txt")) as f:
                self.assertEqual(f.read(), "[4, 5, 6]")
Пример #12
0
    def test_to_cloud(self):
        """Test that a manifest can be uploaded to the cloud as a serialised JSON file of the Manifest instance."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = create_dataset_with_two_files(temporary_directory)
            dataset.upload(cloud_path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "my-small-dataset"))

            manifest = Manifest(datasets={"my-dataset": dataset})
            cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                       "manifest.json")
            manifest.to_cloud(cloud_path)

            persisted_manifest = json.loads(
                GoogleCloudStorageClient().download_as_string(cloud_path))
            self.assertEqual(persisted_manifest["datasets"]["my-dataset"],
                             f"gs://{TEST_BUCKET_NAME}/my-small-dataset")
Пример #13
0
    def _instantiate_from_cloud(self, path):
        """Instantiate the dataset from a cloud directory.

        :param str path: the cloud path to a directory in cloud storage
        :return None:
        """
        if not self._hypothetical:
            self._use_cloud_metadata()

        if not self.files:
            bucket_name = storage.path.split_bucket_name_from_cloud_path(
                path)[0]

            self.files = FilterSet(
                Datafile(path=storage.path.generate_gs_path(
                    bucket_name, blob.name),
                         hypothetical=self._hypothetical)
                for blob in GoogleCloudStorageClient().scandir(
                    path,
                    recursive=self._recursive,
                    filter=(lambda blob:
                            (not blob.name.endswith(METADATA_FILENAME) and
                             SIGNED_METADATA_DIRECTORY not in blob.name)),
                ))
Пример #14
0
    def test_from_cloud_with_no_metadata_file(self):
        """Test that any cloud directory can be accessed as a dataset if it has no `.octue` metadata file in it, the
        cloud dataset doesn't lose any information during serialization, and a metadata file is uploaded afterwards.
        """
        cloud_storage_client = GoogleCloudStorageClient()

        cloud_storage_client.upload_from_string(
            "[1, 2, 3]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_0.txt"),
        )

        cloud_storage_client.upload_from_string(
            "[4, 5, 6]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_1.txt"),
        )

        cloud_dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/my_dataset")

        self.assertEqual(cloud_dataset.path,
                         f"gs://{TEST_BUCKET_NAME}/my_dataset")
        self.assertEqual(cloud_dataset.name, "my_dataset")
        self.assertEqual({file.name
                          for file in cloud_dataset.files},
                         {"file_0.txt", "file_1.txt"})

        for file in cloud_dataset:
            self.assertEqual(
                file.cloud_path,
                f"gs://{TEST_BUCKET_NAME}/my_dataset/{file.name}")

        # Test serialisation doesn't lose any information.
        deserialised_dataset = Dataset.deserialise(
            cloud_dataset.to_primitive())
        self.assertEqual(deserialised_dataset.id, cloud_dataset.id)
        self.assertEqual(deserialised_dataset.name, cloud_dataset.name)
        self.assertEqual(deserialised_dataset.path, cloud_dataset.path)
        self.assertEqual(deserialised_dataset.hash_value,
                         cloud_dataset.hash_value)