def test_from_cloud(self): """Test that a Manifest can be instantiated from a cloud path.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = create_dataset_with_two_files(temporary_directory) dataset_path = storage.path.generate_gs_path( TEST_BUCKET_NAME, "my_nice_dataset") dataset.upload(cloud_path=dataset_path) manifest = Manifest(datasets={"my-dataset": dataset}) cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "my-directory", "manifest.json") manifest.to_cloud(cloud_path) persisted_manifest = Manifest.from_cloud(cloud_path) self.assertEqual(persisted_manifest.id, manifest.id) self.assertEqual(persisted_manifest.hash_value, manifest.hash_value) self.assertEqual( { dataset.name for dataset in persisted_manifest.datasets.values() }, {dataset.name for dataset in manifest.datasets.values()}, ) for dataset in persisted_manifest.datasets.values(): self.assertEqual(dataset.path, dataset_path) self.assertTrue(len(dataset.files), 2) self.assertTrue( all(isinstance(file, Datafile) for file in dataset.files))
def test_serialisation_and_deserialisation(self): """Test that manifests can be serialised and deserialised.""" with tempfile.TemporaryDirectory() as temporary_directory: datasets = { "my_dataset_0": Dataset( path=os.path.join(temporary_directory, "my_dataset_0"), files=[ Datafile( path=os.path.join(temporary_directory, "my_dataset_0", "my_file_0.txt")) ], ), "my_dataset_1": Dataset( path=os.path.join(temporary_directory, "my_dataset_1"), files=[ Datafile( path=os.path.join(temporary_directory, "my_dataset_1", "my_file_1.txt")) ], ), } for dataset in datasets.values(): dataset.update_local_metadata() manifest = Manifest(datasets=datasets, id="7e0025cd-bd68-4de6-b48d-2643ebd5effd", name="my-manifest") serialised_manifest = manifest.to_primitive() self.assertEqual( serialised_manifest, { "id": manifest.id, "name": "my-manifest", "datasets": { "my_dataset_0": os.path.join(temporary_directory, "my_dataset_0"), "my_dataset_1": os.path.join(temporary_directory, "my_dataset_1"), }, }, ) deserialised_manifest = Manifest.deserialise(serialised_manifest) self.assertEqual(manifest.name, deserialised_manifest.name) self.assertEqual(manifest.id, deserialised_manifest.id) for key in manifest.datasets.keys(): self.assertEqual(manifest.datasets[key].name, deserialised_manifest.datasets[key].name) self.assertEqual(manifest.datasets[key].id, deserialised_manifest.datasets[key].id) self.assertEqual(manifest.datasets[key].path, deserialised_manifest.datasets[key].path)
def test_all_datasets_are_in_cloud(self): """Test whether all files of all datasets in a manifest are in the cloud or not can be determined.""" self.assertFalse( self.create_valid_manifest().all_datasets_are_in_cloud) self.assertTrue(Manifest().all_datasets_are_in_cloud) files = [ Datafile(path="gs://hello/file.txt"), Datafile(path="gs://goodbye/file.csv") ] manifest = Manifest(datasets={"my_dataset": Dataset(files=files)}) self.assertTrue(manifest.all_datasets_are_in_cloud)
def test_instantiating_from_datasets_from_different_cloud_buckets(self): """Test instantiating a manifest from multiple datasets from different cloud buckets.""" storage_client = GoogleCloudStorageClient() extra_bucket_name = TEST_BUCKET_NAME + "-another" storage_client.create_bucket(name=extra_bucket_name) storage_client.upload_from_string( "[1, 2, 3]", storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset_a", "file_0.txt"), ) storage_client.upload_from_string( "[4, 5, 6]", storage.path.generate_gs_path(extra_bucket_name, "my_dataset_b", "the_data.txt")) manifest = Manifest( datasets={ "my_dataset_a": f"gs://{TEST_BUCKET_NAME}/my_dataset_a", "my_dataset_b": f"gs://{extra_bucket_name}/my_dataset_b", }) self.assertEqual( {dataset.name for dataset in manifest.datasets.values()}, {"my_dataset_a", "my_dataset_b"}) files = [ list(dataset.files)[0] for dataset in manifest.datasets.values() ] self.assertEqual({file.bucket_name for file in files}, {TEST_BUCKET_NAME, extra_bucket_name})
def test_instantiating_from_serialised_cloud_datasets_with_no_dataset_json_file( self): """Test that a Manifest can be instantiated from a serialized cloud dataset with no `dataset.json` file. This simulates what happens when such a cloud dataset is referred to in a manifest received by a child service. """ GoogleCloudStorageClient().upload_from_string( "[1, 2, 3]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_0.txt"), ) GoogleCloudStorageClient().upload_from_string( "[4, 5, 6]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_1.txt"), ) serialised_cloud_dataset = Dataset( path=f"gs://{TEST_BUCKET_NAME}/my_dataset").to_primitive() manifest = Manifest(datasets={"my_dataset": serialised_cloud_dataset}) self.assertEqual(len(manifest.datasets), 1) self.assertEqual(manifest.datasets["my_dataset"].path, f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(len(manifest.datasets["my_dataset"].files), 2)
def test_to_cloud(self): """Test that a manifest can be uploaded to the cloud as a serialised JSON file of the Manifest instance.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = create_dataset_with_two_files(temporary_directory) dataset.upload(cloud_path=storage.path.generate_gs_path( TEST_BUCKET_NAME, "my-small-dataset")) manifest = Manifest(datasets={"my-dataset": dataset}) cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "manifest.json") manifest.to_cloud(cloud_path) persisted_manifest = json.loads( GoogleCloudStorageClient().download_as_string(cloud_path)) self.assertEqual(persisted_manifest["datasets"]["my-dataset"], f"gs://{TEST_BUCKET_NAME}/my-small-dataset")
def run(analysis): """Run a mock analysis. :param octue.resources.analysis.Analysis analysis: :return None: """ analysis.output_values = {"width": 3} analysis.output_manifest = Manifest()
def test_instantiating_from_multiple_local_datasets(self): """Test instantiating a manifest from multiple local datasets.""" manifest = Manifest(datasets={ "dataset_0": os.path.join("path", "to", "dataset_0"), "dataset_1": os.path.join("path", "to", "dataset_1"), }, ) self.assertEqual( {dataset.name for dataset in manifest.datasets.values()}, {"dataset_0", "dataset_1"})
def test_ask_with_input_manifest(self): """Test that a service can ask a question including an input manifest to another service that is serving and receive an answer. """ child = self.make_new_child(BACKEND, run_function_returnee=MockAnalysis(), use_mock=True) parent = MockService(backend=BACKEND, children={child.id: child}) dataset_path = f"gs://{TEST_BUCKET_NAME}/my-dataset" input_manifest = Manifest( datasets={ "my-dataset": Dataset( files=[ f"{dataset_path}/hello.txt", f"{dataset_path}/goodbye.csv" ], path=dataset_path, ) }) with patch("octue.cloud.pub_sub.service.Topic", new=MockTopic): with patch("octue.cloud.pub_sub.service.Subscription", new=MockSubscription): with patch("google.cloud.pubsub_v1.SubscriberClient", new=MockSubscriber): child.serve() with patch( "google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url): answer = self.ask_question_and_wait_for_answer( parent=parent, child=child, input_values={}, input_manifest=input_manifest, ) self.assertEqual( answer, { "output_values": MockAnalysis().output_values, "output_manifest": MockAnalysis().output_manifest }, )
def test_finalise_with_upload(self): """Test that the `finalise` method can be used to upload the output manifest's datasets to a cloud location and that it updates the manifest with signed URLs for accessing them. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset_path = os.path.join(temporary_directory, "the_dataset") with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f): f.write("hello") output_manifest = Manifest( datasets={ "the_dataset": Dataset( path=dataset_path, files={datafile.local_path}, labels={"one", "two", "three"} ) } ) analysis = Analysis( twine={ "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}}, "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}}, }, output_values={"blah": 3}, output_manifest=output_manifest, ) with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url): analysis.finalise(upload_output_datasets_to=f"gs://{TEST_BUCKET_NAME}/datasets") signed_url_for_dataset = analysis.output_manifest.datasets["the_dataset"].path self.assertTrue(storage.path.is_url(signed_url_for_dataset)) self.assertTrue( signed_url_for_dataset.startswith( f"{self.test_result_modifier.storage_emulator_host}/{TEST_BUCKET_NAME}/datasets/the_dataset" ) ) downloaded_dataset = Dataset(path=signed_url_for_dataset) self.assertEqual(downloaded_dataset.name, "the_dataset") self.assertEqual(len(downloaded_dataset.files), 1) self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"}) with downloaded_dataset.files.one() as (downloaded_datafile, f): self.assertEqual(f.read(), "hello")
def test_ask_with_input_manifest_with_local_paths_works_if_allowed_and_child_has_access_to_the_local_paths( self): """Test that an input manifest referencing local files can be used if the files can be accessed by the child and the `allow_local_files` parameter is `True`. """ temporary_local_path = tempfile.NamedTemporaryFile(delete=False).name with open(temporary_local_path, "w") as f: f.write("This is a local file.") local_file = Datafile(path=temporary_local_path) self.assertFalse(local_file.exists_in_cloud) manifest = Manifest( datasets={ "my-local-dataset": Dataset(name="my-local-dataset", files={local_file}) }) # Get the child to open the local file itself and return the contents as output. def run_function(analysis_id, input_values, input_manifest, analysis_log_handler, handle_monitor_message): with open(temporary_local_path) as f: return MockAnalysis(output_values=f.read()) child = MockService(backend=BACKEND, run_function=run_function) parent = MockService(backend=BACKEND, children={child.id: child}) with patch("octue.cloud.pub_sub.service.Topic", new=MockTopic): with patch("octue.cloud.pub_sub.service.Subscription", new=MockSubscription): with patch("google.cloud.pubsub_v1.SubscriberClient", new=MockSubscriber): child.serve() answer = self.ask_question_and_wait_for_answer( parent=parent, child=child, input_values={}, input_manifest=manifest, allow_local_files=True, ) self.assertEqual(answer["output_values"], "This is a local file.")
def test_app(self): """Test that the app takes in input in the correct format and returns an analysis with the correct output values. """ runner = Runner(app_src=REPOSITORY_ROOT, twine=TWINE_PATH) with patch("google.cloud.storage.blob.Blob.generate_signed_url", mock_generate_signed_url): analysis = runner.run(input_values={"n_iterations": 3}) # Check the output values. self.assertEqual(analysis.output_values, [1, 2, 3, 4, 5]) # Test that the signed URLs for the dataset and its files work and can be used to reinstantiate the output # manifest after serialisation. downloaded_output_manifest = Manifest.deserialise(analysis.output_manifest.to_primitive()) # Check that the output dataset and its files can be accessed. with downloaded_output_manifest.datasets["example_dataset"].files.one() as (datafile, f): self.assertEqual(f.read(), "This is some example service output.")
def test_finalise_validates_output(self): """Test that the `finalise` method with no other arguments just validates the output manifest and values.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset_path = os.path.join(temporary_directory, "the_dataset") with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f): f.write("hello") output_manifest = Manifest( datasets={"the_dataset": Dataset(path=dataset_path, files={datafile.local_path})} ) analysis = Analysis( twine={ "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}}, "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}}, }, output_values={"blah": 3}, output_manifest=output_manifest, ) analysis.finalise()
def create_valid_manifest(self): """Create a valid manifest with two valid datasets (they're the same dataset in this case).""" datasets = {"my_dataset": self.create_valid_dataset(), "another_dataset": self.create_valid_dataset()} manifest = Manifest(datasets=datasets) return manifest
class MockAnalysisWithOutputManifest: output_values = "This is an analysis with an empty output manifest." output_manifest = Manifest()