示例#1
0
    def test_get_file_by_label(self):
        """Ensure files can be accessed by label from the dataset."""
        files = [
            Datafile(path="path-within-dataset/a_my_file.csv",
                     labels="one a b3 all"),
            Datafile(path="path-within-dataset/a_your_file.csv",
                     labels="two a2 b3 all"),
            Datafile(path="path-within-dataset/a_your_file.csv",
                     labels="three all"),
        ]

        resource = Dataset(files=files)

        # Check working for single result
        self.assertEqual(
            resource.get_file_by_label("three").labels, files[2].labels)

        # Check raises for too many results
        with self.assertRaises(
                exceptions.UnexpectedNumberOfResultsException) as e:
            resource.get_file_by_label("all")

        self.assertIn("More than one result found", e.exception.args[0])

        # Check raises for no result
        with self.assertRaises(
                exceptions.UnexpectedNumberOfResultsException) as e:
            resource.get_file_by_label("billyjeanisnotmylover")

        self.assertIn(
            "No results found for filters {'labels__contains': 'billyjeanisnotmylover'}",
            e.exception.args[0])
示例#2
0
    def test_serialisation_and_deserialisation(self):
        """Test that manifests can be serialised and deserialised."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            datasets = {
                "my_dataset_0":
                Dataset(
                    path=os.path.join(temporary_directory, "my_dataset_0"),
                    files=[
                        Datafile(
                            path=os.path.join(temporary_directory,
                                              "my_dataset_0", "my_file_0.txt"))
                    ],
                ),
                "my_dataset_1":
                Dataset(
                    path=os.path.join(temporary_directory, "my_dataset_1"),
                    files=[
                        Datafile(
                            path=os.path.join(temporary_directory,
                                              "my_dataset_1", "my_file_1.txt"))
                    ],
                ),
            }

            for dataset in datasets.values():
                dataset.update_local_metadata()

            manifest = Manifest(datasets=datasets,
                                id="7e0025cd-bd68-4de6-b48d-2643ebd5effd",
                                name="my-manifest")

            serialised_manifest = manifest.to_primitive()

            self.assertEqual(
                serialised_manifest,
                {
                    "id": manifest.id,
                    "name": "my-manifest",
                    "datasets": {
                        "my_dataset_0":
                        os.path.join(temporary_directory, "my_dataset_0"),
                        "my_dataset_1":
                        os.path.join(temporary_directory, "my_dataset_1"),
                    },
                },
            )

            deserialised_manifest = Manifest.deserialise(serialised_manifest)

        self.assertEqual(manifest.name, deserialised_manifest.name)
        self.assertEqual(manifest.id, deserialised_manifest.id)

        for key in manifest.datasets.keys():
            self.assertEqual(manifest.datasets[key].name,
                             deserialised_manifest.datasets[key].name)
            self.assertEqual(manifest.datasets[key].id,
                             deserialised_manifest.datasets[key].id)
            self.assertEqual(manifest.datasets[key].path,
                             deserialised_manifest.datasets[key].path)
示例#3
0
 def test_filter_name_filters_exclude_path(self):
     """Ensures that filters applied to the name will not catch terms in the extension"""
     resource = Dataset(files=[
         Datafile(path="first-path-within-dataset/a_test_file.csv"),
         Datafile(path="second-path-within-dataset/a_test_file.txt"),
     ])
     files = resource.files.filter(name__icontains="second")
     self.assertEqual(0, len(files))
示例#4
0
    def test_filter_name_filters_include_extension(self):
        """Ensures that filters applied to the name will catch terms in the extension"""
        files = [
            Datafile(path="path-within-dataset/a_test_file.csv"),
            Datafile(path="path-within-dataset/a_test_file.txt"),
        ]

        self.assertEqual(
            Dataset(files=files).files.filter(
                name__icontains="txt").pop().path,
            FilterSet({files[1]}).pop().local_path)
示例#5
0
    def create_valid_dataset(self, **kwargs):
        """Create a valid dataset with two valid datafiles (they're the same file in this case)."""
        path = os.path.join(self.data_path, "basic_files", "configuration", "test-dataset")

        return Dataset(
            path=path,
            files=[
                Datafile(path=os.path.join(path, "path-within-dataset", "a_test_file.csv")),
                Datafile(path=os.path.join(path, "path-within-dataset", "another_test_file.csv")),
            ],
            **kwargs
        )
示例#6
0
    def test_all_datasets_are_in_cloud(self):
        """Test whether all files of all datasets in a manifest are in the cloud or not can be determined."""
        self.assertFalse(
            self.create_valid_manifest().all_datasets_are_in_cloud)
        self.assertTrue(Manifest().all_datasets_are_in_cloud)

        files = [
            Datafile(path="gs://hello/file.txt"),
            Datafile(path="gs://goodbye/file.csv")
        ]
        manifest = Manifest(datasets={"my_dataset": Dataset(files=files)})
        self.assertTrue(manifest.all_datasets_are_in_cloud)
示例#7
0
    def test_exists_in_cloud(self):
        """Test whether all files of a dataset are in the cloud or not can be determined."""
        self.assertFalse(self.create_valid_dataset().all_files_are_in_cloud)

        with tempfile.TemporaryDirectory() as temporary_directory:
            self.assertTrue(
                Dataset(path=temporary_directory).all_files_are_in_cloud)

        files = [
            Datafile(path="gs://hello/file.txt"),
            Datafile(path="gs://goodbye/file.csv")
        ]
        self.assertTrue(Dataset(files=files).all_files_are_in_cloud)
示例#8
0
    def test_filter_catches_single_underscore_mistake(self):
        """Ensure that if the filter name contains only single underscores, an error is raised."""
        resource = Dataset(files=[
            Datafile(path="path-within-dataset/A_Test_file.csv"),
            Datafile(path="path-within-dataset/a_test_file.txt"),
        ])

        with self.assertRaises(exceptions.InvalidInputException) as e:
            resource.files.filter(name_icontains="Test")

        self.assertIn(
            "Invalid filter name 'name_icontains'. Filter names should be in the form",
            e.exception.args[0])
示例#9
0
 def test_filter_name_contains(self):
     """Ensures that filter works with the name_contains and name_icontains lookups"""
     resource = Dataset(files=[
         Datafile(path="path-within-dataset/A_Test_file.csv"),
         Datafile(path="path-within-dataset/a_test_file.txt"),
     ])
     files = resource.files.filter(name__icontains="Test")
     self.assertEqual(2, len(files))
     files = resource.files.filter(name__icontains="A")
     self.assertEqual(2, len(files))
     files = resource.files.filter(name__contains="Test")
     self.assertEqual(1, len(files))
     files = resource.files.filter(name__icontains="test")
     self.assertEqual(2, len(files))
     files = resource.files.filter(name__icontains="file")
     self.assertEqual(2, len(files))
示例#10
0
    def test_generating_signed_url_from_dataset_and_recreating_dataset_from_it(
            self):
        """Test that a signed URL can be generated for a dataset that can be used to recreate/get it, its metadata, and
        all its files.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_local_path = os.path.join(temporary_directory,
                                              "my-dataset-to-sign")

            with Datafile(path=os.path.join(dataset_local_path, "my-file.dat"),
                          mode="w") as (datafile, f):
                f.write("hello")
                datafile.tags = {"my": "metadata"}

            dataset = Dataset(path=dataset_local_path, tags={"hello": "world"})
            dataset.upload(
                storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                              "my-dataset-to-sign"))

        with patch("google.cloud.storage.blob.Blob.generate_signed_url",
                   new=mock_generate_signed_url):
            signed_url = dataset.generate_signed_url()

        downloaded_dataset = Dataset(path=signed_url)
        self.assertEqual(downloaded_dataset.tags, {"hello": "world"})

        with downloaded_dataset.files.one() as (downloaded_datafile, f):
            self.assertEqual(f.read(), "hello")

        self.assertEqual(downloaded_datafile.name, "my-file.dat")
        self.assertEqual(downloaded_datafile.extension, "dat")
示例#11
0
 def test_metadata_hash_is_same_for_different_datasets_with_the_same_metadata(
         self):
     """Test that the metadata hash is the same for datasets with different files but the same metadata."""
     first_dataset = Dataset(labels={"a", "b", "c"})
     second_dataset = Dataset(
         files={Datafile(path="blah", hypothetical=True)},
         labels={"a", "b", "c"})
     self.assertEqual(first_dataset.metadata_hash_value,
                      second_dataset.metadata_hash_value)
示例#12
0
    def test_filter_by_label(self):
        """Ensures that filter works with label lookups"""
        resource = Dataset(files=[
            Datafile(path="path-within-dataset/a_my_file.csv",
                     labels="one a2 b3 all"),
            Datafile(path="path-within-dataset/a_your_file.csv",
                     labels="two a2 b3 all"),
            Datafile(path="path-within-dataset/a_your_file.csv",
                     labels="three all"),
        ])

        files = resource.files.filter(labels__contains="a")
        self.assertEqual(0, len(files))
        files = resource.files.filter(labels__contains="one")
        self.assertEqual(1, len(files))
        files = resource.files.filter(labels__contains="all")
        self.assertEqual(3, len(files))
        files = resource.files.filter(labels__any_label_starts_with="b")
        self.assertEqual(2, len(files))
        files = resource.files.filter(labels__any_label_ends_with="3")
        self.assertEqual(2, len(files))
示例#13
0
 def test_filter_name_with(self):
     """Ensures that filter works with the name_endswith and name_startswith lookups"""
     resource = Dataset(files=[
         Datafile(path="path-within-dataset/a_my_file.csv"),
         Datafile(path="path-within-dataset/a_your_file.csv"),
     ])
     files = resource.files.filter(name__starts_with="a_my")
     self.assertEqual(1, len(files))
     files = resource.files.filter(name__starts_with="a_your")
     self.assertEqual(1, len(files))
     files = resource.files.filter(name__starts_with="a_")
     self.assertEqual(2, len(files))
     files = resource.files.filter(name__starts_with="b")
     self.assertEqual(0, len(files))
     files = resource.files.filter(name__ends_with="_file.csv")
     self.assertEqual(2, len(files))
     files = resource.files.filter(name__ends_with="r_file.csv")
     self.assertEqual(1, len(files))
     files = resource.files.filter(name__ends_with="y_file.csv")
     self.assertEqual(1, len(files))
     files = resource.files.filter(name__ends_with="other.csv")
     self.assertEqual(0, len(files))
示例#14
0
    def test_finalise_with_upload(self):
        """Test that the `finalise` method can be used to upload the output manifest's datasets to a cloud location
        and that it updates the manifest with signed URLs for accessing them.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "the_dataset")

            with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f):
                f.write("hello")

            output_manifest = Manifest(
                datasets={
                    "the_dataset": Dataset(
                        path=dataset_path, files={datafile.local_path}, labels={"one", "two", "three"}
                    )
                }
            )

            analysis = Analysis(
                twine={
                    "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}},
                    "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}},
                },
                output_values={"blah": 3},
                output_manifest=output_manifest,
            )

            with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url):
                analysis.finalise(upload_output_datasets_to=f"gs://{TEST_BUCKET_NAME}/datasets")

        signed_url_for_dataset = analysis.output_manifest.datasets["the_dataset"].path
        self.assertTrue(storage.path.is_url(signed_url_for_dataset))

        self.assertTrue(
            signed_url_for_dataset.startswith(
                f"{self.test_result_modifier.storage_emulator_host}/{TEST_BUCKET_NAME}/datasets/the_dataset"
            )
        )

        downloaded_dataset = Dataset(path=signed_url_for_dataset)
        self.assertEqual(downloaded_dataset.name, "the_dataset")
        self.assertEqual(len(downloaded_dataset.files), 1)
        self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"})

        with downloaded_dataset.files.one() as (downloaded_datafile, f):
            self.assertEqual(f.read(), "hello")
示例#15
0
def run(analysis):
    logger.info("Started example analysis.")
    do_something()
    time.sleep(2)
    analysis.output_values = [1, 2, 3, 4, 5]

    with tempfile.TemporaryDirectory() as temporary_directory:

        with Datafile(os.path.join(temporary_directory, "output.dat"),
                      mode="w") as (datafile, f):
            f.write("This is some example service output.")

        analysis.output_manifest.datasets["example_dataset"] = Dataset(
            path=temporary_directory, files={datafile})
        analysis.finalise(upload_output_datasets_to=analysis.output_location)

    logger.info("Finished example analysis.")
示例#16
0
    def test_adding_cloud_datafile_to_local_dataset(self):
        """Test that when a cloud datafile is added to a local dataset, it is downloaded to the root of the dataset."""
        with Datafile(path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "path", "to", "datafile.dat"),
                      mode="w") as (
                          datafile,
                          f,
                      ):
            f.write("hello")

        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))
            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.local_path,
                         os.path.join(dataset.path, "datafile.dat"))
示例#17
0
    def test_ask_with_input_manifest_with_local_paths_works_if_allowed_and_child_has_access_to_the_local_paths(
            self):
        """Test that an input manifest referencing local files can be used if the files can be accessed by the child and
        the `allow_local_files` parameter is `True`.
        """
        temporary_local_path = tempfile.NamedTemporaryFile(delete=False).name

        with open(temporary_local_path, "w") as f:
            f.write("This is a local file.")

        local_file = Datafile(path=temporary_local_path)
        self.assertFalse(local_file.exists_in_cloud)

        manifest = Manifest(
            datasets={
                "my-local-dataset":
                Dataset(name="my-local-dataset", files={local_file})
            })

        # Get the child to open the local file itself and return the contents as output.
        def run_function(analysis_id, input_values, input_manifest,
                         analysis_log_handler, handle_monitor_message):
            with open(temporary_local_path) as f:
                return MockAnalysis(output_values=f.read())

        child = MockService(backend=BACKEND, run_function=run_function)
        parent = MockService(backend=BACKEND, children={child.id: child})

        with patch("octue.cloud.pub_sub.service.Topic", new=MockTopic):
            with patch("octue.cloud.pub_sub.service.Subscription",
                       new=MockSubscription):
                with patch("google.cloud.pubsub_v1.SubscriberClient",
                           new=MockSubscriber):
                    child.serve()

                    answer = self.ask_question_and_wait_for_answer(
                        parent=parent,
                        child=child,
                        input_values={},
                        input_manifest=manifest,
                        allow_local_files=True,
                    )

        self.assertEqual(answer["output_values"], "This is a local file.")
示例#18
0
    def test_adding_cloud_datafile_to_cloud_dataset_when_file_is_already_in_dataset_directory(
            self):
        """Test that a cloud datafile's path is kept as-is when adding it to a cloud dataset if it is already in the
        dataset directory and no `path_in_dataset` is provided.
        """
        dataset = Dataset(path=storage.path.generate_gs_path(
            TEST_BUCKET_NAME, "path", "to", "dataset"))

        with Datafile(path=storage.path.join(dataset.path, "subfolder",
                                             "datafile.dat"),
                      mode="w") as (datafile, f):
            f.write("hello")

        dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(
            datafile.cloud_path,
            storage.path.join(dataset.path, "subfolder", "datafile.dat"))
示例#19
0
    def test_adding_cloud_datafile_to_cloud_dataset(self):
        """Test that a cloud datafile can be added to a cloud dataset and that it's copied into the dataset root if no
        `path_within_dataset` is provided.
        """
        dataset = Dataset(path=storage.path.generate_gs_path(
            TEST_BUCKET_NAME, "path", "to", "dataset"))

        with Datafile(path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "path", "to", "datafile.dat"),
                      mode="w") as (
                          datafile,
                          f,
                      ):
            f.write("hello")

        dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.cloud_path,
                         storage.path.join(dataset.path, "datafile.dat"))
示例#20
0
    def test_adding_local_datafile_to_local_dataset_when_file_is_already_in_dataset_directory(
            self):
        """Test that a local datafile's path is kept as-is when adding it to a local dataset if it is already in the
        dataset directory.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))

            with Datafile(path=os.path.join(dataset.path, "subfolder",
                                            "datafile.dat"),
                          mode="w") as (datafile, f):
                f.write("hello")

            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(
            datafile.local_path,
            os.path.join(dataset.path, "subfolder", "datafile.dat"))
示例#21
0
    def test_adding_local_datafile_to_local_dataset(self):
        """Test that a local datafile can be added to a local dataset and that it is copied to the root of the dataset
        if no `path_within_dataset` is provided.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))

            with Datafile(path=os.path.join(temporary_directory, "path", "to",
                                            "datafile.dat"),
                          mode="w") as (
                              datafile,
                              f,
                          ):
                f.write("hello")

            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.local_path,
                         os.path.join(dataset.path, "datafile.dat"))
示例#22
0
    def test_finalise_validates_output(self):
        """Test that the `finalise` method with no other arguments just validates the output manifest and values."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "the_dataset")

            with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f):
                f.write("hello")

            output_manifest = Manifest(
                datasets={"the_dataset": Dataset(path=dataset_path, files={datafile.local_path})}
            )

            analysis = Analysis(
                twine={
                    "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}},
                    "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}},
                },
                output_values={"blah": 3},
                output_manifest=output_manifest,
            )

            analysis.finalise()
示例#23
0
    def test_providing_path_when_adding_cloud_datafile_to_cloud_dataset_copies_datafile_to_path(
            self):
        """Test that providing the `path_within_dataset` parameter when adding a cloud datafile to a cloud dataset
        results in the datafile being copied to that location within the dataset.
        """
        dataset = Dataset(path=storage.path.generate_gs_path(
            TEST_BUCKET_NAME, "path", "to", "dataset"))

        with Datafile(path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "path", "to", "datafile.dat"),
                      mode="w") as (
                          datafile,
                          f,
                      ):
            f.write("hello")

        path_in_dataset = storage.path.join("another", "path", "datafile.dat")
        dataset.add(datafile, path_in_dataset=path_in_dataset)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.cloud_path,
                         storage.path.join(dataset.path, path_in_dataset))
示例#24
0
    def test_adding_local_datafile_to_cloud_dataset_uploads_it_to_dataset_root(
            self):
        """Test that, when adding a local datafile to a cloud dataset and `path_in_dataset` is not provided, the
        datafile is uploaded to the root of the dataset.
        """
        dataset = Dataset(path=storage.path.generate_gs_path(
            TEST_BUCKET_NAME, "path", "to", "dataset"))

        with tempfile.TemporaryDirectory() as temporary_directory:
            with Datafile(path=os.path.join(temporary_directory, "path", "to",
                                            "datafile.dat"),
                          mode="w") as (
                              datafile,
                              f,
                          ):
                f.write("hello")

            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.cloud_path,
                         storage.path.join(dataset.path, "datafile.dat"))
示例#25
0
    def test_providing_path_when_adding_local_datafile_to_local_dataset(self):
        """Test that providing the `path_within_dataset` parameter when adding a local datafile to a local dataset
        results in the datafile being copied to that location within the dataset.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))

            with Datafile(path=os.path.join(temporary_directory, "path", "to",
                                            "datafile.dat"),
                          mode="w") as (
                              datafile,
                              f,
                          ):
                f.write("hello")

            path_in_dataset = os.path.join("another", "path", "datafile.dat")
            dataset.add(datafile, path_in_dataset=path_in_dataset)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.local_path,
                         os.path.join(dataset.path, path_in_dataset))
示例#26
0
    def test_providing_path_in_dataset_when_adding_cloud_datafile_to_local_dataset(
            self):
        """Test that when a cloud datafile is added to a local dataset and the `path_in_dataset` parameter is provided,
        it is downloaded to that path within the dataset.
        """
        with Datafile(path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "path", "to", "datafile.dat"),
                      mode="w") as (
                          datafile,
                          f,
                      ):
            f.write("hello")

        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))

            path_in_dataset = os.path.join("another", "path", "datafile.dat")
            dataset.add(datafile, path_in_dataset=path_in_dataset)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.local_path,
                         os.path.join(dataset.path, path_in_dataset))
示例#27
0
def run(analysis):
    """Read a time series of files from a dataset, clean them, and write a new, cleaned dataset.

    See the "fractal" template for an introduction to the analysis object and the purpose of this 'run' function.

    Here, let's create an example application designed to clean up CSV data files produced by an instrument, in this
    case a meteorological mast.

    The aim of this example is to teach you how to use input and output file manifests in an app - so what we'll do is:
    - Use the input manifest to read a sequence of files
    - Perform a simple transformation on some of the data (as if we were doing a data cleaning process)
    - Create new files containing the cleaned data
    - Add them to the output manifest

    :param octue.resources.Analysis analysis:
    :return None:
    """
    # You can use a logger to record debug statements, general information, warnings or errors.
    logger.info("Starting clean up of files in %s", analysis.input_manifest)

    # Get the configuration value for our time averaging window (or if not present, use the default specified in
    # the twine).
    time_window = analysis.configuration_values.get("time_window", 600)
    logger.info("Averaging window set to %ss", time_window)

    # Get the input dataset which will be read in.
    input_dataset = analysis.input_manifest.get_dataset("raw_met_mast_data")

    # There are two types of files in the dataset. Metadata file(s), saved daily, and measurement files (saved hourly).
    # Because a manifest has been created, we're able to get this data out easily with the dataset filtering
    # capabilities. Let's get the metadata and the timeseries files, whilst showing off a couple of the filters.
    # See the Dataset class help for more.
    metadata_file = input_dataset.get_file_by_label("meta")

    timeseries_files = input_dataset.files.filter(
        labels__contains="timeseries").order_by(
            "tags__sequence",
            check_start_value=0,
            check_constant_increment=1,
        )

    # We used these because they're special helpers - in this case ensuring that there's only one metadata file and
    # ensuring that the timeseries files come in a strictly ordered sequence.
    #
    # We could also have picked up one or more files using general filters, like so:
    #
    #    metadata_files = input_dataset.files.filter(name__icontains="meta")
    #
    # There's generally a few ways to do it. Choose one which is likely to be most consistent - for example if your
    # filenames might be subject to change, but you have better control over the labels, rely on those.

    # At this point it's over to you, to do whatever you want with the contents of these files.
    # For this example app, we will:

    # Use a custom function to read in the strange metadata file that came with the dataset.
    metadata = read_dat_file(metadata_file)

    # Read the sequence of CSV files and concatenate into a pandas dataframe (like a table).
    data = read_csv_files(timeseries_files)

    # Clean the timeseries data up.
    data = clean(data, metadata["date"])

    # Create a temporary directory for the output dataset. This avoids any race conditions arising (if other instances
    # of this application are running at the same time) and avoids any data loss due to overwriting. The temporary
    # directory is deleted once the "with" block is exited.
    with tempfile.TemporaryDirectory() as temporary_directory:
        timeseries_datafile = Datafile(
            path=os.path.join(temporary_directory, "cleaned.csv"),
            labels=["timeseries"],
        )

        # Write the file (now we know where to write it)
        with timeseries_datafile.open("w") as fp:
            data.to_csv(path_or_buf=fp)

        # You can replace empty output datasets with datasets instantiated from a local or cloud directory.
        analysis.output_manifest.datasets["cleaned_met_mast_data"] = Dataset(
            path=temporary_directory,
            name="cleaned_met_mast_data",
        )

        # We'll add some labels, which will help to improve searchability and allow other apps, reports, users and
        # analyses to automatically find figures and use them.
        #
        # Labels are case insensitive, and accept a-z, 0-9, and hyphens which can be used literally in search and are
        # also used to separate words in natural language search.
        analysis.output_manifest.get_dataset(
            "cleaned_met_mast_data").labels = ["met", "mast", "cleaned"]

        # Finalise the analysis. This validates the output data and output manifest against the twine and optionally
        # uploads any datasets in the output manifest to the service's cloud bucket. Signed URLs are provided so that
        # the parent that asked the service for the analysis can access the data (until the signed URLs expire).
        analysis.finalise(
            upload_output_datasets_to=
            f"gs://{TEST_BUCKET_NAME}/output/test_using_manifests_analysis")
示例#28
0
 def test_instantiates_with_kwargs(self):
     """Ensures that keyword arguments can be used to construct the dataset initially"""
     files = [Datafile(path="path-within-dataset/a_test_file.csv")]
     resource = Dataset(files=files, labels="one two")
     self.assertEqual(len(resource.files), 1)