Пример #1
0
def open_ml_dataset_from_local_fs(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    if not os.path.isabs(path):
        path = os.path.join(ctx.base_dir, path)

    data_format = dataset_descriptor.get('Format', guess_cube_format(path))

    if data_format == FORMAT_NAME_NETCDF4:
        with measure_time(tag=f"opened local NetCDF dataset {path}"):
            ds = assert_cube(xr.open_dataset(path))
            return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_ZARR:
        with measure_time(tag=f"opened local zarr dataset {path}"):
            ds = assert_cube(xr.open_zarr(path))
            return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened local levels dataset {path}"):
            return FileStorageMultiLevelDataset(path)

    raise ServiceConfigError(
        f"Illegal data format {data_format!r} for dataset {ds_id}")
Пример #2
0
    def test_it(self):
        ds = _get_test_dataset()

        ml_ds1 = BaseMultiLevelDataset(ds)

        def input_ml_dataset_getter(ds_id):
            if ds_id == "ml_ds1":
                return ml_ds1
            self.fail(f"unexpected ds_id={ds_id!r}")

        ml_ds2 = ComputedMultiLevelDataset(os.path.join(os.path.dirname(__file__),
                                                        "..", "webapi", "res", "test", "script.py"),
                                           "compute_dataset",
                                           ["ml_ds1"],
                                           input_ml_dataset_getter,
                                           input_parameters=dict(period='1W'),
                                           ds_id="ml_ds2")
        self.assertEqual(3, ml_ds2.num_levels)
        self.assertEqual(TileGrid(3, 2, 1, 180, 180, (-180, -90, 180, 90), inv_y=False),
                         ml_ds2.tile_grid)

        ds0 = ml_ds2.get_dataset(0)
        self.assertEqual({'time': 3, 'lat': 720, 'lon': 1440, 'bnds': 2}, ds0.dims)

        ds1 = ml_ds2.get_dataset(1)
        self.assertEqual({'time': 3, 'lat': 360, 'lon': 720}, ds1.dims)

        ds2 = ml_ds2.get_dataset(2)
        self.assertEqual({'time': 3, 'lat': 180, 'lon': 360}, ds2.dims)

        self.assertEqual([ds0, ds1, ds2], ml_ds2.datasets)

        ml_ds1.close()
        ml_ds2.close()
Пример #3
0
    def test_it(self):
        ml_ds_1 = BaseMultiLevelDataset(_get_test_dataset(('noise_1', 'noise_2')))
        ml_ds_2 = BaseMultiLevelDataset(_get_test_dataset(('noise_3', 'noise_4')))
        ml_ds_3 = BaseMultiLevelDataset(_get_test_dataset(('noise_5', 'noise_6')))

        ml_ds = CombinedMultiLevelDataset([ml_ds_1, ml_ds_2, ml_ds_3])

        self.assertEqual(3, ml_ds.num_levels)
        self.assertEqual(TileGrid(3, 2, 1, 180, 180, (-180, -90, 180, 90), inv_y=False),
                         ml_ds.tile_grid)

        expected_var_names = {'noise_1', 'noise_2',
                              'noise_3', 'noise_4',
                              'noise_5', 'noise_6'}

        ds0 = ml_ds.get_dataset(0)
        self.assertEqual({'time': 14, 'lat': 720, 'lon': 1440, 'bnds': 2}, ds0.dims)
        self.assertEqual(expected_var_names, set(map(str, ds0.data_vars)))
        self.assertTrue(all(v.dims == ('time', 'lat', 'lon') for v in ds0.data_vars.values()))

        ds1 = ml_ds.get_dataset(1)
        self.assertEqual({'time': 14, 'lat': 360, 'lon': 720}, ds1.dims)
        self.assertEqual(expected_var_names, set(map(str, ds1.data_vars)))
        self.assertTrue(all(v.dims == ('time', 'lat', 'lon') for v in ds1.data_vars.values()))

        ds2 = ml_ds.get_dataset(2)
        self.assertEqual({'time': 14, 'lat': 180, 'lon': 360}, ds2.dims)
        self.assertEqual(expected_var_names, set(map(str, ds2.data_vars)))
        self.assertTrue(all(v.dims == ('time', 'lat', 'lon') for v in ds2.data_vars.values()))

        self.assertEqual([ds0, ds1, ds2], ml_ds.datasets)

        ml_ds.close()
Пример #4
0
 def test_new_ml_dataset_descriptor(self):
     cube = new_cube(variables=dict(a=4.1, b=7.4))
     ml_cube = BaseMultiLevelDataset(cube)
     descriptor = new_data_descriptor('cube', ml_cube)
     self.assertExpectedDescriptor(descriptor,
                                   MultiLevelDatasetDescriptor,
                                   'mldataset')
Пример #5
0
    def test_it(self):
        ds = _get_test_dataset()

        ml_ds = BaseMultiLevelDataset(ds)

        self.assertIsInstance(ml_ds.ds_id, str)

        self.assertEqual(3, ml_ds.num_levels)
        self.assertEqual(
            TileGrid(3, 2, 1, 180, 180, (-180, -90, 180, 90), inv_y=False),
            ml_ds.tile_grid)

        ds0 = ml_ds.get_dataset(0)
        self.assertIs(ds, ds0)

        ds1 = ml_ds.get_dataset(1)
        self.assertIsNot(ds, ds1)
        self.assertEqual({'time': 14, 'lat': 360, 'lon': 720}, ds1.dims)

        ds2 = ml_ds.get_dataset(2)
        self.assertIsNot(ds, ds2)
        self.assertEqual({'time': 14, 'lat': 180, 'lon': 360}, ds2.dims)

        self.assertEqual([ds0, ds1, ds2], ml_ds.datasets)

        ml_ds.close()
Пример #6
0
    def test_it(self):
        ds = _get_test_dataset()

        ml_ds = BaseMultiLevelDataset(ds)

        self.assertIsInstance(ml_ds.ds_id, str)

        self.assertEqual(3, ml_ds.num_levels)
        self.assertIsInstance(ml_ds.tile_grid, ImageTileGrid)
        self.assertEqual((180, 180), ml_ds.tile_grid.tile_size)
        self.assertEqual(3, ml_ds.tile_grid.num_levels)

        ds0 = ml_ds.get_dataset(0)
        self.assertIsNot(ds, ds0)
        self.assertEqual({
            'time': 14,
            'lat': 720,
            'lon': 1440,
            'bnds': 2
        }, ds0.dims)

        ds1 = ml_ds.get_dataset(1)
        self.assertIsNot(ds, ds1)
        self.assertEqual({'time': 14, 'lat': 360, 'lon': 720}, ds1.dims)

        ds2 = ml_ds.get_dataset(2)
        self.assertIsNot(ds, ds2)
        self.assertEqual({'time': 14, 'lat': 180, 'lon': 360}, ds2.dims)

        self.assertEqual([ds0, ds1, ds2], ml_ds.datasets)

        ml_ds.close()
Пример #7
0
 def test_get_type_specifier(self):
     self.assertIsNone(get_type_specifier(dict()))
     self.assertEqual(get_type_specifier(new_cube()), TYPE_SPECIFIER_CUBE)
     self.assertEqual(get_type_specifier(xr.Dataset()),
                      TYPE_SPECIFIER_DATASET)
     self.assertEqual(
         get_type_specifier(BaseMultiLevelDataset(xr.Dataset())),
         TYPE_SPECIFIER_MULTILEVEL_DATASET)
     self.assertEqual(get_type_specifier(gpd.GeoDataFrame()),
                      TYPE_SPECIFIER_GEODATAFRAME)
Пример #8
0
 def _open_ml_dataset(self, dataset_config: DatasetConfigDict) \
         -> MultiLevelDataset:
     ds_id: str = dataset_config.get('Identifier')
     store_instance_id = dataset_config.get('StoreInstanceId')
     if store_instance_id:
         data_store_pool = self.get_data_store_pool()
         data_store = data_store_pool.get_store(store_instance_id)
         data_id = dataset_config.get('Path')
         open_params = dataset_config.get('StoreOpenParams') or {}
         # Inject chunk_cache_capacity into open parameters
         chunk_cache_capacity = self.get_dataset_chunk_cache_capacity(
             dataset_config)
         if chunk_cache_capacity \
                 and (data_id.endswith('.zarr')
                      or data_id.endswith('.levels')) \
                 and 'cache_size' not in open_params:
             open_params['cache_size'] = chunk_cache_capacity
         with self.measure_time(tag=f"opened dataset {ds_id!r}"
                                f" from data store"
                                f" {store_instance_id!r}"):
             dataset = data_store.open_data(data_id, **open_params)
         if isinstance(dataset, MultiLevelDataset):
             ml_dataset = dataset
         else:
             cube, _, _ = decode_cube(dataset,
                                      normalize=True,
                                      force_non_empty=True,
                                      force_geographic=True)
             ml_dataset = BaseMultiLevelDataset(cube, ds_id=ds_id)
     else:
         fs_type = dataset_config.get('FileSystem')
         if fs_type != 'memory':
             raise ServiceConfigError(f"Invalid FileSystem {fs_type!r}"
                                      f" in dataset configuration"
                                      f" {ds_id!r}")
         with self.measure_time(tag=f"opened dataset {ds_id!r}"
                                f" from {fs_type!r}"):
             ml_dataset = _open_ml_dataset_from_python_code(
                 self, dataset_config)
     augmentation = dataset_config.get('Augmentation')
     if augmentation:
         script_path = self.get_config_path(
             augmentation,
             f"'Augmentation' of dataset configuration {ds_id}")
         input_parameters = augmentation.get('InputParameters')
         callable_name = augmentation.get('Function', COMPUTE_VARIABLES)
         ml_dataset = augment_ml_dataset(ml_dataset,
                                         script_path,
                                         callable_name,
                                         self.get_ml_dataset,
                                         self.set_ml_dataset,
                                         input_parameters=input_parameters,
                                         exception_type=ServiceConfigError)
     return ml_dataset
Пример #9
0
    def test_get_filename_ext(self):
        import xarray as xr
        import geopandas as gpd
        from xcube.core.mldataset import BaseMultiLevelDataset

        dataset = xr.Dataset()
        self.assertEqual('.zarr', self.store._get_filename_ext(dataset))
        frame = gpd.GeoDataFrame()
        self.assertEqual('.geojson', self.store._get_filename_ext(frame))
        mldataset = BaseMultiLevelDataset(base_dataset=dataset)
        self.assertEqual('.levels', self.store._get_filename_ext(mldataset))

        self.assertIsNone(self.store._get_filename_ext(None))
        self.assertIsNone(
            self.store._get_filename_ext(DataStoreError('A nonsense object')))
Пример #10
0
    def _write_test_cube_pyramid(cls):
        s3_kwargs = dict(key='test_fake_id', secret='test_fake_secret')
        s3_client_kwargs = dict(endpoint_url=MOTO_SERVER_ENDPOINT_URL)
        s3 = s3fs.S3FileSystem(**s3_kwargs, client_kwargs=s3_client_kwargs)
        # Create bucket 'xcube-test', so it exists before we write a test pyramid
        s3.mkdir('xcube-test')

        # Create a test cube pyramid with just one variable "conc_chl"
        zarr_path = os.path.join(os.path.dirname(__file__), '../../examples/serve/demo/cube-1-250-250.zarr')
        base_dataset = xr.open_zarr(zarr_path)
        base_dataset = cls._make_subset(base_dataset)
        ml_dataset = BaseMultiLevelDataset(base_dataset)

        # Write test cube pyramid
        write_levels(ml_dataset,
                     'xcube-test/cube-1-250-250.levels',
                     s3_kwargs=s3_kwargs,
                     s3_client_kwargs=s3_client_kwargs)
Пример #11
0
    def _write_test_cube_pyramid(cls):
        # Create bucket 'xcube-test', so it exists before we write a test pyramid
        s3_conn = boto3.client('s3')
        s3_conn.create_bucket(Bucket='xcube-test', ACL='public-read')

        # Create a test cube pyramid with just one variable "conc_chl"
        zarr_path = os.path.join(
            os.path.dirname(__file__),
            '../../examples/serve/demo/cube-1-250-250.zarr')
        base_dataset = xr.open_zarr(zarr_path)
        base_dataset = xr.Dataset(dict(conc_chl=base_dataset.conc_chl))
        ml_dataset = BaseMultiLevelDataset(base_dataset)

        # Write test cube pyramid
        write_levels(
            ml_dataset,
            'https://s3.amazonaws.com/xcube-test/cube-1-250-250.levels',
            client_kwargs=dict(provider_access_key_id='test_fake_id',
                               provider_secret_access_key='test_fake_secret'))
Пример #12
0
def open_ml_dataset_from_object_storage(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR)

    s3_client_kwargs = {}
    if 'Endpoint' in dataset_descriptor:
        s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint']
    if 'Region' in dataset_descriptor:
        s3_client_kwargs['region_name'] = dataset_descriptor['Region']
    obs_file_system = s3fs.S3FileSystem(anon=True,
                                        client_kwargs=s3_client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = obs_file_system.exists(f'{path}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(cached_store, consolidated=consolidated))
        return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                ds_id,
                obs_file_system,
                path,
                exception_type=ServiceConfigError)
Пример #13
0
    def write_data(self,
                   data: Union[xr.Dataset, MultiLevelDataset],
                   data_id: str,
                   replace: bool = False,
                   **write_params) -> str:
        assert_instance(data, (xr.Dataset, MultiLevelDataset), name='data')
        assert_instance(data_id, str, name='data_id')
        tile_size = write_params.pop('tile_size', None)
        if isinstance(data, MultiLevelDataset):
            ml_dataset = data
            if tile_size:
                warnings.warn('tile_size is ignored for multi-level datasets')
        else:
            base_dataset: xr.Dataset = data
            if tile_size:
                assert_instance(tile_size, int, name='tile_size')
                gm = GridMapping.from_dataset(base_dataset)
                x_name, y_name = gm.xy_dim_names
                base_dataset = base_dataset.chunk({
                    x_name: tile_size,
                    y_name: tile_size
                })
            ml_dataset = BaseMultiLevelDataset(base_dataset)
        fs, root, write_params = self.load_fs(write_params)
        consolidated = write_params.pop('consolidated', True)
        use_saved_levels = write_params.pop('use_saved_levels', False)
        base_dataset_id = write_params.pop('base_dataset_id', None)

        if use_saved_levels:
            ml_dataset = BaseMultiLevelDataset(ml_dataset.get_dataset(0),
                                               tile_grid=ml_dataset.tile_grid)

        path_class = get_fs_path_class(fs)
        data_path = path_class(data_id)
        fs.mkdirs(str(data_path), exist_ok=replace)

        for index in range(ml_dataset.num_levels):
            level_dataset = ml_dataset.get_dataset(index)
            if base_dataset_id and index == 0:
                # Write file "0.link" instead of copying
                # level zero dataset to "0.zarr".

                # Compute a relative base dataset path first
                base_dataset_path = path_class(root, base_dataset_id)
                data_parent_path = data_path.parent
                try:
                    base_dataset_path = base_dataset_path.relative_to(
                        data_parent_path)
                except ValueError as e:
                    raise DataStoreError(
                        f'invalid base_dataset_id: {base_dataset_id}') from e
                base_dataset_path = '..' / base_dataset_path

                # Then write relative base dataset path into link file
                link_path = data_path / f'{index}.link'
                with fs.open(str(link_path), mode='w') as fp:
                    fp.write(f'{base_dataset_path}')
            else:
                # Write level "{index}.zarr"
                level_path = data_path / f'{index}.zarr'
                zarr_store = fs.get_mapper(str(level_path), create=True)
                try:
                    level_dataset.to_zarr(zarr_store,
                                          mode='w' if replace else None,
                                          consolidated=consolidated,
                                          **write_params)
                except ValueError as e:
                    # TODO: remove already written data!
                    raise DataStoreError(f'Failed to write'
                                         f' dataset {data_id}: {e}') from e
                if use_saved_levels:
                    level_dataset = xr.open_zarr(zarr_store,
                                                 consolidated=consolidated)
                    ml_dataset.set_dataset(index, level_dataset)

        return data_id