示例#1
0
def open_ml_dataset_from_local_fs(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    if not os.path.isabs(path):
        path = os.path.join(ctx.base_dir, path)

    data_format = dataset_descriptor.get('Format', guess_cube_format(path))

    if data_format == FORMAT_NAME_NETCDF4:
        with measure_time(tag=f"opened local NetCDF dataset {path}"):
            ds = assert_cube(xr.open_dataset(path))
            return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_ZARR:
        with measure_time(tag=f"opened local zarr dataset {path}"):
            ds = assert_cube(xr.open_zarr(path))
            return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened local levels dataset {path}"):
            return FileStorageMultiLevelDataset(path)

    raise ServiceConfigError(
        f"Illegal data format {data_format!r} for dataset {ds_id}")
示例#2
0
def open_ml_dataset_from_python_code(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    if not os.path.isabs(path):
        path = os.path.join(ctx.base_dir, path)

    callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET)
    input_dataset_ids = dataset_descriptor.get('InputDatasets', [])
    input_parameters = dataset_descriptor.get('InputParameters', {})

    for input_dataset_id in input_dataset_ids:
        if not ctx.get_dataset_descriptor(input_dataset_id):
            raise ServiceConfigError(
                f"Invalid dataset descriptor {ds_id!r}: "
                f"Input dataset {input_dataset_id!r} of callable {callable_name!r} "
                f"must reference another dataset")

    with measure_time(tag=f"opened memory dataset {path}"):
        return ComputedMultiLevelDataset(ds_id,
                                         path,
                                         callable_name,
                                         input_dataset_ids,
                                         ctx.get_ml_dataset,
                                         input_parameters,
                                         exception_type=ServiceConfigError)
示例#3
0
 def get_chunk_cache_capacity(cls, config: Dict[str, Any], cache_size_key: str) -> Optional[int]:
     cache_size = config.get(cache_size_key, None)
     if not cache_size:
         return None
     elif isinstance(cache_size, str):
         try:
             cache_size = parse_mem_size(cache_size)
         except ValueError:
             raise ServiceConfigError(f'Invalid {cache_size_key}')
     elif not isinstance(cache_size, int) or cache_size < 0:
         raise ServiceConfigError(f'Invalid {cache_size_key}')
     return cache_size
示例#4
0
 def from_config(cls, config: Dict[str, Any]) -> Optional['AuthConfig']:
     authentication = config.get('Authentication')
     if not authentication:
         return None
     domain = authentication.get('Domain')
     if not domain:
         raise ServiceConfigError('Missing key "Domain" in section "Authentication"')
     audience = authentication.get('Audience')
     if not audience:
         raise ServiceConfigError('Missing key "Audience" in section "Authentication"')
     algorithms = authentication.get('Algorithms', ['RS256'])
     if not algorithms:
         raise ServiceConfigError('Value for key "Algorithms" in section "Authentication" must not be empty')
     return AuthConfig(domain, audience, algorithms)
示例#5
0
    def test_same_base_type(self):

        self.assertIsInstance(ServiceError(''), HTTPError)
        self.assertEqual(500, ServiceError('').status_code)
        self.assertEqual(503, ServiceError('', status_code=503).status_code)

        self.assertIsInstance(ServiceConfigError(''), ServiceError)
        self.assertEqual(500, ServiceConfigError('').status_code)

        self.assertIsInstance(ServiceBadRequestError(''), ServiceError)
        self.assertEqual(400, ServiceBadRequestError('').status_code)

        self.assertIsInstance(ServiceResourceNotFoundError(''), ServiceError)
        self.assertEqual(404, ServiceResourceNotFoundError('').status_code)
示例#6
0
文件: context.py 项目: manzt/xcube
def _open_ml_dataset_from_python_code(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')
    path = ctx.get_descriptor_path(dataset_descriptor,
                                   f"dataset descriptor {ds_id}")
    callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET)
    input_dataset_ids = dataset_descriptor.get('InputDatasets', [])
    input_parameters = dataset_descriptor.get('InputParameters', {})

    for input_dataset_id in input_dataset_ids:
        if not ctx.get_dataset_descriptor(input_dataset_id):
            raise ServiceConfigError(
                f"Invalid dataset descriptor {ds_id!r}: "
                f"Input dataset {input_dataset_id!r} of callable {callable_name!r} "
                f"must reference another dataset")

    return open_ml_dataset_from_python_code(
        path,
        callable_name=callable_name,
        input_ml_dataset_ids=input_dataset_ids,
        input_ml_dataset_getter=ctx.get_ml_dataset,
        input_parameters=input_parameters,
        ds_id=ds_id,
        exception_type=ServiceConfigError)
示例#7
0
文件: context.py 项目: manzt/xcube
 def _open_ml_dataset(
         self, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
     ds_id = dataset_descriptor.get('Identifier')
     fs_type = dataset_descriptor.get('FileSystem', 'local')
     if self._ml_dataset_openers and fs_type in self._ml_dataset_openers:
         ml_dataset_opener = self._ml_dataset_openers[fs_type]
     elif fs_type in _DEFAULT_MULTI_LEVEL_DATASET_OPENERS:
         ml_dataset_opener = _DEFAULT_MULTI_LEVEL_DATASET_OPENERS[fs_type]
     else:
         raise ServiceConfigError(
             f"Invalid fs={fs_type!r} in dataset descriptor {ds_id!r}")
     ml_dataset = ml_dataset_opener(self, dataset_descriptor)
     augmentation = dataset_descriptor.get('Augmentation')
     if augmentation:
         script_path = self.get_descriptor_path(
             augmentation, f"'Augmentation' of dataset descriptor {ds_id}")
         input_parameters = augmentation.get('InputParameters')
         callable_name = augmentation.get('Function', COMPUTE_VARIABLES)
         ml_dataset = augment_ml_dataset(ml_dataset,
                                         script_path,
                                         callable_name,
                                         self.get_ml_dataset,
                                         self.set_ml_dataset,
                                         input_parameters=input_parameters,
                                         exception_type=ServiceConfigError)
     return ml_dataset
示例#8
0
def _open_ml_dataset_from_python_code(
        ctx: ServiceContext,
        dataset_config: DatasetConfigDict) -> MultiLevelDataset:
    ds_id = dataset_config.get('Identifier')
    path = ctx.get_config_path(dataset_config,
                               f"dataset configuration {ds_id}")
    callable_name = dataset_config.get('Function', COMPUTE_DATASET)
    input_dataset_ids = dataset_config.get('InputDatasets', [])
    input_parameters = dataset_config.get('InputParameters', {})
    chunk_cache_capacity = ctx.get_dataset_chunk_cache_capacity(dataset_config)
    if chunk_cache_capacity:
        warnings.warn(
            'chunk cache size is not effective for datasets computed from scripts'
        )
    for input_dataset_id in input_dataset_ids:
        if not ctx.get_dataset_config(input_dataset_id):
            raise ServiceConfigError(
                f"Invalid dataset configuration {ds_id!r}: "
                f"Input dataset {input_dataset_id!r} of callable {callable_name!r} "
                f"must reference another dataset")
    return open_ml_dataset_from_python_code(
        path,
        callable_name=callable_name,
        input_ml_dataset_ids=input_dataset_ids,
        input_ml_dataset_getter=ctx.get_ml_dataset,
        input_parameters=input_parameters,
        ds_id=ds_id,
        exception_type=ServiceConfigError)
示例#9
0
 def get_dataset_descriptor(self, ds_id: str) -> Dict[str, Any]:
     dataset_descriptors = self.get_dataset_descriptors()
     if not dataset_descriptors:
         raise ServiceConfigError(f"No datasets configured")
     dataset_descriptor = self.find_dataset_descriptor(dataset_descriptors, ds_id)
     if dataset_descriptor is None:
         raise ServiceResourceNotFoundError(f'Dataset "{ds_id}" not found')
     return dataset_descriptor
示例#10
0
 def _open_ml_dataset(self, dataset_config: DatasetConfigDict) \
         -> MultiLevelDataset:
     ds_id: str = dataset_config.get('Identifier')
     store_instance_id = dataset_config.get('StoreInstanceId')
     if store_instance_id:
         data_store_pool = self.get_data_store_pool()
         data_store = data_store_pool.get_store(store_instance_id)
         data_id = dataset_config.get('Path')
         open_params = dataset_config.get('StoreOpenParams') or {}
         # Inject chunk_cache_capacity into open parameters
         chunk_cache_capacity = self.get_dataset_chunk_cache_capacity(
             dataset_config)
         if chunk_cache_capacity \
                 and (data_id.endswith('.zarr')
                      or data_id.endswith('.levels')) \
                 and 'cache_size' not in open_params:
             open_params['cache_size'] = chunk_cache_capacity
         with self.measure_time(tag=f"opened dataset {ds_id!r}"
                                f" from data store"
                                f" {store_instance_id!r}"):
             dataset = data_store.open_data(data_id, **open_params)
         if isinstance(dataset, MultiLevelDataset):
             ml_dataset = dataset
         else:
             cube, _, _ = decode_cube(dataset,
                                      normalize=True,
                                      force_non_empty=True,
                                      force_geographic=True)
             ml_dataset = BaseMultiLevelDataset(cube, ds_id=ds_id)
     else:
         fs_type = dataset_config.get('FileSystem')
         if fs_type != 'memory':
             raise ServiceConfigError(f"Invalid FileSystem {fs_type!r}"
                                      f" in dataset configuration"
                                      f" {ds_id!r}")
         with self.measure_time(tag=f"opened dataset {ds_id!r}"
                                f" from {fs_type!r}"):
             ml_dataset = _open_ml_dataset_from_python_code(
                 self, dataset_config)
     augmentation = dataset_config.get('Augmentation')
     if augmentation:
         script_path = self.get_config_path(
             augmentation,
             f"'Augmentation' of dataset configuration {ds_id}")
         input_parameters = augmentation.get('InputParameters')
         callable_name = augmentation.get('Function', COMPUTE_VARIABLES)
         ml_dataset = augment_ml_dataset(ml_dataset,
                                         script_path,
                                         callable_name,
                                         self.get_ml_dataset,
                                         self.set_ml_dataset,
                                         input_parameters=input_parameters,
                                         exception_type=ServiceConfigError)
     return ml_dataset
示例#11
0
 def _open_ml_dataset(
         self, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
     fs_type = dataset_descriptor.get('FileSystem', 'local')
     if self._ml_dataset_openers and fs_type in self._ml_dataset_openers:
         ml_dataset_opener = self._ml_dataset_openers[fs_type]
     elif fs_type in _DEFAULT_MULTI_LEVEL_DATASET_OPENERS:
         ml_dataset_opener = _DEFAULT_MULTI_LEVEL_DATASET_OPENERS[fs_type]
     else:
         ds_id = dataset_descriptor.get('Identifier')
         raise ServiceConfigError(
             f"Invalid fs={fs_type!r} in dataset descriptor {ds_id!r}")
     return ml_dataset_opener(self, dataset_descriptor)
示例#12
0
 def get_data_store_pool(self) -> Optional[DataStorePool]:
     data_store_configs = self._config.get('DataStores', [])
     if not data_store_configs or self._data_store_pool:
         return self._data_store_pool
     if not isinstance(data_store_configs, list):
         raise ServiceConfigError('DataStores must be a list')
     store_configs: Dict[str, DataStoreConfig] = {}
     for data_store_config_dict in data_store_configs:
         store_instance_id = data_store_config_dict.get('Identifier')
         store_id = data_store_config_dict.get('StoreId')
         store_params = data_store_config_dict.get('StoreParams', {})
         dataset_configs = data_store_config_dict.get('Datasets')
         store_config = DataStoreConfig(store_id,
                                        store_params=store_params,
                                        user_data=dataset_configs)
         store_configs[store_instance_id] = store_config
     self._data_store_pool = DataStorePool(store_configs)
     return self._data_store_pool
示例#13
0
def open_ml_dataset_from_object_storage(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR)

    s3_client_kwargs = {}
    if 'Endpoint' in dataset_descriptor:
        s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint']
    if 'Region' in dataset_descriptor:
        s3_client_kwargs['region_name'] = dataset_descriptor['Region']
    obs_file_system = s3fs.S3FileSystem(anon=True,
                                        client_kwargs=s3_client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = obs_file_system.exists(f'{path}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(cached_store, consolidated=consolidated))
        return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                ds_id,
                obs_file_system,
                path,
                exception_type=ServiceConfigError)
示例#14
0
 def get_dataset_descriptors(self):
     dataset_descriptors = self._config.get('Datasets')
     if not dataset_descriptors:
         raise ServiceConfigError(f"No datasets configured")
     return dataset_descriptors
示例#15
0
文件: context.py 项目: manzt/xcube
    def _load_place_group(self,
                          place_group_descriptor: Dict[str, Any],
                          base_url: str,
                          is_global: bool = False,
                          load_features: bool = False) -> Dict[str, Any]:
        place_group_id = place_group_descriptor.get("PlaceGroupRef")
        if place_group_id:
            if is_global:
                raise ServiceConfigError(
                    "'PlaceGroupRef' cannot be used in a global place group")
            if len(place_group_descriptor) > 1:
                raise ServiceConfigError(
                    "'PlaceGroupRef' if present, must be the only entry in a 'PlaceGroups' item"
                )
            return self.get_global_place_group(place_group_id,
                                               base_url,
                                               load_features=load_features)

        place_group_id = place_group_descriptor.get("Identifier")
        if not place_group_id:
            raise ServiceConfigError(
                "Missing 'Identifier' entry in a 'PlaceGroups' item")

        if place_group_id in self._place_group_cache:
            place_group = self._place_group_cache[place_group_id]
        else:
            place_group_title = place_group_descriptor.get(
                "Title", place_group_id)
            place_path_wc = self.get_descriptor_path(place_group_descriptor,
                                                     f"'PlaceGroups' item")
            source_paths = glob.glob(place_path_wc)
            source_encoding = place_group_descriptor.get(
                "CharacterEncoding", "utf-8")

            join = None
            place_join = place_group_descriptor.get("Join")
            if isinstance(place_join, dict):
                join_path = self.get_descriptor_path(
                    place_join, "'Join' of a 'PlaceGroups' item")
                join_property = place_join.get("Property")
                if not join_property:
                    raise ServiceError(
                        "Missing 'Property' entry in 'Join' of a 'PlaceGroups' item"
                    )
                join_encoding = place_join.get("CharacterEncoding", "utf-8")
                join = dict(path=join_path,
                            property=join_property,
                            encoding=join_encoding)

            property_mapping = place_group_descriptor.get("PropertyMapping")
            if property_mapping:
                property_mapping = dict(property_mapping)
                for key, value in property_mapping.items():
                    if isinstance(value, str) and '${base_url}' in value:
                        property_mapping[key] = value.replace(
                            '${base_url}', base_url)

            place_group = dict(type="FeatureCollection",
                               features=None,
                               id=place_group_id,
                               title=place_group_title,
                               propertyMapping=property_mapping,
                               sourcePaths=source_paths,
                               sourceEncoding=source_encoding,
                               join=join)

            sub_place_group_configs = place_group_descriptor.get("Places")
            if sub_place_group_configs:
                raise ServiceConfigError(
                    "Invalid 'Places' entry in a 'PlaceGroups' item: not implemented yet"
                )
            # sub_place_group_descriptors = place_group_config.get("Places")
            # if sub_place_group_descriptors:
            #     sub_place_groups = self._load_place_groups(sub_place_group_descriptors)
            #     place_group["placeGroups"] = sub_place_groups

            self._place_group_cache[place_group_id] = place_group

        if load_features:
            self._load_place_group_features(place_group)

        return place_group