Exemplo n.º 1
0
 def _get_variable_descriptors(self,
                               var_names: str,
                               var_infos: dict,
                               normalize_dims: bool=True) \
         -> Dict[str, VariableDescriptor]:
     var_descriptors = {}
     for var_name in var_names:
         if var_name in var_infos:
             var_info = var_infos[var_name]
             var_dtype = var_info['data_type']
             var_dims = self._normalize_var_dims(var_info['dimensions']) \
                 if normalize_dims else var_info['dimensions']
             var_descriptors[var_name] = \
                 VariableDescriptor(var_name, var_dtype, var_dims, attrs=var_info)
         else:
             var_descriptors[var_name] = VariableDescriptor(
                 var_name, '', '')
     return var_descriptors
Exemplo n.º 2
0
    def _get_data_descriptor_from_metadata(
            self, data_id: str, metadata: dict) -> DatasetDescriptor:
        ds_metadata = metadata.copy()
        dims = self._normalize_dims(ds_metadata.get('dimensions', {}))
        if 'time' not in dims:
            dims['time'] = ds_metadata.get('time_dimension_size')
        else:
            dims['time'] *= ds_metadata.get('time_dimension_size')
        temporal_resolution = _get_temporal_resolution_from_id(data_id)
        dataset_info = self._cci_odp.get_dataset_info(data_id, ds_metadata)
        spatial_resolution = dataset_info['lat_res']
        if spatial_resolution <= 0:
            spatial_resolution = None
        bbox = dataset_info['bbox']
        # only use date parts of times
        temporal_coverage = (
            dataset_info['temporal_coverage_start'].split('T')[0],
            dataset_info['temporal_coverage_end'].split('T')[0])
        var_infos = ds_metadata.get('variable_infos', {})
        var_descriptors = self._get_variable_descriptors(
            dataset_info['var_names'], var_infos)
        coord_descriptors = self._get_variable_descriptors(
            dataset_info['coord_names'], var_infos, normalize_dims=False)
        if 'time' not in coord_descriptors.keys(
        ) and 't' not in coord_descriptors.keys():
            time_attrs = {
                "units": "seconds since 1970-01-01T00:00:00Z",
                "calendar": "proleptic_gregorian",
                "standard_name": "time"
            }
            coord_descriptors['time'] = VariableDescriptor('time',
                                                           dtype='int64',
                                                           dims=('time', ),
                                                           attrs=time_attrs)

        if 'variables' in ds_metadata:
            ds_metadata.pop('variables')
        ds_metadata.pop('dimensions')
        ds_metadata.pop('variable_infos')
        attrs = ds_metadata.get('attributes', {}).get('NC_GLOBAL', {})
        ds_metadata.pop('attributes')
        attrs.update(ds_metadata)
        self._remove_irrelevant_metadata_attributes(attrs)
        descriptor = DatasetDescriptor(data_id,
                                       type_specifier=self._type_specifier,
                                       dims=dims,
                                       coords=coord_descriptors,
                                       data_vars=var_descriptors,
                                       attrs=attrs,
                                       bbox=bbox,
                                       spatial_res=spatial_resolution,
                                       time_range=temporal_coverage,
                                       time_period=temporal_resolution)
        data_schema = self._get_open_data_params_schema(descriptor)
        descriptor.open_params_schema = data_schema
        return descriptor
Exemplo n.º 3
0
    def _describe_data(self, data_id: str) -> DatasetDescriptor:
        dataset_metadata, collection_metadata = self._get_dataset_and_collection_metadata(
            data_id)
        band_metadatas = dataset_metadata.get('bands', {})

        if self._sentinel_hub is not None:
            # If we are connected to the API, we return band names by API
            band_names = self._sentinel_hub.band_names(data_id)
        else:
            # Otherwise all we know about
            band_names = band_metadatas.keys()

        data_vars = []
        for band_name in band_names:
            band_metadata = band_metadatas.get(band_name,
                                               dict(sample_type='FLOAT32'))
            data_vars.append(
                VariableDescriptor(name=band_name,
                                   dtype=band_metadata.get(
                                       'sample_type', 'FLOAT32'),
                                   dims=('time', 'lat', 'lon'),
                                   attrs=band_metadatas.copy()))

        dataset_attrs = dataset_metadata.copy()

        bbox = None
        time_range = None
        if collection_metadata is not None:
            extent = collection_metadata.get('extent')
            if extent is not None:
                bbox = extent.get("spatial", {}).get('bbox')
                interval = extent.get("temporal", {}).get('interval')
                if isinstance(interval, list) and len(interval) == 2:
                    min_datetime, max_datetime = interval
                    # Get rid of time part
                    time_range = (min_datetime.split('T')[0]
                                  if min_datetime is not None else None,
                                  max_datetime.split('T')[0]
                                  if max_datetime is not None else None)

            if 'title' in collection_metadata:
                dataset_attrs['title'] = collection_metadata['title']
            if 'description' in collection_metadata:
                dataset_attrs['description'] = collection_metadata[
                    'description']

        return DatasetDescriptor(
            data_id=data_id,
            data_vars=data_vars,
            bbox=bbox,
            time_range=time_range,
            time_period=dataset_metadata.get('request_period'),
            attrs=dataset_metadata)
Exemplo n.º 4
0
    def _create_variable_descriptors(self, data_id: str):
        dataset_id = data_id.split(':')[0]

        return [
            VariableDescriptor(
                name=netcdf_name,
                # dtype string format not formally defined as of 2020-06-18.
                # t2m is actually stored as a short with scale and offset in
                # the NetCDF file, but converted to float by xarray on opening:
                # see http://xarray.pydata.org/en/stable/io.html .
                dtype='float32',
                dims=('time', 'latitude', 'longitude'),
                attrs=dict(units=units, long_name=long_name))
            for (api_name, netcdf_name, units,
                 long_name) in self._dataset_dicts[dataset_id]['variables']
        ]
Exemplo n.º 5
0
 def test_era5_describe_data(self):
     store = CDSDataStore(endpoint_url=_CDS_API_URL,
                          cds_api_key=_CDS_API_KEY)
     descriptor = store.describe_data(
         'reanalysis-era5-single-levels:reanalysis')
     self.assertEqual(265, len(descriptor.data_vars))
     self.assertEqual('WGS84', descriptor.crs)
     self.assertTupleEqual((-180, -90, 180, 90), descriptor.bbox)
     # We don't exhaustively check all 260 variables, but we check one
     # fully and make sure that the rest have correct type and dimensions.
     expected_vd = VariableDescriptor(
         name='u100',
         dtype='float32',
         dims=('time', 'latitude', 'longitude'),
         attrs=dict(units='m s**-1', long_name='100 metre U wind component'))
     self.assertDictEqual(expected_vd.__dict__,
                          descriptor.data_vars['u100'].__dict__)
     for vd in descriptor.data_vars.values():
         self.assertEqual('float32', vd.dtype)
         self.assertTupleEqual(('time', 'latitude', 'longitude'),
                               vd.dims)
Exemplo n.º 6
0
    def generate(self) -> CubeInfo:
        try:
            cube_config, resolved_crs, resolved_time_range = \
                 self._compute_effective_cube_config()
        except (TypeError, ValueError) as e:
            raise CubeGeneratorError(f'{e}', status_code=400) from e

        x_min, y_min, x_max, y_max = cube_config.bbox
        spatial_res = cube_config.spatial_res

        width = round((x_max - x_min) / spatial_res)
        height = round((y_max - y_min) / spatial_res)
        width = 2 if width < 2 else width
        height = 2 if height < 2 else height

        num_tiles_x = 1
        num_tiles_y = 1
        tile_width = width
        tile_height = height

        tile_size = cube_config.tile_size
        if tile_size is None and cube_config.chunks is not None:
            # TODO: this is just an assumption, with new
            #   Resampling module, use GridMapping
            #   to identify the actual names for the
            #   spatial tile dimensions.
            tile_size_x = cube_config.chunks.get('lon',
                                                 cube_config.chunks.get('x'))
            tile_size_y = cube_config.chunks.get('lat',
                                                 cube_config.chunks.get('y'))
            if tile_size_x and tile_size_y:
                tile_size = tile_size_x, tile_size_y

        if tile_size is not None:
            tile_width, tile_height = tile_size

            # TODO: this must be made common store logic
            if width > 1.5 * tile_width:
                num_tiles_x = _idiv(width, tile_width)
                width = num_tiles_x * tile_width

            # TODO: this must be made common store logic
            if height > 1.5 * tile_height:
                num_tiles_y = _idiv(height, tile_height)
                height = num_tiles_y * tile_height

        variable_names = cube_config.variable_names

        num_times = len(resolved_time_range)
        num_variables = len(variable_names)
        num_requests = num_variables \
                       * num_times \
                       * num_tiles_x * num_tiles_y
        # TODO: get original data types from dataset descriptors
        num_bytes_per_pixel = 4
        num_bytes = num_variables \
                    * num_times \
                    * (height * width * num_bytes_per_pixel)

        x_name, y_name = ('lon', 'lat') \
            if resolved_crs.is_geographic else ('x', 'y')

        data_id = self._request.output_config.data_id or 'unnamed'
        # TODO: get original variable descriptors from input dataset descriptors
        data_vars = {
            name: VariableDescriptor(name,
                                     dtype='float32',
                                     dims=('time', y_name, x_name))
            for name in variable_names
        }
        dims = {'time': num_times, y_name: height, x_name: width}
        dataset_descriptor = DatasetDescriptor(
            data_id,
            crs=cube_config.crs,
            bbox=cube_config.bbox,
            spatial_res=cube_config.spatial_res,
            time_range=cube_config.time_range,
            time_period=cube_config.time_period,
            dims=dims,
            data_vars=data_vars)
        size_estimation = dict(image_size=[width, height],
                               tile_size=[tile_width, tile_height],
                               num_variables=num_variables,
                               num_tiles=[num_tiles_x, num_tiles_y],
                               num_requests=num_requests,
                               num_bytes=num_bytes)

        return CubeInfo(dataset_descriptor=dataset_descriptor,
                        size_estimation=size_estimation)
Exemplo n.º 7
0
    def describe_data(self, data_id: str) -> DatasetDescriptor:
        _, variable_spec, aggregation = data_id.split(':')

        sm_attrs = dict(saturation=('percent',
                                    'Percent of Saturation Soil Moisture'),
                        volumetric=('m3 m-3',
                                    'Volumetric Soil Moisture'))[variable_spec]

        descriptors_common = [
            VariableDescriptor(name='sensor',
                               dtype='int16',
                               dims=('time', 'lat', 'lon'),
                               attrs={'long_name': 'Sensor'}),
            VariableDescriptor(
                name='freqbandID',
                dtype='int16',
                dims=('time', 'lat', 'lon'),
                attrs={'long_name': 'Frequency Band Identification'}),
            VariableDescriptor(name='sm',
                               dtype='float32',
                               dims=('time', 'lat', 'lon'),
                               attrs={
                                   'units': sm_attrs[0],
                                   'long_name': sm_attrs[1]
                               }),
        ]

        descriptors_daily = [
            VariableDescriptor(
                # The product user guide claims that sm_uncertainty is
                # available for all three aggregation periods, but in practice
                # it only seems to be present in the daily data.
                name='sm_uncertainty',
                dtype='float32',
                dims=('time', 'lat', 'lon'),
                attrs={
                    'units': sm_attrs[0],
                    'long_name': sm_attrs[1] + ' Uncertainty'
                }),
            VariableDescriptor(name='t0',
                               dtype='float64',
                               dims=('time', 'lat', 'lon'),
                               attrs={
                                   'units':
                                   'days since 1970-01-01 00:00:00 UTC',
                                   'long_name': 'Observation Timestamp'
                               }),
            VariableDescriptor(name='dnflag',
                               dtype='int8',
                               dims=('time', 'lat', 'lon'),
                               attrs={'long_name': 'Day / Night Flag'}),
            VariableDescriptor(name='flag',
                               dtype='int8',
                               dims=('time', 'lat', 'lon'),
                               attrs={'long_name': 'Flag'}),
            VariableDescriptor(
                name='mode',
                dtype='int8',
                dims=('time', 'lat', 'lon'),
                # Note: the product user guide gives the long name as
                # 'Satellite Mode' with one space, but the long name in the
                # actual NetCDF files has two spaces.
                attrs={'long_name': 'Satellite  Mode'}),
        ]
        descriptors_aggregated = [
            VariableDescriptor(
                name='nobs',
                dtype='int16',
                dims=('time', 'lat', 'lon'),
                attrs={'long_name': 'Number of valid observation'}),
        ]

        descriptors = descriptors_common + \
            (descriptors_daily if aggregation == 'daily'
             else descriptors_aggregated)

        return DatasetDescriptor(
            data_id=data_id,
            data_vars={desc.name: desc
                       for desc in descriptors},
            crs='WGS84',
            bbox=(-180, -90, 180, 90),
            spatial_res=0.25,
            time_range=('1978-11-01', None),
            time_period=self._aggregation_map[aggregation],
            open_params_schema=self.get_open_data_params_schema(data_id))