def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert(var_names) if var_names else None selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(self.id) if time_range is not None: msg += ' in given time range {}'.format(TimeRangeLike.format(time_range)) raise DataAccessError(msg) files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP) try: ds = open_xarray_dataset(files) if region: ds = normalize_impl(ds) ds = subset_spatial_impl(ds, region) if var_names: ds = ds.drop([var_name for var_name in ds.data_vars.keys() if var_name not in var_names]) return ds except OSError as e: if time_range: raise DataAccessError("Cannot open remote dataset for time range {}:\n" "{}" .format(TimeRangeLike.format(time_range), e), source=self) from e else: raise DataAccessError("Cannot open remote dataset:\n" "{}" .format(TimeRangeLike.format(time_range), e), source=self) from e
def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None if region: region = PolygonLike.convert(region) if var_names: var_names = VarNamesLike.convert(var_names) paths = [] if time_range: time_series = list(self._files.values()) file_paths = list(self._files.keys()) for i in range(len(time_series)): if time_series[i]: if isinstance(time_series[i], Tuple) and \ time_series[i][0] >= time_range[0] and \ time_series[i][1] <= time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) elif isinstance( time_series[i], datetime ) and time_range[0] <= time_series[i] < time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) else: for file in self._files.items(): paths.extend(self._resolve_file_path(file[0])) if paths: paths = sorted(set(paths)) try: ds = open_xarray_dataset(paths) if region: ds = normalize_impl(ds) ds = subset_spatial_impl(ds, region) if var_names: ds = ds.drop([ var_name for var_name in ds.data_vars.keys() if var_name not in var_names ]) return ds except OSError as e: if time_range: raise DataAccessError( "Cannot open local dataset for time range {}:\n" "{}".format(TimeRangeLike.format(time_range), e), source=self) from e else: raise DataAccessError("Cannot open local dataset:\n" "{}".format(e), source=self) from e else: if time_range: raise DataAccessError( "No local datasets available for\nspecified time range {}". format(TimeRangeLike.format(time_range)), source=self) else: raise DataAccessError("No local datasets available", source=self)
def test_format(self): self.assertEqual(TimeRangeLike.format(None), '') self.assertEqual( TimeRangeLike.format((datetime(2001, 1, 1), datetime(2002, 1, 1))), '2001-01-01, 2002-01-01') self.assertEqual( TimeRangeLike.format( (datetime(2001, 1, 1, 12), datetime(2002, 1, 1, 9, 30, 2))), '2001-01-01T12:00:00, 2002-01-01T09:30:02')
def to_json_dict(self): """ Return a JSON-serializable dictionary representation of this object. :return: A JSON-serializable dictionary """ config = OrderedDict({ 'name': self._name, 'meta_data': { 'temporal_coverage': TimeRangeLike.format(self._temporal_coverage) if self._temporal_coverage else None, 'spatial_coverage': PolygonLike.format(self._spatial_coverage) if self._spatial_coverage else None, 'variables': VarNamesLike.format(self._variables) if self._variables else None, 'reference_type': self._reference_type, 'reference_name': self._reference_name }, 'files': [[item[0], item[1][0], item[1][1]] if item[1] else [item[0]] for item in self._files.items()] }) return config
def test_format(self): expected = '2001-01-01T00:00:00, 2002-01-01T00:00:00' actual = TimeRangeLike.format((datetime(2001, 1, 1), datetime(2002, 1, 1))) self.assertTrue(expected, actual) converted = TimeRangeLike.convert(actual) self.assertTrue(converted, expected)
def generate_title(cls, title: str, time_range: Optional[TimeRange] = None, region: Optional[shapely.geometry.Polygon] = None, var_names: Optional[VarNames] = None) -> str: if time_range: title += " [TimeRange:{}]".format(TimeRangeLike.format(time_range)) if region: title += " [Region:{}]".format(PolygonLike.format(region)) if var_names: title += " [Variables:{}]".format(VarNamesLike.format(var_names)) return title
def generate_uuid(cls, ref_id: str, time_range: Optional[TimeRange] = None, region: Optional[shapely.geometry.Polygon] = None, var_names: Optional[VarNames] = None) -> str: if time_range: ref_id += TimeRangeLike.format(time_range) if region: ref_id += PolygonLike.format(region) if var_names: ref_id += VarNamesLike.format(var_names) return str(uuid.uuid3(_NAMESPACE, ref_id))
def generate_title(cls, title: str, time_range: Optional[TimeRange] = None, region: Optional[shapely.geometry.Polygon] = None, var_names: Optional[VarNames] = None) -> str: if time_range: title += " [TimeRange:{}]".format(TimeRangeLike.format(time_range)) if region: title += " [Region:{}]".format(PolygonLike.format(region)) if var_names: title += " [Variables:{}]".format(VarNamesLike.format(var_names)) return title
def generate_uuid(cls, ref_id: str, time_range: Optional[TimeRange] = None, region: Optional[shapely.geometry.Polygon] = None, var_names: Optional[VarNames] = None) -> str: if time_range: ref_id += TimeRangeLike.format(time_range) if region: ref_id += PolygonLike.format(region) if var_names: ref_id += VarNamesLike.format(var_names) return str(uuid.uuid3(_NAMESPACE, ref_id))
def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None region = PolygonLike.convert(region) if region else None var_names = VarNamesLike.convert(var_names) if var_names else None selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'Data source \'{}\' does not seem to have any data files'.format( self.name) if time_range is not None: msg += ' in given time range {}'.format( TimeRangeLike.format(time_range)) raise IOError(msg) files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP) try: ds = open_xarray_dataset(files) if region: [lat_min, lon_min, lat_max, lon_max] = region.bounds ds = ds.sel(drop=False, lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max)) if var_names: ds = ds.drop([ var_name for var_name in ds.variables.keys() if var_name not in var_names ]) return ds except OSError as e: raise IOError("Files: {} caused:\nOSError({}): {}".format( files, e.errno, e.strerror))
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) region = PolygonLike.convert(region) var_names = VarNamesLike.convert(var_names) time_range, region, var_names = self._apply_make_local_fixes( time_range, region, var_names) compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False do_update_of_verified_time_coverage_start_once = True verified_time_coverage_start = None verified_time_coverage_end = None encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format( self.id) if time_range is not None: msg += ' in given time range {}'.format( TimeRangeLike.format(time_range)) raise DataAccessError(msg) try: if protocol == _ODP_PROTOCOL_OPENDAP: do_update_of_variables_meta_info_once = True do_update_of_region_meta_info_once = True files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.id, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] try: child_monitor.start(label=file_name, total_work=1) remote_dataset = xr.open_dataset(dataset_uri) if var_names: remote_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.data_vars.keys() if var_name not in var_names ]) if region: remote_dataset = normalize_impl(remote_dataset) remote_dataset = subset_spatial_impl( remote_dataset, region) geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds remote_dataset.attrs[ 'geospatial_lat_min'] = geo_lat_min remote_dataset.attrs[ 'geospatial_lat_max'] = geo_lat_max remote_dataset.attrs[ 'geospatial_lon_min'] = geo_lon_min remote_dataset.attrs[ 'geospatial_lon_max'] = geo_lon_max if do_update_of_region_meta_info_once: local_ds.meta_info['bbox_maxx'] = geo_lon_max local_ds.meta_info['bbox_minx'] = geo_lon_min local_ds.meta_info['bbox_maxy'] = geo_lat_max local_ds.meta_info['bbox_miny'] = geo_lat_min do_update_of_region_meta_info_once = False if compression_enabled: for sel_var_name in remote_dataset.variables.keys( ): remote_dataset.variables.get( sel_var_name).encoding.update( encoding_update) remote_dataset.to_netcdf(local_filepath) child_monitor.progress(work=1, msg=str(time_coverage_start)) finally: if do_update_of_variables_meta_info_once: variables_info = local_ds.meta_info.get( 'variables', []) local_ds.meta_info['variables'] = [ var_info for var_info in variables_info if var_info.get('name') in remote_dataset. variables.keys() and var_info.get( 'name') not in remote_dataset.dims.keys() ] do_update_of_variables_meta_info_once = False local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = time_coverage_start do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = time_coverage_end child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or ( file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.id, len(outdated_file_list)): bytes_to_download = sum( [file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % ( file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve( url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset( os.path.join(local_id, filename), (coverage_from, coverage_to)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = coverage_from do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = coverage_to except OSError as e: raise DataAccessError( "Copying remote data source failed: {}".format(e), source=self) from e local_ds.meta_info['temporal_coverage_start'] = TimeLike.format( verified_time_coverage_start) local_ds.meta_info['temporal_coverage_end'] = TimeLike.format( verified_time_coverage_end) local_ds.save(True)
def test_make_local_and_update(self): soilmoisture_data_sources = self.data_store.query( query_expr= 'esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1' ) soilmoisture_data_source = soilmoisture_data_sources[0] reference_path = os.path.join( os.path.dirname(__file__), os.path.normpath('resources/datasources/local/files/')) def find_files_mock(_, time_range): def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int): return [ item_name, date_from, date_to, size, { 'OPENDAP': os.path.join(reference_path, item_name), 'HTTPServer': 'file:' + urllib.request.pathname2url( os.path.join(reference_path, item_name)) } ] reference_files = { 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 14, 0, 0), 'date_to': datetime.datetime(1978, 11, 14, 23, 59), 'size': 21511378 }, 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 15, 0, 0), 'date_to': datetime.datetime(1978, 11, 15, 23, 59), 'size': 21511378 }, 'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': { 'date_from': datetime.datetime(1978, 11, 16, 0, 0), 'date_to': datetime.datetime(1978, 11, 16, 23, 59), 'size': 21511378 } } reference_files_list = [] for reference_file in reference_files.items(): file_name = reference_file[0] file_date_from = reference_file[1].get('date_from') file_date_to = reference_file[1].get('date_to') file_size = reference_file[1].get('size') if time_range: if file_date_from >= time_range[ 0] and file_date_to <= time_range[1]: reference_files_list.append( build_file_item(file_name, file_date_from, file_date_to, file_size)) else: reference_files_list.append( build_file_item(file_name, file_date_from, file_date_to, file_size)) return reference_files_list with unittest.mock.patch( 'cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock): with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]): new_ds_title = 'local_ds_test' new_ds_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) try: new_ds = soilmoisture_data_source.make_local( new_ds_title, time_range=new_ds_time_range) except Exception: raise ValueError(reference_path, os.listdir(reference_path)) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.id, "local.%s" % new_ds_title) self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range) new_ds_w_one_variable_title = 'local_ds_test_var' new_ds_w_one_variable_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm']) new_ds_w_one_variable = soilmoisture_data_source.make_local( new_ds_w_one_variable_title, time_range=new_ds_w_one_variable_time_range, var_names=new_ds_w_one_variable_var_names) self.assertIsNotNone(new_ds_w_one_variable) self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_w_one_variable_title) ds = new_ds_w_one_variable.open_dataset() new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(ds.variables), set(new_ds_w_one_variable_var_names)) new_ds_w_region_title = 'from_local_to_local_region' new_ds_w_region_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_spatial_coverage = PolygonLike.convert( "10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, region=new_ds_w_region_spatial_coverage ) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) new_ds_w_region_title = 'from_local_to_local_region_one_var' new_ds_w_region_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_var_names = VarNamesLike.convert(['sm']) new_ds_w_region_spatial_coverage = PolygonLike.convert( "10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, var_names=new_ds_w_region_var_names, region=new_ds_w_region_spatial_coverage ) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) data_set = new_ds_w_region.open_dataset() new_ds_w_region_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names)) new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty' new_ds_w_region_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 16, 23, 59))) new_ds_w_region_var_names = VarNamesLike.convert( ['sm', 'sm_uncertainty']) new_ds_w_region_spatial_coverage = PolygonLike.convert( "10,20,30,40") new_ds_w_region = soilmoisture_data_source.make_local( new_ds_w_region_title, time_range=new_ds_w_region_time_range, var_names=new_ds_w_region_var_names, region=new_ds_w_region_spatial_coverage ) # type: LocalDataSource self.assertIsNotNone(new_ds_w_region) self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title) self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage) data_set = new_ds_w_region.open_dataset() new_ds_w_region_var_names.extend(['lat', 'lon', 'time']) self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names)) empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0), datetime.datetime(2017, 12, 31, 23, 59)) with self.assertRaises(DataAccessError) as cm: soilmoisture_data_source.make_local( 'empty_ds', time_range=empty_ds_timerange) self.assertEqual( 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets in given ' 'time range {}'.format( soilmoisture_data_source.id, TimeRangeLike.format(empty_ds_timerange)), str(cm.exception)) new_ds_time_range = TimeRangeLike.convert( (datetime.datetime(1978, 11, 14, 0, 0), datetime.datetime(1978, 11, 14, 23, 59))) new_ds = soilmoisture_data_source.make_local( "title_test_copy", time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.meta_info['title'], soilmoisture_data_source.meta_info['title']) title = "Title Test!" new_ds = soilmoisture_data_source.make_local( "title_test_set", title, time_range=new_ds_time_range) self.assertIsNotNone(new_ds) self.assertEqual(new_ds.meta_info['title'], title)
def test_format(self): self.assertEqual(TimeRangeLike.format(None), '') self.assertEqual(TimeRangeLike.format((datetime(2001, 1, 1), datetime(2002, 1, 1))), '2001-01-01, 2002-01-01') self.assertEqual(TimeRangeLike.format((datetime(2001, 1, 1, 12), datetime(2002, 1, 1, 9, 30, 2))), '2001-01-01T12:00:00, 2002-01-01T09:30:02')