def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None var_names = VarNamesLike.convert(var_names) if var_names else None selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(self.id) if time_range is not None: msg += ' in given time range {}'.format(TimeRangeLike.format(time_range)) raise DataAccessError(msg) files = self._get_urls_list(selected_file_list, _ODP_PROTOCOL_OPENDAP) try: ds = open_xarray_dataset(files) if region: ds = normalize_impl(ds) ds = subset_spatial_impl(ds, region) if var_names: ds = ds.drop([var_name for var_name in ds.data_vars.keys() if var_name not in var_names]) return ds except OSError as e: if time_range: raise DataAccessError("Cannot open remote dataset for time range {}:\n" "{}" .format(TimeRangeLike.format(time_range), e), source=self) from e else: raise DataAccessError("Cannot open remote dataset:\n" "{}" .format(TimeRangeLike.format(time_range), e), source=self) from e
def open_dataset(self, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, protocol: str = None) -> Any: time_range = TimeRangeLike.convert(time_range) if time_range else None if region: region = PolygonLike.convert(region) if var_names: var_names = VarNamesLike.convert(var_names) paths = [] if time_range: time_series = list(self._files.values()) file_paths = list(self._files.keys()) for i in range(len(time_series)): if time_series[i]: if isinstance(time_series[i], Tuple) and \ time_series[i][0] >= time_range[0] and \ time_series[i][1] <= time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) elif isinstance( time_series[i], datetime ) and time_range[0] <= time_series[i] < time_range[1]: paths.extend(self._resolve_file_path(file_paths[i])) else: for file in self._files.items(): paths.extend(self._resolve_file_path(file[0])) if paths: paths = sorted(set(paths)) try: ds = open_xarray_dataset(paths) if region: ds = normalize_impl(ds) ds = subset_spatial_impl(ds, region) if var_names: ds = ds.drop([ var_name for var_name in ds.data_vars.keys() if var_name not in var_names ]) return ds except OSError as e: if time_range: raise DataAccessError( "Cannot open local dataset for time range {}:\n" "{}".format(TimeRangeLike.format(time_range), e), source=self) from e else: raise DataAccessError("Cannot open local dataset:\n" "{}".format(e), source=self) from e else: if time_range: raise DataAccessError( "No local datasets available for\nspecified time range {}". format(TimeRangeLike.format(time_range)), source=self) else: raise DataAccessError("No local datasets available", source=self)
def _load_json_file(json_path: str): if os.path.isfile(json_path): try: with open(json_path) as fp: return json.load(fp=fp) or {} except json.decoder.JSONDecodeError as e: raise DataAccessError( "Cannot load data source config from {}".format( json_path)) from e else: raise DataAccessError( "Data source config does not exists: {}".format(json_path))
def _fetch_solr_json(base_url, query_args, offset=0, limit=3500, timeout=10, monitor: Monitor = Monitor.NONE): """ Return JSON value read from paginated Solr web-service. """ combined_json_dict = None num_found = -1 # we don't know ahead of time how much request are necessary with monitor.starting("Loading", 10): while True: monitor.progress(work=1) paging_query_args = dict(query_args or {}) # noinspection PyArgumentList paging_query_args.update(offset=offset, limit=limit, format='application/solr+json') url = base_url + '?' + urllib.parse.urlencode(paging_query_args) try: with urllib.request.urlopen(url, timeout=timeout) as response: json_text = response.read() json_dict = json.loads(json_text.decode('utf-8')) if num_found is -1: num_found = json_dict.get('response', {}).get('numFound', 0) if not combined_json_dict: combined_json_dict = json_dict if num_found < limit: break else: docs = json_dict.get('response', {}).get('docs', []) combined_json_dict.get('response', {}).get('docs', []).extend(docs) if num_found < offset + limit: break except (urllib.error.HTTPError, urllib.error.URLError) as e: raise DataAccessError( "Downloading CCI Open Data Portal index failed: {}\n{}". format(e, base_url)) from e except socket.timeout: raise DataAccessError( "Downloading CCI Open Data Portal index failed: connection timeout\n{}" .format(base_url)) offset += limit return combined_json_dict
def _load_index(self): try: esgf_json_dict = _load_or_fetch_json( _fetch_solr_json, fetch_json_args=[ _ESGF_CEDA_URL, dict(type='Dataset', replica='false', latest='true', project='esacci') ], cache_used=self._index_cache_used, cache_dir=get_metadata_store_path(), cache_json_filename='dataset-list.json', cache_timestamp_filename='dataset-list-timestamp.json', cache_expiration_days=self._index_cache_expiration_days) cci_catalogue_service = EsaCciCatalogueService(_CSW_CEDA_URL) csw_json_dict = _load_or_fetch_json( cci_catalogue_service.getrecords, fetch_json_args=[], cache_used=self._index_cache_used, cache_dir=get_metadata_store_path(), cache_json_filename='catalogue.json', cache_timestamp_filename='catalogue-timestamp.json', cache_expiration_days=self._index_cache_expiration_days) except DataAccessError as e: raise DataAccessError( "Cannot download CCI Open Data Portal ECV index:\n{}".format( e), source=self) from e self._csw_data = csw_json_dict self._esgf_data = esgf_json_dict
def test_with_source(self): store = SimpleDataStore('hihi', []) try: raise DataAccessError("haha", source=store) except DataAccessError as e: self.assertEqual(str(e), 'Data store "hihi": haha') self.assertIs(e.source, store) self.assertIs(e.cause, None) source = SimpleDataSource('hehe') try: raise DataAccessError("haha", source=source) except DataAccessError as e: self.assertEqual(str(e), 'Data source "hehe": haha') self.assertIs(e.source, source) self.assertIs(e.cause, None)
def test_plain(self): try: raise DataAccessError("haha") except DataAccessError as e: self.assertEqual(str(e), "haha") self.assertEqual(e.source, None) self.assertEqual(e.cause, None)
def test_with_cause(self): e1 = ValueError("a > 5") try: raise DataAccessError("hoho") from e1 except DataAccessError as e2: self.assertEqual(str(e2), "hoho") self.assertIs(e2.source, None) self.assertIs(e2.cause, e1)
def _save_data_source(self, data_source): json_dict = data_source.to_json_dict() dump_kwargs = dict(indent=' ', default=self._json_default_serializer) file_name = os.path.join(self._store_dir, data_source.id + '.json') try: with open(file_name, 'w') as fp: json.dump(json_dict, fp, **dump_kwargs) except EnvironmentError as e: raise DataAccessError("Couldn't save data source config file {}\n" "{}".format(file_name, e), source=self) from e
def _make_local(self, local_ds: LocalDataSource, time_range: TimeRangeLike.TYPE = None, region: PolygonLike.TYPE = None, var_names: VarNamesLike.TYPE = None, monitor: Monitor = Monitor.NONE): local_id = local_ds.id time_range = TimeRangeLike.convert(time_range) region = PolygonLike.convert(region) var_names = VarNamesLike.convert(var_names) time_range, region, var_names = self._apply_make_local_fixes( time_range, region, var_names) compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL) compression_enabled = True if compression_level > 0 else False do_update_of_verified_time_coverage_start_once = True verified_time_coverage_start = None verified_time_coverage_end = None encoding_update = dict() if compression_enabled: encoding_update.update({ 'zlib': True, 'complevel': compression_level }) if region or var_names: protocol = _ODP_PROTOCOL_OPENDAP else: protocol = _ODP_PROTOCOL_HTTP local_path = os.path.join(local_ds.data_store.data_store_path, local_id) if not os.path.exists(local_path): os.makedirs(local_path) selected_file_list = self._find_files(time_range) if not selected_file_list: msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format( self.id) if time_range is not None: msg += ' in given time range {}'.format( TimeRangeLike.format(time_range)) raise DataAccessError(msg) try: if protocol == _ODP_PROTOCOL_OPENDAP: do_update_of_variables_meta_info_once = True do_update_of_region_meta_info_once = True files = self._get_urls_list(selected_file_list, protocol) monitor.start('Sync ' + self.id, total_work=len(files)) for idx, dataset_uri in enumerate(files): child_monitor = monitor.child(work=1) file_name = os.path.basename(dataset_uri) local_filepath = os.path.join(local_path, file_name) time_coverage_start = selected_file_list[idx][1] time_coverage_end = selected_file_list[idx][2] try: child_monitor.start(label=file_name, total_work=1) remote_dataset = xr.open_dataset(dataset_uri) if var_names: remote_dataset = remote_dataset.drop([ var_name for var_name in remote_dataset.data_vars.keys() if var_name not in var_names ]) if region: remote_dataset = normalize_impl(remote_dataset) remote_dataset = subset_spatial_impl( remote_dataset, region) geo_lon_min, geo_lat_min, geo_lon_max, geo_lat_max = region.bounds remote_dataset.attrs[ 'geospatial_lat_min'] = geo_lat_min remote_dataset.attrs[ 'geospatial_lat_max'] = geo_lat_max remote_dataset.attrs[ 'geospatial_lon_min'] = geo_lon_min remote_dataset.attrs[ 'geospatial_lon_max'] = geo_lon_max if do_update_of_region_meta_info_once: local_ds.meta_info['bbox_maxx'] = geo_lon_max local_ds.meta_info['bbox_minx'] = geo_lon_min local_ds.meta_info['bbox_maxy'] = geo_lat_max local_ds.meta_info['bbox_miny'] = geo_lat_min do_update_of_region_meta_info_once = False if compression_enabled: for sel_var_name in remote_dataset.variables.keys( ): remote_dataset.variables.get( sel_var_name).encoding.update( encoding_update) remote_dataset.to_netcdf(local_filepath) child_monitor.progress(work=1, msg=str(time_coverage_start)) finally: if do_update_of_variables_meta_info_once: variables_info = local_ds.meta_info.get( 'variables', []) local_ds.meta_info['variables'] = [ var_info for var_info in variables_info if var_info.get('name') in remote_dataset. variables.keys() and var_info.get( 'name') not in remote_dataset.dims.keys() ] do_update_of_variables_meta_info_once = False local_ds.add_dataset( os.path.join(local_id, file_name), (time_coverage_start, time_coverage_end)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = time_coverage_start do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = time_coverage_end child_monitor.done() else: outdated_file_list = [] for file_rec in selected_file_list: filename, _, _, file_size, url = file_rec dataset_file = os.path.join(local_path, filename) # todo (forman, 20160915): must perform better checks on dataset_file if it is... # ... outdated or incomplete or corrupted. # JSON also includes "checksum" and "checksum_type" fields. if not os.path.isfile(dataset_file) or ( file_size and os.path.getsize(dataset_file) != file_size): outdated_file_list.append(file_rec) if outdated_file_list: with monitor.starting('Sync ' + self.id, len(outdated_file_list)): bytes_to_download = sum( [file_rec[3] for file_rec in outdated_file_list]) dl_stat = _DownloadStatistics(bytes_to_download) file_number = 1 for filename, coverage_from, coverage_to, file_size, url in outdated_file_list: dataset_file = os.path.join(local_path, filename) sub_monitor = monitor.child(work=1.0) # noinspection PyUnusedLocal def reporthook(block_number, read_size, total_file_size): dl_stat.handle_chunk(read_size) sub_monitor.progress(work=read_size, msg=str(dl_stat)) sub_monitor_msg = "file %d of %d" % ( file_number, len(outdated_file_list)) with sub_monitor.starting(sub_monitor_msg, file_size): urllib.request.urlretrieve( url[protocol], filename=dataset_file, reporthook=reporthook) file_number += 1 local_ds.add_dataset( os.path.join(local_id, filename), (coverage_from, coverage_to)) if do_update_of_verified_time_coverage_start_once: verified_time_coverage_start = coverage_from do_update_of_verified_time_coverage_start_once = False verified_time_coverage_end = coverage_to except OSError as e: raise DataAccessError( "Copying remote data source failed: {}".format(e), source=self) from e local_ds.meta_info['temporal_coverage_start'] = TimeLike.format( verified_time_coverage_start) local_ds.meta_info['temporal_coverage_end'] = TimeLike.format( verified_time_coverage_end) local_ds.save(True)
def _load_or_fetch_json(fetch_json_function, fetch_json_args: list = None, fetch_json_kwargs: dict = None, cache_used: bool = False, cache_dir: str = None, cache_json_filename: str = None, cache_timestamp_filename: str = None, cache_expiration_days: float = 1.0) -> Sequence: """ Return (JSON) value of fetch_json_function or return value of a cached JSON file. """ json_obj = None cache_json_file = None if cache_used: if cache_dir is None: raise ValueError( 'if cache_used argument is True, cache_dir argument must not be None' ) if cache_json_filename is None: raise ValueError( 'if cache_used argument is True, cache_json_filename argument must not be None' ) if cache_timestamp_filename is None: raise ValueError( 'if cache_used argument is True, cache_timestamp_filename argument must not be None' ) if cache_expiration_days is None: raise ValueError( 'if cache_used argument is True, cache_expiration_days argument must not be None' ) cache_json_file = os.path.join(cache_dir, cache_json_filename) cache_timestamp_file = os.path.join(cache_dir, cache_timestamp_filename) timestamp = datetime(year=2000, month=1, day=1) if os.path.exists(cache_timestamp_file): with open(cache_timestamp_file) as fp: timestamp_text = fp.read() timestamp = datetime.strptime(timestamp_text, _TIMESTAMP_FORMAT) time_diff = datetime.now() - timestamp time_diff_days = time_diff.days + time_diff.seconds / 3600. / 24. if time_diff_days < cache_expiration_days: if os.path.exists(cache_json_file): with open(cache_json_file) as fp: json_text = fp.read() json_obj = json.loads(json_text) if json_obj is None: # noinspection PyArgumentList try: # noinspection PyArgumentList json_obj = fetch_json_function(*(fetch_json_args or []), **(fetch_json_kwargs or {})) if cache_used: os.makedirs(cache_dir, exist_ok=True) # noinspection PyUnboundLocalVariable with open(cache_json_file, 'w') as fp: fp.write(json.dumps(json_obj, indent=' ')) # noinspection PyUnboundLocalVariable with open(cache_timestamp_file, 'w') as fp: fp.write(datetime.utcnow().strftime(_TIMESTAMP_FORMAT)) except Exception as e: if cache_json_file and os.path.exists(cache_json_file): with open(cache_json_file) as fp: json_text = fp.read() json_obj = json.loads(json_text) else: if isinstance(e, DataAccessError): raise DataAccessError( "Cannot fetch information from CCI Open Data Portal server." ) from e else: raise e return json_obj
def test_plain(self): try: raise DataAccessError("haha") except DataAccessError as e: self.assertEqual(str(e), "haha") self.assertIsInstance(e, Exception)
def create_data_source(self, data_source_id: str, region: PolygonLike.TYPE = None, title: str = None, time_range: TimeRangeLike.TYPE = None, var_names: VarNamesLike.TYPE = None, meta_info: OrderedDict = None, lock_file: bool = False): self._init_data_sources() if title: if not meta_info: meta_info = OrderedDict() meta_info['title'] = title if not data_source_id.startswith('%s.' % self.id): data_source_id = '%s.%s' % (self.id, data_source_id) lock_filename = '{}.lock'.format(data_source_id) lock_filepath = os.path.join(self._store_dir, lock_filename) pid = os.getpid() create_time = int(psutil.Process(pid).create_time() * 1000000) data_source = None for ds in self._data_sources: if ds.id == data_source_id: if lock_file and os.path.isfile(lock_filepath): with open(lock_filepath, 'r') as lock_file: writer_pid = lock_file.readline() if writer_pid: writer_create_time = -1 writer_pid, writer_timestamp = [ (int(val) for val in writer_pid.split(":")) if ":" in writer_pid else writer_pid, writer_create_time ] if psutil.pid_exists( writer_pid) and writer_pid != pid: if writer_timestamp > writer_create_time: writer_create_time = int( psutil.Process( writer_pid).create_time() * 1000000) if writer_create_time == writer_timestamp: raise DataAccessError( 'Data source "{}" is currently being created by another ' 'process (pid:{})'.format( ds.id, writer_pid), source=self) # ds.temporal_coverage() == time_range and if ds.spatial_coverage() == region \ and ds.variables_info == var_names: data_source = ds data_source.set_completed(False) break raise DataAccessError( 'Data source "{}" already exists.'.format(data_source_id), source=self) if not data_source: data_source = LocalDataSource(data_source_id, files=[], data_store=self, spatial_coverage=region, variables=var_names, temporal_coverage=time_range, meta_info=meta_info, status=DataSourceStatus.PROCESSING) data_source.set_completed(False) self._save_data_source(data_source) if lock_file: with open(lock_filepath, 'w') as lock_file: lock_file.write("{}:{}".format(pid, create_time)) return data_source