def get_forecast(self, input_source_types, utc_period, t_c, geo_location_criteria=None): """Get shyft source vectors of time series for input_source_types Parameters ---------- input_source_types: list List of source types to retrieve (precipitation, temperature..) geo_location_criteria: object, optional Some type (to be decided), extent (bbox + coord.ref) Returns ------- geo_loc_ts: dictionary dictionary keyed by time series name, where values are api vectors of geo located timeseries. """ # filename = self._filename # filename = self._get_files(t_c, "_(\d{8})([T_])(\d{2})(Z)?.nc$") filename = self._get_files(t_c, "(\d{8})([T_])(\d{2})(.*.nc)") with Dataset(filename) as dataset: if utc_period is None: time = dataset.variables.get("time", None) conv_time = convert_netcdf_time(time.units, time) start_t = conv_time[0] last_t = conv_time[-1] utc_period = api.UtcPeriod(int(start_t), int(last_t)) return self._get_data_from_dataset(dataset, input_source_types, utc_period, geo_location_criteria)
def get_forecast_ensemble(self, input_source_types, utc_period, t_c, geo_location_criteria=None): """ Parameters ---------- input_source_types: list List of source types to retrieve (precipitation, temperature, ...) utc_period: api.UtcPeriod The utc time period that should (as a minimum) be covered. t_c: long Forecast specification; return newest forecast older than t_c. geo_location_criteria: object Some type (to be decided), extent (bbox + coord.ref). Returns ------- ensemble: list of geo_loc_ts dictionaries Dictionaries are keyed by time series type, with values being api vectors of geo located timeseries. """ #filename = self._filename filename = self._get_files(t_c, "_(\d{8})([T_])(\d{2})(Z)?.nc$") with Dataset(filename) as dataset: if utc_period is None: time = dataset.variables.get("time", None) conv_time = convert_netcdf_time(time.units, time) start_t = conv_time[0] last_t = conv_time[-1] utc_period = api.UtcPeriod(int(start_t), int(last_t)) # for idx in range(dataset.dimensions["ensemble_member"].size): res = self._get_data_from_dataset(dataset, input_source_types, utc_period, geo_location_criteria, ensemble_member='all') return res
def remove_tp_data(self, period: UtcPeriod): """ delete data given within the time period :param period: :return: """ time_series_cropped = None with Dataset(self.file_path, 'a') as ds: # 1. load the data time_variable = 'time' time = ds.variables.get(time_variable, None) if time is None: raise TimeSeriesStoreError( 'Something is wrong with the dataset. time not found.') var = ds.variables.get(self.ts_meta_info.variable_name, None) if var is None: raise TimeSeriesStoreError( 'Something is wrong with the dataset. variable {0} not found.' .format(self.ts_meta_info.variable_name)) if len(time): # 2. get indices of the data to delete time_utc = convert_netcdf_time(time.units, time) idx_min = np.searchsorted(time_utc, period.start, side='left') idx_max = np.searchsorted(time_utc, period.end, side='right') # check if there is data outside the range if idx_max - idx_min != len(time): # print('indices ', idx_min, idx_max, len(time)) # 3. crop the data array if idx_max < len(time): time_cropped = np.append(time[0:idx_min], time[idx_max:]) var_cropped = np.append(var[0:idx_min], var[idx_max:]) else: time_cropped = np.append(time[0:idx_min], []) var_cropped = np.append(var[0:idx_min], []) last_time_point = 2 * time_cropped[-1] - time_cropped[-2] # print(type(time_cropped[0])) # print(UtcTimeVector.from_numpy(time_cropped.astype(np.int64)).to_numpy()) ta = TimeAxis( UtcTimeVector.from_numpy(time_cropped.astype( np.int64)), int(last_time_point)) # print(var_cropped) # print(type(var_cropped)) time_series_cropped = TimeSeries( ta, dv.from_numpy(var_cropped), point_fx.POINT_INSTANT_VALUE ) # TODO: is this correct point policy? # 4. save the cropped data self.create_new_file() if time_series_cropped: self.append_ts_data(time_series_cropped)
def test_unit_conversion(self): utc = api.Calendar() t_num = np.arange( -24, 24, 1, dtype=np.float64 ) # we use both before and after epoch to ensure sign is ok t_converted = convert_netcdf_time('hours since 1970-01-01 00:00:00', t_num) t_axis = api.TimeAxisFixedDeltaT(utc.time(1969, 12, 31, 0, 0, 0), api.deltahours(1), 2 * 24) [ self.assertEqual(t_converted[i], t_axis(i).start) for i in range(t_axis.size()) ]
def _get_data_from_dataset(self, dataset, input_source_types, utc_period, geo_location_criteria, ensemble_member=None): if geo_location_criteria is not None: self._bounding_box = geo_location_criteria if "wind_speed" in input_source_types: input_source_types = list( input_source_types) # We change input list, so take a copy input_source_types.remove("wind_speed") input_source_types.append("x_wind") input_source_types.append("y_wind") no_temp = False if "temperature" not in input_source_types: no_temp = True if "relative_humidity" in input_source_types: if not isinstance(input_source_types, list): input_source_types = list( input_source_types) # We change input list, so take a copy input_source_types.remove("relative_humidity") input_source_types.extend( ["surface_air_pressure", "dew_point_temperature_2m"]) if no_temp: input_source_types.extend(["temperature"]) unit_ok = { k: dataset.variables[k].units in self.var_units[k] for k in dataset.variables.keys() if self._arome_shyft_map.get(k, None) in input_source_types } if not all(unit_ok.values()): raise EcDataRepositoryError( "The following variables have wrong unit: {}.".format( ', '.join([k for k, v in unit_ok.items() if not v]))) raw_data = {} x = dataset.variables.get("longitude", None) y = dataset.variables.get("latitude", None) time = dataset.variables.get("time", None) if not all([x, y, time]): raise EcDataRepositoryError("Something is wrong with the dataset." " x/y coords or time not found.") if not all([var.units in ['km', 'm'] for var in [x, y]]) and x.units == y.units: raise EcDataRepositoryError( "The unit for x and y coordinates should be either m or km.") coord_conv = 1. if x.units == 'km': coord_conv = 1000. time = convert_netcdf_time(time.units, time) data_cs = dataset.variables.get("projection_regular_ll", None) if data_cs is None: raise EcDataRepositoryError( "No coordinate system information in dataset.") idx_min = np.searchsorted(time, utc_period.start, side='left') idx_max = np.searchsorted(time, utc_period.end, side='right') issubset = True if idx_max < len(time) - 1 else False time_slice = slice(idx_min, idx_max) x, y, (m_x, m_y), _ = self._limit(x[:] * coord_conv, y[:] * coord_conv, data_cs.proj4, self.shyft_cs) #x, y, (m_x, m_y), _ = self._limit(x[:]*coord_conv, y[:]*coord_conv, data_cs.proj4, data_cs.proj4) for k in dataset.variables.keys(): if self._arome_shyft_map.get(k, None) in input_source_types: if k in self._shift_fields and issubset: # Add one to time slice data_time_slice = slice(time_slice.start, time_slice.stop + 1) else: data_time_slice = time_slice data = dataset.variables[k] dims = data.dimensions data_slice = len(data.dimensions) * [slice(None)] if isinstance(ensemble_member, int): data_slice[dims.index("ensemble_member")] = ensemble_member elif ensemble_member == 'all': data_slice[dims.index("ensemble_member")] = slice( 0, dataset.dimensions['ensemble_member'].size, None) data_slice[dims.index("longitude")] = m_x data_slice[dims.index("latitude")] = m_y data_slice[dims.index("time")] = data_time_slice pure_arr = data[data_slice] if isinstance(pure_arr, np.ma.core.MaskedArray): #print(pure_arr.fill_value) pure_arr = pure_arr.filled(np.nan) raw_data[self._arome_shyft_map[k]] = pure_arr, k #raw_data[self._arome_shyft_map[k]] = np.array(data[data_slice], dtype='d'), k if self.elevation_file is not None: _x, _y, z = self._read_elevation_file(self.elevation_file) assert np.linalg.norm(x - _x) < 1.0e-10 # x/y coordinates should match assert np.linalg.norm(y - _y) < 1.0e-10 elif any([ nm in dataset.variables.keys() for nm in ['altitude', 'surface_geopotential'] ]): var_nm = ['altitude', 'surface_geopotential'][[ nm in dataset.variables.keys() for nm in ['altitude', 'surface_geopotential'] ].index(True)] data = dataset.variables[var_nm] dims = data.dimensions data_slice = len(data.dimensions) * [slice(None)] data_slice[dims.index("longitude")] = m_x data_slice[dims.index("latitude")] = m_y z = data[data_slice] shp = z.shape z = z.reshape(shp[-2], shp[-1]) if var_nm == 'surface_geopotential': z /= self._G else: raise EcDataRepositoryError("No elevations found in dataset" ", and no elevation file given.") pts = np.dstack((x, y, z)).reshape(*(x.shape + (3, ))) # Make sure requested fields are valid, and that dataset contains the requested data. if not self.allow_subset and not (set( raw_data.keys()).issuperset(input_source_types)): raise EcDataRepositoryError("Could not find all data fields") if set(("x_wind", "y_wind")).issubset(raw_data): x_wind, _ = raw_data.pop("x_wind") y_wind, _ = raw_data.pop("y_wind") raw_data["wind_speed"] = np.sqrt( np.square(x_wind) + np.square(y_wind)), "wind_speed" if set(("surface_air_pressure", "dew_point_temperature_2m")).issubset(raw_data): sfc_p, _ = raw_data.pop("surface_air_pressure") dpt_t, _ = raw_data.pop("dew_point_temperature_2m") if no_temp: sfc_t, _ = raw_data.pop("temperature") else: sfc_t, _ = raw_data["temperature"] raw_data["relative_humidity"] = self.calc_RH( sfc_t, dpt_t, sfc_p), "relative_humidity" if ensemble_member == 'all': returndata = [] for i in range(51): ensemble_raw = { k: (raw_data[k][0][:, 0, i, :, :], raw_data[k][1]) for k in raw_data.keys() } #print([(k,ensemble_raw[k][0].shape) for k in ensemble_raw]) extracted_data = self._transform_raw(ensemble_raw, time[time_slice], issubset=issubset) returndata.append( self._geo_ts_to_vec( self._convert_to_timeseries(extracted_data), pts)) else: extracted_data = self._transform_raw(raw_data, time[time_slice], issubset=issubset) returndata = self._geo_ts_to_vec( self._convert_to_timeseries(extracted_data), pts) return returndata
def test_unit_conversion(self): utc = api.Calendar() t_num = np.arange(-24, 24, 1, dtype=np.float64) # we use both before and after epoch to ensure sign is ok t_converted = convert_netcdf_time('hours since 1970-01-01 00:00:00', t_num) t_axis = api.Timeaxis(utc.time(api.YMDhms(1969, 12, 31, 0, 0, 0)), api.deltahours(1), 2 * 24) [self.assertEqual(t_converted[i], t_axis(i).start) for i in range(t_axis.size())]
def append_ts_data(self, time_series: TimeSeries): """ ensure that the data-file content are equal to time_series for the time_series.time_axis.total_period(). If needed, create and update the file meta-data. :param time_series: :return: """ period = time_series.total_period() n_new_val = time_series.size() crop_data = False time_series_cropped = None with Dataset(self.file_path, 'a') as ds: # read time, from ts.time_axis.start() # or last value of time # then consider if we should fill in complete time-axis ? # # figure out the start-index, # then # ds.time[startindex:] = ts.time_axis.numpy values # ds.temperature[startindex:] = ts.values.to_numpy() # # or if more advanced algorithm, # first read # diff # result -> delete range, replace range, insert range.. time_variable = 'time' time = ds.variables.get(time_variable, None) if time is None: raise TimeSeriesStoreError( 'Something is wrong with the dataset. time not found.') var = ds.variables.get(self.ts_meta_info.variable_name, None) if var is None: raise TimeSeriesStoreError( 'Something is wrong with the dataset. variable {0} not found.' .format(self.ts_meta_info.variable_name)) if len(time): time_utc = convert_netcdf_time(time.units, time) idx_min = np.searchsorted(time_utc, period.start, side='left') idx_max = np.searchsorted( time_utc, period.end, side='left' ) # use 'left' since period.end = time_point(last_value)+dt idx_data_end = idx_min + n_new_val # print('indices ', idx_min, idx_max, idx_data_end, len(time)) # move data if we are overlap or new data`s time before saved time: if idx_min < len(time_utc) and idx_max < len( time_utc) and idx_max - idx_min != n_new_val: # print('In moving condition ', idx_max - idx_min, n_new_val) idx_last = len(time_utc) time[idx_data_end:] = time[idx_max:idx_last] var[idx_data_end:, 0] = var[idx_max:idx_last, 0] # insert new data time[idx_min: idx_data_end] = time_series.time_axis.time_points[:-1] var[idx_min:idx_data_end, 0] = time_series.values.to_numpy() # crop all data which should not be there if idx_max - idx_min - n_new_val > 0: idx_del_start = len(time) - idx_max + idx_min + n_new_val # print("we need to delete something at the end ", idx_max - idx_min - n_new_val, idx_del_start) crop_data = True time_cropped = time[0:idx_del_start] var_cropped = var[0:idx_del_start, 0] last_time_point = 2 * time_cropped[-1] - time_cropped[-2] # print(type(time_cropped[0])) # print(UtcTimeVector.from_numpy(time_cropped.astype(np.int64)).to_numpy()) ta = TimeAxis( UtcTimeVector.from_numpy(time_cropped.astype( np.int64)), int(last_time_point)) # print(var_cropped) # print(type(var_cropped)) time_series_cropped = TimeSeries( ta, dv.from_numpy(var_cropped), point_fx. POINT_INSTANT_VALUE) # TODO: is this right policy? else: time[:] = time_series.time_axis.time_points[:-1] var[:, 0] = time_series.values.to_numpy() # for i, (t, val) in enumerate(zip(time[:], var[:])): # print('{:<4} : {} - {} - {}'.format(i, datetime.fromtimestamp(t), val[0], type(val[0]))) ds.sync() if crop_data and time_series_cropped: self.create_new_file() self.append_ts_data(time_series_cropped)