def test_get_data_start_end_labels_obs_longer_than_1h(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('2h')) forecast = default_forecast(site_metadata, run_length=pd.Timedelta('5min')) run_time = pd.Timestamp('20190422T1945Z') # obs interval cannot be longer than 1 hr with pytest.raises(ValueError) as excinfo: utils.get_data_start_end(observation, forecast, run_time) assert 'observation.interval_length <= 1h' in str(excinfo.value)
def test_get_data_start_end_labels_obs_avg_fx_instant(site_metadata): run_time = pd.Timestamp('20190422T1945Z') observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='ending') forecast = default_forecast(site_metadata, issue_time_of_day=dt.time(hour=5), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('5min'), run_length=pd.Timedelta('1d'), interval_label='instant') with pytest.raises(ValueError) as excinfo: utils.get_data_start_end(observation, forecast, run_time) assert 'made from interval average obs' in str(excinfo.value)
def test_get_data_start_end_labels_obs_fx_instant_mismatch(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='instant') forecast = default_forecast( site_metadata, issue_time_of_day=dt.time(hour=5), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('1h'), # interval_length must be equal run_length=pd.Timedelta('1d'), interval_label='instant') # if interval_label also instant run_time = pd.Timestamp('20190422T1945Z') with pytest.raises(ValueError) as excinfo: utils.get_data_start_end(observation, forecast, run_time) assert 'with identical interval length' in str(excinfo.value)
def test_get_data_start_end_labels_subhourly_window_limit(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='beginning') forecast = default_forecast( site_metadata, run_length=pd.Timedelta('5min'), # test subhourly limit on window interval_label='beginning') run_time = pd.Timestamp('20190422T1945Z') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) assert data_start == pd.Timestamp('20190422T1940Z') assert data_end == pd.Timestamp('20190422T1945Z')
def _issue_time_generator(observation, fx, obs_mint, obs_maxt, next_issue_time, max_run_time): # now find all the run times that can be made based on the # last observation timestamp while next_issue_time <= max_run_time: data_start, data_end = utils.get_data_start_end( observation, fx, next_issue_time, next_issue_time) if data_end > obs_maxt: break if data_start > obs_mint: yield next_issue_time next_issue_time = utils.get_next_issue_time( fx, next_issue_time + pd.Timedelta('1ns'))
def test_get_data_start_end_labels_1h_window_limit(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='beginning') forecast = default_forecast( site_metadata, run_length=pd.Timedelta('12h'), # test 1 hr limit on window interval_label='beginning') # ensure data no later than run time run_time = pd.Timestamp('20190422T1945Z') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) assert data_start == pd.Timestamp('20190422T1845Z') assert data_end == pd.Timestamp('20190422T1945Z')
def test_get_data_start_end_labels_obs_instant_fx_avg_intraday(site_metadata): run_time = pd.Timestamp('20190422T1945Z') observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='instant') forecast = default_forecast(site_metadata, issue_time_of_day=dt.time(hour=5), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('5min'), run_length=pd.Timedelta('15min'), interval_label='ending') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) assert data_start == pd.Timestamp('20190422T193001Z') assert data_end == pd.Timestamp('20190422T1945Z')
def test_get_data_start_end_labels_obs_fx_instant(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='instant') forecast = default_forecast( site_metadata, issue_time_of_day=dt.time(hour=5), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('5min'), # interval_length must be equal run_length=pd.Timedelta('1d'), interval_label='instant') # if interval_label also instant run_time = pd.Timestamp('20190422T1945Z') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) assert data_start == pd.Timestamp('20190421T0000Z') assert data_end == pd.Timestamp('20190421T235959Z')
def test_get_data_start_end_labels_obs_instant_fx_avg(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='instant') forecast = default_forecast(site_metadata, issue_time_of_day=dt.time(hour=23), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('5min'), run_length=pd.Timedelta('1d'), interval_label='beginning') run_time = pd.Timestamp('20190422T1945Z') issue_time = pd.Timestamp('20190422T2300Z') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time, issue_time) assert data_start == pd.Timestamp('20190421T0000Z') assert data_end == pd.Timestamp('20190421T235959Z')
def test_get_data_start_end_labels_obs_longer_than_1h_day_ahead(site_metadata): observation = default_observation(site_metadata, interval_length=pd.Timedelta('2h'), interval_label='beginning') forecast = default_forecast( site_metadata, issue_time_of_day=dt.time(hour=5), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('1h'), run_length=pd.Timedelta('1d'), # day ahead interval_label='beginning') run_time = pd.Timestamp('20190422T1945Z') # day ahead doesn't care about obs interval length data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) assert data_start == pd.Timestamp('20190421T0000Z') assert data_end == pd.Timestamp('20190422T0000Z')
def test_get_data_start_end_time_dayahead(site_metadata, rl, rt, lt, expected_start, expected_end): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='beginning') run_time = pd.Timestamp(rt) issue_time = pd.Timestamp('20190410T2300Z') forecast = default_forecast(site_metadata, issue_time_of_day=dt.time(hour=23), lead_time_to_start=pd.Timedelta(lt), interval_length=pd.Timedelta('1h'), run_length=pd.Timedelta(rl), interval_label='beginning') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time, issue_time) assert data_start == pd.Timestamp(expected_start) assert data_end == pd.Timestamp(expected_end)
def test_get_data_start_end_time_tz(site_metadata, variable, rl, issue, run, expected_start, expected_end): observation = default_observation(site_metadata, variable=variable, interval_length=pd.Timedelta('5min'), interval_label='ending') forecast = default_forecast(site_metadata, variable=variable, issue_time_of_day=dt.time(hour=23), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('1h'), run_length=pd.Timedelta(rl), interval_label='beginning') data_start, data_end = utils.get_data_start_end(observation, forecast, pd.Timestamp(run), pd.Timestamp(issue)) assert data_start == pd.Timestamp(expected_start) assert data_end == pd.Timestamp(expected_end)
def test_get_data_start_end_labels_obs_fx_instant(site_metadata, lead, issue, it): observation = default_observation(site_metadata, interval_length=pd.Timedelta('5min'), interval_label='instant') # interval length of forecast and obs must be equal if interval label is # instant forecast = default_forecast(site_metadata, issue_time_of_day=dt.time(hour=it), lead_time_to_start=pd.Timedelta(lead), interval_length=pd.Timedelta('5min'), run_length=pd.Timedelta('1d'), interval_label='instant') issue_time = pd.Timestamp(issue) run_time = issue_time - pd.Timedelta('75min') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time, issue_time) assert data_start == pd.Timestamp('20190421T0000Z') assert data_end == pd.Timestamp('20190421T235959Z')
def test_get_forecast_start_end_time_weekahead(site_metadata, variable, expected_start, expected_end): observation = default_observation(site_metadata, variable=variable, interval_length=pd.Timedelta('5min'), interval_label='beginning') run_time = pd.Timestamp('20190410T0630Z') forecast = default_forecast(site_metadata, variable=variable, issue_time_of_day=dt.time(hour=10), lead_time_to_start=pd.Timedelta('1h'), interval_length=pd.Timedelta('1h'), run_length=pd.Timedelta('1d'), interval_label='beginning') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) assert data_start == pd.Timestamp(expected_start) assert data_end == pd.Timestamp(expected_end)
def test_get_data_start_end_time_weekahead_not_midnight(site_metadata): variable = 'net_load' observation = default_observation(site_metadata, variable=variable, interval_length=pd.Timedelta('5min'), interval_label='beginning') run_time = pd.Timestamp('20190410T1030Z') issue_time = pd.Timestamp('20190410T1200Z') # fx from 2019-04-11 12:00 forecast = default_forecast(site_metadata, variable=variable, issue_time_of_day=dt.time(hour=12), lead_time_to_start=pd.Timedelta('1d'), interval_length=pd.Timedelta('1h'), run_length=pd.Timedelta('1d'), interval_label='beginning') data_start, data_end = utils.get_data_start_end(observation, forecast, run_time, issue_time) assert data_start == pd.Timestamp('20190404T1200Z') assert data_end == pd.Timestamp('20190405T1200Z')
def run_persistence(session, observation, forecast, run_time, issue_time, index=False): """ Run a persistence *forecast* for an *observation*. For intraday forecasts, the *index* argument controls if the forecast is constructed using persistence of the measured values (*index = False*) or persistence using clear sky index or AC power index. For day ahead forecasts, only persistence of measured values (*index = False*) is supported. Forecasts may be run operationally or retrospectively. For operational forecasts, *run_time* is typically set to now. For retrospective forecasts, *run_time* is the time by which the forecast should be run so that it could have been be delivered for the *issue_time*. Forecasts will only use data with timestamps before *run_time*. The persistence *window* is the time over which the persistence quantity (irradiance, power, clear sky index, or power index) is averaged. The persistence window is automatically determined from the *forecast* attributes: * Intraday persistence forecasts: *window = forecast.run_length*. No longer than 1 hour. * Day ahead forecasts (all but net load) and week ahead forecasts (net load only): *window = forecast.interval_length*. Users that would like more flexibility may use the lower-level functions in :py:mod:`solarforecastarbiter.reference_forecasts.persistence`. Parameters ---------- session : api.Session The session object to use to request data from the SolarForecastArbiter API. observation : datamodel.Observation The metadata of the observation to be used to create the forecast. forecast : datamodel.Forecast The metadata of the desired forecast. run_time : pd.Timestamp Run time of the forecast. issue_time : pd.Timestamp Issue time of the forecast run. index : bool, default False If False, use persistence of observed value. If True, use persistence of clear sky or AC power index. Returns ------- forecast : pd.Series Forecast conforms to the metadata specified by the *forecast* argument. Raises ------ ValueError If forecast and issue_time are incompatible. ValueError If persistence window < observation.interval_length. ValueError If forecast.run_length = 1 day and forecast period is not midnight to midnight. ValueError If forecast.run_length = 1 day and index=True. ValueError If instantaneous forecast and instantaneous observation interval lengths do not match. ValueError If average observations are used to make instantaneous forecast. Notes ----- For non-intraday net load forecasts, this function will use a weekahead persistence due to the fact that net load exhibits stronger correlation week-to-week than day-to-day. For example, the net load on a Monday tends to look more similar to the previous Monday that it does to the previous day (Sunday). """ utils.check_persistence_compatibility(observation, forecast, index) forecast_start, forecast_end = utils.get_forecast_start_end( forecast, issue_time, False) intraday = utils._is_intraday(forecast) if not intraday: # raise ValueError if not intraday and not midnight to midnight utils._check_midnight_to_midnight(forecast_start, forecast_end) data_start, data_end = utils.get_data_start_end(observation, forecast, run_time) def load_data(observation, data_start, data_end): df = session.get_observation_values(observation.observation_id, data_start, data_end, observation.interval_label) df = df.tz_convert(observation.site.timezone) return df['value'] if intraday and index: fx = persistence.persistence_scalar_index( observation, data_start, data_end, forecast_start, forecast_end, forecast.interval_length, forecast.interval_label, load_data) elif intraday and not index: fx = persistence.persistence_scalar(observation, data_start, data_end, forecast_start, forecast_end, forecast.interval_length, forecast.interval_label, load_data) elif not intraday and not index: fx = persistence.persistence_interval(observation, data_start, data_end, forecast_start, forecast.interval_length, forecast.interval_label, load_data) else: # pragma: no cover raise ValueError( 'index=True not supported for forecasts with run_length >= 1day') return fx
def run_persistence(session, observation, forecast, run_time, issue_time, index=False, load_data=None): """ Run a persistence *forecast* for an *observation*. For intraday forecasts, the *index* argument controls if the forecast is constructed using persistence of the measured values (*index = False*) or persistence using clear sky index or AC power index. For day ahead forecasts, only persistence of measured values (*index = False*) is supported. Forecasts may be run operationally or retrospectively. For operational forecasts, *run_time* is typically set to now. For retrospective forecasts, *run_time* is the time by which the forecast should be run so that it could have been be delivered for the *issue_time*. Forecasts will only use data with timestamps before *run_time*. The persistence *window* is the time over which the persistence quantity (irradiance, power, clear sky index, or power index) is averaged. The persistence window is automatically determined from the *forecast* attributes: - Intraday persistence forecasts: + ``window = forecast.run_length``. No longer than 1 hour. - Day ahead forecasts (all but net load) and week ahead forecasts (net load only): + ``window = forecast.interval_length``. Users that would like more flexibility may use the lower-level functions in :py:mod:`solarforecastarbiter.reference_forecasts.persistence`. Parameters ---------- session : api.Session The session object to use to request data from the SolarForecastArbiter API. observation : datamodel.Observation The metadata of the observation to be used to create the forecast. forecast : datamodel.Forecast The metadata of the desired forecast. run_time : pd.Timestamp Run time of the forecast. issue_time : pd.Timestamp Issue time of the forecast run. index : bool, default False If False, use persistence of observed value. If True, use persistence of clear sky or AC power index. load_data : function Function to load the observation data 'value' series given (observation, data_start, data_end) arguments. Typically, calls `session.get_observation_values` and selects the 'value' column. May also have data preloaded to then slice from data_start to data_end. Returns ------- forecast : pd.Series Forecast conforms to the metadata specified by the *forecast* argument. Raises ------ ValueError If forecast and issue_time are incompatible. ValueError If data is required from after run_time. ValueError If persistence window < observation.interval_length. ValueError If forecast.run_length => 1 day and index=True. ValueError If instantaneous forecast and instantaneous observation interval lengths do not match. ValueError If average observations are used to make instantaneous forecast. Notes ----- For non-intraday net load forecasts, this function will use a weekahead persistence due to the fact that net load exhibits stronger correlation week-to-week than day-to-day. For example, the net load on a Monday tends to look more similar to the previous Monday that it does to the previous day (Sunday). """ utils.check_persistence_compatibility(observation, forecast, index) forecast_start, forecast_end = utils.get_forecast_start_end( forecast, issue_time, False) intraday = utils._is_intraday(forecast) if load_data is None: load_data = _default_load_data(session) data_start, data_end = utils.get_data_start_end(observation, forecast, run_time, issue_time) if data_end > run_time: raise ValueError( 'Persistence forecast requires data from after run_time') if isinstance(forecast, datamodel.ProbabilisticForecast): cvs = [f.constant_value for f in forecast.constant_values] fx = persistence.persistence_probabilistic( observation, data_start, data_end, forecast_start, forecast_end, forecast.interval_length, forecast.interval_label, load_data, forecast.axis, cvs) elif intraday and index: fx = persistence.persistence_scalar_index( observation, data_start, data_end, forecast_start, forecast_end, forecast.interval_length, forecast.interval_label, load_data) elif intraday and not index: fx = persistence.persistence_scalar(observation, data_start, data_end, forecast_start, forecast_end, forecast.interval_length, forecast.interval_label, load_data) elif not intraday and not index: fx = persistence.persistence_interval(observation, data_start, data_end, forecast_start, forecast.interval_length, forecast.interval_label, load_data) else: # pragma: no cover raise ValueError( 'index=True not supported for forecasts with run_length >= 1day') return fx
def generate_reference_persistence_forecast_gaps_parameters( session, forecasts, observations, start, end): """Sort through all *forecasts* to find those with gaps in the data that should be generated by the Arbiter from persisting Observation values. The forecast must have ``'is_reference_persistence_forecast': true`` and an observation_id in Forecast.extra_parameters (formatted as a JSON string). A boolean value for "index_persistence" in Forecast.extra_parameters controls whether the persistence forecast should be made adjusting for clear-sky/AC power index or not. Parameters ---------- session : solarforecastarbiter.io.api.APISession forecasts : list of datamodel.Forecasts The forecasts that should be filtered to find references. observations : list of datamodel.Observations Observations that will are available to use to fetch values and make persistence forecasts. start : pandas.Timestamp The start of the period to search for missing forecast values. end : pandas.Timestamp The end of the period to search for missing forecast values. Returns ------- generator of (Forecast, Observation, index, data_start, data_end, issue_times) """ # NOQA: E501 user_info = session.get_user_info() observation_dict = {obs.observation_id: obs for obs in observations} out = namedtuple('PersistenceGapParameters', [ 'forecast', 'observation', 'index', 'data_start', 'data_end', 'issue_times' ]) for fx in forecasts: obs_ind_mint_maxt = _ref_persistence_check(fx, observation_dict, user_info, session) if obs_ind_mint_maxt is None: continue observation, index, obs_mint, obs_maxt = obs_ind_mint_maxt times = set() gaps = session.get_value_gaps(fx, start, end) for gap in gaps: times |= set( _issue_time_generator(observation, fx, obs_mint, obs_maxt, gap[0], gap[1] - pd.Timedelta('1ns'))) issue_times = tuple(sorted(times)) if len(issue_times) == 0: continue # get_data_start_end only looks for start/end of a single # forecast run, so need to do for first and last issue times # to get full range of data possibly needed data_start, _ = utils.get_data_start_end(observation, fx, issue_times[0], issue_times[0]) _, data_end = utils.get_data_start_end(observation, fx, issue_times[-1], issue_times[-1]) yield out(fx, observation, index, data_start, data_end, issue_times)
def generate_reference_persistence_forecast_parameters(session, forecasts, observations, max_run_time): """Sort through all *forecasts* to find those that should be generated by the Arbiter from persisting Observation values. The forecast must have ``'is_reference_persistence_forecast': true`` and an observation_id in Forecast.extra_parameters (formatted as a JSON string). A boolean value for "index_persistence" in Forecast.extra_parameters controls whether the persistence forecast should be made adjusting for clear-sky/AC power index or not. Parameters ---------- session : solarforecastarbiter.io.api.APISession forecasts : list of datamodel.Forecasts The forecasts that should be filtered to find references. observations : list of datamodel.Observations Observations that will are available to use to fetch values and make persistence forecasts. max_run_time : pandas.Timestamp The maximum run time/issue time for any forecasts. Usually now. Returns ------- generator of (Forecast, Observation, next_issue_time, index) """ user_info = session.get_user_info() observation_dict = {obs.observation_id: obs for obs in observations} for fx in forecasts: if not _is_reference_persistence_forecast(fx.extra_parameters): logger.debug( 'Forecast %s is not labeled as a reference ' 'persistence forecast', fx.forecast_id) continue if not fx.provider == user_info['organization']: logger.debug("Forecast %s is not in user's organization", fx.forecast_id) continue try: extra_parameters = json.loads(fx.extra_parameters) except json.JSONDecodeError: logger.warning( 'Failed to decode extra_parameters for %s: %s as JSON', fx.name, fx.forecast_id) continue try: observation_id = extra_parameters['observation_id'] except KeyError: logger.error( 'Forecast, %s: %s, has no observation_id to base forecasts' ' off of. Cannot make persistence forecast.', fx.name, fx.forecast_id) continue if observation_id not in observation_dict: logger.error( 'Observation %s not in set of given observations.' ' Cannot generate persistence forecast for %s: %s.', observation_id, fx.name, fx.forecast_id) continue observation = observation_dict[observation_id] index = extra_parameters.get('index_persistence', False) obs_mint, obs_maxt = session.get_observation_time_range(observation_id) if pd.isna(obs_maxt): # no observations to use anyway logger.info( 'No observation values to use for %s: %s from observation %s', fx.name, fx.forecast_id, observation_id) continue fx_mint, fx_maxt = session.get_forecast_time_range(fx.forecast_id) # find the next issue time for the forecast based on the last value # in the forecast series if pd.isna(fx_maxt): # if there is no forecast yet, go back a bit from the last # observation. Don't use the start of observations, since it # could really stress the workers if we have a few years of # data before deciding to make a persistence fx next_issue_time = utils.get_next_issue_time( fx, obs_maxt - fx.run_length) else: next_issue_time = utils.find_next_issue_time_from_last_forecast( fx, fx_maxt) # now find all the run times that can be made based on the # last observation timestamp while next_issue_time <= max_run_time: data_start, data_end = utils.get_data_start_end( observation, fx, next_issue_time) if data_end > obs_maxt: break if data_start > obs_mint: yield (fx, observation, next_issue_time, index) next_issue_time = utils.get_next_issue_time( fx, next_issue_time + pd.Timedelta('1ns'))
def generate_reference_persistence_forecast_parameters(session, forecasts, observations, max_run_time): """Sort through all *forecasts* to find those that should be generated by the Arbiter from persisting Observation values. The forecast must have ``'is_reference_persistence_forecast': true`` and an observation_id in Forecast.extra_parameters (formatted as a JSON string). A boolean value for "index_persistence" in Forecast.extra_parameters controls whether the persistence forecast should be made adjusting for clear-sky/AC power index or not. Parameters ---------- session : solarforecastarbiter.io.api.APISession forecasts : list of datamodel.Forecasts The forecasts that should be filtered to find references. observations : list of datamodel.Observations Observations that will are available to use to fetch values and make persistence forecasts. max_run_time : pandas.Timestamp The maximum run time/issue time for any forecasts. Usually now. Returns ------- generator of (Forecast, Observation, index, data_start, issue_times) """ user_info = session.get_user_info() observation_dict = {obs.observation_id: obs for obs in observations} out = namedtuple( 'PersistenceParameters', ['forecast', 'observation', 'index', 'data_start', 'issue_times']) for fx in forecasts: obs_ind_mint_maxt = _ref_persistence_check(fx, observation_dict, user_info, session) if obs_ind_mint_maxt is None: continue observation, index, obs_mint, obs_maxt = obs_ind_mint_maxt # probably split this out to generate issues times for only gaps vs # latest if isinstance(fx, datamodel.ProbabilisticForecast): fx_mint, fx_maxt = \ session.get_probabilistic_forecast_constant_value_time_range( fx.constant_values[0].forecast_id) else: fx_mint, fx_maxt = session.get_forecast_time_range(fx.forecast_id) # find the next issue time for the forecast based on the last value # in the forecast series if pd.isna(fx_maxt): # if there is no forecast yet, go back a bit from the last # observation. Don't use the start of observations, since it # could really stress the workers if we have a few years of # data before deciding to make a persistence fx next_issue_time = utils.get_next_issue_time( fx, obs_maxt - fx.run_length) else: next_issue_time = utils.find_next_issue_time_from_last_forecast( fx, fx_maxt) data_start, _ = utils.get_data_start_end(observation, fx, next_issue_time, next_issue_time) issue_times = tuple( _issue_time_generator(observation, fx, obs_mint, obs_maxt, next_issue_time, max_run_time)) if len(issue_times) == 0: continue yield out(fx, observation, index, data_start, issue_times)