def test_generate_forecast_datetime_range_only_null_values_target_column(
            self):
        """Test if correct forecast window is made when data only has nulls."""
        time_format = "%Y-%m-%d %H:%M:%S%z"
        forecast_start_expected = dt.strptime("2020-10-31 00:45:00+0000",
                                              time_format)
        forecast_end_expected = dt.strptime("2020-11-30 00:00:00+0000",
                                            time_format)
        forecast_data = self.data
        forecast_data.loc[:, forecast_data.columns[0]] = None

        forecast_start, forecast_end = utils.generate_forecast_datetime_range(
            forecast_data=forecast_data)

        self.assertEqual(forecast_start, forecast_start_expected)
        self.assertEqual(forecast_end, forecast_end_expected)
    def test_generate_forecast_datetime_range_single_null_values_target_column(
            self):
        """Test if correct forecast window is made with single range of nulls."""
        time_format = "%Y-%m-%d %H:%M:%S%z"
        forecast_start_expected = dt.strptime("2020-11-26 00:00:00+0000",
                                              time_format)
        forecast_end_expected = dt.strptime("2020-11-30 00:00:00+0000",
                                            time_format)
        forecast_data = self.data
        forecast_data.loc["2020-11-26":"2020-12-01",
                          forecast_data.columns[0]] = None

        forecast_start, forecast_end = utils.generate_forecast_datetime_range(
            forecast_data=forecast_data)

        self.assertEqual(forecast_start, forecast_start_expected)
        self.assertEqual(forecast_end, forecast_end_expected)
Exemplo n.º 3
0
def create_forecast_pipeline_core(
    pj: PredictionJobDataClass,
    input_data: pd.DataFrame,
    model: OpenstfRegressor,
    model_specs: ModelSpecificationDataClass,
) -> pd.DataFrame:
    """Create forecast pipeline (core)

    Computes the forecasts and confidence intervals given a prediction job and input data.
    This pipeline has no database or persisitent storage dependencies.

    Expected prediction job keys: "resolution_minutes", "horizon_minutes", "id", "type",
        "name", "quantiles"

    Args:
        pj (PredictionJobDataClass): Prediction job.
        input_data (pandas.DataFrame): Input data for the prediction.
        model (OpenstfRegressor): Model to use for this prediction.
        model_specs (ModelSpecificationDataClass): Model specifications.

    Returns:
        forecast (pandas.DataFrame)
    """
    logger = structlog.get_logger(__name__)

    fallback_strategy = "extreme_day"  # this can later be expanded

    # Validate and clean data
    validated_data = validation.validate(pj["id"], input_data,
                                         pj["flatliner_treshold"])

    # Add features
    data_with_features = OperationalPredictFeatureApplicator(
        # TODO use saved feature_names (should be saved while training the model)
        horizons=[pj["resolution_minutes"] / 60.0],
        feature_names=model.feature_names,
        feature_modules=model_specs.feature_modules,
    ).add_features(validated_data)

    # Prep forecast input by selecting only the forecast datetime interval (this is much smaller than the input range)
    # Also drop the load column
    forecast_start, forecast_end = generate_forecast_datetime_range(
        data_with_features)
    forecast_input_data = data_with_features[forecast_start:forecast_end].drop(
        columns="load")

    # Check if sufficient data is left after cleaning
    if not validation.is_data_sufficient(data_with_features,
                                         pj["completeness_treshold"],
                                         pj["minimal_table_length"]):
        logger.warning(
            "Using fallback forecast",
            forecast_type="fallback",
            pid=pj["id"],
            fallback_strategy=fallback_strategy,
        )
        forecast = generate_fallback(data_with_features, input_data[["load"]])

    else:
        # Predict
        model_forecast = model.predict(forecast_input_data)
        forecast = pd.DataFrame(index=forecast_input_data.index,
                                data={"forecast": model_forecast})

    # Add confidence
    forecast = ConfidenceIntervalApplicator(
        model, forecast_input_data).add_confidence_interval(forecast, pj)

    # Prepare for output
    forecast = add_prediction_job_properties_to_forecast(
        pj,
        forecast,
        algorithm_type=str(model.path),
    )

    return forecast
def create_basecase_forecast_pipeline(
    pj: PredictionJobDataClass,
    input_data: pd.DataFrame,
) -> pd.DataFrame:
    """Computes the base case forecast and confidence intervals for a given prediction job and input data.

    Args:
        pj: (dict) prediction job
        input_data (pandas.DataFrame): data frame containing the input data necessary for the prediction.

    Returns:
        basecase_forecast (pandas.DataFrame)
    """

    logger = structlog.get_logger(__name__)

    logger.info("Preprocessing data for basecase forecast")
    # Validate data
    # Currently effectively disabled by giving None.
    # We keep this step so it later can be filled using arguments defined in PJ
    validated_data = validation.validate(pj["id"], input_data, flatliner_threshold=None)

    # Add features
    data_with_features = OperationalPredictFeatureApplicator(
        horizons=[0.25],
        feature_names=[
            "T-7d",
            "T-14d",
        ],  # Generate features for load 7 days ago and load 14 days ago these are the same as the basecase forecast.
    ).add_features(validated_data)

    # Similarly to the forecast pipeline, only try to make a forecast for moments in the future
    # TODO, do we want to be this strict on time window of forecast in this place?
    # see issue https://github.com/OpenSTEF/openstef/issues/121
    forecast_start, forecast_end = generate_forecast_datetime_range(data_with_features)
    forecast_input = data_with_features[forecast_start:forecast_end]

    # Initialize model
    model = BaseCaseModel()
    logger.info("Making basecase forecast")
    # Make basecase forecast
    basecase_forecast = BaseCaseModel().predict(forecast_input)

    # Check if input data is available
    if len(basecase_forecast) == 0:
        raise NoRealisedLoadError(pj["id"])

    # Estimate the stdev by using the stdev of the hour for historic (T-14d) load
    model.standard_deviation = generate_basecase_confidence_interval(forecast_input)
    logger.info("Postprocessing basecase forecast")
    # Apply confidence interval
    basecase_forecast = ConfidenceIntervalApplicator(
        model, forecast_input
    ).add_confidence_interval(basecase_forecast, pj, quantile_confidence_interval=False)

    # Add basecase for the component forecasts
    basecase_forecast = add_components_base_case_forecast(basecase_forecast)

    # Do further postprocessing
    basecase_forecast = add_prediction_job_properties_to_forecast(
        pj=pj,
        forecast=basecase_forecast,
        algorithm_type="basecase_lastweek",
        forecast_quality="not_renewed",
    )

    return basecase_forecast