Пример #1
0
def predict_rain_proba(config, predictor_file):
    """
    Predict probabilistic rain forecasts for 'pop' or 'categorical' types.
    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :return:
    """
    if config['Model']['rain_forecast_type'] not in ['pop', 'categorical']:
        raise TypeError(
            "'quantity' rain forecast is not probabilistic, cannot get probabilities"
        )
    rain_tuning = config['Model'].get('Rain tuning', None)
    if rain_tuning is None:
        raise TypeError(
            'Probabilistic rain forecasts are only possible with a RainTuningEstimator'
        )

    if config['multi_stations']:  #multiple stations
        station_ids = config['station_id']
        estimator_files = config['Model']['estimator_file']
        if len(estimator_files) != len(
                station_ids
        ):  #There has to be the same number of estimator files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of estimator files as station IDs"
            )
    else:
        station_ids = [config['station_id']]  #just one station
        estimator_files = [config['Model']['estimator_file']]

    # Load the predictor data and estimator
    predictor_data = read_pkl(predictor_file)
    for i in range(len(station_ids)):
        station_id = station_ids[i]
        estimator_file = estimator_files[i]
        if config['verbose']:
            print('predict: loading estimator %s' % estimator_file)
        estimator = read_pkl(estimator_file)

        predictors = np.concatenate(
            (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)
        if to_bool(rain_tuning.get('use_raw_rain', False)):
            rain_proba = estimator.predict_rain_proba(
                predictors, rain_array=predictor_data.rain[i])
        else:
            rain_proba = estimator.predict_rain_proba(predictors)

    return rain_proba
Пример #2
0
def build_train_data(config,
                     predictor_file,
                     no_obs=False,
                     no_models=False,
                     test_size=0):
    """
    Build the array of training (and optionally testing) data.
    :param config:
    :param predictor_file:
    :param no_obs:
    :param no_models:
    :param test_size:
    :return:
    """
    from sklearn.model_selection import train_test_split
    if config['multi_stations']:  #multiple stations
        station_ids = config['station_id']
    else:
        station_ids = [config['station_id']]  #just one station
    if config['verbose']:
        print('build_train_data: reading predictor file')
    rain_tuning = config['Model'].get('Rain tuning', None)
    data = read_pkl(predictor_file)

    # Select data
    if no_obs and no_models:
        no_obs = False
        no_models = False
    if no_obs:
        if config['verbose']:
            print('build_train_data: not using observations to train')
        predictors = data['BUFKIT']
    elif no_models:
        if config['verbose']:
            print('build_train_data: not using models to train')
        predictors = data['OBS']
    else:
        predictors = np.concatenate((data['BUFKIT'], data['OBS']), axis=1)

    if test_size > 0:
        pred_len = len(predictors[0])
        targets_len = len(data['VERIF'][0][0])
        print(targets_len)
        targets_combined = [
        ]  #arrays of verification of different stations combined
        rain_combined = [
        ]  #arrays of rain arrays of different stations combined
        for day in range(len(data['VERIF'][0])):  #for each day
            rain_day = []
            for i in range(len(data['VERIF'])):  #for each station
                if i == 0:
                    targets_day = data['VERIF'][i][day]
                    rain_day = data.rain[i][day]
                else:
                    targets_day = np.concatenate(
                        (targets_day, data['VERIF'][i][day]))
                    rain_day = np.concatenate((rain_day, data.rain[i][day]))
            targets_combined.append(targets_day)
            rain_combined.append(rain_day)
        targets_combined = np.array(targets_combined)
        rain_combined = np.array(rain_combined)
        rain_len = len(data.rain[0][0])
        if rain_tuning is not None and to_bool(
                rain_tuning.get('use_raw_rain', False)):
            predictors = np.concatenate((predictors, rain_combined), axis=1)

        p_train = []
        t_train = []
        r_train = []
        p_test = []
        t_test = []
        r_test = []
        p_train_raw, p_test_raw, t_train_raw, t_test_raw = train_test_split(
            predictors, targets_combined, test_size=test_size)
        for i in range(len(station_ids)):
            if rain_tuning is not None and to_bool(
                    rain_tuning.get('use_raw_rain', False)):
                p_train_one = np.array([
                    p_train_raw[j][0:pred_len] for j in range(len(p_train_raw))
                ])
                r_train_one = np.array([
                    p_train_raw[j][pred_len + i * rain_len:pred_len +
                                   (i + 1) * rain_len]
                    for j in range(len(p_train_raw))
                ])
                p_test_one = np.array([
                    p_test_raw[j][0:pred_len] for j in range(len(p_test_raw))
                ])
                r_test_one = np.array([
                    p_test_raw[j][pred_len + i * rain_len:pred_len +
                                  (i + 1) * rain_len]
                    for j in range(len(p_test_raw))
                ])
                r_train.append(r_train_one)
                r_test.append(r_test_one)
            else:
                p_train_one = np.copy(p_train_raw)
                p_test_one = np.copy(p_test_raw)
                r_train = None
                r_test = None
            t_train_one = np.array([
                t_train_raw[j][i * targets_len:(i + 1) * targets_len]
                for j in range(len(t_train_raw))
            ])
            t_test_one = np.array([
                t_test_raw[j][i * targets_len:(i + 1) * targets_len]
                for j in range(len(t_test_raw))
            ])
            p_train.append(p_train_one)
            t_train.append(t_train_one)
            p_test.append(p_test_one)
            t_test.append(t_test_one)
            if i == len(station_ids) - 1:  #last station
                return p_train, t_train, r_train, p_test, t_test, r_test
    else:
        predictors_out = []
        targets = []
        rain_data = []
        for i in range(len(station_ids)):
            targets_one = data['VERIF'][i]
            if rain_tuning is not None and to_bool(
                    rain_tuning.get('use_raw_rain', False)):
                rain_data_one = np.array([data.rain[i]]).T
                predictors_one = np.concatenate((predictors, rain_data_one),
                                                axis=1)
                rain_shape = rain_data_one.shape[-1]
                predictors_out.append(predictors_one)
                targets.append(targets_one)
                rain_data.append(rain_data_one)
                if i == len(station_ids) - 1:  #last station
                    return predictors_out, targets, data.rain
            else:
                predictors_one = np.copy(predictors)
                predictors_out.append(predictors_one)
                targets.append(targets_one)
                if i == len(station_ids) - 1:  #last station
                    return predictors_out, targets, None
Пример #3
0
def predict_all(config,
                predictor_file,
                ensemble=False,
                time_series_date=None,
                naive_rain_correction=False,
                round_result=False,
                **kwargs):
    """
    Predict forecasts from the estimator in config. Also return probabilities and time series.
    :param config:
    :param predictor_file: str: file containing predictor data from mosx.model.format_predictors
    :param ensemble: bool: if True, return an array of num_trees-by-4 of the predictions of each tree in the estimator
    :param time_series_date: datetime: if set, returns a time series prediction from the estimator, where the datetime
    provided is the day the forecast is for (only works for single-day runs, or assumes last day)
    :param naive_rain_correction: bool: if True, applies manual tuning to the rain forecast
    :param round_result: bool: if True, rounds the predicted estimate
    :param kwargs: passed to the estimator's 'predict' method
    :return:
    predicted: ndarray: num_samples x num_predicted_variables predictions if one station, or list of such arrays if multiple stations
    all_predicted: ndarray: num_samples x num_predicted_variables x num_ensemble_members predictions for all trees if one station, or list of such arrays if multiple stations
    predicted_timeseries: DataFrame: time series for final sample if one station, or list of such time series if multiple stations
    """
    # Load the predictor data and estimator
    predictor_data = read_pkl(predictor_file)
    rain_tuning = config['Model'].get('Rain tuning', None)
    if config['multi_stations']:  #multiple stations
        station_ids = config['station_id']
        estimator_files = config['Model']['estimator_file']
        if len(estimator_files) != len(
                station_ids
        ):  #There has to be the same number of estimator files as station IDs, so raise error if not
            raise ValueError(
                "There must be the same number of estimator files as station IDs"
            )
    else:
        station_ids = [config['station_id']]  #just one station
        estimator_files = [config['Model']['estimator_file']]

    predictors = np.concatenate(
        (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1)

    predicted = []
    all_predicted = []
    predicted_timeseries = []
    for i in range(len(station_ids)):
        estimator_file = estimator_files[i]
        estimator = read_pkl(estimator_file)
        if config['verbose']:
            print('predict: loading estimator %s' % estimator_file)
        if config['Model']['rain_forecast_type'] == 'pop' and getattr(
                estimator, 'is_classifier', False):
            predict_method = estimator.predict_proba
        else:
            predict_method = estimator.predict
        if rain_tuning is not None and to_bool(
                rain_tuning.get('use_raw_rain', False)):
            predicted_one = predict_method(predictors,
                                           rain_array=predictor_data.rain[i],
                                           **kwargs)
        else:
            predicted_one = predict_method(predictors, **kwargs)
        precip = predictor_data.rain[i]

        # Check for precipitation override
        if naive_rain_correction:
            for day in range(predicted_one.shape[0]):
                if sum(precip[day]) < 0.01:
                    if config['verbose']:
                        print(
                            'predict: warning: overriding MOS-X rain prediction of %0.2f on day %s with 0'
                            % (predicted_one[day, 3], day))
                    predicted_one[day, 3] = 0.
                elif predicted_one[day, 3] > max(
                        precip[day]) or predicted_one[day, 3] < min(
                            precip[day]):
                    if config['verbose']:
                        print(
                            'predict: warning: overriding MOS-X prediction of %0.2f on day %s with model mean'
                            % (predicted_one[day, 3], day))
                    predicted_one[day, 3] = max(
                        0., np.mean(precip[day] + [predicted_one[day, 3]]))
        else:
            # At least make sure we aren't predicting negative values...
            predicted_one[:, 3][predicted_one[:, 3] < 0] = 0.0

        # Round off daily values, if selected
        if round_result:
            predicted_one[:, :3] = np.round(predicted_one[:, :3])
            predicted_one[:, 3] = np.round(predicted_one[:, 3], 2)

        # If probabilities are requested and available, get the results from each tree
        if ensemble:
            num_samples = predictors.shape[0]
            if not hasattr(estimator, 'named_steps'):
                forest = estimator
            else:
                imputer = estimator.named_steps['imputer']
                forest = estimator.named_steps['regressor']
                predictors = imputer.transform(predictors)
            # If we generated our own ensemble by bootstrapping, it must be treated as such
            if config['Model']['train_individual'] and config['Model'].get(
                    'Bootstrapping', None) is None:
                num_trees = len(forest.estimators_[0].estimators_)
                all_predicted_one = np.zeros((num_samples, 4, num_trees))
                for v in range(4):
                    for t in range(num_trees):
                        try:
                            all_predicted_one[:, v, t] = forest.estimators_[
                                v].estimators_[t].predict(predictors)
                        except AttributeError:
                            # Work around the 2-D array of estimators for GBTrees
                            all_predicted_one[:, v, t] = forest.estimators_[
                                v].estimators_[t][0].predict(predictors)
            else:
                num_trees = len(forest.estimators_)
                all_predicted_one = np.zeros((num_samples, 4, num_trees))
                for t in range(num_trees):
                    try:
                        all_predicted_one[:, :,
                                          t] = forest.estimators_[t].predict(
                                              predictors)[:, :4]
                    except AttributeError:
                        # Work around the 2-D array of estimators for GBTrees
                        all_predicted_one[:, :, t] = forest.estimators_[t][
                            0].predict(predictors)[:, :4]
            all_predicted.append(all_predicted_one)
        else:
            all_predicted = None

        if config['Model']['predict_timeseries']:
            if time_series_date is None:
                date_now = datetime.utcnow()
                time_series_date = datetime(date_now.year, date_now.month,
                                            date_now.day) + timedelta(days=1)
                print(
                    'predict: warning: set time series start date to %s (was unspecified)'
                    % time_series_date)
            num_hours = int(24 / config['time_series_interval']) + 1
            predicted_array = predicted_one[-1, 4:].reshape((4, num_hours)).T

            # Get dewpoint
            predicted_array[:, 1] = dewpoint(predicted_array[:, 0],
                                             predicted_array[:, 1])
            times = pd.date_range(
                time_series_date.replace(hour=6),
                periods=num_hours,
                freq='%dH' %
                config['time_series_interval']).to_pydatetime().tolist()
            variables = ['temperature', 'dewpoint', 'windSpeed', 'rain']
            round_dict = {
                'temperature': 0,
                'dewpoint': 0,
                'windSpeed': 0,
                'rain': 2
            }
            predicted_timeseries_one = pd.DataFrame(predicted_array,
                                                    index=times,
                                                    columns=variables)
            predicted_timeseries_one = predicted_timeseries_one.round(
                round_dict)
            predicted_timeseries.append(predicted_timeseries_one)
        else:
            predicted_timeseries_one = None
        predicted.append(predicted_one)

    return predicted, all_predicted, predicted_timeseries
Пример #4
0
def upper_air(config,
              station_id,
              sounding_station_id,
              date,
              use_nan_sounding=False,
              use_existing=True,
              save=True):
    """
    Retrieves upper-air data and interpolates to pressure levels. If use_nan_sounding is True, then if a retrieval
    error occurs, a blank sounding will be returned instead of an error.
    :param config:
    :param station_id: station ID of surface station used
    :param sounding_station_id: station ID of sounding station to use
    :param date: datetime
    :param use_nan_sounding: bool: if True, use sounding of NaNs instead of raising an error
    :param use_existing: bool: preferentially use existing soundings in sounding_data_dir
    :param save: bool: if True, save processed soundings to sounding_data_dir
    :return:
    """
    variables = ['height', 'temperature', 'dewpoint', 'u_wind', 'v_wind']

    # Define levels for interpolation: same as model data, except omitting lowest_p_level
    plevs = [600, 750, 850, 925]
    pres_interp = np.array([p for p in plevs if p <= config['lowest_p_level']])

    # Try retrieving the sounding, first checking for existing
    if config['verbose']:
        print('upper_air: retrieving sounding for %s' %
              datetime.strftime(date, '%Y%m%d%H'))
    nan_sounding = False
    retrieve_sounding = False
    sndg_data_dir = config['Obs']['sounding_data_dir']
    if not (os.path.isdir(sndg_data_dir)):
        os.makedirs(sndg_data_dir)
    sndg_file = '%s/%s_SNDG_%s.pkl' % (sndg_data_dir, station_id,
                                       datetime.strftime(date, '%Y%m%d%H'))
    if use_existing:
        try:
            data = read_pkl(sndg_file)
            if config['verbose']:
                print('    Read from file.')
        except:
            retrieve_sounding = True
    else:
        retrieve_sounding = True
    if retrieve_sounding:
        try:
            dset = WyomingUpperAir.request_data(
                date, config['Obs']['sounding_station_id'])
        except:
            # Try again
            try:
                dset = WyomingUpperAir.request_data(
                    date, config['Obs']['sounding_station_id'])
            except:
                if use_nan_sounding:
                    if config['verbose']:
                        print(
                            'upper_air: warning: unable to retrieve sounding; using nan.'
                        )
                    nan_sounding = True
                else:
                    raise ValueError('error retrieving sounding for %s' % date)

        # Retrieve pressure for interpolation to fixed levels
        if not nan_sounding:
            pressure = dset.variables['pressure']
            pres = np.array([p.magnitude
                             for p in list(pressure)])  # units are hPa

        # Get variables and interpolate; add to dictionary
        data = OrderedDict()
        for var in variables:
            if not nan_sounding:
                var_data = dset.variables[var]
                var_array = np.array([v.magnitude for v in list(var_data)])
                var_interp = interp(pres_interp, pres, var_array)
                data[var] = var_interp.tolist()
            else:
                data[var] = [np.nan] * len(pres_interp)

        # Save
        if save and not nan_sounding:
            with open(sndg_file, 'wb') as handle:
                pickle.dump(data, handle, protocol=2)

    return data