def predict_rain_proba(config, predictor_file): """ Predict probabilistic rain forecasts for 'pop' or 'categorical' types. :param config: :param predictor_file: str: file containing predictor data from mosx.model.format_predictors :return: """ if config['Model']['rain_forecast_type'] not in ['pop', 'categorical']: raise TypeError( "'quantity' rain forecast is not probabilistic, cannot get probabilities" ) rain_tuning = config['Model'].get('Rain tuning', None) if rain_tuning is None: raise TypeError( 'Probabilistic rain forecasts are only possible with a RainTuningEstimator' ) if config['multi_stations']: #multiple stations station_ids = config['station_id'] estimator_files = config['Model']['estimator_file'] if len(estimator_files) != len( station_ids ): #There has to be the same number of estimator files as station IDs, so raise error if not raise ValueError( "There must be the same number of estimator files as station IDs" ) else: station_ids = [config['station_id']] #just one station estimator_files = [config['Model']['estimator_file']] # Load the predictor data and estimator predictor_data = read_pkl(predictor_file) for i in range(len(station_ids)): station_id = station_ids[i] estimator_file = estimator_files[i] if config['verbose']: print('predict: loading estimator %s' % estimator_file) estimator = read_pkl(estimator_file) predictors = np.concatenate( (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1) if to_bool(rain_tuning.get('use_raw_rain', False)): rain_proba = estimator.predict_rain_proba( predictors, rain_array=predictor_data.rain[i]) else: rain_proba = estimator.predict_rain_proba(predictors) return rain_proba
def build_train_data(config, predictor_file, no_obs=False, no_models=False, test_size=0): """ Build the array of training (and optionally testing) data. :param config: :param predictor_file: :param no_obs: :param no_models: :param test_size: :return: """ from sklearn.model_selection import train_test_split if config['multi_stations']: #multiple stations station_ids = config['station_id'] else: station_ids = [config['station_id']] #just one station if config['verbose']: print('build_train_data: reading predictor file') rain_tuning = config['Model'].get('Rain tuning', None) data = read_pkl(predictor_file) # Select data if no_obs and no_models: no_obs = False no_models = False if no_obs: if config['verbose']: print('build_train_data: not using observations to train') predictors = data['BUFKIT'] elif no_models: if config['verbose']: print('build_train_data: not using models to train') predictors = data['OBS'] else: predictors = np.concatenate((data['BUFKIT'], data['OBS']), axis=1) if test_size > 0: pred_len = len(predictors[0]) targets_len = len(data['VERIF'][0][0]) print(targets_len) targets_combined = [ ] #arrays of verification of different stations combined rain_combined = [ ] #arrays of rain arrays of different stations combined for day in range(len(data['VERIF'][0])): #for each day rain_day = [] for i in range(len(data['VERIF'])): #for each station if i == 0: targets_day = data['VERIF'][i][day] rain_day = data.rain[i][day] else: targets_day = np.concatenate( (targets_day, data['VERIF'][i][day])) rain_day = np.concatenate((rain_day, data.rain[i][day])) targets_combined.append(targets_day) rain_combined.append(rain_day) targets_combined = np.array(targets_combined) rain_combined = np.array(rain_combined) rain_len = len(data.rain[0][0]) if rain_tuning is not None and to_bool( rain_tuning.get('use_raw_rain', False)): predictors = np.concatenate((predictors, rain_combined), axis=1) p_train = [] t_train = [] r_train = [] p_test = [] t_test = [] r_test = [] p_train_raw, p_test_raw, t_train_raw, t_test_raw = train_test_split( predictors, targets_combined, test_size=test_size) for i in range(len(station_ids)): if rain_tuning is not None and to_bool( rain_tuning.get('use_raw_rain', False)): p_train_one = np.array([ p_train_raw[j][0:pred_len] for j in range(len(p_train_raw)) ]) r_train_one = np.array([ p_train_raw[j][pred_len + i * rain_len:pred_len + (i + 1) * rain_len] for j in range(len(p_train_raw)) ]) p_test_one = np.array([ p_test_raw[j][0:pred_len] for j in range(len(p_test_raw)) ]) r_test_one = np.array([ p_test_raw[j][pred_len + i * rain_len:pred_len + (i + 1) * rain_len] for j in range(len(p_test_raw)) ]) r_train.append(r_train_one) r_test.append(r_test_one) else: p_train_one = np.copy(p_train_raw) p_test_one = np.copy(p_test_raw) r_train = None r_test = None t_train_one = np.array([ t_train_raw[j][i * targets_len:(i + 1) * targets_len] for j in range(len(t_train_raw)) ]) t_test_one = np.array([ t_test_raw[j][i * targets_len:(i + 1) * targets_len] for j in range(len(t_test_raw)) ]) p_train.append(p_train_one) t_train.append(t_train_one) p_test.append(p_test_one) t_test.append(t_test_one) if i == len(station_ids) - 1: #last station return p_train, t_train, r_train, p_test, t_test, r_test else: predictors_out = [] targets = [] rain_data = [] for i in range(len(station_ids)): targets_one = data['VERIF'][i] if rain_tuning is not None and to_bool( rain_tuning.get('use_raw_rain', False)): rain_data_one = np.array([data.rain[i]]).T predictors_one = np.concatenate((predictors, rain_data_one), axis=1) rain_shape = rain_data_one.shape[-1] predictors_out.append(predictors_one) targets.append(targets_one) rain_data.append(rain_data_one) if i == len(station_ids) - 1: #last station return predictors_out, targets, data.rain else: predictors_one = np.copy(predictors) predictors_out.append(predictors_one) targets.append(targets_one) if i == len(station_ids) - 1: #last station return predictors_out, targets, None
def predict_all(config, predictor_file, ensemble=False, time_series_date=None, naive_rain_correction=False, round_result=False, **kwargs): """ Predict forecasts from the estimator in config. Also return probabilities and time series. :param config: :param predictor_file: str: file containing predictor data from mosx.model.format_predictors :param ensemble: bool: if True, return an array of num_trees-by-4 of the predictions of each tree in the estimator :param time_series_date: datetime: if set, returns a time series prediction from the estimator, where the datetime provided is the day the forecast is for (only works for single-day runs, or assumes last day) :param naive_rain_correction: bool: if True, applies manual tuning to the rain forecast :param round_result: bool: if True, rounds the predicted estimate :param kwargs: passed to the estimator's 'predict' method :return: predicted: ndarray: num_samples x num_predicted_variables predictions if one station, or list of such arrays if multiple stations all_predicted: ndarray: num_samples x num_predicted_variables x num_ensemble_members predictions for all trees if one station, or list of such arrays if multiple stations predicted_timeseries: DataFrame: time series for final sample if one station, or list of such time series if multiple stations """ # Load the predictor data and estimator predictor_data = read_pkl(predictor_file) rain_tuning = config['Model'].get('Rain tuning', None) if config['multi_stations']: #multiple stations station_ids = config['station_id'] estimator_files = config['Model']['estimator_file'] if len(estimator_files) != len( station_ids ): #There has to be the same number of estimator files as station IDs, so raise error if not raise ValueError( "There must be the same number of estimator files as station IDs" ) else: station_ids = [config['station_id']] #just one station estimator_files = [config['Model']['estimator_file']] predictors = np.concatenate( (predictor_data['BUFKIT'], predictor_data['OBS']), axis=1) predicted = [] all_predicted = [] predicted_timeseries = [] for i in range(len(station_ids)): estimator_file = estimator_files[i] estimator = read_pkl(estimator_file) if config['verbose']: print('predict: loading estimator %s' % estimator_file) if config['Model']['rain_forecast_type'] == 'pop' and getattr( estimator, 'is_classifier', False): predict_method = estimator.predict_proba else: predict_method = estimator.predict if rain_tuning is not None and to_bool( rain_tuning.get('use_raw_rain', False)): predicted_one = predict_method(predictors, rain_array=predictor_data.rain[i], **kwargs) else: predicted_one = predict_method(predictors, **kwargs) precip = predictor_data.rain[i] # Check for precipitation override if naive_rain_correction: for day in range(predicted_one.shape[0]): if sum(precip[day]) < 0.01: if config['verbose']: print( 'predict: warning: overriding MOS-X rain prediction of %0.2f on day %s with 0' % (predicted_one[day, 3], day)) predicted_one[day, 3] = 0. elif predicted_one[day, 3] > max( precip[day]) or predicted_one[day, 3] < min( precip[day]): if config['verbose']: print( 'predict: warning: overriding MOS-X prediction of %0.2f on day %s with model mean' % (predicted_one[day, 3], day)) predicted_one[day, 3] = max( 0., np.mean(precip[day] + [predicted_one[day, 3]])) else: # At least make sure we aren't predicting negative values... predicted_one[:, 3][predicted_one[:, 3] < 0] = 0.0 # Round off daily values, if selected if round_result: predicted_one[:, :3] = np.round(predicted_one[:, :3]) predicted_one[:, 3] = np.round(predicted_one[:, 3], 2) # If probabilities are requested and available, get the results from each tree if ensemble: num_samples = predictors.shape[0] if not hasattr(estimator, 'named_steps'): forest = estimator else: imputer = estimator.named_steps['imputer'] forest = estimator.named_steps['regressor'] predictors = imputer.transform(predictors) # If we generated our own ensemble by bootstrapping, it must be treated as such if config['Model']['train_individual'] and config['Model'].get( 'Bootstrapping', None) is None: num_trees = len(forest.estimators_[0].estimators_) all_predicted_one = np.zeros((num_samples, 4, num_trees)) for v in range(4): for t in range(num_trees): try: all_predicted_one[:, v, t] = forest.estimators_[ v].estimators_[t].predict(predictors) except AttributeError: # Work around the 2-D array of estimators for GBTrees all_predicted_one[:, v, t] = forest.estimators_[ v].estimators_[t][0].predict(predictors) else: num_trees = len(forest.estimators_) all_predicted_one = np.zeros((num_samples, 4, num_trees)) for t in range(num_trees): try: all_predicted_one[:, :, t] = forest.estimators_[t].predict( predictors)[:, :4] except AttributeError: # Work around the 2-D array of estimators for GBTrees all_predicted_one[:, :, t] = forest.estimators_[t][ 0].predict(predictors)[:, :4] all_predicted.append(all_predicted_one) else: all_predicted = None if config['Model']['predict_timeseries']: if time_series_date is None: date_now = datetime.utcnow() time_series_date = datetime(date_now.year, date_now.month, date_now.day) + timedelta(days=1) print( 'predict: warning: set time series start date to %s (was unspecified)' % time_series_date) num_hours = int(24 / config['time_series_interval']) + 1 predicted_array = predicted_one[-1, 4:].reshape((4, num_hours)).T # Get dewpoint predicted_array[:, 1] = dewpoint(predicted_array[:, 0], predicted_array[:, 1]) times = pd.date_range( time_series_date.replace(hour=6), periods=num_hours, freq='%dH' % config['time_series_interval']).to_pydatetime().tolist() variables = ['temperature', 'dewpoint', 'windSpeed', 'rain'] round_dict = { 'temperature': 0, 'dewpoint': 0, 'windSpeed': 0, 'rain': 2 } predicted_timeseries_one = pd.DataFrame(predicted_array, index=times, columns=variables) predicted_timeseries_one = predicted_timeseries_one.round( round_dict) predicted_timeseries.append(predicted_timeseries_one) else: predicted_timeseries_one = None predicted.append(predicted_one) return predicted, all_predicted, predicted_timeseries
def upper_air(config, station_id, sounding_station_id, date, use_nan_sounding=False, use_existing=True, save=True): """ Retrieves upper-air data and interpolates to pressure levels. If use_nan_sounding is True, then if a retrieval error occurs, a blank sounding will be returned instead of an error. :param config: :param station_id: station ID of surface station used :param sounding_station_id: station ID of sounding station to use :param date: datetime :param use_nan_sounding: bool: if True, use sounding of NaNs instead of raising an error :param use_existing: bool: preferentially use existing soundings in sounding_data_dir :param save: bool: if True, save processed soundings to sounding_data_dir :return: """ variables = ['height', 'temperature', 'dewpoint', 'u_wind', 'v_wind'] # Define levels for interpolation: same as model data, except omitting lowest_p_level plevs = [600, 750, 850, 925] pres_interp = np.array([p for p in plevs if p <= config['lowest_p_level']]) # Try retrieving the sounding, first checking for existing if config['verbose']: print('upper_air: retrieving sounding for %s' % datetime.strftime(date, '%Y%m%d%H')) nan_sounding = False retrieve_sounding = False sndg_data_dir = config['Obs']['sounding_data_dir'] if not (os.path.isdir(sndg_data_dir)): os.makedirs(sndg_data_dir) sndg_file = '%s/%s_SNDG_%s.pkl' % (sndg_data_dir, station_id, datetime.strftime(date, '%Y%m%d%H')) if use_existing: try: data = read_pkl(sndg_file) if config['verbose']: print(' Read from file.') except: retrieve_sounding = True else: retrieve_sounding = True if retrieve_sounding: try: dset = WyomingUpperAir.request_data( date, config['Obs']['sounding_station_id']) except: # Try again try: dset = WyomingUpperAir.request_data( date, config['Obs']['sounding_station_id']) except: if use_nan_sounding: if config['verbose']: print( 'upper_air: warning: unable to retrieve sounding; using nan.' ) nan_sounding = True else: raise ValueError('error retrieving sounding for %s' % date) # Retrieve pressure for interpolation to fixed levels if not nan_sounding: pressure = dset.variables['pressure'] pres = np.array([p.magnitude for p in list(pressure)]) # units are hPa # Get variables and interpolate; add to dictionary data = OrderedDict() for var in variables: if not nan_sounding: var_data = dset.variables[var] var_array = np.array([v.magnitude for v in list(var_data)]) var_interp = interp(pres_interp, pres, var_array) data[var] = var_interp.tolist() else: data[var] = [np.nan] * len(pres_interp) # Save if save and not nan_sounding: with open(sndg_file, 'wb') as handle: pickle.dump(data, handle, protocol=2) return data