def __init__(self, params: Optional[dict] = None): """ Implements Quantile Random Forests using skgarden. """ from skgarden import RandomForestQuantileRegressor self.model = RandomForestQuantileRegressor(**params)
def __init__(self, p, alpha, random_state=2020, verbose=True): # Parameters of the random forest self.alpha = 100 * alpha self.model = RandomForestQuantileRegressor(random_state=random_state, min_samples_split=3, n_estimators=100)
def __init__(self, quantiles, min_samples_leaf=5, n_estimators=100, n_jobs=1, random_state=0, verbose=False): """ Initialization Parameters ---------- quantiles : numpy array of quantile levels (q), each in the range (0,1) num_features : integer, input signal dimension (p) random_state : integer, seed used in quantile random forests """ self.device = 'cpu' # Store input (sort the quantiles) self.quantiles = torch.from_numpy(np.sort(quantiles)).float().to( self.device) # Define RF model self.model = RandomForestQuantileRegressor( random_state=random_state, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=n_jobs, verbose=verbose)
def __init__(self, quantiles=0.5, min_samples_split=10, n_estimators=100): self.quantiles = quantiles self.model = RandomForestQuantileRegressor( random_state=0, min_samples_split=min_samples_split, n_estimators=n_estimators) self.label = 'Quantile Forest' self.filename = 'rf'
def __init__(self, x, y, args): super(QuantileForest, self).__init__() self.alpha = args.alpha self.model_name = "QuantileForest" self.rfqr = RandomForestQuantileRegressor(n_estimators=args.n_learners) #min_samples_split=args.min_samples_split, #n_estimators=args.n_learners, #random_state=args.seed) # self.rfqr.set_params(max_features=x.shape[1] // args.max_features) self.rfqr.fit(x, y)
class QRF: """ Fit a random forest (conditional quantile) to training data """ def __init__(self, quantiles, min_samples_leaf=5, n_estimators=100, n_jobs=1, random_state=0, verbose=False): """ Initialization Parameters ---------- quantiles : numpy array of quantile levels (q), each in the range (0,1) num_features : integer, input signal dimension (p) random_state : integer, seed used in quantile random forests """ self.device = 'cpu' # Store input (sort the quantiles) self.quantiles = torch.from_numpy(np.sort(quantiles)).float().to( self.device) # Define RF model self.model = RandomForestQuantileRegressor( random_state=random_state, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=n_jobs, verbose=verbose) def fit(self, X, Y, return_loss=None): warnings.filterwarnings("ignore", category=FutureWarning) self.model.fit(X, Y) warnings.filterwarnings("default", category=FutureWarning) return 0 def predict(self, X): """ Estimate the label given the features Parameters ---------- x : numpy array of training features (nXp) Returns ------- ret_val : numpy array of predicted labels (n) """ quantiles = self.quantiles.cpu() ret_val = np.zeros((X.shape[0], len(quantiles))) print("Predicting RF quantiles:") for i in tqdm(range(len(quantiles))): ret_val[:, i] = self.model.predict(X, quantile=100 * quantiles[i]) return ret_val def get_quantiles(self): return self.quantiles.cpu().numpy()
def __init__(self, model, fit_params=None, quantiles=[5, 95], params=None): """ Initialization Parameters ---------- model : None, unused parameter (for compatibility with nc class) fit_params : None, unused parameter (for compatibility with nc class) quantiles : numpy array, low and high quantile levels in range (0,100) params : dictionary of parameters params["random_state"] : integer, seed for splitting the data in cross-validation. Also used as the seed in quantile random forests (QRF) params["min_samples_leaf"] : integer, parameter of QRF params["n_estimators"] : integer, parameter of QRF params["max_features"] : integer, parameter of QRF params["CV"] : boolean, use cross-validation (True) or not (False) to tune the two QRF quantile levels to obtain the desired coverage params["test_ratio"] : float, ratio of held-out data, used in cross-validation params["coverage_factor"] : float, to avoid too conservative estimation of the prediction band, when tuning the two QRF quantile levels in cross-validation one may ask for prediction intervals with reduced average coverage, equal to coverage_factor*(q_high - q_low). params["range_vals"] : float, determines the lowest and highest quantile level parameters when tuning the quanitle levels bt cross-validation. The smallest value is equal to quantiles[0] - range_vals. Similarly, the largest is equal to quantiles[1] + range_vals. params["num_vals"] : integer, when tuning QRF's quantile parameters, sweep over a grid of length num_vals. """ super(QuantileForestRegressorAdapter, self).__init__(model, fit_params) # Instantiate model self.quantiles = quantiles self.cv_quantiles = self.quantiles self.params = params self.rfqr = RandomForestQuantileRegressor(random_state=params["random_state"], min_samples_leaf=params["min_samples_leaf"], n_estimators=params["n_estimators"], max_features=params["max_features"])
def fit_model(self): """ fit the gradient boosting regression model using the train dataset Returns ------- output: RandomForestQuantileRegressor object the random forest quantile regression model """ x_train_dummy = pd.get_dummies(self.x) self.random_forest = RandomForestQuantileRegressor() self.random_forest.set_params(**self.params) self.random_forest = self.random_forest.fit(x_train_dummy, self.y) return self.random_forest
class QRF: @validated() def __init__(self, params: Optional[dict] = None): """ Implements Quantile Random Forests using skgarden. """ from skgarden import RandomForestQuantileRegressor self.model = RandomForestQuantileRegressor(**params) def fit(self, x_train, y_train): self.model.fit(np.array(x_train), np.array(y_train)) def predict(self, x_test, quantile): return self.model.predict(x_test, quantile=100 * quantile)
def build_model(**kwargs): model = RandomForestQuantileRegressor(random_state=0, min_samples_split=10, n_estimators=1000, n_jobs=-1, warm_start=False) return model
def train_qr_algo(model_obj, theta_mat, stats_mat, algo_name, learner_kwargs, pytorch_kwargs, alpha, prediction_grid): # Train the regression quantiles algorithms if algo_name == 'xgb': model = GradientBoostingRegressor(loss='quantile', alpha=alpha, **learner_kwargs) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) pred_vec = model.predict(prediction_grid.reshape(-1, model_obj.d)) elif algo_name == 'rf': model = RandomForestQuantileRegressor(**learner_kwargs) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) pred_vec = model.predict(prediction_grid.reshape(-1, model_obj.d), quantile=alpha * 100) elif algo_name == 'lgb': model = lgb.LGBMRegressor(objective='quantile', alpha=alpha, **learner_kwargs) model.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )) pred_vec = model.predict(prediction_grid.reshape(-1, model_obj.d)) elif algo_name == 'pytorch': model = q_model([alpha], dropout=0.1, in_shape=model_obj.d, **pytorch_kwargs) loss_func = QuantileLoss(quantiles=[alpha]) learner = Learner(model, partial(torch.optim.Adam, weight_decay=1e-6), loss_func, device="cpu") learner.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, ), **learner_kwargs) pred_vec = learner.predict( prediction_grid.reshape(-1, model_obj.d).astype(np.float32)) elif algo_name == 'pytorch_3l': model = q_model_3l([alpha], dropout=0.1, in_shape=model_obj.d, **pytorch_kwargs) loss_func = QuantileLoss(quantiles=[alpha]) learner = Learner(model, partial(torch.optim.Adam, weight_decay=1e-6), loss_func, device="cpu") learner.fit(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, ), **learner_kwargs) pred_vec = learner.predict( prediction_grid.reshape(-1, model_obj.d).astype(np.float32)) elif algo_name == 'linear': pred_vec = QuantReg(theta_mat.reshape(-1, model_obj.d), stats_mat.reshape(-1, )).fit(q=alpha).predict( prediction_grid.reshape(-1, model_obj.d)) else: raise ValueError('CDE Classifier not defined in the file.') return pred_vec
class QuantileForest: def __init__(self, quantiles=0.5, min_samples_split=10, n_estimators=100): self.quantiles = quantiles self.model = RandomForestQuantileRegressor( random_state=0, min_samples_split=min_samples_split, n_estimators=n_estimators) self.label = 'Quantile Forest' self.filename = 'rf' def fit(self, X, y): self.model.fit(X, y) def predict(self, X): if np.isscalar(self.quantiles): return self.model.predict(X, quantile=self.quantiles * 100) return np.array([ self.model.predict(X, quantile=q * 100) for q in self.quantiles ]).T
def __init__(self, params, quantiles, verbose=False): self.regressor = RF(n_estimators=params['n_estimators'], max_features=params['max_features'], min_samples_leaf=params['min_samples_leaf'], random_state=params['random_state'], n_jobs=params['n_jobs']) self.quantiles = quantiles self.cv_quantiles = quantiles self.verbose = verbose self.cv = params["cv"]
class ForestQuantileRegressor: def __init__(self, p, alpha, random_state=2020, verbose=True): # Parameters of the random forest self.alpha = 100 * alpha self.model = RandomForestQuantileRegressor(random_state=random_state, min_samples_split=3, n_estimators=100) def fit(self, X, y): # Reshape the data X = np.asarray(X) y = np.asarray(y) self.model.fit(X, y) def predict(self, X): lower = self.model.predict(X, quantile=self.alpha) y = np.concatenate( (lower[:, np.newaxis], self.model.predict( X, quantile=100.0 - self.alpha)[:, np.newaxis]), 1) return y
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) orig_cols = list(X.names) self.pre_get_model() from skgarden import RandomForestQuantileRegressor model = RandomForestQuantileRegressor(**self.params) X = self.basic_impute(X) X = X.to_numpy() model.fit(X, y) importances = np.array(model.feature_importances_) self.set_model_properties( model=model, features=orig_cols, importances=importances.tolist(), iterations=self.params["n_estimators"], )
class QuantileForest: """ Estimate conditional quantiles by Quantile Forest (fits one model for all quantiles) """ def __init__(self, x, y, args): super(QuantileForest, self).__init__() self.alpha = args.alpha self.model_name = "QuantileForest" self.rfqr = RandomForestQuantileRegressor(n_estimators=args.n_learners) #min_samples_split=args.min_samples_split, #n_estimators=args.n_learners, #random_state=args.seed) # self.rfqr.set_params(max_features=x.shape[1] // args.max_features) self.rfqr.fit(x, y) def predict(self, x_te): preds_low = self.rfqr.predict(x_te, (self.alpha / 2) * 100) preds_high = self.rfqr.predict(x_te, (1 - self.alpha / 2) * 100) preds_mean = (preds_high - preds_low) / 2 return torch.Tensor(preds_mean), torch.Tensor(preds_low), torch.Tensor( preds_high)
def __init__(self, dependent_var_str: str, len_of_lag=48, len_of_forecast=48, min_samples_split=2, len_of_test=48, n_estimators=1000, n_jobs=4): """ initializing class :param dependent_var_str: sets variable to be fit :param min_samples_split: minimum number of samples needed to generate a new branch :param n_estimators: number of estimators used """ self.model = RandomForestQuantileRegressor( min_samples_split=min_samples_split, n_estimators=n_estimators, bootstrap=True, # min_weight_fraction_leaf=0.01, max_leaf_nodes=1000, n_jobs=n_jobs) self.dependent_var = dependent_var_str self.length_of_lag = len_of_lag self.length_of_test = len_of_test self.length_of_forecast = len_of_forecast
def __init__(self, switch, X_train, y_train, **RF_params): """ The initilialization includes registration and fitting of the random forest. :param switch: It can be chosen between 'Classifier' or 'Regressor' with the corresponding string. :param X_train: Features used for training. Can be provided as numpy array or pandas DF (and more?) :param y_train: The target for regression or the labels for classification. Also numpy array or pandas DF. :param RF_params: All options from sklearn can be used. For instance """ if switch == 'Classifier': clf = RandomForestClassifier(**RF_params) elif switch == 'Regressor': clf = RandomForestRegressor(**RF_params) elif switch == 'RegressorQuantile': clf = RandomForestQuantileRegressor(**RF_params) else: print('specify Classifier or Regressor (first argument)') return clf.fit(X_train, y_train) self.model = clf
def run_random_forest(self): x_norm2, w = self.prep_train_data() x_train, x_test = train_test_split(x_norm2, test_size=self.__test_size) x_tr = x_train.values x_te = x_test.values x_tr, y_tr, x_te, y_te = split_time_series(x_tr, x_te, self.__proportion) train_y = [] for y in y_tr: train_y.append(y[-1]) rfqr = RandomForestQuantileRegressor(random_state=0, min_samples_split=2, n_estimators=100, criterion='mae') rfqr.fit(x_tr, train_y) test_y = [] for y in y_te: test_y.append(np.float(y[-1])) y_mean_test = rfqr.predict(x_te) y_high_test = rfqr.predict(x_te, 85) y_low_test = rfqr.predict(x_te, 15) test_predictions = pd.DataFrame({ 'high': y_high_test * w[-1], 'low': y_low_test * w[-1], 'point': y_mean_test * w[-1], 'actual': np.array(test_y) * w[-1] }) test_predictions['id'] = np.arange(0, len(test_predictions)) self.__logging.info(test_predictions) self.__model = rfqr return rfqr, w, test_predictions
max_features = None # default ############################################################## ############################################################## ####### TRAIN MODELS ###### X = df[features] X = np.array(X) # def train_qrf(): # label = 'C' # quantity to predict etaC_model = RandomForestQuantileRegressor(min_samples_split=min_samples_split, n_estimators=n_estimators, max_features=max_features) etaW_model = RandomForestQuantileRegressor(min_samples_split=min_samples_split, n_estimators=n_estimators, max_features=max_features) etaN_model = RandomForestQuantileRegressor(min_samples_split=min_samples_split, n_estimators=n_estimators, max_features=max_features) etaC_model.fit(X, df['etaC'].values) etaW_model.fit(X, df['etaW'].values) etaN_model.fit(X, df['etaN'].values) lon = ncres['lon'][:] lat = ncres['lat'][:] nlon = np.size(lon)
def pred_error(dfr, label, features, seeds,*, min_samples_split=10, nshuffles=10, n_estimators=1000, test_size = 0.33, min_dist=100, max_features='none', plotmaps = True): ''' Predict error components using a quantile regression forest model ''' # initialize arrays with prediction and test values: nX = np.size(features) Y_TEST = [] D_TEST = [] T_TEST = [] LOWER = [] UPPER = [] MEDIAN = [] EXPECTED = [] LAT_TEST = [] LON_TEST = [] QRF_IMPS = np.zeros((nX, nshuffles)) RF_IMPS = np.zeros((nX, nshuffles)) for irs in range(nshuffles): # df = shuffle(df) # first reshuffle order of rows df0 = dfr.copy() df0 = df0.sample(frac=1, replace=False, random_state=seeds[irs]).reset_index(drop=True) df = remove_neighbours(df0, min_dist=min_dist) X = df[features] # Xnames = list(X.columns) X = np.array(X) Yname = 'eta{}'.format(label) # label = error in the parameter C Y = df[Yname].values D = df['{}d'.format(label)].values # downscaled values to correct T = df['{}g'.format(label)].values # ground truth value Lat = df['clat'].values # latitude value Lon = df['clon'].values # longitiude value X_train, X_test, Y_train, Y_test, D_train, D_test, T_train, T_test, \ Lat_train, Lat_test, Lon_train, Lon_test = train_test_split( X, Y, D, T, Lat, Lon, test_size=test_size, shuffle=False) if irs < 3 and plotmaps == True: plt.figure() plt.plot(dfr.clat, dfr.clon, '.') plt.plot(Lat_test, Lon_test, 'or') plt.plot(Lat_train, Lon_train, 'ob') plt.savefig(os.path.join(cfun.outplot, 'stats', 'qrf_gen_{}_{}'.format(irs, label))) plt.close() # fit quantile regression forest rfqr = RandomForestQuantileRegressor( min_samples_split=min_samples_split, n_estimators=n_estimators, max_features=max_features) rfqr.fit(X_train, Y_train) upper = rfqr.predict(X_test, quantile=75) lower = rfqr.predict(X_test, quantile=25) median = rfqr.predict(X_test, quantile=50) qrf_imps = rfqr.feature_importances_ # print(qrf_imps) # Fit random forest rfr = RandomForestRegressor(min_samples_split=min_samples_split, n_estimators=n_estimators, max_features=max_features) rfr.fit(X_train, Y_train) expected = rfqr.predict(X_test) rf_imps = rfr.feature_importances_ # print(rf_imps) Y_TEST = np.concatenate((Y_TEST, Y_test)) D_TEST = np.concatenate((D_TEST, D_test)) T_TEST = np.concatenate((T_TEST, T_test)) UPPER = np.concatenate((UPPER, upper)) LOWER = np.concatenate((LOWER, lower)) MEDIAN = np.concatenate((MEDIAN, median)) EXPECTED = np.concatenate((EXPECTED, expected)) LAT_TEST = np.concatenate((LAT_TEST, Lat_test)) LON_TEST = np.concatenate((LON_TEST, Lon_test)) QRF_IMPS[:, irs] = qrf_imps RF_IMPS [:, irs] = rf_imps # print(QRF_IMPS) MEAN_QRF_IMPS = np.mean(QRF_IMPS, axis=1) MEAN_RF_IMPS = np.mean(RF_IMPS, axis=1) CORR_QRF = D_TEST/(MEDIAN + 1) CORR_RF = D_TEST/(EXPECTED + 1) res = {'pred_qrf': MEDIAN, 'pred_rf': EXPECTED, 'upper': UPPER, 'lower': LOWER, 'y_test': Y_TEST, 'd_test': D_TEST, 't_test': T_TEST, 'lat_test': LAT_TEST, 'lon_test': LON_TEST, 'corr_qrf': CORR_QRF, 'corr_rf': CORR_RF, 'qrf_imps': QRF_IMPS, 'rf_imps': RF_IMPS, 'mean_qrf_imps': MEAN_QRF_IMPS, 'mean_rf_imps': MEAN_RF_IMPS } return res
class RandomForestQR: def __init__(self, params, quantiles, verbose=False): self.regressor = RF(n_estimators=params['n_estimators'], max_features=params['max_features'], min_samples_leaf=params['min_samples_leaf'], random_state=params['random_state'], n_jobs=params['n_jobs']) self.quantiles = quantiles self.cv_quantiles = quantiles self.verbose = verbose self.cv = params["cv"] def fit(self, X, y, cv=True): if self.cv and cv: self.tune(X, y) self.regressor.fit(X, y) def predict(self, X, quantiles=None): if quantiles is None: quantiles = self.cv_quantiles predictions = np.zeros((X.shape[0], len(quantiles))) for j in range(len(quantiles)): q = 100.0 * quantiles[j] predictions[:, j] = self.regressor.predict(X, q) predictions.sort(axis=1) return predictions def tune(self, X, y, test_ratio=0.2, random_state=1): "Tune using cross-validation" coverage_factor = 0.85 target_coverage = round(self.quantiles[-1] - self.quantiles[0], 3) * coverage_factor range_vals = 0.3 num_vals = 10 print(" [CV] target coverage = %.3f" % (target_coverage)) sys.stdout.flush() quantiles = np.array(self.quantiles) grid_q_low = np.linspace(quantiles[0], quantiles[0] + range_vals, num_vals).reshape(-1, 1) grid_q_median = np.repeat(0.5, num_vals).reshape(-1, 1) grid_q_high = np.linspace(quantiles[-1], quantiles[-1] - range_vals, num_vals).reshape(-1, 1) grid_q = np.concatenate((grid_q_low, grid_q_median, grid_q_high), 1) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_ratio, random_state=random_state) print(" [CV] Fitting random forest... ", end="") sys.stdout.flush() self.fit(X_train, y_train, cv=False) print("done.") sys.stdout.flush() best_avg_length = 1e10 best_q = grid_q[0] for q in grid_q: print(" [CV] q = [%.3f,%.3f,%.3f], " % (q[0], q[1], q[-1]), end="") sys.stdout.flush() y_predictions = self.predict(X_test, quantiles=q) lower = y_predictions[:, 0] upper = y_predictions[:, -1] coverage = np.mean((y_test >= lower) & (y_test <= upper)) avg_length = np.mean(upper - lower) print("coverage = %.3f, length = %.3f" % (coverage, avg_length)) sys.stdout.flush() if (coverage >= target_coverage) and (avg_length < best_avg_length): best_avg_length = avg_length best_q = q else: break print(" [CV] Best q = [%.3f,%.3f,%.3f]" % (best_q[0], best_q[1], best_q[-1])) sys.stdout.flush() self.cv_quantiles = best_q return best_q
class QuantileForestRegressorAdapter(RegressorAdapter): """ Conditional quantile estimator, defined as quantile random forests (QRF) References ---------- .. [1] Meinshausen, Nicolai. "Quantile regression forests." Journal of Machine Learning Research 7.Jun (2006): 983-999. """ def __init__(self, model, fit_params=None, quantiles=[5, 95], params=None): """ Initialization Parameters ---------- model : None, unused parameter (for compatibility with nc class) fit_params : None, unused parameter (for compatibility with nc class) quantiles : numpy array, low and high quantile levels in range (0,100) params : dictionary of parameters params["random_state"] : integer, seed for splitting the data in cross-validation. Also used as the seed in quantile random forests (QRF) params["min_samples_leaf"] : integer, parameter of QRF params["n_estimators"] : integer, parameter of QRF params["max_features"] : integer, parameter of QRF params["CV"] : boolean, use cross-validation (True) or not (False) to tune the two QRF quantile levels to obtain the desired coverage params["test_ratio"] : float, ratio of held-out data, used in cross-validation params["coverage_factor"] : float, to avoid too conservative estimation of the prediction band, when tuning the two QRF quantile levels in cross-validation one may ask for prediction intervals with reduced average coverage, equal to coverage_factor*(q_high - q_low). params["range_vals"] : float, determines the lowest and highest quantile level parameters when tuning the quanitle levels bt cross-validation. The smallest value is equal to quantiles[0] - range_vals. Similarly, the largest is equal to quantiles[1] + range_vals. params["num_vals"] : integer, when tuning QRF's quantile parameters, sweep over a grid of length num_vals. """ super(QuantileForestRegressorAdapter, self).__init__(model, fit_params) # Instantiate model self.quantiles = quantiles self.cv_quantiles = self.quantiles self.params = params self.rfqr = RandomForestQuantileRegressor( random_state=params["random_state"], min_samples_leaf=params["min_samples_leaf"], n_estimators=params["n_estimators"], max_features=params["max_features"]) def fit(self, x, y): """ Fit the model to data Parameters ---------- x : numpy array of training features (nXp) y : numpy array of training labels (n) """ if self.params["CV"]: target_coverage = self.quantiles[1] - self.quantiles[0] coverage_factor = self.params["coverage_factor"] range_vals = self.params["range_vals"] num_vals = self.params["num_vals"] grid_q_low = np.linspace(self.quantiles[0], self.quantiles[0] + range_vals, num_vals).reshape(-1, 1) grid_q_high = np.linspace(self.quantiles[1], self.quantiles[1] - range_vals, num_vals).reshape(-1, 1) grid_q = np.concatenate((grid_q_low, grid_q_high), 1) self.cv_quantiles = tune_params_cv.CV_quntiles_rf( self.params, x, y, target_coverage, grid_q, self.params["test_ratio"], self.params["random_state"], coverage_factor) self.rfqr.fit(x, y) def predict(self, x): """ Estimate the conditional low and high quantiles given the features Parameters ---------- x : numpy array of training features (nXp) Returns ------- ret_val : numpy array of estimated conditional quantiles (nX2) """ lower = self.rfqr.predict(x, quantile=self.cv_quantiles[0]) upper = self.rfqr.predict(x, quantile=self.cv_quantiles[1]) ret_val = np.zeros((len(lower), 2)) ret_val[:, 0] = lower ret_val[:, 1] = upper return ret_val
def CV_quntiles_rf(params, X, y, target_coverage, grid_q, test_ratio, random_state, coverage_factor=0.9): """ Tune the low and high quantile level parameters of quantile random forests method, using cross-validation Parameters ---------- params : dictionary of parameters params["random_state"] : integer, seed for splitting the data in cross-validation. Also used as the seed in quantile random forest (QRF) params["min_samples_leaf"] : integer, parameter of QRF params["n_estimators"] : integer, parameter of QRF params["max_features"] : integer, parameter of QRF X : numpy array, containing the training features (nXp) y : numpy array, containing the training labels (n) target_coverage : desired coverage of prediction band. The output coverage may be smaller if coverage_factor <= 1, in this case the target will be modified to target_coverage*coverage_factor grid_q : numpy array, of low and high quantile levels to test test_ratio : float, test size of the held-out data random_state : integer, seed for splitting the data in cross-validation. Also used as the seed in QRF. coverage_factor : float, when tuning the two QRF quantile levels one may ask for prediction band with smaller average coverage, equal to coverage_factor*(q_high - q_low) to avoid too conservative estimation of the prediction band Returns ------- best_q : numpy array of low and high quantile levels (length 2) References ---------- .. [1] Meinshausen, Nicolai. "Quantile regression forests." Journal of Machine Learning Research 7.Jun (2006): 983-999. """ target_coverage = coverage_factor * target_coverage X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_ratio, random_state=random_state) best_avg_length = 1e10 best_q = grid_q[0] rf = RandomForestQuantileRegressor( random_state=params["random_state"], min_samples_leaf=params["min_samples_leaf"], n_estimators=params["n_estimators"], max_features=params["max_features"]) rf.fit(X_train, y_train) for q in grid_q: y_lower = rf.predict(X_test, quantile=q[0]) y_upper = rf.predict(X_test, quantile=q[1]) coverage, avg_length = helper.compute_coverage_len( y_test, y_lower, y_upper) if (coverage >= target_coverage) and (avg_length < best_avg_length): best_avg_length = avg_length best_q = q else: break return best_q
def rfqr_model(pos_dict, predict_year, sz): model_dict = {} predict_dict = {} outcomes = {} for pos in pos_dict: print(pos) target = copy.deepcopy(pos_dict[pos]) # team dummy variables dum = pd.get_dummies(target.Tm) target = target.drop('Tm', axis=1) target = pd.concat([target, dum], axis=1) # save these values to evaluate predictions later outcomes[pos] = target.loc[target.Year == predict_year]\ .reset_index(drop = True)[['Name', 'pts_next_year']] # set aside data to use for model prediction when the model is done predict_dict[pos] = target.loc[target.Year == predict_year]\ .reset_index(drop = True)\ .drop(['Year', 'pts_next_year', 'Name'], axis=1) # make sure new values arent used in the modeling target = target.loc[target.Year < predict_year].reset_index(drop=True) # only use 'sz' years of data before prediction year target = target.loc[target.Year > predict_year - sz].drop( ['Year', 'Name'], axis=1) # separate labels, targets, features labels = np.array(target['pts_next_year']) target = target.drop(['pts_next_year'], axis=1) features = np.array(target) feature_list = list(target.columns) # run model rfqr = RandomForestQuantileRegressor(random_state=0, n_estimators=3000) rfqr.fit(features, labels) model_dict[pos] = rfqr # upper = np.concatenate( ([], rfqr.predict(predict_dict[pos], quantile=98.5))) lower = np.concatenate( ([], rfqr.predict(predict_dict[pos], quantile=2.5))) median = np.concatenate( ([], rfqr.predict(predict_dict[pos], quantile=50))) #interval = upper - lower #sort_ind = np.argsort(interval) y_true_all = outcomes[pos]['pts_next_year'] #[sort_ind] upper = upper #[sort_ind] lower = lower #[sort_ind] median = median #[sort_ind] #mean = (upper + lower) / 2 # Center such that the mean of the prediction interval is at 0.0 #y_true_all -= mean #upper -= mean #lower -= mean plt.plot(y_true_all, "ro") plt.fill_between(np.arange(len(upper)), lower, upper, alpha=0.2, color="r", label="Pred. interval") plt.plot(median) plt.xlabel("X variable") plt.ylabel("Points") plt.xlim([0, 100]) plt.show() # Get numerical feature importances importances = list(rfqr.feature_importances_) feature_importances = [ (feature, round(importance, 2)) for feature, importance in zip(feature_list, importances) ] feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) [ print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances ] ## get ouptuts final_dict = copy.deepcopy(predict_dict) for pos in predict_dict: model = model_dict[pos] final_dict[pos]['prediction'] = model.predict(predict_dict[pos]) final_dict[pos]['Names'] = outcomes[pos]['Name'] final_dict[pos]['pts_next_year'] = outcomes[pos]['pts_next_year'] return final_dict
label="Cross-validation score") sns.despine() plt.ylabel('R2 score') plt.xlabel('Training examples') plt.ylim((0, 1)) plt.legend(loc="best") plt.show() # =============================================================== # RF QUANTILE REGRESSOR # =============================================================== # == fit rfqr = RandomForestQuantileRegressor(**best_params) rfqr.fit(X, y) lower = rfqr.predict(X, quantile=2.5) upper = rfqr.predict(X, quantile=97.5) med = rfqr.predict(X, quantile=50) ypred = reg.predict(X) # plot confidence intervals sort_ind = np.argsort(ypred) plt.plot(np.arange(len(upper)), lower[sort_ind], label='lower') plt.plot(np.arange(len(upper)), ypred[sort_ind], label='predicted') plt.plot(np.arange(len(upper)), med[sort_ind], label='median') plt.plot(np.arange(len(upper)), upper[sort_ind], label='upper') plt.xlabel('ordered samples') plt.ylabel('dropout rate') plt.legend()
def train_RandomForestQuantileRegressor( population, plpData, train, modelOutput, seed, quiet, n_estimators, criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, bootstrap, oob_score, warm_start): print("Training RandomForestQuantileRegressor ") y = population[:, 1] X = plpData[population[:, 0], :] trainInds = population[:, population.shape[1] - 1] > 0 print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1])) print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1])) ########################################################################### if train: pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0)) print("Calculating prediction for train set of size %s" % (pred_size)) test_pred = np.zeros( pred_size ) # zeros length sum(population[:,population.size[1]] ==i) for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1): testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i) train_x = X[trainInds, :][trainInd, :] train_y = y[trainInds][trainInd] test_x = X[trainInds, :][testInd, :] print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0])) print("Train set contains %s outcomes " % (np.sum(train_y))) print("Training fold %s" % (i)) start_time = timeit.default_timer() tmodel = RandomForestQuantileRegressor( n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap, oob_score=oob_score, warm_start=warm_start, random_state=seed, n_jobs=-1) tmodel = tmodel.fit(X=csr_matrix(train_x), y=train_y) end_time = timeit.default_timer() print("Training fold took: %.2f s" % (end_time - start_time)) print("Calculating predictions on left out fold set...") ind = (population[:, population.shape[1] - 1] > 0) ind = population[ind, population.shape[1] - 1] == i test_pred[ind] = tmodel.predict(csr_matrix(test_x)) print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0])) print("Mean: %s prediction value" % (np.mean(test_pred[ind]))) # merge pred with indexes[testInd,:] test_pred.shape = ( population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) prediction = np.append( population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) return prediction # train final: else: print("Training final adaBoost model on all train data...") print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0])) start_time = timeit.default_timer() tmodel = RandomForestQuantileRegressor( n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_leaf_nodes=max_leaf_nodes, bootstrap=bootstrap, oob_score=oob_score, warm_start=warm_start, random_state=seed, n_jobs=-1) tmodel = tmodel.fit(X=csr_matrix(X[trainInds, :]), y=y[trainInds]) end_time = timeit.default_timer() print("Training final took: %.2f s" % (end_time - start_time)) # save the model: if not os.path.exists(modelOutput): os.makedirs(modelOutput) print("Model saved to: %s" % (modelOutput)) joblib.dump(tmodel, os.path.join(modelOutput, "model.pkl"), compress=True) pred = tmodel.predict(csr_matrix(X[trainInds, :]))[:, 0] pred.shape = ( population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) prediction = np.append( population[population[:, population.shape[1] - 1] > 0, :], pred, axis=1) return prediction, tmodel.feature_importances_
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from skgarden import RandomForestQuantileRegressor import pandas as pd X = pd.read_csv(r'c:\test\2010pop-.csv', usecols=['slope', 'poi', 'dem', 'ndvi', 'dmsp']) y = pd.read_csv(r'c:\test\2010pop-.csv', usecols=['log_pop']) X = np.array(X) y = np.array(y) y = y.reshape(y.shape[0], ) kf = KFold(n_splits=6, random_state=0) rfqr = RandomForestQuantileRegressor(random_state=0, min_samples_split=10, n_estimators=1000) y_true_all = [] lower = [] upper = [] for train_index, test_index in kf.split(X): X_train, X_test, y_train, y_test = (X[train_index], X[test_index], y[train_index], y[test_index]) rfqr.set_params(max_features=X_train.shape[1] // 3) rfqr.fit(X_train, y_train) y_true_all = np.concatenate((y_true_all, y_test)) upper = np.concatenate((upper, rfqr.predict(X_test, quantile=98.5))) lower = np.concatenate((lower, rfqr.predict(X_test, quantile=2.5)))
with the more direct "just predict the maximum" approach. """ import numpy as np import matplotlib.pyplot as plt from skgarden import RandomForestQuantileRegressor from sklearn.ensemble import RandomForestRegressor _, y = simulate_ts(T=1500) pasts, futures = windows(y) samples = window_samples(pasts, futures, np.arange(0, 1.05, 0.05)) ############################################################################### # First the quantile forest idea ############################################################################### model = RandomForestQuantileRegressor(n_estimators=1000) # fit model using all the quantiles y_p = np.array([v["x"] for v in samples]) y_f = np.array([v["y"] for v in samples]) model.fit(y_p, y_f) # make predictions only for 0.9 quantiles x_q = np.array([v["x"] for v in samples if v["quantile"] == 1]) q_hat = model.predict(x_q, quantile=1) y_q = np.array([v["y"] for v in samples if v["quantile"] == 1]) plt.scatter(y_q, q_hat) #plt.show() ###############################################################################
def CV_quntiles_rf(params, X, y, target_coverage, grid_q, test_ratio, random_state, coverage_factor=1.0): """ Tune the low and high quantile level parameters of quantile random forests method, using cross-validation Parameters ---------- params : dictionary of parameters params["random_state"] : integer, seed for splitting the data in cross-validation. Also used as the seed in quantile random forest (QRF) params["min_samples_leaf"] : integer, parameter of QRF params["n_estimators"] : integer, parameter of QRF params["max_features"] : integer, parameter of QRF X : numpy array, containing the training features (nXp) y : numpy array, containing the training labels (n) target_coverage : desired coverage of prediction band. The output coverage may be smaller if coverage_factor <= 1, in this case the target will be modified to target_coverage*coverage_factor grid_q : numpy array, of low and high quantile levels to test test_ratio : float, test size of the held-out data random_state : integer, seed for splitting the data in cross-validation. Also used as the seed in QRF. coverage_factor : float, when tuning the two QRF quantile levels one may ask for prediction band with smaller average coverage, equal to coverage_factor*(q_high - q_low) to avoid too conservative estimation of the prediction band Returns ------- best_q : numpy array of low and high quantile levels (length 2) References ---------- .. [1] Meinshausen, Nicolai. "Quantile regression forests." Journal of Machine Learning Research 7.Jun (2006): 983-999. """ target_coverage = coverage_factor*target_coverage rf = RandomForestQuantileRegressor(random_state=params["random_state"], min_samples_leaf=params["min_samples_leaf"], n_estimators=params["n_estimators"], max_features=params["max_features"]) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state) n_folds = 10 kf = KFold(n_splits=n_folds) folds = kf.split(X,y) coverage_values = np.zeros((len(grid_q), n_folds)) length_values = np.zeros((len(grid_q), n_folds)) fold_idx = 0 for fold in folds: print("[CV DEBUG] fold " + str(fold_idx+1) + " of " + str(n_folds) + "... ", end="") sys.stdout.flush() idx_train = fold[0] idx_test = fold[1] X_train = X[idx_train,:] y_train = y[idx_train] X_test = X[idx_test,:] y_test = y[idx_test] rf.fit(X_train, y_train) for q_idx in range(len(grid_q)): q = grid_q[q_idx] y_lower = rf.predict(X_test, quantile=q[0]) y_upper = rf.predict(X_test, quantile=q[-1]) coverage, avg_length = helper.compute_coverage_len(y_test, y_lower, y_upper) coverage_values[q_idx,fold_idx] = coverage length_values[q_idx,fold_idx] = avg_length fold_idx = fold_idx+1 print("done.") sys.stdout.flush() avg_coverage = coverage_values.mean(1) avg_length = length_values.mean(1) idx_under = np.where(avg_coverage<=target_coverage)[0] if len(idx_under)>0: best_idx = np.max(idx_under) else: best_idx = 0 best_q = grid_q[best_idx] best_coverage = avg_coverage[best_idx] best_length = avg_length[best_idx] print("[CV DEBUG] best q " + str(best_q) + ", coverage " + str(best_coverage) + ", length " + str(best_length)) return best_q, best_coverage, best_length