def get_regressor_model(model_config): if 'rf' in model_config.model_type or 'random_forest' in model_config.model_type: from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(**model_config.model_params) # temporarily set n_estimators to 0, will add estimators for each batch in iteration regressor.n_estimators = 0 elif 'xgb' in model_config.model_type and 'tree' in model_config.model_type: # TODO: add other options? #from xgboost.sklearn_generic import XGBRegressor pass else: logger.error( 'model_type {self.model_type} provided in config file is not a valid model type.' ) return regressor
y_ = mds.get_y(h, v, year, month, day, masks) assert x_.shape[0] == y_.shape[0] if it == 0: x = x_ y = y_ else: x = np.vstack((x, x_)) y = np.vstack((y, y_)) print 'loading data: %d of %d tiles, h%.2dv%.2d, year:%d, month:%d, day:%d, x:%s, y:%s' % (i+1, len(tile_hv), \ h, v, year, month, day, x.shape, y.shape) train_x = x set_rf_samples(int(train_x.shape[0] * 0.7)) for ib in band_ids: train_y = y[:, ib] rf = rf_models[ib] rf.n_estimators = (i + 1) * n_trees rf.fit(train_x, train_y) score = rf.score(train_x, train_y) print ' (%d of %d bands): h%.2dv%.2d: train r squared: %.5f' % ( ib + 1, n_bands, h, v, score) for ib in band_ids: joblib.dump(rf_models[ib], 'rf%d.model' % ib)
def test_RF(X_tv, y_tv, dates_tv, day_test, day_valid_small, day_valid, m_f_opt, m_d_opt, n_e_opt): # preparing data n_rows = 2 * 12 * len(m_f_opt) * len(m_d_opt) * len(n_e_opt) rf_results = pd.DataFrame(np.zeros([n_rows, 11]), columns=[ 'year', 'month', 'max_features', 'max_depth', 'n_estimators', 'rmsle_tot', 'rmsle_cas', 'rmsle_reg', 'train_rmsle_tot', 'train_rmsle_cas', 'train_rmsle_reg' ]) rf_results.loc[:, [ 'year', 'month', 'max_features', 'max_depth', 'n_estimators' ]] = list(it.product([2011, 2012], range(1, 13), m_f_opt, m_d_opt, n_e_opt)) i = 0 for year in [2011, 2012]: for month in range(1, 13): if year == 2012 or month >= 4: day_valid_curr = day_valid else: day_valid_curr = day_valid_small train_ind = ss.get_train(dates_tv, year, month, day_test, day_valid_curr) valid_ind = ss.get_valid(dates_tv, year, month, day_test, day_valid_curr) print('year {}, month {}'.format(year, month), flush=True) print('train size: {}, validation size: {}'.format( train_ind.sum(), valid_ind.sum())) print('learning from {} to {}'.format(dates_tv[train_ind].min(), dates_tv[train_ind].max())) print('validation from {} to {}'.format(dates_tv[valid_ind].min(), dates_tv[valid_ind].max())) for m in m_f_opt: for md in m_d_opt: rf_c = RandomForestRegressor(n_jobs=-1, max_features=m, max_depth=md, warm_start=True) rf_r = RandomForestRegressor(n_jobs=-1, max_features=m, max_depth=md, warm_start=True) for n in n_e_opt: ## casual rf_c.n_estimators = n rf_c.fit(X_tv[train_ind], y_tv.loc[train_ind, 'lcasual']) pred_cas = rf_c.predict(X_tv[valid_ind]) rf_results.ix[i, 'rmsle_cas'] = resf.rmsle_of_logs( pred_cas, y_tv.loc[valid_ind, 'lcasual']) pred_cas_train = rf_c.predict(X_tv[train_ind]) rf_results.ix[i, 'train_rmsle_cas'] = resf.rmsle_of_logs( pred_cas_train, y_tv.loc[train_ind, 'lcasual']) ## registered rf_r.n_estimators = n rf_r.fit(X_tv[train_ind], y_tv.loc[train_ind, 'lregistered']) pred_reg = rf_r.predict(X_tv[valid_ind]) rf_results.ix[i, 'rmsle_reg'] = resf.rmsle_of_logs( pred_reg, y_tv.loc[valid_ind, 'lregistered']) pred_reg_train = rf_r.predict(X_tv[train_ind]) rf_results.ix[i, 'train_rmsle_reg'] = resf.rmsle_of_logs( pred_reg_train, y_tv.loc[train_ind, 'lregistered']) ## total pred_total = np.log( np.exp(pred_cas) + np.exp(pred_reg) - 1 ) #np.log(resf.total_from_log(pred_cas, pred_reg)+1) pred_total_train = np.log( resf.total_from_log(pred_cas_train, pred_reg_train) + 1) rf_results.ix[i, 'rmsle_tot'] = resf.rmsle_of_logs( pred_total, y_tv.loc[valid_ind, 'ltotal']) rf_results.ix[i, 'train_rmsle_tot'] = resf.rmsle_of_logs( pred_total_train, y_tv.loc[train_ind, 'ltotal']) print('Done: ', flush=True) print(rf_results.ix[i], flush=True) i += 1 return rf_results
#plt.tight_layout() param_grid = { 'n_estimators': [10, 20, 50, 100], 'max_depth': list(range(3, 10 + 1)), 'max_features': [None] } scores = {} for max_depth in param_grid['max_depth']: print(max_depth) for max_features in param_grid['max_features']: print(max_features) rf = RandomForestRegressor(max_depth=max_depth, max_features=max_features, oob_score=True, warm_start=True) for n_estimators in param_grid['n_estimators']: rf.n_estimators = n_estimators scores[(max_depth, max_features, n_estimators)] = rf.fit(X, y).oob_score_ modelFinal = RandomForestRegressor(max_depth=10, max_features=None, n_estimators=100, oob_score=True).fit(X, y) modelFinal.oob_score_ metrics.mean_squared_error(y, modelFinal.oob_prediction_) metrics.mean_absolute_error(y, modelFinal.oob_prediction_)