targets = data['Sales'].values targets = np.reshape(targets, [targets.shape[0], 1]) data = data[features].values data = np.concatenate((targets, data), axis=1) np.random.shuffle(data) from sklearn.ensemble import RandomForestRegressor offset = int(data.shape[0] * 0.9) train = data[:offset, :] test = data[offset:, :] params= {'n_estimators':50, 'n_jobs':1, 'verbose':2, 'max_depth':30, 'max_features':0.6,} forest = RandomForestRegressor(**params) forest.fit(train[:, 1::], np.log1p(train[:, 0])) #WARNING: MAY DUMP A FEW GIGS OR MORE #from sklearn.externals import joblib #joblib.dump(forest, '../saved_models/rf_001.model') #print 'wrote forest model to rf_001.model' out = np.expm1(forest.predict(test[:, 1::])) target = test[:, 0] err = rmspe(target, out) print err print 'done' print forest.feature_importances_
"colsample_bytree": 0.7, #"silent": 1 "seed": 1301 } num_boost_round = 1700 print("Train an XGBoost model") # sales in first column for easy syntax later holdout = (data.Year==2014) & ((data.Month==8)) X_train = data[['Sales'] + features][~holdout].values X_valid = data[['Sales'] + features][holdout].values #data = data[['Sales'] + features].values #X_train, X_valid = train_test_split(data, test_size=0.012, random_state=10) y_train = np.log1p(X_train[:, 0]) y_valid = np.log1p(X_valid[:, 0]) dtrain = xgb.DMatrix(X_train[:, 1:], y_train) dvalid = xgb.DMatrix(X_valid[:, 1:], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \ early_stopping_rounds=120, feval=rmspe_xg, verbose_eval=True) gbm.save_model('../saved_models/xgb_001.model') print 'model saved in xgb_001.model' print("Validating") yhat = gbm.predict(xgb.DMatrix(X_valid[:, 1:])) error = rmspe(X_valid[:,0], np.expm1(yhat)) print('RMSPE: {:.6f}'.format(error))