def deserialize_random_forest_regressor(model_dict): model = RandomForestRegressor(**model_dict['params']) estimators = [ deserialize_decision_tree_regressor(decision_tree) for decision_tree in model_dict['estimators_'] ] model.estimators_ = np.array(estimators) model.n_features_ = model_dict['n_features_'] model.n_outputs_ = model_dict['n_outputs_'] model.max_depth = model_dict['max_depth'] model.min_samples_split = model_dict['min_samples_split'] model.min_samples_leaf = model_dict['min_samples_leaf'] model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf'] model.max_features = model_dict['max_features'] model.max_leaf_nodes = model_dict['max_leaf_nodes'] model.min_impurity_decrease = model_dict['min_impurity_decrease'] model.min_impurity_split = model_dict['min_impurity_split'] if 'oob_score_' in model_dict: model.oob_score_ = model_dict['oob_score_'] if 'oob_prediction_' in model_dict: model.oob_prediction_ = np.array(model_dict['oob_prediction_']) return model
def RFR(x_train,y_train,x_test,udf_trees=100,udf_max_features='auto', udf_min_samples=1, do_CV=False,names=None): from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.cross_validation import cross_val_score if do_CV: ### Randomly split up training set into 80/20 split. ### ### 80 for CV, 20 for "Test" score ### from sklearn.cross_validation import train_test_split x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_train,y_train,test_size=0.20, random_state=42) param_grid = {'max_features': [4,5,6], 'min_samples_leaf':[50,250,1000,2500]} est=RandomForestRegressor(n_estimators=100,verbose=1, n_jobs=-1) cv_scores=list() test_scores=list() params_list=list() start = time() for mfeatures in param_grid['max_features']: for minSamples in param_grid['min_samples_leaf']: print 'Trying parameter combination with 100 trees: (MaxFeatures=%i, minSamples=%i)' % (mfeatures,minSamples) est.min_samples_leaf=minSamples est.max_features=mfeatures cv_score=cross_val_score(est,x_train_cv,y_train_cv,scoring='mean_squared_error',cv=5) cv_scores.append(np.mean(cv_score)) ### Create the labels for display purposes ### params_list.append((mfeatures,minSamples)) ### Perform 20% test set score ### est.fit(x_train_cv,y_train_cv) y_pred=est.predict(x_test_cv) test_scores.append(mean_squared_error(y_test_cv,y_pred)) print 'Took %.2f seconds for parameter tuning.' %(time()-start) print 'writing CV results to file...' results = np.array([params_list,cv_scores,test_scores]).T ## should have 48 results... print 'Parameter tuning results........' print 'Parameters (max_features, min_samples_leaf), CV_Scores' for i in range(len(results)): print results[i] else: ### Train the RFC Classifier with the optimal parameters found above ### ### RFR only takes 'MSE', need to change it to RMSEPE as per contest rules ### print 'Fitting Random Forest with optimal user-defined parameters....' est=RandomForestRegressor(n_estimators=udf_trees, max_features=udf_max_features,min_samples_leaf=udf_min_samples,n_jobs=-1,verbose=1) est.fit(x_train,y_train) #idx=np.where(x_test[:,1]==0) #x_test=np.delete(x_test, 1, axis=1) y_pred=est.predict(x_test) y_pred=np.exp(y_pred) #y_pred[idx] = 0 ### Plot feature importances ### #plot_feature_importance(est, names) print 'Writing submission file....' with open('RFC_Submission.csv','wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Sales')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'