def run_random_forest(X_train, y_train, X_test, y_test): print("Fitting Random Forest") X_train = dp.factorize_variables(X_train) X_test = dp.factorize_variables(X_test) max_features = [None, 'auto', 'log2'] params = {'criterion': ['gini'], 'random_state': [1234], 'n_estimators': [100, 200], 'max_features': max_features, 'oob_score': [False], # EOSL -- just fit max depth trees. not going to overfit 'max_depth': [None, 10], 'n_jobs': [1], } cv_func, y = dp.get_kfold_obj(y_train, k = 3) grid = GridSearchCV(RandomForestClassifier(), params, cv=cv_func, verbose=2) grid.fit(X_train, y_train.values) print(grid.best_score_) print(grid.best_estimator_) print("Training set score {}".format(grid.score(X_train,y_train))) print("Test set score {}".format(grid.score(X_test,y_test))) return grid
y_train = pd.Series(psf.adjust_y(y_train)) y_test= pd.Series(psf.adjust_y(y_test)) X_ID = train.pop('Id') Y_ID = test.pop('Id') # are there any missing values? dp.print_columns_with_missing(X_train) dp.print_columns_with_missing(X_test) # Run the model rf_model = rrf.run_random_forest(X_train, y_train, X_test, y_test) print(rf_model.get_params()) rf_final_model = rf_model.fit(X_data_full, y_data_full.values) # Score the actual test set test = dp.read_data('{}data/test.csv'.format(file_path)) test = test.set_index('Id') test = dp.factorize_variables(test) test_predictions = rf_final_model.predict(test) test = pd.DataFrame(np.transpose([test.index, test_predictions])) test.columns = ["Id", "Hazard"] # Store results and pickle model test.to_csv('{}Output/{}.csv'.format(file_path, model_name), drop = True) #with open('/Users/Adrianna/Desktop/Kaggle/Liberty/Output/rf.pkl', 'wb') as f: # cPickle.dump(rf_model, f)