#norm age and rating #data[0:,-2] = data[0:,-2] / data[0:,-2].max() #data[0:,-1] = data[0:,-1] / data[0:,-1].max() #data_word_age = data[0:,0:-1] train_x = data[0:, 0:-1] train_y = np.array(data[0:, -1:]).reshape((data.shape[0], )) X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=0) reg = RandomForestRegressor() reg.fit(X_train, y_train) p = reg.predict(X_test) s = reg.score(X_test, y_test) print(s) # above mean or not train_rating_average = np.average(train_y) binary_p = np.zeros([p.shape[0]]) iteration = 0 for i in p: if i > train_rating_average: binary_p[iteration] = 1.0 else: binary_p[iteration] = 0.0 iteration += 1 binary_ytest = np.zeros([p.shape[0]]) iteration = 0 for i in y_test:
for column in features: if plotnumber <= len(features): ax = plt.subplot(4, 4, plotnumber) sns.stripplot(target, features[column]) plotnumber += 1 plt.show() # In[15]: from sklearn.ensemble.forest import RandomForestRegressor # In[16]: rand_clf = RandomForestRegressor() # In[272]: x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30) # In[273]: rand_clf.fit(x_train, y_train) # In[274]: rand_clf.score(x_train, y_train) # In[275]: rand_clf.score(x_test, y_test)
# Generate training and test set X_train,y_train = OrganizeData(nucleus, 'train') X_test, y_test = OrganizeData(nucleus, 'test') # Feature scaling X_train_scaled = preprocessing.scale(X_train) X_test_scaled = preprocessing.scale(X_test) # Set the parameters for the random forest estimator estimator = RandomForestRegressor(n_estimators=50, max_features=16, max_depth=25, min_samples_split=5, min_samples_leaf=5, random_state=0) # Build the random forest of regression trees from the training set estimator = estimator.fit(X_train_scaled,y_train) print estimator.score(X_train_scaled,y_train) print estimator.score(X_test_scaled,y_test) # Predict regression target for the test set predicted = estimator.predict(X_train_scaled) cc = np.corrcoef(y_train,predicted) print cc print estimator #my_plotting.simple_plot_overlay(y_train,predicted) predicted = estimator.predict(X_test_scaled) cc = np.corrcoef(y_test,predicted) print cc print estimator #my_plotting.simple_plot_overlay(y_test,predicted)
# if res[i][j] == 100: # res[i][j] = 0 # else: # res[i][j] = -0.01 * res[i][j] # return res # def normalizeY(arr): # arr=arr/100 # return arr if __name__ == '__main__': train_x, test_x, train_y, test_y, x_data, y_data = load(train_data_path) rf_model = RandomForestRegressor() rf_model.fit(x_data, y_data) with open(filename, 'wb') as file: pickle.dump(rf_model, file) rf_train_score = rf_model.score(x_data, y_data) rf_test_score = rf_model.score(test_x, test_y) print("RF train score:",rf_train_score) print("RF test score:",rf_test_score) dt_model = DecisionTreeRegressor() dt_model.fit(x_data, y_data) with open(filename2, 'wb') as file: pickle.dump(dt_model, file) dt_train_score = dt_model.score(x_data, y_data) dt_test_score = dt_model.score(test_x, test_y) print("DT train score:",dt_train_score) print("DT test score:",dt_test_score)
# fit a linear model with no bells and whistles model = linear_model.LinearRegression() model.fit(train_X, train_Y) # look at the r squared on the training data and the test data model.score(train_X, train_Y) model.score(test_X, test_Y) # See if I can get the r squared on the test data lower by using more complex models # random forest forest = RandomForestRegressor() # fit the data without using cross val to select parameters # note that train score is much higher than test score forest.fit(train_X, train_Y) forest.score(train_X, train_Y) forest.score(test_X, test_Y) # fit a random forest regressor using grid search to # select the number of trees and max depth new_forest = RandomForestRegressor() params_grid = [{'max_depth': [3, 5,10, None], 'n_estimators': [5,10,15,20, 50, 80]} ] grid_search = GridSearchCV(new_forest, params_grid, cv=10) grid_search.fit(train_X, train_Y) grid_search.score(test_X, test_Y) grid_search.best_estimator_ # fit a boosted regression boost = GradientBoostingRegressor() params_grid = [{'learning_rate': [.05,.1,2], 'n_estimators': [20,50,100,150]} ]