def __init__(self, params): super(%CLASS%, self).__init__(params) tmp = RandomForestRegressor() params = tmp.get_params() for key in params: self.create_new_input(type_="data", label=key, widget_name="std line edit m", widget_pos="besides", pos=-1) del tmp
def get_feat_imps(): X_train, X_test, y_train, y_test = data_for_gridsearch() column_names = X_train.columns model = RFR(max_features='auto', max_depth=None, bootstrap=True, min_samples_leaf=5, min_samples_split=10, n_estimators=100) model = model.fit(X_train, y_train) model_params = model.get_params() feat_imps = model.feature_importances_ print('model_params', model_params) print('feat_imps', feat_imps) rmse_train, rmse_test, errors_for_plot = eval_model( model, X_train, y_train, X_test, y_test) print('RMSE train/test: ', rmse_train, rmse_test) return model_params, feat_imps, column_names
def createModelRFC(self): logger.info("DEFINITION OF THE MODEL RFC") # model = ExtraTreesClassifier(n_estimators=self.model_param['n_estimators'],max_features=self.model_param['max_features'],n_jobs=-1) # model = RandomForestClassifier(n_jobs=-1), model = RandomForestRegressor(n_jobs=-1) logger.info("MODEL PARAMS: %s", model.get_params(deep=True)) return model
def __init__(self, params): super(RFRGetParams_NodeInstance, self).__init__(params) tmp = RandomForestRegressor() params = tmp.get_params() for key in params: self.create_new_output(type_="data", label=key, pos=-1) del tmp self.create_new_output(type_="data", label="param dict", pos=-1)
def main(): ## Read in csv and correct formating that was lost in transition mydf = read_data_csv() #eliminate rows that have 1 or more missing values mydf = mydf.dropna(axis=0) #convert region to something numerical numeric_regions = { 'Africa': 1, 'Asia': 2, 'Central America/ Caribbean': 3, } mydf['region_num'] = mydf['region'].map(numeric_regions) ### predictor_names = ['week_day_num_posted','day_posted','maleness','region_num', \ 'treat_cost','patient_age','smile_scale'] numfeat = len(predictor_names) Y = mydf.dollars_per_day #variable to predict X = mydf[predictor_names] #Build classifier using "best" random forest nfolds = 3 #number of folds to use for cross-validation #n_estimators is number of trees in forest #max_features is the number of features to consider when looking for best split parameters = {'n_estimators':[10,100,1000], 'max_features':[3,5,7]} # rf parameters to try njobs = 1 #number of jobs to run in parallel -- pickle problems may occur if njobs > 1 rf_tune = grid_search.GridSearchCV(RandomForestRegressor(), parameters, n_jobs = njobs, cv = nfolds) rf_opt = rf_tune.fit(X,Y) #Results of the grid search for optimal random forest parameters. print("Grid of scores:\n" + str(rf_opt.grid_scores_) + "\n") print("Best zero-one score: " + str(rf_opt.best_score_) + "\n") print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n") #print "Parameters of random forest:\n " , rf_opt.get_params() #save optimal random forest regressor for future #mypickledRF = open('RF_Regressor' , 'wb') #w is for write; b is for binary #pickle.dump(rf_opt.best_estimator_ , mypickledRF) #Save classifier in file "RFclassifier" #mypickledRF.close() #Now use the optimal model's parameters to run random forest #(I couldn't get feature importances directly from the GridSearchCV fit) crf = RandomForestRegressor(n_jobs=njobs, max_features=3, n_estimators=1000).fit(X,Y) print "Parameters used in chosen RF model:\n " , crf.get_params() plotting_names = np.array(('Day','Date','Sex','Region','Cost','Age','Smile')) print crf.feature_importances_ indices = np.argsort(crf.feature_importances_)[::-1][:numfeat] plt.bar(xrange(numfeat), crf.feature_importances_[indices], align='center', alpha=.5) plt.xticks(xrange(numfeat), plotting_names[indices], rotation='horizontal', fontsize=12) plt.xlim([-1, numfeat]) plt.ylabel('Feature importances', fontsize=12) plt.title('Feature importances computed by Random Forest', fontsize=16) plt.savefig('03_feature_importance.png', dpi=150);
def search_bestparam_RandomForestRegressor(X_train, y_train, df_search_best_param): print(f"Search best params for RandomForestRegressor ...") model = RandomForestRegressor() print("Supported params", model.get_params()) param_grid = { 'n_estimators': [500, 700, 1000], 'max_depth': [None, 1, 2, 3], 'min_samples_split': [1, 2, 3] } search_bestparam(model, param_grid, X_train, y_train, df_search_best_param)
def build_model(df): X = df.iloc[:, :-1] # Using all column except for the last column as X y = df.iloc[:, -1] # Selecting the last column as y st.markdown('**1.2. Data splits**') st.write('Training set') st.info(X.shape) st.write('Test set') st.info(y.shape) st.markdown('**1.3. Variable details**:') st.write('X variable') st.info(list(X.columns)) st.write('y variable') st.info(y.name) # Data splitting X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_size) rf = RandomForestRegressor(n_estimators=parameter_n_estimators, max_features=parameter_max_features, random_state=parameter_random_state, criterion=parameter_criterion, min_samples_split=parameter_min_samples_split, min_samples_leaf=parameter_min_samples_leaf, bootstrap=parameter_bootstrap, oob_score=parameter_oob_score, n_jobs=parameter_n_jobs) rf.fit(X_train, y_train) st.subheader('2. Model Performance') st.markdown('**2.1. Training set**') y_pred_train = rf.predict(X_train) st.write('Coefficient of determination ($R^2$):') st.info(r2_score(y_train, y_pred_train)) st.write('Error (MSE or MAE):') st.info(mean_squared_error(y_train, y_pred_train)) st.markdown('**2.2. Test set**') y_pred_test = rf.predict(X_test) st.write('Coefficient of determination ($R^2$):') st.info(r2_score(y_test, y_pred_test)) st.write('Error (MSE or MAE):') st.info(mean_squared_error(y_test, y_pred_test)) st.subheader('3. Model Parameters') st.write(rf.get_params())
class RandomForestModel: def __init__(self): self.regressor = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=0) def get_params(self): return self.regressor.get_params() def train(self, X, y): self.regressor.fit(X, y) def predict(self, X): return self.regressor.predict(X.values.reshape(1, -1))
def test_parameters(self): """ Testing parameters of Model class. """ #1.) #create instance of PLS model using Model class & creating instance # using SKlearn libary, comparing if the parameters of both instances are equal pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200} model = Model(algorithm="PlsRegression", parameters=pls_parameters) pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200) for k, v in model.model.get_params().items(): self.assertIn(k, list(pls_model.get_params())) #2.) rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10} model = Model(algorithm="RandomForest", parameters=rf_parameters) rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10) for k, v in model.model.get_params().items(): self.assertIn(k, list(rf_model.get_params())) #3.) knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"} model = Model(algorithm="KNN", parameters=knn_parameters) knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree") for k, v in model.model.get_params().items(): self.assertIn(k, list(knn_model.get_params())) #4.) svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1} model = Model(algorithm="SVR",parameters=svr_parameters) svr_model = SVR(kernel='poly', degree=5, coef0=1) for k, v in model.model.get_params().items(): self.assertIn(k, list(svr_model.get_params())) #5.) ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"} model = Model(algorithm="AdaBoost", parameters=ada_parameters) ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(ada_model.get_params())) #6.) bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2} model = Model(algorithm="Bagging", parameters=bagging_parameters) bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(bagging_model.get_params())) #7.) lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004} model = Model(algorithm="lasso", parameters=lasso_parameters) lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004) for k, v in model.model.get_params().items(): self.assertIn(k, list(lasso_model.get_params()))
def run_random_forest(mydf): print "\n************ Random Forest Results ************\n" mydf = prepare_data_for_RF(mydf) predictor_names = ['week_day_num_posted','day_posted','maleness','region_num', \ 'treat_cost','patient_age','smile_scale'] numfeat = len(predictor_names) Y = mydf.dollars_per_day #variable to predict X = mydf[predictor_names] #Build classifier using "best" random forest nfolds = 3 #number of folds to use for cross-validation #n_estimators is number of trees in forest #max_features is the number of features to consider when looking for best split parameters = { 'n_estimators': [10, 100, 1000], 'max_features': [3, 5, 7] } # to try njobs = 1 #number of jobs to run in parallel rf_tune = grid_search.GridSearchCV(RandomForestRegressor(), parameters, n_jobs=njobs, cv=nfolds) rf_opt = rf_tune.fit(X, Y) #Results of the grid search for optimal random forest parameters. print("Grid of scores:\n" + str(rf_opt.grid_scores_) + "\n") print("Best zero-one score: " + str(rf_opt.best_score_) + "\n") print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n") #print "Parameters of random forest:\n " , rf_opt.get_params() #Now use the optimal model's parameters to run random forest #(I couldn't get feature importances directly from the GridSearchCV fit) crf = RandomForestRegressor(n_jobs=njobs, max_features=3, n_estimators=1000).fit(X, Y) print "Parameters used in chosen RF model:\n ", crf.get_params() plotting_names = np.array( ('Day', 'Date', 'Sex', 'Region', 'Cost', 'Age', 'Smile')) #print crf.feature_importances_ indices = np.argsort(crf.feature_importances_)[::-1][:numfeat] plt.bar(xrange(numfeat), crf.feature_importances_[indices], \ align='center', alpha=.5) plt.xticks(xrange(numfeat), plotting_names[indices], \ rotation='horizontal', fontsize=20) plt.xlim([-1, numfeat]) plt.ylabel('Feature importances', fontsize=24) plt.title('', fontsize=28) plt.savefig('03_feature_importance_v2.pdf')
def build_model(df): X = df.iloc[:, :-1] # using all columns except for the last column as X y = df.iloc[:, -1] # select the last column as y st.markdown('**1.2. Data Splits**') st.write('Training Set') st.info(X.shape) st.write('Testing Set') st.info(y.shape) st.markdown('**1.3. Variable Details:**') st.write('X Variables') st.info(list(X.columns)) st.write('Y Variables') st.info(y.name) # Data splitting X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_size) rfr = RandomForestRegressor(n_estimators=parameter_n_estimator, random_state=parameter_random_state, max_features=parameter_max_feature, criterion=parameter_criterion, min_samples_split=parameter_min_sample_split, min_samples_leaf=parameter_min_sample_leaf, bootstrap=parameter_bootstrap, oob_score=parameter_oob_store, n_jobs=parameter_n_jobs) rfr.fit(X_train, y_train) st.subheader('2. Model Performance') st.markdown('**2.1. Training Set**') y_train_pred = rfr.predict(X_train) st.write('Coefficient of determination ($R^2$):') st.info(metrics.r2_score(y_train, y_train_pred)) st.write('Error (MSE or MAE):') st.write(nama_error) st.info(error(y_train, y_train_pred)) st.markdown('**2.2. Testing Set**') y_test_pred = rfr.predict(X_test) st.write(nama_error) st.info(error(y_test, y_test_pred)) st.subheader('3. Model Parameters') st.write(rfr.get_params()) # get the parameters of the model
def get_feat_imps(X, y): column_names = X.columns model = RFR(max_features = 'auto', max_depth = None, bootstrap = True, min_samples_leaf = 5, min_samples_split = 10, n_estimators = 100 ) model = model.fit(X, y) model_params = model.get_params() feat_imps = model.feature_importances_ print('model_params', model_params) print('feat_imps', feat_imps) return model_params, feat_imps, column_names
def Grid_Search_CV_RFR(X_train, y_train, X_test, y_test): #Default RandomForestRegression rf = RandomForestRegressor() rf.fit(X_train, y_train.flatten()) y_pred_train = rf.predict(X_train) y_pred_test = rf.predict(X_test) r2_default_train = metrics.r2_score(y_train, y_pred_train) r2_default_test = metrics.r2_score(y_test, y_pred_test) #base_accuracy = evaluate(rf, X_test, y_test.flatten()) params = rf.get_params() #print('Default parameters in the fit are ', params) #print('R2 Default Train={:0.5f}'.format(r2_default_train)) #print('R2 Default Test={:0.5f}'.format(r2_default_test)) #Instantiate Grid search model to optimize hyperparameters for Random Forest Regression param_grid = { 'bootstrap': [True, False], #Replacement or not 'max_depth': [5, 7, 10], #Depth of each tree in the forest (1-32). 4, 8, 12, 16, 20 'max_features': ['auto' ], #Number of features to consider when looking for the best split 'min_samples_leaf': [5, 7], #Minimum number of samples required to be at a leaf node 'min_samples_split': [ 6, 8 ], #Minimum samples required to split an internal node (10-100%). 8, 10, 12 'n_estimators': [10, 20, 30] #Number of trees in the forest (<200). 10, 20, 30, 40 } grid_search = GridSearchCV(rf, param_grid, cv=5, refit=True, verbose=0) grid_results = grid_search.fit(X_train, y_train.flatten()) best_parameters = grid_search.best_params_ best_result = grid_search.best_score_ best_estimator = grid_search.best_estimator_ #Optimized randomForestRegressor y_pred_grid_train = best_estimator.predict(X_train) y_pred_grid_test = best_estimator.predict(X_test) r2_grid_train = metrics.r2_score(y_train, y_pred_grid_train) r2_grid_test = metrics.r2_score(y_test, y_pred_grid_test) #print('Optimised parameters in the fit are ', best_parameters) #grid_accuracy = evaluate(best_estimator, X_test, y_test.flatten()) #print('base_accuracy={:0.5f}'.format(base_accuracy), 'grid_accuracy={:0.5f}'.format(grid_accuracy), '. Relative improvement of {:0.2f}%'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy)) return best_parameters, best_result, best_estimator
def run_random_forest(mydf): print "\n************ Random Forest Results ************\n" mydf = prepare_data_for_RF(mydf) predictor_names = ['week_day_num_posted','day_posted','maleness','region_num', \ 'treat_cost','patient_age','smile_scale'] numfeat = len(predictor_names) Y = mydf.dollars_per_day #variable to predict X = mydf[predictor_names] #Build classifier using "best" random forest nfolds = 3 #number of folds to use for cross-validation #n_estimators is number of trees in forest #max_features is the number of features to consider when looking for best split parameters = {'n_estimators':[10,100,1000], 'max_features':[3,5,7]} # to try njobs = 1 #number of jobs to run in parallel rf_tune = grid_search.GridSearchCV(RandomForestRegressor(), parameters, n_jobs = njobs, cv = nfolds) rf_opt = rf_tune.fit(X,Y) #Results of the grid search for optimal random forest parameters. print("Grid of scores:\n" + str(rf_opt.grid_scores_) + "\n") print("Best zero-one score: " + str(rf_opt.best_score_) + "\n") print("Optimal Model:\n" + str(rf_opt.best_estimator_) + "\n") #print "Parameters of random forest:\n " , rf_opt.get_params() #Now use the optimal model's parameters to run random forest #(I couldn't get feature importances directly from the GridSearchCV fit) crf = RandomForestRegressor( n_jobs=njobs, max_features=3, n_estimators=1000).fit(X,Y) print "Parameters used in chosen RF model:\n " , crf.get_params() plotting_names = np.array(('Day','Date','Sex','Region','Cost','Age','Smile')) #print crf.feature_importances_ indices = np.argsort(crf.feature_importances_)[::-1][:numfeat] plt.bar(xrange(numfeat), crf.feature_importances_[indices], \ align='center', alpha=.5) plt.xticks(xrange(numfeat), plotting_names[indices], \ rotation='horizontal', fontsize=20) plt.xlim([-1, numfeat]) plt.ylabel('Feature importances', fontsize=24) plt.title('', fontsize=28) plt.savefig('03_feature_importance_v2.pdf');
def do_model(X_train, X_test, y_train, y_test): model = RFR(max_features='sqrt', max_depth=100, bootstrap=False, min_samples_leaf=1, min_samples_split=2, n_estimators=200) model = model.fit(X_train, y_train) ypred = model.predict(X_test) ytrainpred = model.predict(X_train) model_params = model.get_params() feat_imps = model.feature_importances_ return model, model_params, feat_imps, ypred, ytrainpred
class _RandomForestRegressorWrapper(BaseEstimator, RegressorMixin): _params = () def __init__(self, **kwargs): """Base wrapper for a RandomForestRegressor class.""" super().__init__() self._forest = RandomForestRegressor_(**kwargs) def fit(self, X, y, **kwargs): self._forest.fit(X, y, **kwargs) return self def predict(self, X): return self._forest.predict(X) def __getattr__(self, attr): # Check if own attribute. if attr in self.__dict__: return getattr(self, attr) # Proxy to forest. return getattr(self._forest, attr) def get_params(self, deep=True): # Merge own parameters with the forests. forest_params = self._forest.get_params(deep=deep) own_params = {p: getattr(self, p) for p in self._params} return toolz.merge(forest_params, own_params) def set_params(self, **parameters): # Copy dict. parameters = dict(parameters) # Extract own parameters. for own_param in self._params: if own_param in parameters: self.setattr(own_param, parameters.pop(own_param)) # Pass others to the forest. self._forest.set_params(parameters) return self
def hyper_par_girdcv(x, y): X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.7, random_state=42) model = RandomForestRegressor() print(model.get_params().keys()) parameters = { "n_estimators": [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "max_features": ["auto", "log2", "sqrt", None], "max_leaf_nodes": [None, 10, 20, 30, 40, 50, 60, 70, 80, 90] } gs = GridSearchCV( model, param_grid={ 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] }, cv=10, n_jobs=1, scoring='neg_mean_squared_error') gs.fit(X_train, y_train) print(gs.best_params_) best_grid = gs.best_estimator_ y_pred = best_grid.predict(X_test) text_out = { "R-squared": round(r2_score(y_test, y_pred), 3), "MAE": round(mean_absolute_error(y_test, y_pred), 3), "MSE": round(mean_squared_error(y_test, y_pred), 3) } json_out = json.dumps(text_out, sort_keys=False, indent=4) with open('models/rtfr.pkl', 'wb') as output_file: pickle.dump(best_grid, output_file) return json_out
def get_warmstart_configuration(self): """ Determine the default hyperparameter configuration of the selected ML-algorithm. This configuration can be used as a warmstart configuration for the HPO-method. :return: default_params: dict Dictionary that contains the default HPs. """ if self.ml_algorithm == 'RandomForestRegressor': default_model = RandomForestRegressor( random_state=self.random_seed) elif self.ml_algorithm == 'SVR': # SVR has no random_state parameter default_model = SVR() elif self.ml_algorithm == 'AdaBoostRegressor': default_model = AdaBoostRegressor(random_state=self.random_seed) elif self.ml_algorithm == 'DecisionTreeRegressor': default_model = DecisionTreeRegressor( random_state=self.random_seed) elif self.ml_algorithm == 'LinearRegression': # LinearRegression has no random_state parameter default_model = LinearRegression() elif self.ml_algorithm == 'KNNRegressor': # KNeighborsRegressor has no random_state parameter default_model = KNeighborsRegressor() # Add remaining ML-algorithms here (e.g. XGBoost, Keras) else: raise Exception('Unknown ML-algorithm!') # Default HPs of the ML-algorithm default_params = default_model.get_params() return default_params
y, test_size=0.1, random_state=0) clf = DecisionTreeRegressor(max_depth=6) # Train Decision Tree Classifer clf = clf.fit(X_train, y_train) #Predict the response for test dataset y_pred = clf.predict(X_test) clf.score(X, y) clf.get_depth() clf.get_n_leaves() clf.get_params() # In[18]: clf.get_depth() # In[33]: clf.score(X_test, y_test) # In[31]: # evaluate decision tree performance on train and test sets with different tree depths from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score
def surrogateRF(Xarchive, Farchive, X, file_loc, file_loc_general, toUpdate, first_iter=False, problem='LABS', index=1): Xnew = Xarchive.T X_pred = X.T SMAC = False if SMAC: with open("/home/naamah/Documents/CatES/result_All/smac/RF/X1.p", "wb") as fp: pickle.dump(Xnew, fp) with open("/home/naamah/Documents/CatES/result_All/smac/RF/F1.p", "wb") as fp: pickle.dump(Farchive, fp) anf = smac_RF.main_loop(problem) print("SMAC {}".format(anf)) sys.exit("Error message") clf = RandomForestRegressor(criterion="mse", n_estimators=49, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0001060554, max_leaf_nodes=1000, min_impurity_decrease=0.0, min_impurity_split=None, warm_start=False, max_depth=None, max_features="auto", random_state=7) #RF_9111 # if problem=="LABS": # # clf = RandomForestRegressor(criterion="mse",n_estimators=49, min_samples_leaf=1, min_samples_split=2, # min_weight_fraction_leaf=0.0001060554, max_leaf_nodes=1000, # min_impurity_decrease=0.0,min_impurity_split=None, warm_start=False, max_depth=None, # max_features="auto",random_state=7 # ) #RF_9111 # # # elif problem=="NKL": # clf = RandomForestRegressor(criterion="mse",n_estimators=43, min_samples_leaf=1, min_samples_split=4, # min_weight_fraction_leaf=0.0005238657425634613, max_leaf_nodes=673, min_impurity_decrease=0.0, # warm_start=True, max_depth=None, max_features="auto", random_state=8 # ) #RF_1_sig # # # # else: #problem=="QAP" # clf = RandomForestRegressor(criterion="mse",n_estimators=38, min_samples_leaf=1, min_samples_split=2, # min_weight_fraction_leaf=0.0002313685, max_leaf_nodes=551, min_impurity_decrease=2.86E-08, # warm_start=False, max_depth=None, max_features="auto", random_state=None # )#RF_9333 if not os.path.exists(file_loc_general + "/surrogate_configuration"): with open(file_loc_general + "/surrogate_configuration", 'a') as file: file.write("clf:\n{}\n\nTuning Algorithem: {} ".format( clf.get_params(), "smac")) file.close() clf.fit(Xnew, Farchive) F_pred = clf.predict(X_pred) return F_pred
# ### Displaying Results of the Random Forest Model # In[20]: plot_ground_truth_vs_prediction(y_valid, predictions) plot_results_as_scatter(y_valid, predictions) display_results(y_valid, predictions) apply_cross_validation(randomForest, X, y) # ### Hyperparameter Tuning # Applying Hyperparameter Tuning to see if we can improve the results of our model # In[21]: print('Parameters currently in use:\n') pprint(randomForest.get_params()) # ### Creation of all the possible features parameters # Here we create different inputs for Hyperparameter Tuning # # Hyperparameter Tuning features found here: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74 # In[22]: n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False]
P_VALUE = 0.05 if kstest_results_predicted_values.pvalue <= P_VALUE or kstest_results_predicted_values.pvalue <= P_VALUE: print(stats.kruskal(future_real_target, y_future_pred)) print(stats.mannwhitneyu(future_real_target, y_future_pred)) else: levene_results = stats.levene(future_real_target, y_future_pred) print("levene_results", levene_results) print( stats.ttest_ind(future_real_target, y_future_pred, equal_var=levene_results.pvalue > P_VALUE)) ## Export results n_trees = model.get_params()['n_estimators'] n_features = future_test.shape[1] res = { 'MAE': test_mae, 'RMSE': test_rmse, 'R2': test_r2, 'Num Trees': n_trees, 'Num Features': n_features, 'Time Taken': time_taken } res_df = pd.DataFrame( [res], columns=['MAE', 'RMSE', 'R2', 'Num Trees', 'Num Features', 'Time Taken']) result = res_df.to_string()
## Inspecting RF Hyperparameters in sklearn # Import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor # Set seed for reproducibility SEED = 1 # Instantiate a random forests regressor 'rf' rf = RandomForestRegressor(random_state= SEED) # Inspect rf' s hyperparameters rf.get_params() # Basic imports from sklearn.metrics import mean_squared_error as MSE from sklearn.model_selection import GridSearchCV # Define a grid of hyperparameter 'params_rf' params_rf = { 'n_estimators': [300, 400, 500], 'max_depth': [4, 6, 8], 'min_samples_leaf': [0.1, 0.2], 'max_features': ['log2','sqrt'] } # Instantiate 'grid_rf' grid_rf = GridSearchCV(estimator=rf,param_grid=params_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
submission.to_csv('submission.csv', index = False) # In[69]: # Got a score of 5.35 with k-fold Linear regression # Now trying random Forest regressor to check if there is any improvement from sklearn.ensemble import RandomForestRegressor rfgModel = RandomForestRegressor() # Trying cross validation first to check if the model is givibng good results. Root mean square value is # a good approximation of the performance of a prediction model print("Random Forest Generator Parameters: ") print(rfgModel.get_params() ) rfgModel.fit(X_train, y_train) rfgModel_pred = rfgModel.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test , rfgModel_pred)) print("root mean Squared error: {}".format(rmse)) rfgModel.fit(train_data, output_data) # Now running the model on actual data test data rfgModel_pred = rfgModel.predict(test_data) # In[70]: submission = pd.DataFrame( {'key': test_data_with_key.key, 'fare_amount': rfgModel_pred}, columns = ['key', 'fare_amount'])
rf = RandomForestRegressor(max_depth=20, n_estimators=50) # Train model rf.fit(X=x, y=y) # Get prediction results result = rf.predict(tX) print "Result" print "------" print result # Analyze performance print "Performance" print "-----------" print "Root Mean Squared Error", mean_squared_error(tY, np.array(result)) ** 0.5 print "Mean Absolute Error", mean_absolute_error(tY, np.array(result)) print "R2", Measures.r2(tY, np.array(result)) # Dump pickle files print df_mapper.features print rf.get_params() joblib.dump(df_mapper, mapper_pkl, compress = 3) joblib.dump(rf, estimator_pkl, compress = 3) # # Build pmml # command = "java -jar converter-executable-1.1-SNAPSHOT.jar --pkl-mapper-input mapper.pkl --pkl-estimator-input estimator.pkl --pmml-output mapper-estimator.pmml" # os.system(command)
sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = y_train.values.reshape(-1, 1) y_train = sc_y.fit_transform(y_train)""" # Fitting Random Forest Regression to the dataset from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=900, random_state=0) regressor.fit(X_train, y_train) # Predicting a new result y_pred = regressor.predict(X_test) print(regressor.score(X=X_test, y=y_test)) regressor.get_params() parameters = [{ 'random_state': [0, 1, 2, 3, 4, 5], 'n_estimators': [900, 950, 1000] }] from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(estimator=regressor, param_grid=parameters, scoring='r2', n_jobs=-1, cv=3) grid_search.fit(X_train, y_train) grid_search.best_params_ grid_search.best_score_
'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': 1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} print('Parameters currently in use:\n') pprint(rf_r.get_params()) m_rf_hhb = rf_r m_rf_hhb.fit(tuning_x_train_hhb,y_train_hhb) m_rf_hhb.score(tuning_x_test_hhb, y_test_hhb)
class PostRankOptimization(object): """ :param balanced: :param visual_expansion_use: :param re_score_alpha: :param re_score_method_proportional: :param regions: Define which of the regions to be considered upper body and which legs. If None, not used. Must be of length 2 if defined. Example: regions=[[0, 1], [2, 3, 4]] :return: """ def __init__(self, balanced=False, visual_expansion_use=True, re_score_alpha=0.15, re_score_proportional=True, regions=None, ve_estimators=20, ve_leafs=5): # OK self.subject = -1 # The order of the person to be Re-identified by the user (Initially -1) self.probe_name = "" self.probe_selected = None # Already feature extracted self.target_position = 0 self.iteration = 0 self.strong_negatives = [] self.weak_negatives = [] self.visual_expanded = [] self.new_strong_negatives = [] self.new_weak_negatives = [] self.new_visual_expanded = [] self.visual_expansion = RandomForestRegressor(n_estimators=ve_estimators, min_samples_leaf=ve_leafs, n_jobs=-1) # As in POP # regions = [[0], [1]] if regions is None: self.regions = [[0]] self.regions_parts = 1 elif len(regions) == 2: self.regions = regions self.regions_parts = sum([len(e) for e in regions]) else: raise ValueError("Regions size must be 2 (body region and legs region)") self.size_for_each_region_in_fe = 0 # Initialized at initial iteration self.execution = None self.ranking_matrix = None self.rank_list = None self.comp_list = None self.balanced = balanced if not balanced: self.use_visual_expansion = False else: self.use_visual_expansion = visual_expansion_use self.re_score_alpha = re_score_alpha self.re_score_proportional = re_score_proportional def set_ex(self, ex, rm): # OK self.execution = ex self.ranking_matrix = rm self.initial_iteration() def new_samples(self, weak_negatives_index, strong_negatives_index, absolute_index=False): # OK self.new_weak_negatives = [[e, idx] for [e, idx] in weak_negatives_index if [e, idx] not in self.weak_negatives] self.new_strong_negatives = [[e, idx] for [e, idx] in strong_negatives_index if [e, idx] not in self.strong_negatives] if not absolute_index: self.new_weak_negatives = [[self.rank_list[e], idx] for [e, idx] in self.new_weak_negatives] self.new_strong_negatives = [[self.rank_list[e], idx] for [e, idx] in self.new_strong_negatives] def _generate_visual_expansion(self): # OK n_estimators = self.visual_expansion.get_params()['n_estimators'] selected_len = round(float(n_estimators) * (2 / 3.)) selected = np.random.RandomState() selected = selected.permutation(n_estimators) selected = selected[:selected_len] expansion = np.zeros_like(self.probe_selected) for s in selected: expansion += self.visual_expansion[s].predict(self.probe_selected).flatten() expansion /= float(selected_len) return expansion def new_subject(self): # OK if self.subject < self.execution.dataset.test_size: self.subject += 1 self.probe_name = self.execution.dataset.probe.files_test[self.subject] self.probe_name = "/".join(self.probe_name.split("/")[-2:]) self.probe_selected = self.execution.dataset.probe.fe_test[self.subject] self.rank_list = self.ranking_matrix[self.subject].copy() self.comp_list = self.execution.matching_matrix[self.subject].copy() self._calc_target_position() self.iteration = 0 self.strong_negatives = [] self.weak_negatives = [] self.visual_expanded = [] else: return # TODO Control situation def initial_iteration(self): # OK self.new_subject() self.size_for_each_region_in_fe = self.execution.dataset.gallery.fe_test.shape[1] / self.regions_parts if self.use_visual_expansion: self.visual_expansion.fit(self.execution.dataset.probe.fe_train, self.execution.dataset.gallery.fe_train) def iterate(self): # OK self.iteration += 1 # print("Iteration %d" % self.iteration) to_expand_len = len(self.new_strong_negatives) - len(self.new_weak_negatives) if self.balanced: if to_expand_len < 0: return "There cannot be more weak negatives than strong negatives" elif to_expand_len > 0 and not self.use_visual_expansion: return "There must be the same number of weak negatives and strong negatives" for i in range(to_expand_len): # Randomly select if body or legs if len(self.regions) == 1: reg = 0 else: # Assumes only two body parts reg = random.choice([0, 1]) self.new_visual_expanded.append([self._generate_visual_expansion(), reg]) self.reorder() self._calc_target_position() self.strong_negatives.extend(self.new_strong_negatives) self.weak_negatives.extend(self.new_weak_negatives) self.visual_expanded.extend(self.new_visual_expanded) self.new_strong_negatives = [] self.new_weak_negatives = [] self.new_visual_expanded = [] return "OK" def collage(self, name, cols=5, size=20, min_gap_size=5): # OK """ Adapted from http://answers.opencv.org/question/13876/ read-multiple-images-from-folder-and-concat/?answer=13890#post-id-13890 :param name: path to save collage imgf :param cols: num of columms for the collage :param size: num of images to show in collage :param min_gap_size: space between images :return: """ # Add reference imgf first imgs = [] img = self.execution.dataset.probe.images_test[self.subject].copy() img[0:10, 0:10] = [0, 255, 0] cv2.putText(img, "Probe", (10, 10), cv2.FONT_HERSHEY_SIMPLEX, 0.4, 255, 1) imgs.append(img) elements = self.rank_list.copy() # Open imgs and save in list size = min(len(elements), (size - 1)) for i, elem in zip(range(size), elements): # print files_order_list[elem] img = self.execution.dataset.gallery.images_test[elem].copy() if self.execution.dataset.same_individual_by_pos(self.subject, elem, "test"): img[0:10, 0:10] = [0, 255, 0] cv2.putText(img, str(i), (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, [0, 0, 255], 2) imgs.append(img) # let's find out the maximum dimensions max_width = 0 max_height = 0 for img in imgs: max_height = max(max_height, img.shape[0]) # rows max_width = max(max_width, img.shape[1]) # cols # number of images in y direction rows = int(math.ceil(len(imgs) / float(cols))) result = np.zeros( (rows * max_height + (rows - 1) * min_gap_size, cols * max_width + (cols - 1) * min_gap_size, 3), np.uint8) current_height = current_width = i = 0 for y in range(rows): for x in range(cols): result[current_height:current_height + imgs[i].shape[0], current_width:current_width + imgs[i].shape[1]] = imgs[i] i += 1 current_width += max_width + min_gap_size current_width = 0 current_height += max_height + min_gap_size cv2.imwrite(name, result) cv2.imshow("tal", result) cv2.waitKey(1) def reorder(self): # OK raise NotImplementedError("Please Implement reorder method") def _calc_target_position(self): # OK for column, elemg in enumerate(self.rank_list): if self.execution.dataset.same_individual_by_pos(self.subject, elemg, selected_set="test"): target_position = column # If not multiview we can exit loop here self.target_position = target_position break
for clf in [clf_A, clf_B, clf_C, clf_D, clf_E, clf_F]: train_predict(clf, X_train, y_train, X_valid, y_valid)''' # RandomForestRegressor parameters = {'n_estimators':(10,15,20), 'min_samples_split':(2,3,4), 'min_samples_leaf':(1,2,3)} rfr = RandomForestRegressor(random_state=seed, warm_start=True) score = make_scorer(mean_squared_error, greater_is_better=False) grid_obj = GridSearchCV(rfr, param_grid=parameters, scoring=score, verbose=1, n_jobs=4, cv=5) grid_obj= grid_obj.fit(X_train, y_train) rfr = grid_obj.best_estimator_ print rfr.get_params(), '\n' print "Tuned model has a training RMSE score of {:.4f}.".format(predict_labels(rfr, X_train, y_train)) print "Tuned model has a testing RMSE score of {:.4f}.".format(predict_labels(rfr, X_valid, y_valid)) # RidgeCV ridge = RidgeCV(alphas=(1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1.0, 10.0), cv=5) ridge = ridge.fit(X_train, y_train) print ridge.get_params(), '\n' print "Tuned model has a training RMSE score of {:.4f}.".format(predict_labels(ridge, X_train, y_train)) print "Tuned model has a testing RMSE score of {:.4f}.".format(predict_labels(ridge, X_valid, y_valid)) # Save regressors pickle_file = 'regressor.pickle' try: f = open(pickle_file, 'wb')
max_features = [4, 6, 8, 10] -------------------------------------------------- # Exercise_2 from sklearn.ensemble import RandomForestRegressor # Fill in rfr using your variables rfr = RandomForestRegressor( n_estimators=100, max_depth=random.choice(max_depth), min_samples_split=random.choice(min_samples_split), max_features=random.choice(max_features)) # Print out the parameters print(rfr.get_params()) -------------------------------------------------- # Exercise_3 from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import make_scorer, mean_squared_error # Finish the dictionary by adding the max_depth parameter param_dist = {"max_depth": [2, 4, 6, 8], "max_features": [2, 4, 6, 8, 10], "min_samples_split": [2, 4, 8, 16]} # Create a random forest regression model rfr = RandomForestRegressor(n_estimators=10, random_state=1111) # Create a scorer to use (use the mean squared error)
class TrainROI(): """Train a regressor and test it on ROI loan data """ def __init__(self): self.load_data() self.calculate_roi() self.convert_to_float() self.split_by_grade() self.create_targets_features() self.split_train_test(train_size=0.8) #self.balance() self.X_train = self.X_train.drop(['loan_status', 'total_pymnt', 'roi'], 1).values self.y_train = self.y_train.values self.X_test = self.X_test.drop(['loan_status', 'total_pymnt', 'roi'], 1).values self.y_test = self.y_test.values def load_data(self): fileName = 'data.pickle' print "Loading %s" %fileName f = open(fileName, 'rb') self.loanData = pickle.load(f) def calculate_roi(self): self.loanData['roi'] = (self.loanData['total_pymnt']-self.loanData['funded_amnt'])/self.loanData['funded_amnt'] def convert_to_float(self): self.loanData = self.loanData.astype(float) def split_by_grade(self, grade='A'): self.loans = self.loanData[self.loanData[grade] == 1] self.loans = self.loans.drop(['A', 'B', 'C', 'D', 'E', 'F', 'G'], 1) def split_train_test(self, train_size=0.8): mask = np.random.rand(len(self.targets)) < train_size self.X_train = self.features[mask] self.y_train = self.targets[mask] self.X_test = self.features[~mask] self.y_test = self.targets[~mask] print "Instances in training: ", len(self.X_train) print "Instances in testing: ", len(self.X_test) def scale(self): self.scalerX = StandardScaler().fit(self.X_train) self.X_train, self.X_test = self.scalerX.transform(self.X_train), \ self.scalerX.transform(self.X_test) def standardize_samples(self): ##0 mean, unit variance self.X_train = preprocessing.scale(self.X_train) self.X_test = preprocessing.scale(self.X_test) def scale_samples_to_range(self): ##Samples lie in range between 0 and 1 minMaxScaler = preprocessing.MinMaxScaler() self.X_train = minMaxScaler.fit_transform(self.X_train) self.X_test = minMaxScaler.fit_transform(self.X_test) def balance(self): """Balances the training default and non-default instances""" print "Total loans before balancing: ", len(self.X_train) print "Defaults before balancing: ", np.sum(self.X_train['loan_status'] == 0) defaults_added = 0 for i in range(1, len(self.X_train)): loan = self.X_train[i-1:i] loan_roi = self.y_train[i-1:i] if int(loan['loan_status']) == 0: for n in range(10): #replicate the loan multiple times defaults_added += 1 if defaults_added%100 == 0: print defaults_added self.X_train = self.X_train.append(loan) self.y_train = self.y_train.append(loan_roi) print "Total loans after balancing: ", len(self.y_train) print "Defaults after balancing: ", np.sum(self.X_train['loan_status'] == 0) def create_targets_features(self): self.targets = self.loans['roi'] self.features = self.loans def define_linear_regressor(self): self.regr = LinearRegression() def define_SVR(self, C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.01, cache_size=200, class_weight='auto', verbose=True, max_iter=-1, random_state=None): print "Using a Support Vector Machine Regressor ..." self.regr = SVR(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, verbose=verbose, max_iter=max_iter, random_state=random_state) print self.regr.get_params() def define_rfr(self, n_estimators): self.regr = RandomForestRegressor(n_estimators=n_estimators) def train_regr(self): self.regr.fit(self.X_train, self.y_train) def score_regr(self, X, y): score = self.regr.score(X, y) print "Score: %0.3f" %score def predict(self, filename_label): print "predicting" self.prediction = self.regr.predict(self.X_test) print "Saving prdiction as A_%s.pickle" %filename_label self.save_pickle(fileName="A_%s_predict.pickle" %filename_label, data=self.prediction) self.save_pickle(fileName="A_%s_test.pickle" %filename_label, data=self.y_test) def runPCA(self, n_components=None, copy=False, whiten=False): print "Running PCA Dimensionality Reduction with n_components = ", n_components self.pca = PCA(n_components=n_components, copy=copy, whiten=whiten) self.X_train = self.pca.fit_transform(self.X_train) print "Reduced data down to ", self.pca.n_components_, " dimensions: " print "Transforming test data ..." self.X_test = self.pca.transform(self.X_test) #self.X_cv = self.pca.transform(self.X_cv) def runRFRGridSearch(self): n_estimators = [10,50,100,500] for n in n_estimators: self.define_rfr(n_estimators=n) self.train_regr() self.predict(filename_label="rfr_n_est_%i" %n) def runSVRGridSearch(self): C_vals = [0.1, 0.5, 1, 10, 100] gamma_vals = [1E-1, 1, 1E1, 1E2, 1E3] degrees = [3,4,5] for C in C_vals: for gamma in gamma_vals: for degree in degrees: print "\n\n C: ", C, " gamma: ", gamma self.define_SVR(C=C, gamma=gamma, degree=degree, cache_size=2000) self.train_regr() print "Training Scores:" self.score_regr(self.X_train, self.y_train) print "Testing Scores:" self.score_regr(self.X_test, self.y_test) self.predict(filename_label="svr_C_%s_gamma_%s" %(C, gamma)) def plot_score(self): plt.scatter(self.prediction, self.y_test) plt.plot([0,1.3], [0,1.3]) plt.xlabel('prediction') plt.ylabel('y_test') plt.show() def save_pickle(self, fileName, data): f = open(fileName, 'wb') pickle.dump(data, f) f.close()
def main(arg1): #print arg1 fname = '../EssentiaTrainFeatures/'+ arg1 #Liquids_To_UnvoicedPlosives.arff' fname2 = './'+ arg1 #Liquids_To_UnvoicedPlosives.arff' start = time() try: f = open(fname,'r') except: return('error') #lines = f.readlines()[:] #f.close() #floats = [] #for line in lines: # floats.append(shlex.split(line)) #array = np.asarray(floats) #for (x,y), value in np.ndenumerate(array): # if value == np.nan or value == 'NaN': # array[x][y] = 0; # elif value == np.infty: # array[x][y] = 1; array = np.loadtxt(f) f.close() array = np.nan_to_num(array) #array = array.astype(np.float) print 'Data size' print np.shape(array) #scale = StandardScaler() #array = scale.fit_transform(array) trainY = array[:,305] trainX = np.delete(array, [302,303,304,305,306,307],1) elapsed = time() - start print 'Training array loading time' print elapsed/60 f = open(fname2,'r') #lines = f.readlines()[:] #f.close() #floats = [] #for line in lines: # floats.append(shlex.split(line)) #array2 = np.asarray(floats) #for (x,y), value in np.ndenumerate(array2): # if value == np.nan or value == 'NaN': # array2[x][y] = 0; # elif value == np.infty: # array2[x][y] = 2; array2 = np.loadtxt(f) f.close() array2 = np.nan_to_num(array2) #array2 = array2.astype(np.float) print 'Test size' print np.shape(array2) #scale = StandardScaler() #array = scale.fit_transform(array) #traiY = array[:,38] #Position = array2[:,36] #Hmmboundary = array2[:,37] #Manualboundary = array2[:,38] hmm_true = array2[:,305] hmmX = np.delete(array2, [302,303,304,305,306,307],1) #trainY, realY, trainX, testX = train_test_split(traiY,traiX,test_size=0.8,random_state=42) #Cost = np.power(2,np.arange(1,12)); #g = [0.5,0.25,0.125,0.0625,0.03125,0.015625,0.0078125,0.00390625,0.001953125,0.0009765625,0.00048828125,0.00048828125] #print '\nCost values' #print Cost #print '\ngamma values' #print g #scorebest = 0 #Cbest = 0 #gammabest = 0 #model_to_set = NuSVR(C=32, cache_size=2048, coef0=0.0, degree=3, gamma=0.03125, kernel='rbf', # max_iter=-1, nu=0.5, probability=False, shrinking=True, tol=0.001, # verbose=True) #parameters = {'C':Cost,'gamma':g}#,'nu':[0.5],'kernel':['rbf'],'verbose':[True]} #k =[0.5,1]#2,5,7,8]; model_to_set = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=5000, min_samples_split=2000, min_samples_leaf=10,min_density=0.1, max_features='auto', bootstrap=True, compute_importances=False, oob_score=False, n_jobs=3, random_state=None, verbose=0) #parameters = {'n_estimators':[10,100,500],'max_depth':[1,5,20,100,None],'min_samples_split':[1,5,20,100],} #trainY, realY, trainX, testX = train_test_split(traiY,traiX,test_size=0,random_state=42) print '\nparams' print model_to_set.get_params() start = time() print '\ntraining start time' print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) model_to_set.fit(trainX,trainY) print '\ntraining end time' print strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) elapsed = (time() - start) print elapsed/60 y_pred = model_to_set.predict(trainX) #return(y_pred,trainY) #score1 = model_to_set.score(trainX,trainY) #print 'score1' #print score1 #print 'Myscore1' #print MyScore(trainY,y_pred) #y_pred = model_to_set.predict(testX) #score2 = model_to_set.score(testX,realY) #print '\nscore2' #print score2 #print 'Myscore2' #print MyScore(realY,y_pred) '''TESTING''' hmm_pred = model_to_set.predict(hmmX) #baseName = arg1.replace('.arff','') #np.savetxt((baseName+'_hmm_pred.txt'),hmm_pred) #np.savetxt((baseName+'_hmm_true.txt'),hmm_true) #np.savetxt((baseName+'_Bhmm.txt'),array2[:,306]) #np.savetxt((baseName+'_Btrue.txt'),array2[:,307]) #np.savetxt((baseName+'_train_pred.txt'),y_pred) #np.savetxt((baseName+'_train_true.txt'),hmm_pred) #np.savetxt((baseName+'_Btrain.txt'),hmm_pred) return(hmm_pred,hmm_true,array2[:,306],array2[:,307], y_pred,trainY, array[:,307]) #cnt = 0; #print 'asdasd' #print hmm_pred '''for pred in hmm_pred:
valid_X = X_train[int(sz[0] * frac):, :] valid_Y = Y_train[int(sz[0] * frac):] #################################################################################### #################################################################################### #################################################################################### #classifier RFmodel = RandomForestRegressor( n_estimators=1000, #number of trees to generate n_jobs=1, #run in parallel on all cores criterion="mse" ) #train RFmodel = RFmodel.fit(train_X, train_Y) #get parameters params=RFmodel.get_params() #score on training set acc_rate=RFmodel.score(train_X,train_Y) print acc_rate #feature importances feat_imp=RFmodel.feature_importances_ df_train=pd.io.parsers.read_table('X_train.csv',sep=',',header=False) col_names=list(df_train.columns) feat_imp_dict={col_names[i]:feat_imp[i] for i in range(len(feat_imp))} feat_imp_sort = sorted(feat_imp_dict.items(), key=operator.itemgetter(1)) y_out=RFmodel.predict(valid_X) pred = np.array([np.max([0.0,x]) for x in y_out]) print ('prediction error=%f' % np.sqrt(sum( (pred[i]-valid_Y[i])**2 for i in range(len(valid_Y))) / float(len(valid_Y)) ))
plt.clf() ###############End: Construct Feature Importance plot for first Forest ##### ############### Start: Randomized Search CV ################################## # Look at parameters used by our current forest # from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state=42) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(rf.get_params()) from sklearn.model_selection import RandomizedSearchCV # ============================================================================= # # Randomized Search CV # # # Number of trees in random forest # n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)] # # Number of features to consider at every split # max_features = ['auto', 'sqrt'] # # Maximum number of levels in tree # max_depth = [int(x) for x in np.linspace(10, 110, num = 10)] # max_depth.append(None) # # Minimum number of samples required to split a node # min_samples_split = [2, 5, 10]
"min_samples_leaf": [1, 3, 10], "bootstrap" :[True,False]} grid_search = GridSearchCV(ld_multirf1, param_grid=param_grid) grid_search.fit(ldX_train, ldy_train) ldy_grid_search = grid_search.predict(ldX_test) scaler = MinMaxScaler(feature_range=(0, 1)) b = scaler.fit_transform(ldy2_test) inv_ldyhat = scaler.inverse_transform(ldy_grid_search) inv_ldyhat = scalef(ldy2_test,inv_ldyhat) #grid_search1 = GridSearchCV(ld_multirf1, param_grid=param_grid) #grid_search1.fit(ldX_train, ldy_train.iloc[:,1]) 3ld_multirf1.get_params().keys() regr_multimlp.get_params ld_multirf1.get_params # Predict on new data ldy_multirf = ld_multirf.predict(ldX_test) ldy_multimlp = ld_multimlp.predict(ldX_test) ldy_multiada = ld_multiada.predict(ldX_test) ldy_grid_search = grid_search.predict(ldX_test) grid_search.score(ldX_test, ldy_test) grid_search1.score(ldX_test, ldy_test.iloc[:,1]) ldysc_multirf = ld_multirf.predict(ldX_testsc) ldysc_multimlp = ld_multimlp.predict(ldX_testsc)
,{ 'fit' : svrFit, 'predict': scaledPredict, 'scaled': True, 'idx0': 3, 'idx1': 4, 'ratio': svmRatio } ,{ 'fit' : svrFit, 'predict': scaledPredict, 'scaled': True, 'idx0': 4, 'idx1': 5, 'ratio': svmRatio } ,{ 'fit' : svrFit, 'predict': scaledPredict, 'scaled': True, 'idx0': 6, 'idx1': 7, 'ratio': svmRatio } ] #models2 = combine.combineTrain(X_test, y_test, models) print "Training random forest..." forestSize = 30 print "\t# Examples: \t\t" + str(len(X_train)) print "\tForest Size: \t\t" + str(forestSize) start = time.time() clf = RandomForestRegressor(n_estimators=forestSize, n_jobs=8) clf = clf.fit(X_train, y_train) print "\tTraining Complete" print "\tTime: \t\t" + str(round(time.time() - start, 1)) + "s" #Reset n_jobs to 1 because multicore evaluation is apparently hard params = clf.get_params() clf.set_params(n_jobs = 1) print "\tRMSE: \t\t" + str(rmse(X_test, y_test, clf.predict, True)) #results = combine.combineTest(X_test, y_test, clf, models) #def subPredict(X): # return combine.combinePredict(X, clf, models) submission(clf.predict, filters, pca.transform)
class UncertainTaggerBuilder: """Encapsulates the machinery required to * discover features from the countably infinite set of possible motif-based features, and * return a function that, when called on a string, returns a corresponding array of numbers in [0, 1] """ def __init__(self, texts, tags, motifs, a=0.85, n=20, initial_max_separation=1, sort_sample_size=2000): """Initialize an UncertainTaggerBuilder with the required training data. TEXTS - a sequence of strings TAGS - a sequence of sequences of numbers in [0, 1] corresponding to TEXTS MOTIFS - a sequence of substrings used to create features A - the first feature's probability of being chosen N - the expected value of the size of a set of features, if approximating the set of possible features as infinite INITIAL_MAX_SEPARATION - the initial number of instances of a motif that can be detected ahead of or behind the current string position """ self._texts = list(texts) self._tags = list(tags) self._motifs = list(motifs) # This is the best OOB score yet observed. self._max_oob = -float('inf') self._a = a self._n = n self._guaranteed_n = 0 self._features = [PositionFeature( True), PositionFeature(False)] + [ MotifFeature(motif, i) for motif in self._motifs for i in range(-initial_max_separation, initial_max_separation) ] # Past states are stored for future recovery and for prediction # of the performance of future states. States are mapped to # performances. self._states = dict() self._model = RandomForestRegressor(random_state=random.randint( 0, 1000), verbose=1, n_jobs=-1, max_features='sqrt', oob_score=True) self._importance_sort(sort_sample_size) # Key invariant: The features with the highest rankings are at # the beginning of the list. self._rankings = {f: -i for i, f in enumerate(self._features)} self.scores = list() # Invariant: self._best_feature_set must be in descending order # by importance. self._best_feature_set = self._features def _importance_sort(self, sort_sample_size): """Sort the features stored in SELF by their importances.""" quickmodel = RandomForestRegressor(random_state=random.randint( 0, 1000), verbose=1, n_jobs=-1, max_features='sqrt', max_samples=0.2, min_samples_split=8, oob_score=True) sample_idx = random.sample(list(range(len(self._texts))), k=min(sort_sample_size, len(self._texts))) sample_texts = [self._texts[i] for i in sample_idx] sample_y = get_y([self._tags[i] for i in sample_idx]) quickmodel.fit(get_X(sample_texts, self._features), sample_y) self._max_oob = max(self._max_oob, quickmodel.oob_score_) self._features = sort_a_by_b(self._features, quickmodel.feature_importances_) def _sort(self): """Sort the features in descending order by their rankings.""" self._features.sort(reverse=True, key=lambda f: self._rankings[f]) def _get_r(self): # This is just the geometric sum formula with the linearity of # expectation return 1 - self._a / (self._n - self._guaranteed_n) def _random_candidate_feature_set(self): """Returns a random feature set of nonzero length. """ ret = self._best_feature_set[:self._guaranteed_n] p = self._a r = self._get_r() for candidate in self._features: if candidate not in ret: if random.random() < p: ret.append(candidate) p *= r if len(ret) == 0: return self._random_candidate_feature_set() return frozenset(ret) def _new_max_oob(self, ordered_feature_set): """Carries out the operations that correspond to discovering a new high-performing feature set. (This is a private helper function to IMPROVE.) ORDERED_FEATURE_SET must be in the same order as was used to most recently train the model. """ self._max_oob = self._states[frozenset(ordered_feature_set)].oob_score # Update the ideal length to be that of the high-performing feature # set. self._n = len(ordered_feature_set) # Update the ordered list of best features. feature_importances = self._model.feature_importances_ assert len(feature_importances) == len(ordered_feature_set) self._best_feature_set = ordered_feature_set self._best_feature_set = sort_a_by_b(self._best_feature_set, feature_importances) print('DEBUG: Max OOB score updated to {}'.format(self._max_oob)) # Update the current list of features with their successors. for feature in ordered_feature_set: successor = None try: successor = feature.successor() except AttributeError: pass # This feature does not support successors. if successor and successor not in self._features: self._features.append(successor) self._rankings[successor] = self._rankings[feature] - 1 self._sort() def _update_rankings(self, feature_set, oob_score): """Update the rankings for the features in FEATURE_SET according to whether the OOB score associated with them is good. """ for f in feature_set: if f not in self._best_feature_set[:self._guaranteed_n]: z = z_score(self.scores, oob_score) if np.isfinite(z): self._rankings[f] += z self._sort() def _improve(self): """Tests a new subset of the possible features. Returns True iff the operation terminated successfully. """ feature_set = self._random_candidate_feature_set() if feature_set not in self._states: ordered_feature_set = list(feature_set) X = get_X(self._texts, ordered_feature_set, cache='self._texts') y = get_y(self._tags) self._states[feature_set] = UTBStatePerformance( self._model.get_params()) self._model.fit(X, y) oob = self._model.oob_score_ self._states[feature_set].oob_score = oob if oob >= self._max_oob: self._new_max_oob(feature_set) self._update_rankings(feature_set, oob) self.scores.append(oob) return True return False def run(self, duration): """Improves the feature selection for DURATION minutes.""" t0 = time.time() duration_s = duration * SECONDS_PER_MINUTE while self._guaranteed_n < self._n: while not self._improve(): # If improvement attempt failed, then the set of possible # feature sets that could be tested is probably small; # therefore, more flexibility in feature set selection is # warranted. self._guaranteed_n = max(0, self._guaranteed_n - 1) self._guaranteed_n = int(self._n * (time.time() - t0) / duration_s) print('N = {}. Guaranteed N = {}.\nBest OOB = {}.'.format( self._n, self._guaranteed_n, self._max_oob)) def _CV(self, features, k=5, confidence=0.95): """Return a confidence interval for an f-score for a K-fold CV. """ f_scores = list() for a in range(k): self._model.fit( get_X([ self._texts[i] for i in range(len(self._texts)) if i % k == a ], features, cache='train partition {}/{}'.format(a, k)), get_y([ self._tags[i] for i in range(len(self._texts)) if i % k == a ])) actual = get_y( [self._tags[i] for i in range(len(self._texts)) if i % k != a]) pred = self._model.predict( get_X([ self._texts[i] for i in range(len(self._texts)) if i % k != a ], features, cache='pred partition {}/{}'.format(a, k))) print('DEBUG: pred: ', pred[:20]) print('DEBUG: actual: ', actual[:20]) pred = [round(p) for p in pred] print( 'DEBUG: precision: ', sum(pred[i] and actual[i] for i in range(len(actual))) / max(sum(pred), 0.00001)) print( 'DEBUG: recall: ', sum(actual[i] and pred[i] for i in range(len(actual))) / max(sum(actual), 0.00001)) f_scores.append(f1_score(actual, pred)) return t_ci(f_scores, 1 - confidence)
log_regressor_summary(rfr, X_train, X_test, y_train, y_test) # tests exp = neptune.get_experiment() # check logs correct_logs_set = { 'evs_test_sklearn', 'me_test_sklearn', 'mae_test_sklearn', 'r2_test_sklearn', 'charts_sklearn' } from_exp_logs = set(exp.get_logs().keys()) assert correct_logs_set == from_exp_logs, '{} - incorrect logs'.format(exp) # check sklearn parameters assert set(exp.get_properties().keys()) == set( rfr.get_params().keys()), '{} parameters do not match'.format(exp) # check neptune parameters assert set(exp.get_parameters().keys()) == set( parameters.keys()), '{} parameters do not match'.format(exp) ## Step 5: Stop Neptune experiment after logging summary neptune.stop() ## Explore results # Scikit-learn classification ## Step 1: Create and fit gradient boosting classifier
def main_params(x_fname, y_fname, model_names, models_dir, ncores, model_type, verbose, cv_predictions, imbalanced, input_format): seed = 42 # create models subdir if models_dir is None: models_dir = os.path.join(os.path.dirname(x_fname), "models") if not os.path.exists(models_dir): os.makedirs(models_dir) for m in model_names: fpath = os.path.join(models_dir, m + ".pkl") if os.path.exists(fpath): os.remove(fpath) model_stat_fname = os.path.join(models_dir, "models_stat.txt") # load y y = load_y(y_fname) # load x if input_format == 'txt': descr_names, mol_names, x = load_sirms_txt(x_fname, names=y.keys()) elif input_format == 'svm': descr_names, mol_names, x = load_sirms_svm(x_fname, names=y.keys()) else: print("Illegal value of input format: " % input_format) exit() # process y y = np.asarray([y[n] for n in mol_names]) # process x save_bound_box_constrains(x, os.path.join(models_dir, "bound_box.pkl")) save_object(descr_names, os.path.join(models_dir, "var_names.pkl")) # scale scale = StandardScaler().fit(x) save_object(scale, os.path.join(models_dir, "scale.pkl")) x = scale.transform(x) if model_type == "class": cv = ms.StratifiedKFold(n_splits=5, random_state=seed, shuffle=True) elif model_type == "reg": cv = ms.KFold(n_splits=5, random_state=seed, shuffle=True) if model_type == "class" and imbalanced: subsets = make_subsets(y, seed) if not subsets: warnings.warn("The data set is balanced (ratio majority:minority < 1.5)." "No multiple undersampling will be done", Warning) subsets = [list(range(y.shape[0]))] else: subsets = [list(range(y.shape[0]))] # build models for current_model in model_names: if verbose: print(current_model.upper() + ' model building...') models_lst = [] # this lst refreshes on each model name; here we store either 1 model in balanced case or list of models=number of subsets in the case of imbalanced for subset in subsets: if current_model == "rf": # choosing optimal parameters param_grid = {"max_features": [x.shape[1] // 10, x.shape[1] // 7, x.shape[1] // 5, x.shape[1] // 3], "n_estimators": [500]} if model_type == "reg": m = ms.GridSearchCV(RandomForestRegressor(random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) elif model_type == "class": m = ms.GridSearchCV(RandomForestClassifier(random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) m.fit(x[subset], y[subset]) # final model if model_type == "reg": m = RandomForestRegressor(n_estimators=m.best_params_["n_estimators"], max_features=m.best_params_["max_features"], bootstrap=True, random_state=seed) elif model_type == "class": m = RandomForestClassifier(n_estimators=m.best_params_["n_estimators"], max_features=m.best_params_["max_features"], bootstrap=True, random_state=seed) models_lst.append(m) if current_model == "gbm": # choosing optimal parameters param_grid = {"n_estimators": [100, 200, 300, 400, 500]} if model_type == "reg": m = ms.GridSearchCV(GradientBoostingRegressor(subsample=0.5, max_features=0.5, random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) elif model_type == "class": m = ms.GridSearchCV(GradientBoostingClassifier(subsample=0.5, max_features=0.5, random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) m.fit(x[subset], y[subset]) # final model if model_type == "reg": m = GradientBoostingRegressor(n_estimators=m.best_params_["n_estimators"], subsample=0.5, max_features=0.5, random_state=seed) elif model_type == "class": m = GradientBoostingClassifier(n_estimators=m.best_params_["n_estimators"], subsample=0.5, max_features=0.5, random_state=seed) models_lst.append(m) if current_model == "svm": # choosing optimal parameters if model_type == "reg": param_grid = {"C": [10 ** i for i in range(0, 5)], "epsilon": [0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.01]} m = ms.GridSearchCV(svm.SVR(kernel='rbf'), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) elif model_type == "class": param_grid = {"C": [10 ** i for i in range(0, 5)], "gamma": [10 ** i for i in range(-6, 0)]} m = ms.GridSearchCV(svm.SVC(kernel='rbf', random_state=seed), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) m.fit(x[subset], y[subset]) # final model if model_type == "reg": m = svm.SVR(kernel='rbf', C=m.best_params_["C"], epsilon=m.best_params_["epsilon"]) elif model_type == "class": m = svm.SVC(kernel='rbf', C=m.best_params_["C"], gamma=m.best_params_["gamma"], probability=True, random_state=seed) models_lst.append(m) if current_model == "pls" and model_type == "reg": # choosing optimal parameters param_grid = {"n_components": [i for i in range(1, 8)]} m = ms.GridSearchCV(PLSRegression(), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) m.fit(x[subset], y[subset]) # final model m = PLSRegression(n_components=m.best_params_["n_components"]) models_lst.append(m) if current_model == "knn": # choosing optimal parameters param_grid = {"n_neighbors": [i for i in range(3, 21)]} if model_type == "reg": m = ms.GridSearchCV(KNeighborsRegressor(), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) elif model_type == "class": m = ms.GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=ncores, cv=cv, refit=False, verbose=verbose) m.fit(x[subset], y[subset]) # final model if model_type == "reg": m = KNeighborsRegressor(n_neighbors=m.best_params_["n_neighbors"]) elif model_type == "class": m = KNeighborsClassifier(n_neighbors=m.best_params_["n_neighbors"]) models_lst.append(m) # return cv predictions ncol = len(models_lst) + 1 if len(models_lst) > 1 else len(models_lst) # +1 column if consensus cv_pred = np.column_stack((y, np.full((y.shape[0], ncol), np.nan))) for i, (m, subset) in enumerate(zip(models_lst, subsets)): pred = ms.cross_val_predict(estimator=m, X=x[subset], y=y[subset], cv=cv) if current_model == 'pls': # reshape for pls because it returns 2d array and we need 1d pred = pred.reshape(len(subset)) cv_pred[subset, i + 1] = pred # build final model, save it and its stat m.fit(x[subset], y[subset]) add_obj_to_file(os.path.join(models_dir, current_model + '.pkl'), m) save_model_stat_2(current_model + '_%i' % i, model_stat_fname, str(m.get_params())[1:-1], y[subset], cv_pred[subset, i + 1], model_type, verbose) # calc cv consensus and save stat if model_type == "class" and len(models_lst) > 1: cv_pred[:, -1] = np.apply_along_axis(get_major_vote, 1, cv_pred[:, 1:]) # cv_pred[:, -1] = np.around(np.nanmean(cv_pred[:, 1:], axis=1)) save_model_stat_2(current_model + "_consensus", model_stat_fname, "", y, cv_pred[:, -1], model_type, verbose) # save cv predictions if cv_predictions: np.savetxt(os.path.join(models_dir, current_model + "_cv_pred.txt"), np.column_stack([mol_names, np.round(cv_pred, 3)]), fmt="%s", delimiter="\t", comments="", header="Mol\tObs\t" + "\t".join("%s_%i" % (current_model, i) for i in range(len(models_lst))) + "\t" + current_model + "_consensus") if verbose: print(current_model.upper() + ' model was built\n')