def return_best_rf_regressor(df, target, num_trees_hyperparameter, num_trees_final_clf, num_iterations): print "entering return best rf regressor function" if df.shape[0] < 10000: num_samples = df.shape[0] else: num_samples = int(df.shape[0]*0.7) print "Sample dataframe" #use X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples) # figure out a vary this some how """ param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} """ param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True]} clf = RandomForestRegressor(n_estimators=num_trees_hyperparameter) print "starting hyperparameter search" clf_best, best_params = hyperparameter_search_random(X, y, clf, param_dist, num_iterations) print "sample data for fitting model" #train new classifier on the entire dataset X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples=df.shape[0]) clf_final = RandomForestRegressor(n_estimators=num_trees_final_clf, max_depth = best_params["max_depth"], min_samples_leaf = best_params["min_samples_leaf"], min_samples_split = best_params["min_samples_split"], bootstrap = best_params["bootstrap"], max_features = best_params["max_features"]) print "Fitting Random Forest Regressor" clf_final.fit(X,y) return clf_final, column_list_for_sampled
def run_lasso_on_input(df, target): X_part, y_part, _ = sample_data_frame_return_x_y_column_name(df, True, target, int(0.7*df.shape[0])) X_part, _ = scale_input_data(X_part) print "#######################################" print "Starting LARS CV" print "#######################################" lars_cv = LassoLarsCV(cv=10).fit(X_part, y_part) print "#######################################" print "Done with LARS CV" print "#######################################" #alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, df.shape[0]) X, _ = scale_input_data(X) print "#######################################" print "Starting main lasso" print "#######################################" clf = RandomizedLasso(alpha= lars_cv.alphas_, random_state=12, n_resampling= 400, normalize=True).fit(X, y) print "#######################################" print "Done with main lasso" print "#######################################" return clf, column_list_for_sampled
def orthogonal_variable_selection_cannot_query_black_box(df, target, non_linear, no_bootstrap_iter, num_samples): master_dictionary = {} for name in list(df.columns): if name != target: master_dictionary[name] = [] for iteration in range(no_bootstrap_iter): print "going through iteration " + str(iteration) + " of orthogonal feature selection" X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples) master_dictionary = perform_orthogonal_variable_selection(X, y, column_list_for_sampled, non_linear, master_dictionary) return master_dictionary
def orthogonal_variable_selection_cannot_query_black_box( df, target, non_linear, no_bootstrap_iter, num_samples): master_dictionary = {} for name in list(df.columns): if name != target: master_dictionary[name] = [] for iteration in range(no_bootstrap_iter): print "going through iteration " + str( iteration) + " of orthogonal feature selection" X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name( df, True, target, num_samples) master_dictionary = perform_orthogonal_variable_selection( X, y, column_list_for_sampled, non_linear, master_dictionary) return master_dictionary