def FI_xgb_sklearn(): X, y = load_traindata(encodetype='le') cols = list(X.columns) rndcol = np.random.randn(X.shape[0]) X = np.column_stack((X, rndcol)) cols.append('random') xgb1 = XGBRegressor(learning_rate=0.01, n_estimators=3320, max_depth=3, min_child_weight=4, colsample_bytree=0.8, subsample=0.8, importance_type='total_gain', objective='reg:linear', n_jobs=-1, random_state=0, seed=27, silent=True) xgb1.fit(X, y) imp = sorted(list(zip(cols, xgb1.feature_importances_)), key=lambda t: abs(t[1]), reverse=True) imp = pd.DataFrame(imp, columns=['Feature', 'Importance']) rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0] print(imp.iloc[:rnd_idx + 1, :]) return imp
def FI_reg(): """ Add a random column to the feature data, and run the optimized lasso model Any feature that ranks lower in importance than the random column has no significance for prediction with this model """ X, y = load_traindata() cols = list(X.columns) lasso = Lasso(max_iter=10000, alpha=0.004498433) scaler = RobustScaler() X = scaler.fit_transform(X) # add random column rndcol = np.random.randn(X.shape[0]) X = np.column_stack((X, rndcol)) cols.append('random') lasso.fit(X, y) coefs = sorted(list(zip(cols, lasso.coef_)), key=lambda t: abs(t[1]), reverse=True) coefs = pd.DataFrame(coefs, columns=['Feature', 'Coef']) rnd_idx = np.argwhere(coefs['Feature'] == 'random')[0][0] print("random column coefficient is : %.4f, ranking %d" % (coefs.iloc[rnd_idx, 1], rnd_idx))
def initialise_game(model, budget, niter, feature_number, method): train_x_all = [] test_x_all = [] dev_x_all = [] for i in range(len(model)): train_x, train_y = helper.load_traindata(feature_number[i], model[i], seed=0) train_x_all.append(train_x) train_y_all = train_y test_x, test_y = helper.load_testdata(feature_number[i], model[i], seed=0) test_x_all.append(test_x) test_y_all = test_y #dev_x, dev_y = helper.load_testdata(feature_number[i], model[i], seed=3) #dev_x_all.append(dev_x) #dev_y_all = dev_y dev_x_all = test_x_all dev_y_all = test_y_all story = [train_x_all, train_y_all] dev = [dev_x_all, dev_y_all] test = [test_x_all, test_y_all] # load game game = Env(story, test, dev, budget, MODEL_VER, model, feature_number, CUM, EXPNUM, 0, method) return game
def FI_RF_permuation_blog(): X, y = load_traindata(encodetype ='le') rf = ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 14, max_features = 26, random_state = 42, oob_score=True) rf.fit(X, y) drop_in_mse = permutation_importances(rf, X, y, oob_regression_mse_score) imp = [-x for x in drop_in_mse] imp = sorted(list(zip(X.columns,imp)),key=lambda t: abs(t[1]), reverse = True) imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] ) return imp
def FI_RF_permuation(rf): X, y = load_traindata(encodetype='le') rndcol = np.random.randn(X.shape[0]) X['random'] = rndcol rf.fit(X, y) imp = permutation_importances(rf, X, y) imp = sorted(list(zip(X.columns, imp)), key=lambda t: abs(t[1]), reverse=True) imp = pd.DataFrame(imp, columns=['Feature', 'Importance']) rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0] print(imp.iloc[:rnd_idx + 1, :]) return imp
def FI_RF_permuation(metric = oob_regression_r2_score): X, y = load_traindata(encodetype ='le') rndcol = np.random.randn(X.shape[0]) X['random'] = rndcol rf = ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 14, max_features = 26, random_state = 42, oob_score=True) rf.fit(X, y) imp = permutation_importances(rf, X, y, oob_regression_r2_score) #imp = permutation_importances(rf, X, y, oob_regression_mse_score) imp = sorted(list(zip(X.columns,imp)),key=lambda t: abs(t[1]), reverse = True) imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] ) rnd_idx = np.argwhere( imp['Feature'] == 'random' )[0][0] print(imp.iloc[:rnd_idx+1,:]) return imp
def FI_RF_sklearn(): X, y = load_traindata(encodetype ='le') cols = list(X.columns) # scaler = RobustScaler() # X = scaler.fit_transform(X) rndcol = np.random.randn(X.shape[0]) X = np.column_stack((X,rndcol)) cols.append('random') rf = ensemble.RandomForestRegressor(n_estimators = 100, max_depth = 14, max_features = 26, random_state = 42) rf.fit(X, y) imp = sorted(list(zip(cols,rf.feature_importances_)),key=lambda t: abs(t[1]), reverse = True) imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] ) rnd_idx = np.argwhere( imp['Feature'] == 'random' )[0][0] print(imp.iloc[:rnd_idx+1,:]) return imp
def FI_reg_blog(): """ Add a random column to the feature data, and run the optimized lasso model Any feature that ranks lower in importance than the random column has no significance for prediction with this model """ X, y = load_traindata() cols = list(X.columns) lasso = Lasso(max_iter = 10000, alpha = 0.000308884359647748) scaler = RobustScaler() X = scaler.fit_transform(X) lasso.fit(X, y) coefs = sorted(list(zip(cols,lasso.coef_)),key=lambda t: abs(t[1]), reverse = True) coefs = pd.DataFrame( coefs, columns = ['Feature', 'Coef'] ) return coefs
def FI_xgb_blog(): X, y = load_traindata(encodetype ='le') cols = list(X.columns) xgb1 = XGBRegressor( learning_rate=0.01, n_estimators=3320, max_depth=3, min_child_weight=4, colsample_bytree=0.8, subsample=0.8, importance_type='total_gain', objective='reg:linear', n_jobs=-1, random_state= 0, seed=27, silent=True ) xgb1.fit(X, y) imp = sorted(list(zip(cols,xgb1.feature_importances_)),key=lambda t: abs(t[1]), reverse = True) imp = pd.DataFrame( imp, columns = ['Feature', 'Importance'] ) return imp
def reboot(self, model): # reboot everything to initial status # save in the csv file helper.write_csv_game(self, model) random.seed(self.cvit) random.shuffle(self.order) self.queried_times = 0 self.terminal = False if not self.accum: self.queried_set_x = [] self.queried_set_y = [] self.current_frame = 0 self.count_action0 = 0 self.count_action1 = 0 self.rAll = [] self.episode += 1 self.train_x_all = [] self.train_y_all = [] self.test_x_all = [] self.test_y_all = [] self.rounds = 0 for i in range(len(self.feature)): train_x, train_y = helper.load_traindata(self.feature[i], model[i], seed=self.episode) self.train_x_all.append(train_x) self.train_y_all = train_y test_x, test_y = helper.load_testdata(self.feature[i], model[i], seed=self.episode) self.test_x_all.append(test_x) self.test_y_all = test_y self.dev_x_all = self.test_x_all self.dev_y_all = self.test_y_all self.cross_counts_correct_train = np.zeros( shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int) self.cross_counts_incorrect_train = np.zeros( shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int) self.cross_counts_correct_test = np.zeros(shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int) self.cross_counts_incorrect_test = np.zeros( shape=(len(self.feature) + 1, len(self.feature) + 2)).astype(int)
def data_generation(model, feature_number): train_x_all = [] test_x_all = [] for i in range(len(model)): print("Loading data for feature {0}..".format(feature_number[i])) train_x, train_y = helper.load_traindata(feature_number[i],model[i],seed=0) train_x_all.append(train_x) train_y_all = train_y test_x, test_y = helper.load_testdata(feature_number[i],model[i],seed=0) test_x_all.append(test_x) test_y_all = test_y train = [train_x_all, train_y_all] test = [test_x_all, test_y_all] return train, test
proj_path = 'C:\\Users\\yanqi\\Documents\\NYCDSA\\Project 3 - Machine Learning\\Housing Price Prediction\\house_price_prediction\\code\\basecase' os.chdir(proj_path) from sklearn.model_selection import GridSearchCV, KFold, train_test_split import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.metrics import mean_squared_error import pickle from helper import plot_cv_traintestscores, make_prediction_dummy, load_traindata from sklearn import tree import time X, y = load_traindata() cols = X.columns # standardize x_train data scaler = RobustScaler() X = scaler.fit_transform(X) n_folds_i = 5 n_folds_o = 5 rs = 1 inner_cv = KFold(n_splits=n_folds_i, shuffle=True, random_state=rs) outer_cv = KFold(n_splits=n_folds_o, shuffle=True, random_state=rs) def simpleDT(X, y): tree_model = tree.DecisionTreeRegressor()
import os proj_path = 'C:\\Users\\yanqi\\Documents\\NYCDSA\\Project 3 - Machine Learning\\Housing Price Prediction\\house_price_prediction\\code\\basecase' os.chdir(proj_path) from sklearn.model_selection import GridSearchCV, KFold, train_test_split import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.metrics import mean_squared_error from helper import plot_cv_traintestscores, make_prediction_dummy, load_traindata from sklearn import tree import time X, y = load_traindata(encodetype='le') cols = X.columns # standardize x_train data scaler = RobustScaler() X = scaler.fit_transform(X) n_folds_i = 5 n_folds_o = 5 rs = 1 inner_cv = KFold(n_splits=n_folds_i, shuffle=True, random_state=rs) outer_cv = KFold(n_splits=n_folds_o, shuffle=True, random_state=rs) def simpleDT(X, y): tree_model = tree.DecisionTreeRegressor()
min_child_weight=3, missing=None, n_estimators=1996, n_jobs=-1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=0.9, scale_pos_weight=1, seed=27, silent=True, subsample=0.55) # load training data X, y = load_traindata() cols = X.columns X_le, tmp = load_traindata(encodetype='le') cols_le = X_le.columns scaler1 = RobustScaler() X = scaler1.fit_transform(X) scaler2 = RobustScaler() X_le = scaler2.fit_transform(X_le) # create 5-fold CV scheme to fit base models, and make out-of-fold predictions cv = KFold(n_splits=5, shuffle=True, random_state=42) oof_pred = np.zeros((X.shape[0], 4)) for train_idx, test_idx in cv.split(X):