def pipe(stage_1, stage_2, y_1, y_2, to_pred, regr=RandomForestRegressor(max_depth=5, n_estimators=100)): #guess it should be totally doable on pipelines and unions but I'm lazy and don't want to write transforms selector_cat = SelectFromModel(RandomForestRegressor(max_depth=5, n_estimators=100), threshold='median') selector_bin = RandomizedLasso() selector_cat.fit(stage_1[objs], y_1) selector_bin.fit(stage_1[ints], y_1) trunc = np.hstack([ selector_cat.transform(stage_2[objs]), selector_bin.transform(stage_2[ints]) ]) trunc_test = np.hstack([ selector_cat.transform(to_pred[objs]), selector_bin.transform(to_pred[ints]) ]) pca = PCA(n_components=12, random_state=420).fit(stage_1) regr_no_pca = clone(regr) regr_with_pca = clone(regr) trunc_with_pca = np.hstack([trunc, pca.transform(stage_2)]) trunc_with_pca_test = np.hstack([trunc_test, pca.transform(to_pred)]) regr_no_pca.fit(trunc, y_2) regr_with_pca.fit(trunc_with_pca, y_2) return regr_no_pca.predict(trunc_test) / 2 + regr_with_pca.predict( trunc_with_pca_test) / 2
def main(train_label, train_feat, modelsdir, selfeat): X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' ')) y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' ')) X_trains = X_train scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # performs feature selection featsel_str = ".all-feats" if int(selfeat): print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=int(config['n_jobs']), random_state=42, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs'])) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=int(config['RR_Iter']), scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True, cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42), verbose=1, random_state=42) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(modelsdir + os.sep + "*")) estimator2 = ExtraTreesRegressor( bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=int(config['n_jobs'])) print "Train the model with the best parameters ..." estimator2.fit(X_trains, y_train) from sklearn.externals import joblib joblib.dump(estimator2, modelsdir + "/XRT.pkl") joblib.dump(scaler, modelsdir + "/scaler.pkl") joblib.dump(sel_est, modelsdir + "/sel_est.pkl")
class LinearAll: """ A repertoire of Linear Variable Selection and Prediction Models Parameters ---------- n_jobs : int, optional Number of jobs to run in parallel (default 1). If -1 all CPUs are used. This will only provide speedup for n_targets > 1 and sufficient large problems pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs An int, giving the exact number of total jobs that are spawned A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’ refit : boolean Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting. iid : boolean, optional If True, the data is assumed to be identically distributed across the folds, and the score is computed from all samples individually, and not the mean loss across the folds. (If the number of data points is the same across folds, either returns the same thing) Attributes ---------- ols_train, predictions models before variable selection predictions models after variable selection """ def __init__ (self, cv=20, scoring = 'mean_squared_error', n_jobs=1, refit=False, iid=False, pre_pred=True, param_ridge_post=list(np.arange(1,3,0.1)), rlasso_selection_threshold = 0.5): #self.__name__ = '__main__' """ CAUTION: we changed to __main__ so that parallelization works """ self.cv = cv self.scoring = scoring self.n_jobs = n_jobs self.refit = refit self.iid = iid self.pre_pred =pre_pred self.param_ridge_post = param_ridge_post self.rlasso_selection_threshold = rlasso_selection_threshold def run_models(self, X, y, param_ridge): """ Prediction Models. OLS, PLS, Ridge """ ################################## ## OLS CV ################################## #ols = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_cv_score = cross_validation.cross_val_score( # ols, X, y, # cv=self.cv, scoring=self.scoring, # n_jobs=self.n_jobs) """ self.ols_cv_score.shape = (cv,) """ ################################## ## PLS CV ################################## tuned_parameters = [{'n_components': range(1, 5)}] pls = PLSRegression() pls_cv = GridSearchCV(pls, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) pls_cv.fit(X, y) ################################## ## Ridge CV ################################## tuned_parameters = [{'alpha': param_ridge}] ridge = linear_model.Ridge(alpha = 1) ridge_cv = GridSearchCV(ridge, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) ridge_cv.fit(X, y) return (pls_cv, ridge_cv) def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self def predict(self, X_test): assert(self.refit == True) if self.pls_post.best_score_ > self.ridge_post.best_score_: self.best_model = self.pls_post print "Chosen Model: pls" else: self.best_model = self.ridge_post print "Chosen Model: ridge" if self.rlasso_selection_threshold == 0: X_test_selected = X_test[:, self.active] else: X_test_selected = self.rlasso.transform(X_test) return self.best_model.best_estimator_.predict(X_test_selected)
def main(train_label, train_feat, modelsdir, selfeat): X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' ')) y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' ')) X_trains = X_train scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # performs feature selection featsel_str = ".all-feats" if int(selfeat): print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=int(config['n_jobs']), random_state=42, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs'])) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=int(config['RR_Iter']), scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True, cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42), verbose=1, random_state=42) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(modelsdir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=int(config['n_jobs'])) print "Train the model with the best parameters ..." estimator2.fit(X_trains,y_train) from sklearn.externals import joblib joblib.dump(estimator2, modelsdir+"/XRT.pkl") joblib.dump(scaler, modelsdir+"/scaler.pkl") joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
def main(): start = time.time() MAX_TRAIN_SIZE = 126838 train_size = 20000 val_size = MAX_TRAIN_SIZE - train_size data, test_data = get_data('data') X = data[0:train_size, 0:-1] y = [lbl for lbl in data[0:train_size, -1]] print(X.shape) print(len(y)) # use randomized log regression for feature selection clfR = RandomizedLasso( alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=1, pre_dispatch='3*n_jobs', #memory=Memory(cachedir=None) ) # fit regresion clfR.fit(X, y) # Transform Train Data to selected features X = np.array( X).copy() # little hack to fix assignment dest. read only error X_new = clfR.transform(X) X = X_new ## transform Quiz Dataset test_data = np.array(test_data).copy( ) # little hack to fix assignment dest. read only error transformed_test_data = clfR.transform(test_data) test_data = transformed_test_data print('Dimensions after feature Reduction: ' + str(X.shape)) print("Elapsed Time For Feature Reduction: " + str(duration)) # Training classifier clf1 = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None, presort=False) # fit sub-classifiers clf1.fit(X, y) # fit voting classifier print("Elapsed Time For Classifier Training: " + str(duration)) # predict & calculate training error y_hat = clf1.predict(X) test_err = 1 for yi, y_hati in zip(y, y_hat): test_err += (yi == y_hati) test_err /= train_size print("train: " + str(test_err)) # validation data - calculate valdiation error val_start = train_size val_end = train_size + val_size # get validation data set # TODO: put this back in if MAX_TRAIN_SIZE - train_size > val_size: print("Beginning test validation...") X_val = data[val_start:val_end, 0:-1] y_val = [lbl for lbl in data[val_start:val_end, -1]] y_val_hat = clf1.predict(X_val) test_err = 1 for yi, y_hati in zip(y_val, y_val_hat): test_err += (yi == y_hati) test_err /= X_val.shape[0] print("val: " + str(test_err)) #quiz data print("Beginning quiz validation...") # test_data = get_data('quiz') X_test = test_data[:, :] print(X_test.shape) y_test = [lbl for lbl in data[:, -1]] y_test_hat = clf1.predict(X_test) test_err = 1 # for yi, y_hati in zip(y_test, y_test_hat): # test_err += (yi == y_hati) # test_err /= X_test.shape[0] # print("test: " + str(test_err)) store_csv(y_test_hat, "prediction") end = time.time() duration = end - start print("Took this many seconds: " + str(duration))
def run(args): X_train = np.nan_to_num( np.genfromtxt(args.training_data, delimiter=args.delimiter)) y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1) X_trains = X_train if args.scale: print "Scaling features (mean removal divided by std)..." scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # create output folders outF = args.output_folder + "/" + os.path.basename( args.training_data) + "--FS_" + str( args.select_features) + "--i_" + str(args.iterations) buildDir(outF) maskF = outF + "/masks/" buildDir(maskF) #evaluation features first_experiments labels logs masks parameters # predictions src suca paramF = outF + "/parameters/" buildDir(paramF) #featF = outF+"/features/" #buildDir(featF) #evalF = buildDir(outF+"/evaluation") #os.path.basename( # args.training_data)]) + featsel_str + "--" + os.path.basename( # test_label # initializes numpy random seed np.random.seed(args.seed) # performs feature selection featsel_str = ".all-feats" if args.select_features: print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=8, random_state=args.seed, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join( # [".", "masks", os.path.basename(args.training_data)]) [maskF, os.path.basename(args.training_data)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=args.iterations, scoring=mae_scorer, n_jobs=8, refit=True, cv=KFold(X_train.shape[0], args.folds, shuffle=True, random_state=args.seed), verbose=1, random_state=args.seed) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(args.models_dir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=8) estimator2.fit(X_trains,y_train) from sklearn.externals import joblib print "koooonnn %s" % args.models_dir joblib.dump(estimator2, args.models_dir+"/XRT.pkl") joblib.dump(scaler, args.models_dir+"/scaler.pkl") joblib.dump(sel_est, args.models_dir+"/sel_est.pkl") # print "Kioonnn number of feat:\n", n_feature # ................SHAHAB ........................ print "Best parameters: ", search.best_params_ # saves parameters on yaml file #param_path = os.sep.join([".", "parameters", os.path.basename( param_path = os.sep.join([paramF, os.path.basename( args.training_data)]) + featsel_str + ".params.yaml" param_file = codecs.open(param_path, "w", "utf-8") yaml.dump(search.best_params_, stream=param_file) testF = os.sep.join([outF, "/test/"]) buildDir(testF) m = y_train.mean() # evaluates model on the different test sets test_features = sorted(glob.glob(args.test_data + os.sep + "*")) test_labels = sorted(glob.glob(args.test_labels + os.sep + "*")) for test_feature, test_label in zip(test_features, test_labels): print "Evaluating on %s" % test_label X_test = np.nan_to_num( np.genfromtxt(test_feature, delimiter=args.delimiter)) y_test = np.clip(np.genfromtxt(test_label), 0, 1) X_tests = X_test if args.scale: X_tests = scaler.transform(X_test) if args.select_features: X_tests = sel_est.transform(X_tests) # gets predictions on test set #y_pred = search.predict(X_tests) y_pred = np.clip(search.predict(X_tests), 0, 1) # evaluates on test set mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print "Test MAE = %2.8f" % mae print "Test RMSE = %2.8f" % rmse print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max()) # saves evaluation testFX = testF + "/" + os.path.basename(test_label) buildDir(testFX) buildDir(testFX + "/evaluation/") eval_path = os.sep.join([testFX, "evaluation", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8") mae_eval.write(str(mae) + "\n") rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8") rmse_eval.write(str(rmse) + "\n") mu = m * np.ones(y_test.shape[0]) # baseline on test set maeB = mean_absolute_error(y_test, mu) rmseB = np.sqrt(mean_squared_error(y_test, mu)) print "Test MAE Baseline= %2.8f" % maeB print "Test RMSE Baseline= %2.8f" % rmseB mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8") mae_eval.write(str(maeB) + "\n") rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8") rmse_eval.write(str(rmseB) + "\n") # saves predictions buildDir(testFX + "/predictions/") preds_path = os.sep.join([testFX, "predictions", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) + ".preds" np.savetxt(preds_path, y_pred, fmt="%2.15f")
u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD', u'GRASS', u'FLOWER', u'CHEMICAL']): attribute[idx] = attr # In[ ]: # select the best features with true values and save them features = pd.read_csv('all_features.csv',index_col=0).sort() target = pd.read_csv('targets_for_feature_selection.csv',index_col=0).sort()#replace this with targets_for_feature_selection_LB_incl.csv if LB data is included for i in range(21): print(attribute[i]) sys.stdout.flush() Y = target[attribute[i]].dropna() X = features.loc[Y.index] selector = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=200, random_state=25).fit(X,Y) selected = pd.DataFrame(selector.transform(features)) selected.index = features.index print('shape ', selected.shape) selected.to_csv('...path to features folder/selected_features/features_'+str(i)+'.csv') # In[ ]:
def do_ml(day): ################################################################ # Modules to use ############################################################### USE_SAX = False FEATURE_REDUCTION = False DIM_REDUCTION_SEARCH = False #Create folder for administration try: os.mkdir('performance-{}-days'.format(day)) os.mkdir('performance-{}-days/models'.format(day)) except FileExistsError as e: None ################################################################ print('Using sax: {}'.format(USE_SAX)) if USE_SAX: X = pickle.load(open('X_sax.p', 'rb')) y = pickle.load(open('y_sax.p', 'rb')) else: X = pickle.load(open('X_reg.p', 'rb')) y = pickle.load(open('y_reg.p', 'rb')) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) ################################################################ # Dimensionality reduction ################################################################ def pca_reduce(X, dim): pca = PCA(n_components=dim) X_reduced = pca.fit_transform(X) return X_reduced def isomap_reduce(X, dim): iso = Isomap(n_components=dim) X_reduced = iso.fit_transform(X) return X_reduced def _find_best_dim_red(dims, model, model_name, X, y, params): rows_list = [] for dim in dims: for f in [pca_reduce, isomap_reduce]: print( 'Start reducing dimensionality using {} to {} dimensions'. format(f.__name__, dim)) t0 = time.time() #reduce dimensionality #print(X.shape) X_red = f(X, dim) #print(X_red.shape) X_train_red, X_test_red, y_train, y_test = train_test_split( X_red, y, test_size=0.2, random_state=1234) X_train_red = f(X_train_red, dim) X_test_red = f(X_test_red, dim) t1 = time.time() print('Reducing dimensions cost {} seconds'.format(t1 - t0)) #Optimize model using grid search and cross validation print('Start optimizing {} model'.format(model_name)) t0 = time.time() optimized_model = GridSearchCV(model, params, cv=10, refit=True) optimized_model.fit(X_train_red, y_train) t1 = time.time() print('Optimizing took {} seconds'.format(t1 - t0)) #print('best found parameters') #print(optimized_model.best_params_) y_pred = optimized_model.predict(X_test_red) mse = sk.metrics.mean_squared_error(y_test, y_pred) print("MSE on test set: {}".format(mse)) #administration rows_list.append({ 'model': deepcopy(model_name), 'dimensions': deepcopy(dim), 'reduction technique': deepcopy(f.__name__), 'mse': deepcopy(mse), 'parameters': str(deepcopy(optimized_model.best_params_)) }) #store model doc = open( 'performance-{}-days/models/{}-{}-{}.pickle'.format( day, f.__name__, model_name, dim), 'wb') pickle.dump(optimized_model, doc) doc.close() adm_df = pd.DataFrame(rows_list) adm_df.to_csv('performance-{}-days/{}-dim_reduction.csv'.format( day, model_name)) def dim_reduction_search(X, y): rf_params = { "max_depth": [3, None], "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["mse"] } ada_params = { 'n_estimators': [10, 50, 100, 300, 500], 'learning_rate': [1, 0.5, 0.1, 0.01, 0.001], 'loss': ['linear', 'square', 'exponential'] } dims = [10, 20, 30] _find_best_dim_red(dims, AdaBoostRegressor(), 'AdaBoost', X, y, ada_params) _find_best_dim_red(dims, RandomForestRegressor(), 'RandomForest', X, y, rf_params) #X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2) ############################################################## # Feature selection ############################################################## #Recursive Feature Elimination def ReFeEl(nr_features, X_train, y_train, X_test, y_test, estimator, nr_models=5): print('Start selecting features') t1 = time.time() #estimator = AdaBoostRegressor(learning_rate= 0.001, loss ='square', n_estimators = 50) result = [] for nr_feature in nr_features: selector = RFE(estimator, nr_feature, step=1) selector.fit(X_train, y_train) y_pred = selector.predict(X_test) mse = sk.metrics.mean_squared_error(y_test, y_pred) result.append((mse, selector)) #sort models and take nr_models best ones result.sort(key=lambda x: x[0]) result = result[:nr_models] t2 = time.time() print('selecting features took {} seconds'.format(t2 - t1)) print(result[0][1].support_) print(result[0][1].ranking_) print('Minimum MSE: {}, number of selected features: {}'.format( result[0][0], len(result[0][1].support_[result[0][1].support_]))) return result if FEATURE_REDUCTION: if not USE_SAX: estimator = AdaBoostRegressor(learning_rate=0.001, loss='square', n_estimators=50) else: estimator = AdaBoostRegressor(learning_rate=0.01, loss='linear', n_estimators=50) _, total_nr_features = X.shape nr_features = range(1, total_nr_features) opt_features_models = ReFeEl(nr_features, X_train, y_train, X_test, y_test, estimator) print(opt_features_models) with open('optimal_features_model_{}.pickle'.format(USE_SAX), 'wb') as f: pickle.dump(opt_features_models, f) # Feature Importance using Extra Trees def ET_feature_selection(): estimator = ExtraTreesRegressor() estimator.fit(X, y) print(estimator.feature_importances_) print((len(estimator.feature_importances_[ estimator.feature_importances_ < 0.01]), len(estimator.feature_importances_))) print(np.mean(estimator.feature_importances_)) print(np.std(estimator.feature_importances_)) fig = plt.figure() ax = fig.add_subplot(111) bp = ax.boxplot(estimator.feature_importances_) fig.savefig('boxplot.png', bbox_inches='tight') input('hallo') ############################################################## #Dimensionality reduction search ############################################################## if DIM_REDUCTION_SEARCH: dim_reduction_search(X, y) ############################################################## #Grid search + CV ############################################################## def append_deep_copy(rows_list, model_name, nr_features, mse, params): result = { 'model': deepcopy(model_name), 'nr_features': deepcopy(nr_features), 'MSE': deepcopy(mse), 'parameters': str(deepcopy(params)) } rows_list.append(result) return rows_list params = { 'RF': { "max_depth": [3, None], "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["mse"] }, 'AdaBoost': { 'n_estimators': [10, 50, 100, 300, 500], 'learning_rate': [1, 0.5, 0.1, 0.01, 0.001], 'loss': ['linear', 'square', 'exponential'] } } ############################################################## # Recursive Feature Elimination ############################################################# for estimator in [(AdaBoostRegressor(), 'AdaBoost'), (RandomForestRegressor(), 'RF')]: #Get features and store feature selector print('Optimizing {} using CV RFE'.format(estimator[1])) t0 = time.time() selector = RFECV(estimator[0], step=1, cv=10) selector = selector.fit(X_train, y_train) X_train_transformed = selector.transform(X_train) X_test_transformed = selector.transform(X_test) t1 = time.time() print('Optimizing features done in {} seconds, storing model..'.format( t1 - t0)) #print('Selected features ({}): {}'.format(len(selector.get_support()[selector.get_support()]),selector.get_support())) doc = open( 'performance-{}-days/models/RFE-{}-selector.pickle'.format( day, estimator[1]), 'wb') pickle.dump(selector, doc) doc.close() #Optimize hyperparameters and evaluate model print('Start optimizing hyperparameters using determined features...') t0 = time.time() opt_model = GridSearchCV(estimator[0], params[estimator[1]], cv=10, refit=True) opt_model.fit(X_train_transformed, y_train) t1 = time.time() print('Optimizing took {} seconds'.format((t1 - t0))) #print('best found parameters') #print(opt_model.best_params_) y_pred = opt_model.predict(X_test_transformed) mse = sk.metrics.mean_squared_error(y_test, y_pred) print("MSE on test set: {}".format(mse)) model_doc = open( 'performance-{}-days/models/RFE-{}-model.pickle'.format( day, estimator[1]), 'wb') pickle.dump(opt_model, model_doc) model_doc.close() ############################################################## # Feature Stability Selection ############################################################# RL = RandomizedLasso(alpha='aic') print('Start optimizing using Randomized Lasso') t0 = time.time() RL.fit(X, y) t1 = time.time() print('Optimizing done in {} seconds'.format(t1 - t0)) #print('Best parameters: {}'.format(RL.get_params())) #print('Best features: {}'.format(RL.get_support())) doc = open( 'performance-{}-days/models/RandomizedLasso-selector.pickle'.format( day), 'wb') pickle.dump(RL, doc) doc.close() X_train_RL = RL.transform(X_train) X_test_RL = RL.transform(X_test) print('Using RL features to optimize model..') for estimator in [(AdaBoostRegressor(), 'AdaBoost'), (RandomForestRegressor(), 'RF')]: print('Optimizing {} using CV RFE'.format(estimator[0])) t0 = time.time() opt_model = GridSearchCV(estimator[0], params[estimator[1]], cv=10, refit=True) opt_model.fit(X_train_RL, y_train) t1 = time.time() print('Optimizing took {} seconds'.format((t1 - t0))) #print('best found parameters') #print(opt_model.best_params_) y_pred = opt_model.predict(X_test_RL) mse = sk.metrics.mean_squared_error(y_test, y_pred) print("MSE on test set: {}".format(mse)) model_doc = open( 'performance-{}-days/models/RandomizedLasso-{}-model.pickle'. format(day, estimator[1]), 'wb') pickle.dump(opt_model, model_doc) model_doc.close()
# select the best features with true values and save them features = pd.read_csv('all_features_training_filledna.csv',index_col=0).sort() # targets_for_feature_selection.csv can be computed in opc_python/hulab/feature_selection target = pd.read_csv('targets_for_feature_selection.csv',index_col=0).sort()#replace this with targets_for_feature_selection_LB_incl.csv if LB data is included for i in range(21): print(attribute[i]) sys.stdout.flush() Y = target[attribute[i]].dropna() X = features.loc[Y.index] selector = RandomizedLasso(alpha=0.025,selection_threshold=0.025,n_resampling=200, random_state=25).fit(X,Y) selected = pd.DataFrame(selector.transform(features)) selected.index = features.index print('shape ', selected.shape) selected.columns = X.ix[:,selector.scores_>0.025].columns f = open('scores_'+str(i)+'.txt', 'w') #nb_features = int(features.length()) for x in range(0, len(features.columns.values)): feature_name = features.columns.values[x] f.write(feature_name+': ') score = selector.scores_[x] f.write(str(score)) f.write('\n') f.close()
def main(): start = time.time() MAX_TRAIN_SIZE = 126838 train_size = 20000 val_size = MAX_TRAIN_SIZE - train_size data, test_data = get_data('data') X = data[0:train_size,0:-1] y = [lbl for lbl in data[0:train_size,-1]] print(X.shape) print(len(y)) # use randomized log regression for feature selection clfR = RandomizedLasso( alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=1, pre_dispatch='3*n_jobs', #memory=Memory(cachedir=None) ) # fit regresion clfR.fit(X,y) # Transform Train Data to selected features X = np.array(X).copy() # little hack to fix assignment dest. read only error X_new = clfR.transform(X) X = X_new ## transform Quiz Dataset test_data = np.array(test_data).copy() # little hack to fix assignment dest. read only error transformed_test_data = clfR.transform(test_data) test_data = transformed_test_data print('Dimensions after feature Reduction: ' + str(X.shape) ) print("Elapsed Time For Feature Reduction: " + str(duration)) # Training classifier clf1 = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None, presort=False) # fit sub-classifiers clf1.fit(X,y) # fit voting classifier print("Elapsed Time For Classifier Training: " + str(duration)) # predict & calculate training error y_hat = clf1.predict(X) test_err = 1 for yi, y_hati in zip(y, y_hat): test_err += (yi == y_hati) test_err /= train_size print("train: " + str(test_err)) # validation data - calculate valdiation error val_start = train_size val_end = train_size + val_size # get validation data set # TODO: put this back in if MAX_TRAIN_SIZE - train_size > val_size: print("Beginning test validation...") X_val = data[val_start:val_end,0:-1] y_val = [lbl for lbl in data[val_start:val_end,-1]] y_val_hat = clf1.predict(X_val) test_err = 1 for yi, y_hati in zip(y_val, y_val_hat): test_err += (yi == y_hati) test_err /= X_val.shape[0] print("val: " + str(test_err)) #quiz data print("Beginning quiz validation...") # test_data = get_data('quiz') X_test = test_data[:,:] print(X_test.shape) y_test = [lbl for lbl in data[:,-1]] y_test_hat = clf1.predict(X_test) test_err = 1 # for yi, y_hati in zip(y_test, y_test_hat): # test_err += (yi == y_hati) # test_err /= X_test.shape[0] # print("test: " + str(test_err)) store_csv(y_test_hat, "prediction") end = time.time() duration = end - start print("Took this many seconds: " + str(duration))