def featureSelection(train_x, train_y): # Create the RFE object and compute a cross-validated score. svc = LinearSVC(C=1, class_weight='balanced') # The "accuracy" scoring is proportional to the number of correct # classifications lasso = RandomizedLasso() lasso.fit(train_x, train_y) rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy') rfecv.fit(train_x, train_y) print("Optimal number of features : %d" % rfecv.n_features_) rankings = rfecv.ranking_ lasso_ranks = lasso.get_support() lassoFeats = [] recursiveFeats = [] shouldUseFeats = [] for i in range(len(rankings)): if lasso_ranks[i]: lassoFeats.append(feats[i]) if rankings[i] == 1: recursiveFeats.append(feats[i]) if lasso_ranks[i]: shouldUseFeats.append(feats[i]) keyboard() print 'Should use ' + ', '.join(shouldUseFeats) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def main(train_label, train_feat, modelsdir, selfeat): X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' ')) y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' ')) X_trains = X_train scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # performs feature selection featsel_str = ".all-feats" if int(selfeat): print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=int(config['n_jobs']), random_state=42, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs'])) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=int(config['RR_Iter']), scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True, cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42), verbose=1, random_state=42) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(modelsdir + os.sep + "*")) estimator2 = ExtraTreesRegressor( bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=int(config['n_jobs'])) print "Train the model with the best parameters ..." estimator2.fit(X_trains, y_train) from sklearn.externals import joblib joblib.dump(estimator2, modelsdir + "/XRT.pkl") joblib.dump(scaler, modelsdir + "/scaler.pkl") joblib.dump(sel_est, modelsdir + "/sel_est.pkl")
def main(train_label, train_feat, modelsdir, selfeat): X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' ')) y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' ')) X_trains = X_train scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # performs feature selection featsel_str = ".all-feats" if int(selfeat): print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=int(config['n_jobs']), random_state=42, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs'])) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=int(config['RR_Iter']), scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True, cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42), verbose=1, random_state=42) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(modelsdir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=int(config['n_jobs'])) print "Train the model with the best parameters ..." estimator2.fit(X_trains,y_train) from sklearn.externals import joblib joblib.dump(estimator2, modelsdir+"/XRT.pkl") joblib.dump(scaler, modelsdir+"/scaler.pkl") joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
def lass_varselect(train, num_vars, target, alpha): lass = RandomizedLasso(alpha=alpha, n_resampling=5) lass.fit(train[num_vars], train[target]) return lass.get_support()
def run(args): X_train = np.nan_to_num( np.genfromtxt(args.training_data, delimiter=args.delimiter)) y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1) X_trains = X_train if args.scale: print "Scaling features (mean removal divided by std)..." scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # create output folders outF = args.output_folder + "/" + os.path.basename( args.training_data) + "--FS_" + str( args.select_features) + "--i_" + str(args.iterations) buildDir(outF) maskF = outF + "/masks/" buildDir(maskF) #evaluation features first_experiments labels logs masks parameters # predictions src suca paramF = outF + "/parameters/" buildDir(paramF) #featF = outF+"/features/" #buildDir(featF) #evalF = buildDir(outF+"/evaluation") #os.path.basename( # args.training_data)]) + featsel_str + "--" + os.path.basename( # test_label # initializes numpy random seed np.random.seed(args.seed) # performs feature selection featsel_str = ".all-feats" if args.select_features: print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=8, random_state=args.seed, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join( # [".", "masks", os.path.basename(args.training_data)]) [maskF, os.path.basename(args.training_data)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=args.iterations, scoring=mae_scorer, n_jobs=8, refit=True, cv=KFold(X_train.shape[0], args.folds, shuffle=True, random_state=args.seed), verbose=1, random_state=args.seed) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(args.models_dir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=8) estimator2.fit(X_trains,y_train) from sklearn.externals import joblib print "koooonnn %s" % args.models_dir joblib.dump(estimator2, args.models_dir+"/XRT.pkl") joblib.dump(scaler, args.models_dir+"/scaler.pkl") joblib.dump(sel_est, args.models_dir+"/sel_est.pkl") # print "Kioonnn number of feat:\n", n_feature # ................SHAHAB ........................ print "Best parameters: ", search.best_params_ # saves parameters on yaml file #param_path = os.sep.join([".", "parameters", os.path.basename( param_path = os.sep.join([paramF, os.path.basename( args.training_data)]) + featsel_str + ".params.yaml" param_file = codecs.open(param_path, "w", "utf-8") yaml.dump(search.best_params_, stream=param_file) testF = os.sep.join([outF, "/test/"]) buildDir(testF) m = y_train.mean() # evaluates model on the different test sets test_features = sorted(glob.glob(args.test_data + os.sep + "*")) test_labels = sorted(glob.glob(args.test_labels + os.sep + "*")) for test_feature, test_label in zip(test_features, test_labels): print "Evaluating on %s" % test_label X_test = np.nan_to_num( np.genfromtxt(test_feature, delimiter=args.delimiter)) y_test = np.clip(np.genfromtxt(test_label), 0, 1) X_tests = X_test if args.scale: X_tests = scaler.transform(X_test) if args.select_features: X_tests = sel_est.transform(X_tests) # gets predictions on test set #y_pred = search.predict(X_tests) y_pred = np.clip(search.predict(X_tests), 0, 1) # evaluates on test set mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print "Test MAE = %2.8f" % mae print "Test RMSE = %2.8f" % rmse print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max()) # saves evaluation testFX = testF + "/" + os.path.basename(test_label) buildDir(testFX) buildDir(testFX + "/evaluation/") eval_path = os.sep.join([testFX, "evaluation", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8") mae_eval.write(str(mae) + "\n") rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8") rmse_eval.write(str(rmse) + "\n") mu = m * np.ones(y_test.shape[0]) # baseline on test set maeB = mean_absolute_error(y_test, mu) rmseB = np.sqrt(mean_squared_error(y_test, mu)) print "Test MAE Baseline= %2.8f" % maeB print "Test RMSE Baseline= %2.8f" % rmseB mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8") mae_eval.write(str(maeB) + "\n") rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8") rmse_eval.write(str(rmseB) + "\n") # saves predictions buildDir(testFX + "/predictions/") preds_path = os.sep.join([testFX, "predictions", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) + ".preds" np.savetxt(preds_path, y_pred, fmt="%2.15f")
new_x = pd.DataFrame() for i in X_ms.columns: print(i) new = binn._applyBinwoe(X_ms[i], WOE_detail1[WOE_detail1["var_name"] == i]) new_x = pd.concat([new_x, new], axis=1) X_ms = new_x ''' 对woe转换后的变量进行筛选''' from sklearn.linear_model import RandomizedLasso from sklearn.linear_model import LassoCV ####随机拉索回归选择与y线性关系的变量(稳定性选择2) rla = RandomizedLasso() rla.fit(X_ms, y) print(X_ms.columns[rla.get_support()]) X_ms = X_ms[X_ms.columns[rla.get_support()]] #LassoCV LASSO通常用来为其他方法做特征选择,在其他算法中使用。 lassocv = LassoCV() lassocv.fit(X_ms, y) print(X_ms.columns[lassocv.coef_ != 0]) X_ms = X_ms[X_ms.columns[lassocv.coef_ != 0]] ###3、显著性检验部分: import statsmodels.api as sm logit = sm.Logit(y, X_ms) result = logit.fit() print(result.summary()) ###移除2、VIF(方差膨胀因子-判断是否存在多重共线)大于10的 from statsmodels.stats.outliers_influence import variance_inflation_factor
def main(): print "read train" df_train = pd.read_csv('./data/train.csv') print "read test" df_test = pd.read_csv('./data/test.csv') sample = pd.read_csv('./data/sample_submission.csv') cats = [ 'T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 'T1_V9', 'T1_V11', 'T1_V12', 'T1_V15', 'T1_V16', 'T1_V17', 'T2_V3', 'T2_V5', 'T2_V11', 'T2_V12', 'T2_V13' ] print "convert mixed columns to strings" df_train.loc[:, cats] = df_train[cats].applymap(str) df_test.loc[:, cats] = df_test[cats].applymap(str) print "one-hot encoding" df_train = make_dummies(df_train, cats) df_test = make_dummies(df_test, cats) print "set binary labels" df_train['hazard_class'] = (df_train.Hazard == 1).astype(int) classes = df_train.hazard_class.values # loss = df_train.target.values hazard = df_train.Hazard.values df_train = df_train.drop(['Hazard', 'Id', 'hazard_class'], axis=1) df_test = df_test.drop(['Id'], axis=1) build_features = False #flag, determines whether features will be trained or read from file if build_features: print "univariate feature selectors" selector_clf = SelectKBest(score_func=f_classif, k='all') selector_reg = SelectKBest(score_func=f_regression, k='all') selector_clf.fit(df_train.values, classes) selector_reg.fit(df_train.values, hazard) pvalues_clf = selector_clf.pvalues_ pvalues_reg = selector_reg.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 pvalues_reg[np.isnan(pvalues_reg)] = 1 #put feature vectors into dictionary feats = {} feats['univ_sub01'] = (pvalues_clf < 0.1) & (pvalues_reg < 0.1) feats['univ_sub005'] = (pvalues_clf < 0.05) & (pvalues_reg < 0.05) feats['univ_reg_sub005'] = (pvalues_reg < 0.05) feats['univ_clf_sub005'] = (pvalues_clf < 0.05) print "randomized lasso feature selector" sel_lasso = RandomizedLasso(random_state=42).fit( df_train.values, hazard) #put rand_lasso feats into feature dict feats['rand_lasso'] = sel_lasso.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(df_train.values) sel_svc = LinearSVC(C=0.1, penalty="l1", dual=False, random_state=42).fit(X_sp, classes) feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0) sel_log = LogisticRegression(C=0.01, random_state=42).fit(X_sp, classes) feats['LogReg'] = np.ravel(sel_log.coef_ > 0) feat_sums = np.zeros(len(feats['rand_lasso'])) for key in feats: feat_sums += feats[key].astype(int) feats[ 'ensemble'] = feat_sums >= 5 #take features which get 5 or more votes joblib.dump(feats, './features/feats.pkl', compress=3) else: feats = joblib.load('features/feats.pkl') xtrain = df_train.values xtest = df_test.values print "fitting xgb-regressor" params = {} params["objective"] = "reg:linear" params["eta"] = 0.01 params["max_depth"] = 7 params["subsample"] = 0.8 params["colsample_bytree"] = 0.8 params["min_child_weight"] = 5 params["silent"] = 1 plst = list(params.items()) num_rounds = 600 #create a train and validation dmatrices xgtrain = xgb.DMatrix(xtrain[:, feats['ensemble']], label=hazard) xgtest = xgb.DMatrix(xtest[:, feats['ensemble']]) reg_xgb = xgb.train(plst, xgtrain, num_rounds) xgb_preds = reg_xgb.predict(xgtest) sample['Hazard'] = xgb_preds sample.to_csv('./submissions/xgb.csv', index=False) reg_lin = LinearRegression() scaler = StandardScaler() xtrain = scaler.fit_transform(xtrain) xtest = scaler.transform(xtest) print "fitting linear regressor" reg_lin.fit(xtrain[:, feats['rand_lasso']], hazard) lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']]) sample['Hazard'] = lin_preds sample.to_csv('./submissions/lin.csv', index=False) xgb_order = xgb_preds.argsort().argsort( ) #maps smallest value to 0, second-smallest to 1 etc. lin_order = lin_preds.argsort().argsort() #averaging mean_order = np.vstack((xgb_order, lin_order)).mean(0) sample['Hazard'] = mean_order sample.to_csv('./submissions/mean.csv', index=False)
def lass_varselect(train, num_vars, target, alpha): lass = RandomizedLasso(alpha=alpha, n_resampling=5) lass.fit(train[num_vars], train[target]) return lass.get_support()
from sklearn.cross_validation import train_test_split from scipy import io as sio from tensorflow.python.framework import ops from dfs2 import DeepFeatureSelectionNew import numpy as np from sklearn.datasets import make_classification from sklearn.preprocessing import normalize # ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat") ourdataB = sio.loadmat("/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat") # ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat") inputX = ourdataB['X'] inputX = normalize(inputX, axis=0) inputY = ourdataB['Y'][0,:] columnNames = ourdataB['columnNames'] X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42) randomized_lasso = RandomizedLasso() randomized_lasso.fit(X_train, y_train) featureMask = randomized_lasso.get_support() X_train_lasso = X_train[:,featureMask] X_test_lasso = X_train[:,featureMask] columnNames[0][:100][featureMask] sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \ 'X_train_lasso':X_test_lasso, 'featureMask':featureMask})
def main(): print "read train" df_train = pd.read_csv('data/train.csv') print "read test" df_test = pd.read_csv('data/test.csv') sample = pd.read_csv('data/sampleSubmission.csv') cats = ['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'dummy'] print "convert mixed columns to strings" df_train.loc[:, cats] = df_train[cats].applymap(str) df_test.loc[:, cats] = df_test[cats].applymap(str) print "one-hot encoding" df_train = make_dummies(df_train, cats) df_test = make_dummies(df_test, cats) print "fill missing values" df_train = df_train.fillna(df_train.mean()) df_test = df_test.fillna(df_test.mean()) print "set binary labels" df_train['target_class'] = (df_train.target>0).astype(int) classes = df_train.target_class.values loss = df_train.target.values df_train = df_train.drop(['target', 'id', 'target_class'], axis = 1) df_test = df_test.drop(['id'], axis = 1) build_features = True #flag, determines whether features will be trained or read from file if build_features: print "univariate feature selectors" selector_clf = SelectKBest(score_func = f_classif, k = 'all') selector_reg = SelectKBest(score_func = f_regression, k = 'all') selector_clf.fit(df_train.values, classes) selector_reg.fit(df_train.values, loss) pvalues_clf = selector_clf.pvalues_ pvalues_reg = selector_reg.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 pvalues_reg[np.isnan(pvalues_reg)] = 1 #put feature vectors into dictionary feats = {} feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05) feats['univ_reg_sub005'] = (pvalues_reg<0.05) feats['univ_clf_sub005'] = (pvalues_clf<0.05) print "randomized lasso feature selector" sel_lasso = RandomizedLasso(random_state = 42, n_jobs = 4).fit(df_train.values, loss) #put rand_lasso feats into feature dict feats['rand_lasso'] = sel_lasso.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(df_train.values) sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes) feats['LinearSVC'] = np.ravel(sel_svc.coef_>0) sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes) feats['LogReg'] = np.ravel(sel_log.coef_>0) feat_sums = np.zeros(len(feats['rand_lasso'])) for key in feats: feat_sums+=feats[key].astype(int) feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes joblib.dump(feats, 'features/feats.pkl', compress = 3) else: feats = joblib.load('features/feats.pkl') xtrain = df_train.values xtest = df_test.values print "fitting gb-regressor" reg_gbr = GradientBoostingRegressor(n_estimators = 3000, learning_rate = 0.001, max_depth =5, random_state = 42, verbose = 100, min_samples_leaf=5) reg_gbr.fit(xtrain[:, feats['ensemble']], loss) gbr_preds = reg_gbr.predict(xtest[:, feats['ensemble']]) sample['target'] = gbr_preds sample.to_csv('submissions/gbm_sub.csv', index = False) reg_lin = LinearRegression() scaler = StandardScaler() xtrain = scaler.fit_transform(xtrain) xtest = scaler.transform(xtest) print "fitting linear regressor" reg_lin.fit(xtrain[:, feats['rand_lasso']], loss) lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']]) gbr_order = gbr_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc. lin_order = lin_preds.argsort().argsort() #averaging mean_order = np.vstack((gbr_order, lin_order)).mean(0) sample['target'] = mean_order sample.to_csv('submissions/mean_sub.csv', index = False)
def main(): print "read train" df_train = pd.read_csv('./data/train.csv') print "read test" df_test = pd.read_csv('./data/test.csv') sample = pd.read_csv('./data/sample_submission.csv') cats = ['T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 'T1_V9', 'T1_V11', 'T1_V12', 'T1_V15', 'T1_V16', 'T1_V17', 'T2_V3', 'T2_V5', 'T2_V11', 'T2_V12', 'T2_V13'] print "convert mixed columns to strings" df_train.loc[:, cats] = df_train[cats].applymap(str) df_test.loc[:, cats] = df_test[cats].applymap(str) print "one-hot encoding" df_train = make_dummies(df_train, cats) df_test = make_dummies(df_test, cats) print "set binary labels" df_train['hazard_class'] = (df_train.Hazard==1).astype(int) classes = df_train.hazard_class.values # loss = df_train.target.values hazard = df_train.Hazard.values df_train = df_train.drop(['Hazard', 'Id', 'hazard_class'], axis = 1) df_test = df_test.drop(['Id'], axis = 1) build_features = False #flag, determines whether features will be trained or read from file if build_features: print "univariate feature selectors" selector_clf = SelectKBest(score_func = f_classif, k = 'all') selector_reg = SelectKBest(score_func = f_regression, k = 'all') selector_clf.fit(df_train.values, classes) selector_reg.fit(df_train.values, hazard) pvalues_clf = selector_clf.pvalues_ pvalues_reg = selector_reg.pvalues_ pvalues_clf[np.isnan(pvalues_clf)] = 1 pvalues_reg[np.isnan(pvalues_reg)] = 1 #put feature vectors into dictionary feats = {} feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05) feats['univ_reg_sub005'] = (pvalues_reg<0.05) feats['univ_clf_sub005'] = (pvalues_clf<0.05) print "randomized lasso feature selector" sel_lasso = RandomizedLasso(random_state = 42).fit(df_train.values, hazard) #put rand_lasso feats into feature dict feats['rand_lasso'] = sel_lasso.get_support() print "l1-based feature selectors" X_sp = sparse.coo_matrix(df_train.values) sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes) feats['LinearSVC'] = np.ravel(sel_svc.coef_>0) sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes) feats['LogReg'] = np.ravel(sel_log.coef_>0) feat_sums = np.zeros(len(feats['rand_lasso'])) for key in feats: feat_sums+=feats[key].astype(int) feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes joblib.dump(feats, './features/feats.pkl', compress = 3) else: feats = joblib.load('features/feats.pkl') xtrain = df_train.values xtest = df_test.values print "fitting xgb-regressor" params = {} params["objective"] = "reg:linear" params["eta"] = 0.01 params["max_depth"] = 7 params["subsample"] = 0.8 params["colsample_bytree"] = 0.8 params["min_child_weight"] = 5 params["silent"] = 1 plst = list(params.items()) num_rounds = 600 #create a train and validation dmatrices xgtrain = xgb.DMatrix(xtrain[:,feats['ensemble']], label=hazard) xgtest = xgb.DMatrix(xtest[:,feats['ensemble']]) reg_xgb = xgb.train(plst, xgtrain, num_rounds) xgb_preds = reg_xgb.predict(xgtest) sample['Hazard'] = xgb_preds sample.to_csv('./submissions/xgb.csv', index = False) reg_lin = LinearRegression() scaler = StandardScaler() xtrain = scaler.fit_transform(xtrain) xtest = scaler.transform(xtest) print "fitting linear regressor" reg_lin.fit(xtrain[:, feats['rand_lasso']], hazard) lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']]) sample['Hazard'] = lin_preds sample.to_csv('./submissions/lin.csv', index = False) xgb_order = xgb_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc. lin_order = lin_preds.argsort().argsort() #averaging mean_order = np.vstack((xgb_order, lin_order)).mean(0) sample['Hazard'] = mean_order sample.to_csv('./submissions/mean.csv', index = False)
import numpy as np from sklearn.datasets import make_classification from sklearn.preprocessing import normalize # ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat") ourdataB = sio.loadmat( "/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat") # ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat") inputX = ourdataB['X'] inputX = normalize(inputX, axis=0) inputY = ourdataB['Y'][0, :] columnNames = ourdataB['columnNames'] X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42) randomized_lasso = RandomizedLasso() randomized_lasso.fit(X_train, y_train) featureMask = randomized_lasso.get_support() X_train_lasso = X_train[:, featureMask] X_test_lasso = X_train[:, featureMask] columnNames[0][:100][featureMask] sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \ 'X_train_lasso':X_test_lasso, 'featureMask':featureMask})