示例#1
0
def featureSelection(train_x, train_y):
    # Create the RFE object and compute a cross-validated score.
    svc = LinearSVC(C=1, class_weight='balanced')
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    lasso = RandomizedLasso()
    lasso.fit(train_x, train_y)
    rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy')
    rfecv.fit(train_x, train_y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    rankings = rfecv.ranking_
    lasso_ranks = lasso.get_support()
    lassoFeats = []
    recursiveFeats = []
    shouldUseFeats = []

    for i in range(len(rankings)):
        if lasso_ranks[i]:
            lassoFeats.append(feats[i])
        if rankings[i] == 1:
            recursiveFeats.append(feats[i])
            if lasso_ranks[i]:
                shouldUseFeats.append(feats[i])
    keyboard()
    print 'Should use ' + ', '.join(shouldUseFeats)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
示例#2
0
def main(train_label, train_feat, modelsdir, selfeat):

    X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' '))
    y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' '))

    X_trains = X_train
    scaler = StandardScaler().fit(X_train)
    X_trains = scaler.transform(X_train)

    # performs feature selection
    featsel_str = ".all-feats"
    if int(selfeat):
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic",
                                  verbose=True,
                                  max_iter=1000,
                                  n_jobs=int(config['n_jobs']),
                                  random_state=42,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=42,
                                    n_jobs=int(config['n_jobs']))

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
      {"n_estimators": [5, 10, 50, 100, 200, 500],
       "max_depth": [3, 2, 1, None],
       "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
       "min_samples_split": sp_randint(1, 11),
       "min_samples_leaf": sp_randint(1, 11),
       "bootstrap": [True, False]}
    # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator,
                                param_distributions,
                                n_iter=int(config['RR_Iter']),
                                scoring=mae_scorer,
                                n_jobs=int(config['n_jobs']),
                                refit=True,
                                cv=KFold(X_train.shape[0],
                                         int(config['folds']),
                                         shuffle=True,
                                         random_state=42),
                                verbose=1,
                                random_state=42)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................

    models_dir = sorted(glob.glob(modelsdir + os.sep + "*"))

    estimator2 = ExtraTreesRegressor(
        bootstrap=search.best_params_["bootstrap"],
        max_depth=search.best_params_["max_depth"],
        max_features=search.best_params_["max_features"],
        min_samples_leaf=search.best_params_["min_samples_leaf"],
        min_samples_split=search.best_params_["min_samples_split"],
        n_estimators=search.best_params_["n_estimators"],
        verbose=1,
        random_state=42,
        n_jobs=int(config['n_jobs']))

    print "Train the model with the best parameters ..."
    estimator2.fit(X_trains, y_train)

    from sklearn.externals import joblib
    joblib.dump(estimator2, modelsdir + "/XRT.pkl")
    joblib.dump(scaler, modelsdir + "/scaler.pkl")
    joblib.dump(sel_est, modelsdir + "/sel_est.pkl")
示例#3
0
def main(train_label, train_feat, modelsdir, selfeat):

  X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' '))
  y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' '))

  X_trains = X_train
  scaler = StandardScaler().fit(X_train)
  X_trains = scaler.transform(X_train)


    # performs feature selection
  featsel_str = ".all-feats"
  if int(selfeat):
    print "Performing feature selection ..."
    # initializes selection estimator
    sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                              n_jobs=int(config['n_jobs']), random_state=42,
                              n_resampling=1000)
  
    sel_est.fit(X_trains, y_train)
    X_trains = sel_est.transform(X_trains)
  
    selected_mask = sel_est.get_support()
    selected_features = sel_est.get_support(indices=True)
  
    sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)])
  
    # saves indices
    np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
    # saves mask
    np.save(sel_feats_path + ".mask", selected_mask)
    featsel_str = ".randcv"


  estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs']))

  mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
  #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

  # performs parameter optimization using random search
  print "Performing parameter optimization ... "


  param_distributions = \
    {"n_estimators": [5, 10, 50, 100, 200, 500],
     "max_depth": [3, 2, 1, None],
     "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
     "min_samples_split": sp_randint(1, 11),
     "min_samples_leaf": sp_randint(1, 11),
     "bootstrap": [True, False]}
   # "criterion": ["gini", "entropy"]}

  search = RandomizedSearchCV(estimator, param_distributions,
            n_iter=int(config['RR_Iter']),
            scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True,
            cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42),
            verbose=1, random_state=42)
  
  # fits model using best parameters found
  search.fit(X_trains, y_train)

  # ................SHAHAB ........................ 
  
  models_dir = sorted(glob.glob(modelsdir + os.sep + "*"))
  
  estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
       max_depth=search.best_params_["max_depth"], 
       max_features=search.best_params_["max_features"],
       min_samples_leaf=search.best_params_["min_samples_leaf"], 
       min_samples_split=search.best_params_["min_samples_split"], 
       n_estimators=search.best_params_["n_estimators"], 
       verbose=1, 
       random_state=42, 
       n_jobs=int(config['n_jobs']))

  print "Train the model with the best parameters ..."
  estimator2.fit(X_trains,y_train)

  from sklearn.externals import joblib
  joblib.dump(estimator2, modelsdir+"/XRT.pkl")
  joblib.dump(scaler, modelsdir+"/scaler.pkl")
  joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
def lass_varselect(train, num_vars, target, alpha):   
    lass = RandomizedLasso(alpha=alpha, n_resampling=5)
    lass.fit(train[num_vars], train[target])
    return lass.get_support()
示例#5
0
def run(args):
    X_train = np.nan_to_num(
        np.genfromtxt(args.training_data, delimiter=args.delimiter))
    y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1)

    X_trains = X_train
    if args.scale:
        print "Scaling features (mean removal divided by std)..."
        scaler = StandardScaler().fit(X_train)
        X_trains = scaler.transform(X_train)

    # create output folders
    outF = args.output_folder + "/" + os.path.basename(
        args.training_data) + "--FS_" + str(
        args.select_features) + "--i_" + str(args.iterations)
    buildDir(outF)
    maskF = outF + "/masks/"
    buildDir(maskF)
    #evaluation  features  first_experiments  labels  logs  masks  parameters
    #  predictions  src  suca
    paramF = outF + "/parameters/"
    buildDir(paramF)
    #featF = outF+"/features/"
    #buildDir(featF)    

    #evalF = buildDir(outF+"/evaluation")



    #os.path.basename(
    #        args.training_data)]) + featsel_str + "--" + os.path.basename(
    # test_label



    # initializes numpy random seed
    np.random.seed(args.seed)

    # performs feature selection
    featsel_str = ".all-feats"
    if args.select_features:
        print "Performing feature selection ..."
        # initializes selection estimator
        sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000,
                                  n_jobs=8, random_state=args.seed,
                                  n_resampling=1000)

        sel_est.fit(X_trains, y_train)
        X_trains = sel_est.transform(X_trains)

        selected_mask = sel_est.get_support()
        selected_features = sel_est.get_support(indices=True)

        sel_feats_path = os.sep.join(
            #    [".", "masks", os.path.basename(args.training_data)])
            [maskF, os.path.basename(args.training_data)])

        # saves indices
        np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d")
        # saves mask
        np.save(sel_feats_path + ".mask", selected_mask)
        featsel_str = ".randcv"

    estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1)

    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    # performs parameter optimization using random search
    print "Performing parameter optimization ... "


    param_distributions = \
        {"n_estimators": [5, 10, 50, 100, 200, 500],
         "max_depth": [3, 2, 1, None],
         "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)],
         "min_samples_split": sp_randint(1, 11),
         "min_samples_leaf": sp_randint(1, 11),
         "bootstrap": [True, False]}
         # "criterion": ["gini", "entropy"]}

    search = RandomizedSearchCV(estimator, param_distributions,
                                n_iter=args.iterations,
                                scoring=mae_scorer, n_jobs=8, refit=True,
                                cv=KFold(X_train.shape[0], args.folds, shuffle=True,
                                         random_state=args.seed), verbose=1,
                                random_state=args.seed)

    # fits model using best parameters found
    search.fit(X_trains, y_train)

    # ................SHAHAB ........................ 
    
    models_dir = sorted(glob.glob(args.models_dir + os.sep + "*"))
    
    estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], 
                                     max_depth=search.best_params_["max_depth"], 
                                     max_features=search.best_params_["max_features"],
                                     min_samples_leaf=search.best_params_["min_samples_leaf"], 
                                     min_samples_split=search.best_params_["min_samples_split"], 
                                     n_estimators=search.best_params_["n_estimators"], 
                                     verbose=1, 
                                     random_state=42, 
                                     n_jobs=8)
   
    estimator2.fit(X_trains,y_train)
    from sklearn.externals import joblib
    print "koooonnn %s" % args.models_dir
    joblib.dump(estimator2, args.models_dir+"/XRT.pkl")
    joblib.dump(scaler, args.models_dir+"/scaler.pkl")
    joblib.dump(sel_est, args.models_dir+"/sel_est.pkl")
    
#    print "Kioonnn number of feat:\n", n_feature
    # ................SHAHAB ........................

    print "Best parameters: ", search.best_params_

    # saves parameters on yaml file
    #param_path = os.sep.join([".", "parameters", os.path.basename(
    param_path = os.sep.join([paramF, os.path.basename(
        args.training_data)]) + featsel_str + ".params.yaml"
    param_file = codecs.open(param_path, "w", "utf-8")
    yaml.dump(search.best_params_, stream=param_file)
    testF = os.sep.join([outF, "/test/"])
    buildDir(testF)

    m = y_train.mean()

    # evaluates model on the different test sets
    test_features = sorted(glob.glob(args.test_data + os.sep + "*"))
    test_labels = sorted(glob.glob(args.test_labels + os.sep + "*"))
    for test_feature, test_label in zip(test_features, test_labels):
        print "Evaluating on %s" % test_label
    	X_test = np.nan_to_num(
        	np.genfromtxt(test_feature, delimiter=args.delimiter))
    	y_test = np.clip(np.genfromtxt(test_label), 0, 1)

    	X_tests = X_test
    	if args.scale:
        	X_tests = scaler.transform(X_test)

    	if args.select_features:
        	X_tests = sel_est.transform(X_tests)

    	# gets predictions on test set
    	#y_pred = search.predict(X_tests)
    	y_pred = np.clip(search.predict(X_tests), 0, 1)

    	# evaluates on test set
    	mae = mean_absolute_error(y_test, y_pred)
    	rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    	print "Test MAE = %2.8f" % mae
    	print "Test RMSE = %2.8f" % rmse
    	print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max())
    	# saves evaluation
    	testFX = testF + "/" + os.path.basename(test_label)
    	buildDir(testFX)
    	buildDir(testFX + "/evaluation/")

    	eval_path = os.sep.join([testFX, "evaluation", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label)
    	mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8")
    	mae_eval.write(str(mae) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8")
    	rmse_eval.write(str(rmse) + "\n")

    	mu = m * np.ones(y_test.shape[0])  # baseline on test set
    	maeB = mean_absolute_error(y_test, mu)
    	rmseB = np.sqrt(mean_squared_error(y_test, mu))
    	print "Test MAE Baseline= %2.8f" % maeB
    	print "Test RMSE Baseline= %2.8f" % rmseB
    	mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8")
    	mae_eval.write(str(maeB) + "\n")
    	rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8")
    	rmse_eval.write(str(rmseB) + "\n")



	# saves predictions
	buildDir(testFX + "/predictions/")
	preds_path = os.sep.join([testFX, "predictions", os.path.basename(
        	args.training_data)]) + featsel_str + "--" + os.path.basename(
        	test_label) + ".preds"
	np.savetxt(preds_path, y_pred, fmt="%2.15f")
new_x = pd.DataFrame()
for i in X_ms.columns:
    print(i)
    new = binn._applyBinwoe(X_ms[i], WOE_detail1[WOE_detail1["var_name"] == i])

    new_x = pd.concat([new_x, new], axis=1)

X_ms = new_x
''' 对woe转换后的变量进行筛选'''
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LassoCV
####随机拉索回归选择与y线性关系的变量(稳定性选择2)
rla = RandomizedLasso()
rla.fit(X_ms, y)
print(X_ms.columns[rla.get_support()])
X_ms = X_ms[X_ms.columns[rla.get_support()]]
#LassoCV LASSO通常用来为其他方法做特征选择,在其他算法中使用。
lassocv = LassoCV()
lassocv.fit(X_ms, y)
print(X_ms.columns[lassocv.coef_ != 0])
X_ms = X_ms[X_ms.columns[lassocv.coef_ != 0]]

###3、显著性检验部分:
import statsmodels.api as sm
logit = sm.Logit(y, X_ms)
result = logit.fit()
print(result.summary())

###移除2、VIF(方差膨胀因子-判断是否存在多重共线)大于10的
from statsmodels.stats.outliers_influence import variance_inflation_factor
def main():
    print "read train"
    df_train = pd.read_csv('./data/train.csv')
    print "read test"
    df_test = pd.read_csv('./data/test.csv')
    sample = pd.read_csv('./data/sample_submission.csv')

    cats = [
        'T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 'T1_V9', 'T1_V11',
        'T1_V12', 'T1_V15', 'T1_V16', 'T1_V17', 'T2_V3', 'T2_V5', 'T2_V11',
        'T2_V12', 'T2_V13'
    ]

    print "convert mixed columns to strings"
    df_train.loc[:, cats] = df_train[cats].applymap(str)
    df_test.loc[:, cats] = df_test[cats].applymap(str)

    print "one-hot encoding"
    df_train = make_dummies(df_train, cats)
    df_test = make_dummies(df_test, cats)

    print "set binary labels"
    df_train['hazard_class'] = (df_train.Hazard == 1).astype(int)

    classes = df_train.hazard_class.values
    # loss = df_train.target.values
    hazard = df_train.Hazard.values
    df_train = df_train.drop(['Hazard', 'Id', 'hazard_class'], axis=1)
    df_test = df_test.drop(['Id'], axis=1)

    build_features = False  #flag, determines whether features will be trained or read from file

    if build_features:
        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func=f_classif, k='all')
        selector_reg = SelectKBest(score_func=f_regression, k='all')
        selector_clf.fit(df_train.values, classes)
        selector_reg.fit(df_train.values, hazard)
        pvalues_clf = selector_clf.pvalues_
        pvalues_reg = selector_reg.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1
        pvalues_reg[np.isnan(pvalues_reg)] = 1

        #put feature vectors into dictionary
        feats = {}
        feats['univ_sub01'] = (pvalues_clf < 0.1) & (pvalues_reg < 0.1)
        feats['univ_sub005'] = (pvalues_clf < 0.05) & (pvalues_reg < 0.05)
        feats['univ_reg_sub005'] = (pvalues_reg < 0.05)
        feats['univ_clf_sub005'] = (pvalues_clf < 0.05)

        print "randomized lasso feature selector"
        sel_lasso = RandomizedLasso(random_state=42).fit(
            df_train.values, hazard)
        #put rand_lasso feats into feature dict
        feats['rand_lasso'] = sel_lasso.get_support()

        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(df_train.values)
        sel_svc = LinearSVC(C=0.1, penalty="l1", dual=False,
                            random_state=42).fit(X_sp, classes)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0)
        sel_log = LogisticRegression(C=0.01,
                                     random_state=42).fit(X_sp, classes)
        feats['LogReg'] = np.ravel(sel_log.coef_ > 0)

        feat_sums = np.zeros(len(feats['rand_lasso']))
        for key in feats:
            feat_sums += feats[key].astype(int)
        feats[
            'ensemble'] = feat_sums >= 5  #take features which get 5 or more votes
        joblib.dump(feats, './features/feats.pkl', compress=3)

    else:
        feats = joblib.load('features/feats.pkl')

    xtrain = df_train.values
    xtest = df_test.values

    print "fitting xgb-regressor"
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.01
    params["max_depth"] = 7
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.8
    params["min_child_weight"] = 5
    params["silent"] = 1
    plst = list(params.items())
    num_rounds = 600
    #create a train and validation dmatrices
    xgtrain = xgb.DMatrix(xtrain[:, feats['ensemble']], label=hazard)
    xgtest = xgb.DMatrix(xtest[:, feats['ensemble']])
    reg_xgb = xgb.train(plst, xgtrain, num_rounds)
    xgb_preds = reg_xgb.predict(xgtest)
    sample['Hazard'] = xgb_preds
    sample.to_csv('./submissions/xgb.csv', index=False)
    reg_lin = LinearRegression()
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    print "fitting linear regressor"
    reg_lin.fit(xtrain[:, feats['rand_lasso']], hazard)
    lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']])
    sample['Hazard'] = lin_preds
    sample.to_csv('./submissions/lin.csv', index=False)
    xgb_order = xgb_preds.argsort().argsort(
    )  #maps smallest value to 0, second-smallest to 1 etc.
    lin_order = lin_preds.argsort().argsort()
    #averaging
    mean_order = np.vstack((xgb_order, lin_order)).mean(0)
    sample['Hazard'] = mean_order
    sample.to_csv('./submissions/mean.csv', index=False)
def lass_varselect(train, num_vars, target, alpha):
    lass = RandomizedLasso(alpha=alpha, n_resampling=5)
    lass.fit(train[num_vars], train[target])
    return lass.get_support()
from sklearn.cross_validation import train_test_split
from scipy import io as sio
from tensorflow.python.framework import ops
from dfs2 import DeepFeatureSelectionNew
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import normalize

# ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat")
ourdataB = sio.loadmat("/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat")
# ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat")

inputX = ourdataB['X']
inputX = normalize(inputX, axis=0)
inputY = ourdataB['Y'][0,:]
columnNames = ourdataB['columnNames']

X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42)

randomized_lasso = RandomizedLasso()
randomized_lasso.fit(X_train, y_train)

featureMask = randomized_lasso.get_support()

X_train_lasso = X_train[:,featureMask]
X_test_lasso = X_train[:,featureMask]

columnNames[0][:100][featureMask]

sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \
			'X_train_lasso':X_test_lasso, 'featureMask':featureMask})
def main():
    print "read train"
    df_train = pd.read_csv('data/train.csv')
    print "read test"
    df_test = pd.read_csv('data/test.csv')
    sample = pd.read_csv('data/sampleSubmission.csv')
    
    cats = ['var1', 'var2', 'var3', 'var4', 'var5', 
            'var6', 'var7', 'var8', 'var9', 'dummy']
            
    print "convert mixed columns to strings"
    df_train.loc[:, cats] = df_train[cats].applymap(str)
    df_test.loc[:, cats] = df_test[cats].applymap(str)
    
    print "one-hot encoding"
    df_train = make_dummies(df_train, cats)
    df_test = make_dummies(df_test, cats)

    print "fill missing values"
    df_train = df_train.fillna(df_train.mean())
    df_test = df_test.fillna(df_test.mean())
    
    print "set binary labels"
    df_train['target_class'] = (df_train.target>0).astype(int)
    
    classes = df_train.target_class.values
    loss = df_train.target.values
    df_train = df_train.drop(['target', 'id', 'target_class'], axis = 1)
    df_test = df_test.drop(['id'], axis = 1)

    build_features = True #flag, determines whether features will be trained or read from file
    
    if build_features:
        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func = f_classif, k = 'all')
        selector_reg = SelectKBest(score_func = f_regression, k = 'all')
        selector_clf.fit(df_train.values, classes)
        selector_reg.fit(df_train.values, loss)
        pvalues_clf = selector_clf.pvalues_
        pvalues_reg = selector_reg.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1
        pvalues_reg[np.isnan(pvalues_reg)] = 1
        
        #put feature vectors into dictionary
        feats = {}
        feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) 
        feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05)
        feats['univ_reg_sub005'] = (pvalues_reg<0.05)
        feats['univ_clf_sub005'] = (pvalues_clf<0.05)
        
        print "randomized lasso feature selector"
        sel_lasso = RandomizedLasso(random_state = 42, n_jobs = 4).fit(df_train.values, loss)
        #put rand_lasso feats into feature dict
        feats['rand_lasso'] = sel_lasso.get_support()
        
        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(df_train.values)
        sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_>0)
        sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes)
        feats['LogReg'] = np.ravel(sel_log.coef_>0)
        
        feat_sums = np.zeros(len(feats['rand_lasso']))
        for key in feats:
            feat_sums+=feats[key].astype(int)
        feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes
        joblib.dump(feats, 'features/feats.pkl', compress = 3)
    
    else:
        feats = joblib.load('features/feats.pkl')
    
    xtrain = df_train.values
    xtest = df_test.values
    
    print "fitting gb-regressor"
    reg_gbr = GradientBoostingRegressor(n_estimators = 3000, learning_rate = 0.001, max_depth =5, random_state = 42, verbose = 100, min_samples_leaf=5)
    reg_gbr.fit(xtrain[:, feats['ensemble']], loss)
    gbr_preds = reg_gbr.predict(xtest[:, feats['ensemble']])
    sample['target'] = gbr_preds
    sample.to_csv('submissions/gbm_sub.csv', index = False)
    reg_lin = LinearRegression()
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    print "fitting linear regressor"
    reg_lin.fit(xtrain[:, feats['rand_lasso']], loss)
    lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']])
    gbr_order = gbr_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc.
    lin_order = lin_preds.argsort().argsort()
    #averaging
    mean_order = np.vstack((gbr_order, lin_order)).mean(0)    
    sample['target'] = mean_order
    sample.to_csv('submissions/mean_sub.csv', index = False)
def main():
    print "read train"
    df_train = pd.read_csv('./data/train.csv')
    print "read test"
    df_test = pd.read_csv('./data/test.csv')
    sample = pd.read_csv('./data/sample_submission.csv')
    
    cats = ['T1_V4', 'T1_V5', 'T1_V6', 'T1_V7', 'T1_V8', 
            'T1_V9', 'T1_V11', 'T1_V12', 'T1_V15', 'T1_V16',
            'T1_V17', 'T2_V3', 'T2_V5', 'T2_V11', 'T2_V12',
            'T2_V13']
            
    print "convert mixed columns to strings"
    df_train.loc[:, cats] = df_train[cats].applymap(str)
    df_test.loc[:, cats] = df_test[cats].applymap(str)
    
    print "one-hot encoding"
    df_train = make_dummies(df_train, cats)
    df_test = make_dummies(df_test, cats)
    
    print "set binary labels"
    df_train['hazard_class'] = (df_train.Hazard==1).astype(int)
    
    classes = df_train.hazard_class.values
    # loss = df_train.target.values
    hazard = df_train.Hazard.values
    df_train = df_train.drop(['Hazard', 'Id', 'hazard_class'], axis = 1)
    df_test = df_test.drop(['Id'], axis = 1)

    build_features = False #flag, determines whether features will be trained or read from file
    
    if build_features:
        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func = f_classif, k = 'all')
        selector_reg = SelectKBest(score_func = f_regression, k = 'all')
        selector_clf.fit(df_train.values, classes)
        selector_reg.fit(df_train.values, hazard)
        pvalues_clf = selector_clf.pvalues_
        pvalues_reg = selector_reg.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1
        pvalues_reg[np.isnan(pvalues_reg)] = 1
        
        #put feature vectors into dictionary
        feats = {}
        feats['univ_sub01'] = (pvalues_clf<0.1)&(pvalues_reg<0.1) 
        feats['univ_sub005'] = (pvalues_clf<0.05)&(pvalues_reg<0.05)
        feats['univ_reg_sub005'] = (pvalues_reg<0.05)
        feats['univ_clf_sub005'] = (pvalues_clf<0.05)
        
        print "randomized lasso feature selector"
        sel_lasso = RandomizedLasso(random_state = 42).fit(df_train.values, hazard)
        #put rand_lasso feats into feature dict
        feats['rand_lasso'] = sel_lasso.get_support()
        
        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(df_train.values)
        sel_svc = LinearSVC(C=0.1, penalty = "l1", dual = False, random_state = 42).fit(X_sp, classes)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_>0)
        sel_log = LogisticRegression(C=0.01, random_state = 42).fit(X_sp, classes)
        feats['LogReg'] = np.ravel(sel_log.coef_>0)
        
        feat_sums = np.zeros(len(feats['rand_lasso']))
        for key in feats:
            feat_sums+=feats[key].astype(int)
        feats['ensemble'] = feat_sums>=5 #take features which get 5 or more votes
        joblib.dump(feats, './features/feats.pkl', compress = 3)
    
    else:
        feats = joblib.load('features/feats.pkl')

    xtrain = df_train.values
    xtest = df_test.values

    print "fitting xgb-regressor"
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.01
    params["max_depth"] = 7
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.8
    params["min_child_weight"] = 5
    params["silent"] = 1
    plst = list(params.items())
    num_rounds = 600
    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(xtrain[:,feats['ensemble']], label=hazard)
    xgtest = xgb.DMatrix(xtest[:,feats['ensemble']])
    reg_xgb = xgb.train(plst, xgtrain, num_rounds)
    xgb_preds = reg_xgb.predict(xgtest)
    sample['Hazard'] = xgb_preds
    sample.to_csv('./submissions/xgb.csv', index = False)
    reg_lin = LinearRegression()
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    print "fitting linear regressor"
    reg_lin.fit(xtrain[:, feats['rand_lasso']], hazard)
    lin_preds = reg_lin.predict(xtest[:, feats['rand_lasso']])
    sample['Hazard'] = lin_preds
    sample.to_csv('./submissions/lin.csv', index = False)
    xgb_order = xgb_preds.argsort().argsort() #maps smallest value to 0, second-smallest to 1 etc.
    lin_order = lin_preds.argsort().argsort()
    #averaging
    mean_order = np.vstack((xgb_order, lin_order)).mean(0)    
    sample['Hazard'] = mean_order
    sample.to_csv('./submissions/mean.csv', index = False)
示例#12
0
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import normalize

# ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat")
ourdataB = sio.loadmat(
    "/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat")
# ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat")

inputX = ourdataB['X']
inputX = normalize(inputX, axis=0)
inputY = ourdataB['Y'][0, :]
columnNames = ourdataB['columnNames']

X_train, X_test, y_train, y_test = train_test_split(inputX,
                                                    inputY,
                                                    test_size=0.2,
                                                    random_state=42)

randomized_lasso = RandomizedLasso()
randomized_lasso.fit(X_train, y_train)

featureMask = randomized_lasso.get_support()

X_train_lasso = X_train[:, featureMask]
X_test_lasso = X_train[:, featureMask]

columnNames[0][:100][featureMask]

sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \
   'X_train_lasso':X_test_lasso, 'featureMask':featureMask})