def test_lasso(): alphaNum = 6 print '*' * 80 inputData = pd.read_hdf( './rise_DM_fraud/dev1/preprocessing/preprocessing_result.h5') target = 'fpd' Y = inputData[target] X = inputData.drop(target, axis=1) X.fillna(-999, inplace=True) lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, Y) skf = cv.StratifiedKFold(y=Y, n_folds=5) for i, (_, test_index) in enumerate(skf): print 'Fold', i test_X = X.iloc[test_index, :] test_Y = Y[test_index] alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], alphaNum) clf = linear_model.RandomizedLasso(alphas, random_state=33, n_jobs=1).fit(test_X, test_Y) featureImportance = pd.DataFrame(sorted(zip( map(lambda x: round(x, 4), clf.scores_), X.columns), reverse=True), columns=['importance', 'name']) featureImportance.to_csv( './rise_DM_fraud/dev1/feature_ranking/feature_importance_lasso_fold_%d.csv' % (i + 1), index=False)
def Random_Lasso_reg(X, y, alpha): #modify this to use gridsearch cv this weekend: #https://stackoverflow.com/questions/45857274/interpreting-ridge-regression-in-gridsearchcv estimator = linear_model.RandomizedLasso(alpha=alpha) estimator.fit(X, y) return estimator.scores_
def perform_randomizedLasso(df, target): randomLasso = linear_model.RandomizedLasso(alpha=np.logspace(-3, 3, 100), sample_fraction=0.5, n_resampling=500, normalize=False, random_state=36, scaling=0.5) randomLasso.fit(df, target) return randomLasso.scores_ #, randomLasso.all_scores_
def RandomizedLassoRegression(np_X, np_y): X = np_X y = np_y X_sparse = coo_matrix(X) X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) estimator = linear_model.RandomizedLasso(n_jobs=1, n_resampling=500) estimator.fit(X, y) return estimator.scores_
def rLasso(X_scaled, Y, labels, X_test): print "Features sorted by their score for Randomized Lasso:" scores = np.zeros(X_scaled.shape[1]) alphas = [0.003, 0.002] #, 0.001] for i in alphas: a = i print "Trying alpha %f" % (a) randomized_lasso = linear_model.RandomizedLasso( n_jobs=1, alpha=a, sample_fraction=0.25, verbose=True) printSizes('rlasso', X_scaled, Y, X_test) randomized_lasso.fit(X_scaled, Y[:, 1]) scores = scores + randomized_lasso.scores_ if debug: for score, label in sorted(zip( map(lambda x: round(x, 6), randomized_lasso.scores_), labels), reverse=True): if score > 0.015: print "%s: %f" % (label, score) scores = scores / len(alphas) # get mean values meanImportance = np.mean(scores) print "Average score for variable = %f" % (meanImportance) if meanImportance > 0.00001: if X_scaled.shape[1] > 100: thresh = 1.0 else: thresh = 1.0 keptIndices = np.where(scores > thresh * meanImportance) print "Top Scores for Random Lasso" if debug: for (score, label) in sorted(zip(scores, labels), key=lambda (score, label): score, reverse=True): if score > meanImportance: print "%s: %f" % (label, score) printSizes('rlassoBeforeCut', X_scaled, Y, X_test) labels = labels[keptIndices] X_scaled = np.squeeze(X_scaled[:, keptIndices]) X_test = np.squeeze(X_test[:, keptIndices]) printSizes('rlassoAfterCut', X_scaled, Y, X_test) else: print "Not useful, aborting" print "New size of X" print X_scaled.shape return (X_scaled, Y, labels, X_test)
size = 750 X = np.random.uniform(0, 1, (size, 14)) print(X[:, 1]) Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5)**2 + 10 * X[:, 3] + 5 * X[:, 4]**5 + np.random.normal(0, 1)) X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4)) lin = linear_model.LinearRegression() lin.fit(X, Y) ridge = Ridge() # alpha=0.1 ridge.fit(X, Y) lasso = linear_model.Lasso() # alpha=0.1 lasso.fit(X, Y) randLasso = linear_model.RandomizedLasso() randLasso.fit(X, Y) rfe = feature_selection.RFE(estimator=linear_model.LinearRegression()) rfe.fit(X=X, y=Y) rfr = RandomForestRegressor() rfr.fit(X, Y) freg = feature_selection.f_regression(X, Y) ans_lin = abs(lin.coef_) mx = [max(ans_lin)] * 14 ans_lin = ans_lin / mx ans_ridge = abs(ridge.coef_) mx = [max(ans_ridge)] * 14 ans_ridge = ans_ridge / mx ans_lasso = abs(lasso.coef_)
def feature_select(feature_ranking_choice, ranking_method, X_train, Y_train, targetName, featureRank_folder, featureNum, fill_missing, stable_test_rf=False): # lasso configuration alphaNum = 6 # random forest configuration nTrees = 1000 njobs = 4 maxFeaturePercent = 0.1 nFeaturePlot = 30 featureRanking_la = featureRank_folder + '/feature_importance_lasso.csv' afterSelectData_la = featureRank_folder + '/dataAfterSelect_lasso.h5' featureRanking_rf = featureRank_folder + '/feature_importance_rf.csv' afterSelectData_rf = featureRank_folder + '/dataAfterSelect_rf.h5' print '*' * 80 print 'running feature_selection.py' if feature_ranking_choice == 0: print 'no feature ranking or selection!' return X_train elif feature_ranking_choice == 2: print 'previous feature ranking and selection result is loaded!' if ranking_method == 'lasso': featureNames = pd.read_csv(featureRanking_la)['name'][:featureNum] elif ranking_method == 'rf': featureNames = pd.read_csv(featureRanking_rf)['name'][:featureNum] return X_train[featureNames] elif feature_ranking_choice == 1: X_train_temp = X_train.copy() if X_train_temp.isnull().sum().sum() > 0: X_train_temp.fillna(-999, inplace=True) print 'missing data is temporarily filled by -999 in the feature selection process!' #### stability selection: L1-based feature selection if ranking_method == 'lasso': ## find best alpha through cross-valiation # lars_cv = linear_model.LassoLarsCV(cv=6).fit(X_train_temp,Y_train) ## choose the alpha candidates # alphas = np.linspace(lars_cv.alphas_[0], .1*lars_cv.alphas_[0], alphaNum) ## obtain scores of features coming with different alphas and combine them, max() used across all alphas's score # clf1 = linear_model.RandomizedLasso(alpha=alphas, random_state=42,n_jobs=1).fit(X_train_temp,Y_train) clf1 = linear_model.RandomizedLasso(alpha='aic', random_state=33, n_jobs=1, verbose=True).fit( X_train_temp, Y_train) ## sort the scores of features featureImportance = pd.DataFrame(zip( X_train_temp.columns, map(lambda x: round(x, 4), clf1.scores_)), columns=['name', 'importance']) featureImportance.sort_values(by='importance', ascending=False, inplace=True) featureImportance.index = range(featureImportance.shape[0]) featureImportance.to_csv(featureRanking_la, index=False) if fill_missing == True: returnData = pd.concat([ Y_train, X_train_temp.ix[:, featureImportance.iloc[:featureNum, 0]] ], axis=1) else: returnData = pd.concat([ Y_train, X_train.ix[:, featureImportance.iloc[:featureNum, 0]] ], axis=1) print 'Lasso feature ranking finish!' elif ranking_method == 'rf': if stable_test_rf == True: test_rf(X_train_temp, Y_train, nTrees, njobs, maxFeaturePercent, featureRank_folder, nFeaturePlot) featureImportanceAndName = get_feature_importance_rf( X_train_temp, Y_train, nTrees, njobs, maxFeaturePercent) featureImportanceAndName.sort_values(by='importance', ascending=False, inplace=True) featureImportanceAndName.to_csv(featureRanking_rf, index=False) if fill_missing == True: returnData = pd.concat([ Y_train, X_train_temp. ix[:, featureImportanceAndName['name'][:featureNum]] ], axis=1) else: returnData = pd.concat([ Y_train, X_train.ix[:, featureImportanceAndName['name'][:featureNum]] ], axis=1) print 'RF feature ranking finish!' print 'feature ranking done!' returnX = returnData.drop(targetName, axis=1, inplace=False) return returnX
#features.drop('zipcode',1,inplace=True) #features.drop('lat',1,inplace=True) #features.drop('long',1,inplace=True) scalerNorm = Normalizer(norm='l2') scalerStandard = StandardScaler().fit(features) #scalerX.fit(features) #features = scalerX.transform(features) features = scalerStandard.transform(features) print(features.shape) Lars_cv = linearmodels.LarsCV(cv=6).fit(features, y) Lasso_cv = linearmodels.LassoCV(cv=6).fit(features, y) alphas = np.linspace(Lars_cv.alphas_[0], .1 * Lars_cv.alphas_[0], 6) Randomized_lasso = linearmodels.RandomizedLasso(alpha=alphas, random_state=42) linear_regression = linearmodels.LinearRegression() linear_SVR = LinearSVR(loss='squared_epsilon_insensitive') featureselector_Lars = feature_selection.SelectFromModel(Lars_cv, prefit=True) featureselector_Lasso = feature_selection.SelectFromModel(Lasso_cv, prefit=True) featureselector_RLasso = Randomized_lasso.fit(features, y) print(Lars_cv.coef_) print(Lasso_cv.coef_) print(Randomized_lasso.scores_) scoreoffeature = pd.DataFrame( [Lars_cv.coef_, Lasso_cv.coef_, Randomized_lasso.scores_],
scalerX = preprocessing.Normalizer(norm='l2') standardScalerX = preprocessing.StandardScaler() #scalerYa = preprocessing.Normalizer(norm='l2') #scalerYg = preprocessing.Normalizer(norm='l2') #scalerYf = preprocessing.Normalizer(norm='l2') scalerX.fit(X) X=scalerX.transform(X) #X=standardScalerX.fit_transform(X) print(X.shape) lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, yf) alphas = py.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) treeScore = linear_model.RandomizedLasso(alpha=alphas,random_state=42) #treeScore.fit(X, yf) #X=selector.fit_transform(X,yf) #trees = ensemble.ExtraTreesRegressor(100).fit(X, yf) print(lars_cv.coef_) #lars_cv.coef_ = py.abs(lars_cv.coef_) treeSelector = feature_selection.SelectFromModel(lars_cv,prefit=True) #treeSelector = feature_selection.SelectFromModel(treeScoreSaver,prefit=True,threshold=0.5) print(treeSelector.get_params())
"Features sorted by score, using {} resamplings: ".format(resamplings)) feature_list = sorted(zip(map(lambda x: round(x, 4), rlogit.scores_), cols), reverse=True) for f in feature_list[ 0:25]: # Adjust this if last feature output is nonzero print("{}:\t\t\t{:.2f}".format(f[1], f[0])) # ### Entire dataset, LASSO for age as interest variable. # In[68]: X, y = df[cols], df.AGE import warnings # sklearn is using a deprecated rand function here, with warnings.catch_warnings(): # and warnings clutter output warnings.simplefilter("ignore") resamplings = 2000 rlasso = linear_model.RandomizedLasso(n_resampling=resamplings) rlasso.fit(X, y) print( "Features sorted by score, using {} resamplings: ".format(resamplings)) feature_list = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), cols), reverse=True) for f in feature_list[ 0:50]: # Adjust this if last feature output is nonzero print("{}:\t\t\t{:.2f}".format(f[1], f[0])) # In[ ]:
data = pd.read_csv('../data/pipeline-full/ya-oa-full-linreg-02-24.csv') cols = list(data.columns.values) cols.remove('SUBJECT') cols.remove('CLASS') cols.remove('AGE') cols.remove('SEX') X = data[cols] y = data.AGE alpha = -15.4 resamplings = 8 rlasso = linear_model.RandomizedLasso(alpha=alpha, n_resampling=resamplings) rlasso.fit(X, y) print("Features sorted by score, using {} resamplings: ".format(resamplings)) feature_list = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), cols), reverse=True) for f in feature_list: print(f)
def feature_select(feature_ranking_choice, ranking_method, inputData, targetName, featureRank_folder, featureNum): alphaNum = 6 nTrees = 1000 featureRanking_la = featureRank_folder + '/feature_importance_lasso.csv' afterSelectData_la = featureRank_folder + '/dataAfterSelect_lasso.h5' featureRanking_rf = featureRank_folder + '/feature_importance_rf.csv' afterSelectData_rf = featureRank_folder + '/dataAfterSelect_rf.h5' print '*' * 80 if feature_ranking_choice == 0: print 'no feature ranking or selection!' return inputData elif feature_ranking_choice == 2: print 'previous feature ranking and selection result is loaded!' if ranking_method == 'lasso': return pd.read_hdf(afterSelectData_la, 'dataAfterSelect') elif ranking_method == 'rf': return pd.read_hdf(afterSelectData_rf, 'dataAfterSelect') elif feature_ranking_choice == 1: if inputData.isnull().sum().sum() > 0: inputData.fillna(-999, inplace=True) print 'missing data is temporarily filled by -999 in the feature selection process!' Y = inputData[targetName] X = inputData.drop([targetName], axis=1, inplace=False) #### L1-based feature selection if ranking_method == 'lasso': ## find best alpha through cross-valiation lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, Y) ## choose the alpha candidates alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], alphaNum) ## obtain scores of features coming with different alphas and combine them, max() used across all alphas's score clf1 = linear_model.RandomizedLasso(alpha=alphas, random_state=42, n_jobs=1).fit(X, Y) ## sort the scores of features featureImportance = pd.DataFrame( sorted(zip(map(lambda x: round(x, 4), clf1.scores_), X.columns), reverse=True)) featureImportance.to_csv(featureRanking_la, index=False) store = pd.HDFStore(afterSelectData_la) store['dataAfterSelect'] = pd.concat( [Y, X.ix[:featureImportance.iloc[:featureNum, 1]]], axis=1) print 'Lasso feature ranking finish!' elif ranking_method == 'rf': rf1 = ensemble.RandomForestClassifier(n_estimators=nTrees, criterion='gini', max_features=0.1, max_depth=3, n_jobs=4, verbose=1) rf2 = ensemble.RandomForestClassifier(n_estimators=nTrees, criterion='gini', max_features=0.1, max_depth=5, n_jobs=4, verbose=1) rf3 = ensemble.RandomForestClassifier(n_estimators=nTrees, criterion='gini', max_features=0.1, max_depth=7, n_jobs=4, verbose=1) rf4 = ensemble.RandomForestClassifier(n_estimators=nTrees, criterion='entropy', max_features=0.1, max_depth=3, n_jobs=4, verbose=1) rf5 = ensemble.RandomForestClassifier(n_estimators=nTrees, criterion='entropy', max_features=0.1, max_depth=5, n_jobs=4, verbose=1) rf6 = ensemble.RandomForestClassifier(n_estimators=nTrees, criterion='entropy', max_features=0.1, max_depth=7, n_jobs=4, verbose=1) ## train random forest model rf1.fit(X, Y) rf2.fit(X, Y) rf3.fit(X, Y) rf4.fit(X, Y) rf5.fit(X, Y) rf6.fit(X, Y) ## note down the ranking of features based on the importances in different split criteria and max depth featureImportanceAverage = ( rf1.feature_importances_ + rf2.feature_importances_ + rf3.feature_importances_ + rf4.feature_importances_ + rf5.feature_importances_ + rf6.feature_importances_) / 6 sortedFeatureImportance = pd.DataFrame( featureImportanceAverage).sort_values(by=0, ascending=False) sortedFeatureNames = X.columns[sortedFeatureImportance.index] sortedFeatureImportance.index = range(X.shape[1]) featureImportance = pd.concat([ pd.DataFrame(sortedFeatureImportance), pd.DataFrame(sortedFeatureNames) ], axis=1) featureImportance.to_csv(featureRanking_rf, index=False) store = pd.HDFStore(afterSelectData_rf) store['dataAfterSelect'] = pd.concat( [Y, X.ix[:, featureImportance.iloc[:featureNum, 1]]], axis=1) print 'RF feature ranking finish!' return pd.concat([Y, X.ix[:, featureImportance.iloc[:featureNum, 1]]], axis=1) print 'feature ranking done!'
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None): from sklearn import datasets, neighbors, linear_model, svm totalTime = 0 startTrainTime = time() logger.info("Start training...") if model_type == 'ARDRegression': model = linear_model.ARDRegression().fit(train_x, train_y) elif model_type == 'BayesianRidge': model = linear_model.BayesianRidge().fit(train_x, train_y) elif model_type == 'ElasticNet': model = linear_model.ElasticNet().fit(train_x, train_y) elif model_type == 'ElasticNetCV': model = linear_model.ElasticNetCV().fit(train_x, train_y) elif model_type == 'HuberRegressor': model = linear_model.HuberRegressor().fit(train_x, train_y) elif model_type == 'Lars': model = linear_model.Lars().fit(train_x, train_y) elif model_type == 'LarsCV': model = linear_model.LarsCV().fit(train_x, train_y) elif model_type == 'Lasso': model = linear_model.Lasso().fit(train_x, train_y) elif model_type == 'LassoCV': model = linear_model.LassoCV().fit(train_x, train_y) elif model_type == 'LassoLars': model = linear_model.LassoLars().fit(train_x, train_y) elif model_type == 'LassoLarsCV': model = linear_model.LassoLarsCV().fit(train_x, train_y) elif model_type == 'LassoLarsIC': model = linear_model.LassoLarsIC().fit(train_x, train_y) elif model_type == 'LinearRegression': model = linear_model.LinearRegression().fit(train_x, train_y) elif model_type == 'LogisticRegression': model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'LogisticRegressionCV': model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'MultiTaskLasso': model = linear_model.MultiTaskLasso().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNet': model = linear_model.MultiTaskElasticNet().fit(train_x, train_y) elif model_type == 'MultiTaskLassoCV': model = linear_model.MultiTaskLassoCV().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNetCV': model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuit': model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuitCV': model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y) elif model_type == 'PassiveAggressiveClassifier': model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'PassiveAggressiveRegressor': model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y) elif model_type == 'Perceptron': model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RandomizedLasso': model = linear_model.RandomizedLasso().fit(train_x, train_y) elif model_type == 'RandomizedLogisticRegression': model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y) elif model_type == 'RANSACRegressor': model = linear_model.RANSACRegressor().fit(train_x, train_y) elif model_type == 'Ridge': model = linear_model.Ridge().fit(train_x, train_y) elif model_type == 'RidgeClassifier': model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeClassifierCV': model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeCV': model = linear_model.RidgeCV().fit(train_x, train_y) elif model_type == 'SGDClassifier': model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SGDRegressor': model = linear_model.SGDRegressor().fit(train_x, train_y) elif model_type == 'TheilSenRegressor': model = linear_model.TheilSenRegressor().fit(train_x, train_y) elif model_type == 'lars_path': model = linear_model.lars_path().fit(train_x, train_y) elif model_type == 'lasso_path': model = linear_model.lasso_path().fit(train_x, train_y) elif model_type == 'lasso_stability_path': model = linear_model.lasso_stability_path().fit(train_x, train_y) elif model_type == 'logistic_regression_path': model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'orthogonal_mp': model = linear_model.orthogonal_mp().fit(train_x, train_y) elif model_type == 'orthogonal_mp_gram': model = linear_model.orthogonal_mp_gram().fit(train_x, train_y) elif model_type == 'LinearSVC': model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SVC': model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y) else: raise NotImplementedError('Model not implemented') logger.info("Finished training.") endTrainTime = time() trainTime = endTrainTime - startTrainTime logger.info("Training time : %d seconds" % trainTime) logger.info("Start predicting train set...") train_pred_y = model.predict(train_x) logger.info("Finished predicting train set.") logger.info("Start predicting test set...") test_pred_y = model.predict(test_x) logger.info("Finished predicting test set.") endTestTime = time() testTime = endTestTime - endTrainTime logger.info("Testing time : %d seconds" % testTime) totalTime += trainTime + testTime train_pred_y = np.round(train_pred_y) test_pred_y = np.round(test_pred_y) np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i') logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y))) logger.info('[TEST] Acc: %.3f' % (accuracy_score(test_y, test_pred_y))) return accuracy_score(test_y, test_pred_y)