Exemplo n.º 1
0
y_train = house_train['SalePrice']
sns.distplot(y_train, hist=True)
y_trans = np.log1p(y_train)
sns.distplot(y_trans, hist=True)

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)

#union of 3 feature selectors
lasso_estimator = linear_model.Lasso()
lasso_grid = {'alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5]}
lasso_selector = get_best_model(lasso_estimator,
                                lasso_grid,
                                X_train,
                                y_trans,
                                scoring=scoring)
plot_feature_importances(lasso_selector, X_train, 50)
important_features_lasso = get_important_features(lasso_selector, X_train)

rf_estimator = ensemble.RandomForestRegressor(random_state=100)
rf_grid = {
    'n_estimators': list(range(100, 501, 200)),
    'max_features': [14, 16, 18, 20],
    'max_depth': [3, 5, 7]
}
rf_selector = get_best_model(rf_estimator,
                             rf_grid,
                             X_train,
                             y_trans,
                             scoring=scoring)
plot_feature_importances(rf_selector, X_train, 50)
important_features_rf = get_important_features(rf_selector, X_train)
Exemplo n.º 2
0
X_train = utils.ohe(titanic_train1, cat_features)
y_train = titanic_train['Survived']

#embedded feature selectors
rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(1, 9)),
    'n_estimators': list(range(1, 300, 100))
}
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
                                                   X_train, y_train)
embedded_selector = feature_selection.SelectFromModel(rf_final_estimator,
                                                      prefit=True,
                                                      threshold='mean')
X_train1 = embedded_selector.transform(X_train)
utils.plot_feature_importances(rf_final_estimator, X_train)

gb_estimator = ensemble.GradientBoostingClassifier()
gb_grid = {
    'max_depth': [1, 2, 3],
    'n_estimators': list(range(50, 300, 100)),
    'learning_rate': [0.001, 0.1, 1.0]
}
gb_final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid,
                                                   X_train, y_train)
embedded_selector = feature_selection.SelectFromModel(gb_final_estimator,
                                                      prefit=True,
                                                      threshold='mean')
X_train1 = embedded_selector.transform(X_train)
utils.plot_feature_importances(gb_final_estimator, X_train)
Exemplo n.º 3
0
    house1[imputable_cont_features])
house1.info()

house2 = utils.ohe(house1, imputable_cat_features)

scaler = utils.get_scaler(house2)
house3 = scaler.transform(house2)
house3 = pd.DataFrame(house3, columns=house2.columns)

X_train = house3[:house_train.shape[0]]
y_train = house_train['SalePrice']

lasso_selector = linear_model.Lasso()
lasso_selector.fit(X_train, y_train)
print(lasso_selector.coef_)
utils.plot_feature_importances(lasso_selector, X_train, 40)

X_train1 = utils.select_features(lasso_selector, X_train)

utils.corr_heatmap(X_train1)
lpca = decomposition.PCA(0.95)
lpca.fit(X_train1)
print(np.cumsum(lpca.explained_variance_ratio_))
pca_data = lpca.transform(X_train1)
print(pca_data.shape)

tsne = manifold.TSNE(n_components=2)
tsne_data = tsne.fit_transform(pca_data)
rutils.plot_data_3d_regression(tsne_data, y_train)

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
#filter zero-variance features
variance = feature_selection.VarianceThreshold()
train2 = variance.fit_transform(train1)

#embedded feature selection
rf_estimator = ensemble.RandomForestClassifier()
rf_grid = {
    'max_depth': list(range(1, 9)),
    'n_estimators': list(range(1, 300, 100))
}
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid,
                                                   train1, y)
embedded_selector = feature_selection.SelectFromModel(rf_final_estimator,
                                                      prefit=True,
                                                      threshold='mean')
utils.plot_feature_importances(rf_final_estimator, train1, cutoff=50)
train2 = embedded_selector.transform(train1)

#statistical feature selection
statistical_selector = feature_selection.SelectKBest(
    feature_selection.f_classif, k=20)
train2 = statistical_selector.fit_transform(train1, y)
print(statistical_selector.scores_)

#recursive feature elimination(rfe)
rf_estimator = ensemble.RandomForestClassifier()
rfe_selector = feature_selection.RFE(rf_estimator,
                                     n_features_to_select=10,
                                     step=5)
train2 = rfe_selector.fit_transform(train1, y)
print(rfe_selector.ranking_)