def test_cv(boston_train): # noise feature can be important if no cv is used, but not if cv is used # X_train, y_train are almost empty; we're using test part of the dataset X_train, X_test, y_train, y_test, feat_names = _boston_with_leak( *boston_train, noise_ratio=0.99) reg = PermutationImportance( SVR(C=100), random_state=42, cv=None, n_iter=50, # use the same number of experiments as with cv=10 ).fit(X_test, y_test) assert reg.score(X_test, y_test) > 0 assert reg.estimator_.score(X_test, y_test) > 0 print(reg.score(X_test, y_test)) imp_nocv = _assert_importances_good(reg, feat_names) # CV feature importances reg = PermutationImportance( SVR(C=100), random_state=42, cv=10, ).fit(X_test, y_test) imp_cv = _assert_importances_good(reg, feat_names) assert reg.score(X_test, y_test) > 0 assert imp_cv['DATALEAK'] * 10 < imp_nocv['DATALEAK']
random_state=42 ) permuter.fit(X_val_transformed, y_val) feature_names = X_val.columns.tolist() eli5.show_weights( permuter, top=None, # show permutation importances for all features feature_names=feature_names ) from sklearn.metrics import mean_squared_error, r2_score # Coefficient of determination r2 for the training set pipeline_score = permuter.score(X_train_transformed,y_train) print("Coefficient of determination r2 for the training set.: ", pipeline_score) # Coefficient of determination r2 for the validation set pipeline_score = permuter.score(X_val_transformed,y_val) print("Coefficient of determination r2 for the validation set.: ", pipeline_score) # The mean squared error y_pred = permuter.predict(X_val_transformed) print("Mean squared error: %.2f"% mean_squared_error(y_val, y_pred)) # Thus, Density remains important according to feature permutation than according to feature importance in the Random Fo # Use importances for feature selection print('Shape before removing features:', X_train.shape) # Remove features of 0 importance