def test_regression(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf', gamma='auto') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='r2', num_rounds=1, seed=123) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert sum(imp_vals[3:]) <= 0.01
def test_regression_custom_mse(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf', gamma='auto') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric=mean_squared_error, num_rounds=1, seed=123) norm_imp_vals = imp_vals / np.abs(imp_vals).max() assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert norm_imp_vals[0] == -1.
def test_classification(): X, y = make_classification(n_samples=1000, n_features=6, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) svm = SVC(C=1.0, kernel='rbf', random_state=0, gamma='auto') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='accuracy', num_rounds=1, seed=1) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert imp_vals[2] > 0.2 assert sum(imp_vals[3:]) <= 0.02
def setUp(self): import Exercise9_03 self.exercises = Exercise9_03 self.file_url = '../Dataset/phpYYZ4Qc.csv' self.df = pd.read_csv(self.file_url) self.df.head() self.y = self.df.pop('rej') self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.df, self.y, test_size=0.3, random_state=1) self.rf_model = RandomForestRegressor(random_state=1, n_estimators=50, max_depth=6, min_samples_leaf=60) self.rf_model.fit(self.X_train, self.y_train) self.imp_vals, _ = feature_importance_permutation( predict_method=self.rf_model.predict, X=self.X_test.values, y=self.y_test.values, metric='r2', num_rounds=1, seed=2) self.varimp_df = pd.DataFrame({ 'feature': self.df.columns, 'importance': self.imp_vals })
def test_classification(): X, y = make_classification(n_samples=1000, n_features=6, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0, stratify=y) svm = SVC(C=1.0, kernel='rbf', random_state=0) svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='accuracy', num_rounds=1, seed=1) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert imp_vals[2] > 0.2 assert sum(imp_vals[3:]) <= 0.02
def test_regression(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric='r2', num_rounds=1, seed=123) assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert imp_vals[0] > 0.2 assert imp_vals[1] > 0.2 assert sum(imp_vals[3:]) <= 0.01
def test_regression_custom_mse(): X, y = make_regression(n_samples=1000, n_features=5, n_informative=2, n_targets=1, random_state=123, shuffle=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=123) svm = SVR(kernel='rbf', gamma='auto') svm.fit(X_train, y_train) imp_vals, imp_all = feature_importance_permutation( predict_method=svm.predict, X=X_test, y=y_test, metric=mean_squared_error, num_rounds=1, seed=123) norm_imp_vals = imp_vals / np.abs(imp_vals).max() assert imp_vals.shape == (X_train.shape[1], ) assert imp_all.shape == (X_train.shape[1], 1) assert norm_imp_vals[0] == -1.
def compute_imp_score(model, metric, training_features, training_classes, random_state): """compute importance scores for features. If coef_ or feature_importances_ attribute is available for the model, the the importance scores will be based on the attribute. If not, then permuation importance scores will be estimated Parameters ---------- tmpdir: string Temporary directory for saving experiment results model: scikit-learn Estimator A fitted scikit-learn model metric: str, callable The metric for evaluating the feature importance through permutation. By default, the strings 'accuracy' is recommended for classifiers and the string 'r2' is recommended for regressors. Optionally, a custom scoring function (e.g., `metric=scoring_func`) that accepts two arguments, y_true and y_pred, which have similar shape to the `y` array. training_features: np.darray/pd.DataFrame Features in training dataset training_classes: np.darray/pd.DataFrame Target in training dataset random_state: int Random seed for permuation importances Returns ------- coefs: np.darray Feature importance scores imp_score_type: string Importance score type """ # exporting/computing importance score if hasattr(model, 'coef_'): coefs = model.coef_ if coefs.ndim > 1: coefs = safe_sqr(coefs).sum(axis=0) imp_score_type = "Sum of Squares of Coefficients" else: coefs = safe_sqr(coefs) imp_score_type = "Squares of Coefficients" else: coefs = getattr(model, 'feature_importances_', None) imp_score_type = "Gini Importance" if coefs is None or np.isnan(coefs).any(): coefs, _ = feature_importance_permutation( predict_method=model.predict, X=training_features, y=training_classes, num_rounds=5, metric=metric, seed=random_state, ) imp_score_type = "Permutation Feature Importance" return coefs, imp_score_type
def getPermutationImportanceMLxtend(num_rounds, model, X_test, y_test, feature_names, width_perm_imp_plot, \ figure_path, figure_filename, top_k='All'): # calculate permutation importance values (hardcoded seed for reproducibility) imp_vals, imp_all = feature_importance_permutation( predict_method=model.predict, X=X_test, y=y_test, metric='accuracy', num_rounds=num_rounds, seed=1597) # calculate std dev std = np.std(imp_all, axis=1) # get indices from ranking indices = np.argsort(imp_vals)[::-1] # get labels in ranking order labels = [] for i in indices: labels += [feature_names[i]] # print information print( "********* Most Important Features (Mean Permutation Importance with Std. Dev.): *********" ) n_features = len(feature_names) if top_k == 'All' else int(top_k) if top_k != 'All': print("Note: Showing only top-" + str(top_k) + " features") for i in range(n_features): print("%d. feature %s (%f +/- %f)" % (i + 1, labels[i], imp_vals[indices[i]], std[indices[i]])) # create figure plt.figure(figsize=(width_perm_imp_plot, 7)) # title plt.title("RF Classifier Mean Permutation Importance (with Std. Dev.)") # horizontal line for y = 0 plt.hlines(0, -1, n_features, colors='k', linestyles='dotted') # create bars if top_k != 'All': plt.bar(range(n_features), imp_vals[indices[:top_k]], yerr=std[indices[:top_k]]) else: plt.bar(range(n_features), imp_vals[indices], yerr=std[indices]) # set labels on features if top_k != 'All': plt.xticks(range(n_features), labels[:top_k], rotation=90) else: plt.xticks(range(n_features), labels, rotation=90) # set x axis limits plt.xlim([-1, n_features]) # set axis labels plt.xlabel("Feature Name") plt.ylabel("Mean Feature Permutation Importance (MLxtend)") # save figure plt.savefig(figure_path / figure_filename, bbox_inches='tight') # return ranked feature names used in the figure if top_k != 'All': return labels[:top_k] else: return labels
def compute_imp_score(model, metric, features, target, random_state): """Compute permuation importance scores for features. Parameters ---------- tmpdir: string Temporary directory for saving experiment results model: scikit-learn Estimator A fitted scikit-learn model metric: str, callable The metric for evaluating the feature importance through permutation. By default, the strings 'accuracy' is recommended for classifiers and the string 'r2' is recommended for regressors. Optionally, a custom scoring function (e.g., `metric=scoring_func`) that accepts two arguments, y_true and y_pred, which have similar shape to the `y` array. features: np.darray/pd.DataFrame Features in training dataset target: np.darray/pd.DataFrame Target in training dataset random_state: int Random seed for permuation importances Returns ------- coefs: np.darray Feature importance scores imp_score_type: string Importance score type """ coefs, _ = feature_importance_permutation( predict_method=model.predict, X=features, y=target, num_rounds=5, metric=metric, seed=random_state, ) imp_score_type = "Permutation Feature Importance" return coefs, imp_score_type
def permuation_importance_wrapper(datatuple, model, rounds, metric=balanced_accuracy_score): X = datatuple[0] y = datatuple[1] name = datatuple[2] imp_vals, imp_all = feature_importance_permutation( predict_method=model.predict, X=X, y=y, metric=metric, num_rounds=rounds, seed=821996, ) result_dict = {"set": name, "imp_vals": imp_vals, "imp_all": imp_all} return result_dict
# Supervised Feature Importance # Using mlxtend.evaluate.feature_importance_permutation # Using sklearn.neighbors.KNeighborsClassifier # conda install -c conda-forge mlxtend from mlxtend.evaluate import feature_importance_permutation from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, y_train) feature_importances_, _ = feature_importance_permutation( predict_method=knn.predict, X=X_test, y=y_test, metric='accuracy', num_rounds=100, seed=1) feature_importances = pd.DataFrame({ 'feature': X.columns, 'importance': np.round(feature_importances_, 3) }) feature_importances = feature_importances.sort_values('importance', ascending=False) print("Supervised brute force") print(feature_importances)
ax = fig.add_subplot() viz = FeatureImportances(rf, ax=ax, absolute=True) viz.fit(X, y) viz.poof() ############## PERMUTATION IMPORTANCE ############## # mlxtend from mlxtend.evaluate import feature_importance_permutation imp_vals, imp_all = feature_importance_permutation( predict_method=rf.predict, X=X_test, y=y_test, metric='accuracy', # use 'r2' or other method for regression num_rounds=10, seed=1) std = np.std(imp_all, axis=1) indices = np.argsort(imp_vals)[::-1] plt.figure() plt.title( "Random Forest feature importance via permutation importance w. std. dev.") plt.bar(range(X.shape[1]), imp_vals[indices], yerr=std[indices]) plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, X.shape[1]]) plt.show()
def test_model( # pylint:disable=too-many-arguments modelpath: str, scalerpath: str, Xpath: str, ypath: str, namepath: str, outpath: str, featurelabelpath: str = None, ): # pylint:disable=too-many-locals """Takes a trained model and performes some tests on it and calculates statistics. Arguments: modelpath {str} -- path to sklearn model in .joblib file modelpath {str} -- path to the scaler object Xpath {str} -- path to features in npz file ypath {str} -- path to labels in npz file namepath {str} -- path to names in pickle 3 file outpath {str} -- path to which the evaluation metrics are written Keyword Arguments: featurelabelpath {str} -- path to a picklefile with a list of the feature names, if not None, feature importances are also estimates (default {None}) """ lower_quantile = 2.5 / 100 upper_quantile = 97.5 / 100 experiment = Experiment( api_key=os.getenv("COMET_API_KEY", None), project_name="mof-oxidation-states" ) experiment.add_tag("model evaluation") print("*** Loading data ***") model = load(modelpath) scaler = load(scalerpath) X = np.load(Xpath) X = scaler.transform(X) y = np.load(ypath) experiment.log_dataset_hash(X) names = read_pickle(namepath) print("*** Getting bootstrapped metrics, using 200 folds which takes some time ***") scores = bootstrapped_metrics(model, X, y, scoring_funcs=return_scoring_funcs()) df_metrics = pd.DataFrame(scores) means = df_metrics.mean().values medians = df_metrics.median().values lower = df_metrics.quantile(lower_quantile).values upper = df_metrics.quantile(upper_quantile).values stds = df_metrics.std().values # print( # " *** Running permuation test running 200 folds with 10 fold CV which takes forever ***" # ) # cv = StratifiedKFold(10) # balanced_accuracy, balanced_acc_permutation_scores, balanced_accuracy_pvalue = permutation_test( # model, X, y # ) metrics_dict = {} # metrics_dict["balanced_accuracy_cv"] = balanced_accuracy # metrics_dict[ # "balanced_accuracy_permutation_scores" # ] = balanced_acc_permutation_scores # metrics_dict["balanced_accuracy_p_value"] = balanced_accuracy_pvalue prediction = model.predict(X) print(" *** Getting misclassified cases ***") misclassified = np.where(y != prediction) misclassified_w_prediction_true = [ (names[i], prediction[i], y[i]) for i in list(misclassified[0]) ] metrics_dict["misclassified"] = misclassified_w_prediction_true experiment.log_metric("misclassified", misclassified) if featurelabelpath is not None: feature_labels = read_pickle(featurelabelpath) print("*** Getting feature importance ***") imp_vals, imp_all = feature_importance_permutation( predict_method=model.predict, X=X, y=y, metric="accuracy", num_rounds=20, # to get some errorbars seed=1, ) importance_error = np.std(imp_all, axis=-1) importance_metrics = [ (name, value, error) for name, value, error in zip(feature_labels, imp_vals, importance_error) ] experiment.log_metric("feature_importances", importance_metrics) metrics_dict["feature_importances"] = importance_metrics for i, column in enumerate(df_metrics.columns.values): metrics_dict[column] = (means[i], medians[i], stds[i], lower[i], upper[i]) print((column, means[i], "_".join([column, "mean"]))) experiment.log_metric("_".join([column, "mean"]), means[i]) experiment.log_metric("_".join([column, "median"]), medians[i]) experiment.log_metric("_".join([column, "lower"]), lower[i]) experiment.log_metric("_".join([column, "upper"]), upper[i]) experiment.log_metric("_".join([column, "std"]), stds[i]) # experiment.log_metrics("balanced_accuracy_cv", balanced_accuracy) # experiment.log_metrics("balanced_accuracy_p_value", balanced_accuracy_pvalue) # experiment.log_metrics("missclassified", misclassified_w_prediction_true) print(" *** Getting the calibration curve ***") cc = calibration_curve(y, model.predict(X), n_bins=10) metrics_dict["calibration_curve_true_probab"] = cc[0] metrics_dict["calibration_curve_predicted_probab"] = cc[1] # now write a .json with metrics for DVC with open(os.path.join(outpath, "test_metrics.json"), "w") as fp: json.dump(metrics_dict, fp, cls=NpEncoder)
align="center") plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, 25]) plt.ylim([0, 0.15]) #plt.show() plt.savefig('./feat_imp_48.png') #Ytest =numpy.array ([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, # 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ) # X = numpy.array(Xtest) imp_vals, imp_all = feature_importance_permutation( predict_method=model.predict, X=numpy.array(Xtest), y=numpy.array(Ytest), metric='accuracy', num_rounds=10, seed=1) std = numpy.std(imp_all, axis=1) indices = numpy.argsort(imp_vals)[::-1] plt.figure() plt.title("Random Forest feature importance via permutation importance") plt.bar(range(X.shape[1]), imp_vals[indices], yerr=std[indices]) plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, 30]) #plt.show() plt.savefig('./feat_imp_dog_perm.png') #aa.to_pickle('./conf_matr.pkl')