def permutation_importance(self, test_sample, test_target, node_params=None, **kwargs): """ Calculates permutation importance of features. If node_params is not None, then plots graph weighted by permutation importance. :param test_sample: test feature subsample :param test_target: vector of targets for test sample :param node_params: mapping describes which node should be highlighted by target or source type Node param should be represented in the following form ```{ 'lost': 'bad_target', 'passed': 'nice_target', 'onboarding_welcome_screen': 'source', }``` If mapping is not given, it will be constracted from config :return: Nothing """ self.show_quality_metrics(test_sample, test_target) if hasattr(self.mod, 'coef_'): self._plot_perm_imp(__LogRegWrapper__(self.mod.coef_[0]), test_sample, node_params, **kwargs) return perm = PermutationImportance(self.mod, random_state=0).fit( test_sample, test_target) eli5.show_weights(perm, feature_names=[ ' '.join(i) if type(i) == tuple else i for i in test_sample.columns ]) self._plot_perm_imp(perm, test_sample, node_params, **kwargs)
def feature_importance_permutation(self, random_state=123): ''' calculate and display feature importance using the permutation method must have eli5 installed ''' perm = PermutationImportance(self.model, random_state).fit(X_val, y_val) eli5.show_weights(perm, feature_names = X_val.columns.tolist())
def feat_rank(clf_df): for i, clf in clf_df.iterrows(): classifier = clf['Classifier'] p = classifier.fit(X_train, y_train) perm = PermutationImportance(p).fit(X_test, y_test) eli5.show_weights(perm) return
def permutation_importance(self, test_sample, test_target, node_params=None, **kwargs): """ Calculates permutation importance of features. If ``node_params`` is not `None`, then plots graph weighted by permutation importance. Parameters ------- test_sample: pd.DataFrame Test feature subsample. test_target: np.array Vector of targets for test sample. node_params: dict, optional Event mapping describing which nodes or edges should be highlighted by different colors for better visualisation. Dictionary keys are ``event_col`` values, while keys have the following possible values: - ``bad_target``: highlights node and all incoming edges with red color; - ``nice_target``: highlights node and all incoming edges with green color; - ``bad_node``: highlights node with red color; - ``nice_node``: highlights node with green color; - ``source``: highlights node and all outgoing edges with yellow color. Example ``node_params`` is shown below: ``` { 'lost': 'bad_target', 'purchased': 'nice_target', 'onboarding_welcome_screen': 'source', 'choose_login_type': 'nice_node', 'accept_privacy_policy': 'bad_node', } ``` If ``node_params=None``, it will be constructed from ``retention_config`` variable, so that: ``` { 'positive_target_event': 'nice_target', 'negative_target_event': 'bad_target', 'source_event': 'source', } ``` Default: ``None`` """ self.show_quality_metrics(test_sample, test_target) if hasattr(self.mod, 'coef_'): self._plot_perm_imp(__LogRegWrapper__(self.mod.coef_[0]), test_sample, node_params, **kwargs) return perm = PermutationImportance(self.mod, random_state=0).fit( test_sample, test_target) eli5.show_weights(perm, feature_names=[ ' '.join(i) if type(i) == tuple else i for i in test_sample.columns ]) self._plot_perm_imp(perm, test_sample, node_params, **kwargs)
def show_permutation_importance(model, val_X, val_y): ''' Takes the model and dataframes for validation set (X and y) then calculates and shows the permutation importance. For more on permutation importance, check https://www.kaggle.com/dansbecker/permutation-importance ''' import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(model, random_state=1).fit(val_X, val_y) eli5.show_weights(perm, feature_names=val_X.columns.tolist())
def p_importances(permuter,feature_names): permutation_importances = eli5.show_weights( permuter, top=None, feature_names=feature_names ) return permutation_importances
def apply(self, data: 'Chapter') -> 'Chapter': base_score, score_decreases = get_score_importances(score_func, X, y) feature_importances = np.mean(score_decreases, axis=0) self.permutation_weights = show_weights( self.permutation_importances, feature_names=recipe.dataset.columns.keys()) return data
def plot_score(clf, X_test, y_test, feat_to_show=30, is_normalize=False, cut_off=0.5): cm = confusion_matrix(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test) print ('ROC_AUC: ', roc_auc_score(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test)) print ('Gini: ', 2*roc_auc_score(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test) - 1) print ('F1_score: ', f1_score(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test)) print ('Log_loss: ', log_loss(clf.predict(X_test), y_test)) print ('\n') print ('Classification_report: \n', classification_report(pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), y_test)) skplt.metrics.plot_confusion_matrix(y_test, pd.Series(clf.predict_proba(X_test)[:,1]).apply(lambda x: 1 if x>cut_off else 0), title="Confusion Matrix", normalize=is_normalize,figsize=(8,8),text_fontsize='large') #print ('\n') imp = pd.DataFrame(list(zip(X_test.columns, clf.feature_importances_))) imp = imp.reindex(imp[1].abs().sort_values().index).set_index(0) imp = imp[-feat_to_show:] #график_фич ax = imp.plot.barh(width = .6, legend = "", figsize = (12, 10)) ax.set_title("Feature Importances", y = 1.03, fontsize = 16.) _ = ax.set(frame_on = False, xlabel = "", xticklabels = "", ylabel = "") for i, labl in enumerate(list(imp.index)): score = imp.loc[labl][1] ax.annotate('%.2f' % score, (score + (-.12 if score < 0 else .02), i - .2), fontsize = 10.5) try: display(eli5.show_weights(clf, top=20, feature_names = list(X_test.columns))) except: pass
def randFor(rData, lData): randClass = RandomForestClassifier(n_estimators = 100) respTrain, respTest, labTrain, labTest = train_test_split(rData, lData, random_state=1) vect = TfidfVectorizer(min_df=1, max_df=1.0, stop_words='english') respTrainVec = vect.fit_transform(respTrain) # To be commented for Pickle Building of Vectorizer respTestVec = vect.transform(respTest) randClass.fit(respTrainVec, labTrain) # To be commented for Pickle Building of Rand Class Model labPredClass = randClass.predict(respTestVec) #display(HTML(eli5.show_weights(randClass, top=5))) #print type(eli5.explain_prediction(randClass, respTest[0], vec=vect, target_names=targetList)) #tDF = eli5.explain_prediction_df(randClass) #tDF1 = eli5.show_weights(randClass, vec=vect, target_names=targetList) #print type(eli5.show_prediction(randClass, respTest[0], vec=vect, target_names=targetList)) # Explain the Weights of this Estimator ---------------------------------- #print eli5.explain_weights(randClass) print eli5.format_as_dataframes(eli5.show_weights(randClass)) print respTest[0] #prediction = eli5.explain_prediction (randClass, respTest[0], vec=vect, target_names=targetList, top=5) #weigths = eli5.explain_prediction (randClass, respTest[0], vec=vect, target_names=targetList, top=5) #print ( eli5.format_as_dataframes( weigths ) ) # Modify to return specfic class types return (metrics.accuracy_score(labTest, labPredClass))
def publish(self, recipe): base_score, score_decreases = get_score_importances(score_func, X, y) feature_importances = np.mean(score_decreases, axis=0) from eli5 import show_weights self.permutation_weights = show_weights( self.permutation_importances, feature_names = recipe.dataset.columns.keys()) return self
def permutation_importance(X_train, y_train, X_test, y_test, model, rand_state=1): # Lo que hace esta funcion es aplicar una tecnica para determinar la importancia de las variables # segun el siguiente metodo: # Dentro del conjunto de validacion (con un modelo ya entrenado) se mezclan una a una las variables (como en un mazo de cartas) # y se predicen los resultados y se evalua que tanto baja la performance. Cuanto mas baje, mas importante era esa variable perm = PermutationImportance(model, random_state=rand_state).fit( X_test, y_test ) # Creo una instancia PermutationImportance (solo se aplica a testing) eli5.show_weights(perm, feature_names=X_test.columns.tolist() ) # Mostrar la importancia de las variables
def permutation_importance(my_model, val_X, val_y, ret_df=False): import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y) if ret_df: return eli5.explain_weights_df(perm, feature_names=val_X.columns.tolist()) else: return eli5.show_weights(perm, feature_names=val_X.columns.tolist())
def feature_importance(create_model): t_model = KerasClassifier(build_fn=create_model, epochs=EPOCHS, batch_size=5, verbose=0) t_model.fit(X_train, Y_train) perm = PermutationImportance(t_model, random_state=1).fit(X_train, Y_train) display(eli5.show_weights(perm, feature_names=featureNames))
def load_and_feature_analysis(): from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve import xgboost ################### eli5 import eli5 # pip install eli5 from eli5.sklearn import PermutationImportance #featrues_filename = 'features_3_sec.csv' # Test Accuracy: 0.90224 #featrues_filename = 'data_adv_3_sec_no_var_hccho.csv' # 실수로 mfcc_var를 빼먹고 만들었다. Test Accuracy: 0.96663 featrues_filename = 'data_adv_3_sec_hccho.csv' # Test Accuracy : 0.95762 data = pd.read_csv(f'{general_path}/{featrues_filename}') data = data.iloc[0:, 1:] # 첫번째 column은 파일 이름이므로, 버린다. print(data.shape, data.head(5)) y = data['label'] # genre variable. X = data.loc[:, data.columns != 'label'] #select all columns but not the labels #### NORMALIZE X #### # Normalize so everything is on the same scale. cols = X.columns min_max_scaler = preprocessing.MinMaxScaler() np_scaled = min_max_scaler.fit_transform(X) # return numpy array (9990,58) # new data frame with the new scaled data. X = pd.DataFrame(np_scaled, columns=cols) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42) # Frame in Frame out ########## model load ############## xgb_classifier = pickle.load(open("my_xgb_model.pkl", "rb")) preds = xgb_classifier.predict( X_test) # array(['hiphop', 'jazz', 'blues', ....],dtype=object) print('Accuracy', ':', round(accuracy_score(y_test, preds), 5), '\n') # feature F score. pandas dataframe으로 train했기 때문에, featrue이름이 표시된다. numpy array로 data를 넣었다면, feature이름이 표시되지 않는다. xgboost.plot_importance(xgb_classifier) plt.show() ####### eli5 PermutationImportance ################# perm = PermutationImportance(estimator=xgb_classifier, random_state=1) perm.fit(X_test, y_test) # return 되는 값은 정확도의 변화이다. 한번만 simulation하는 것이 아니므로, +/-가 있다. weights = eli5.show_weights(estimator=perm, feature_names=X_test.columns.tolist() ) #### weights.data가 string인데, 내용은 html형식. with open('Permutation_Importance.htm', 'wb') as f: f.write(weights.data.encode("UTF-8"))
def assess_best_features(cfg, granularity, label_col, algorithm_code=ALGO_CODE_EXTRA_TREE): logging.debug( "Assess best features for %s, on %s level, using algorithm_code %s" % (granularity, label_col, algorithm_code)) data_cfg = cfg.get_data_sample_config(label_col=label_col, max_items_per_class=None) # classifier_cfg = cfg.get_classifier_algorithm_config(algorithm_code) pipe = make_classifier_training_pipeline( cfg, get_classifier_algorithm_config(algorithm_code)) # # prepare the data X = data_cfg.df y = data_cfg.df[label_col] # # fit the pipeline pipe.fit(X, y) # get the classifier instance forest_clf = pipe.named_steps["classifier"].named_steps[algorithm_code] # get feature union (that include tranformers) fe = pipe.named_steps["data_preprocessor"].named_steps["feature_union"] feature_names = _get_feature_names(fe) feature_importances = forest_clf.feature_importances_ indices = np.argsort(feature_importances)[::-1] eli5_output = eli5.show_weights(forest_clf, feature_names=feature_names, top=100) # eli5_output = eli5.show_prediction(forest_clf, X.iloc[5], feature_names=feature_names, top=10, show_feature_values=True) from IPython.core.display import display display(eli5_output) # compute standard deviation std = np.std( [tree.feature_importances_ for tree in forest_clf.estimators_], axis=0) # # Print the feature ranking print("Feature ranking:") for i in range(100): print("%d. feature %d, %s, (%f, std:%f)" % (i + 1, indices[i], feature_names[indices[i]], feature_importances[indices[i]], std[indices[i]])) print("\nAverage standard deviation: %s" % np.average(std)) return indices, feature_names, feature_importances, std
def show_weights(self, **kwargs): """ Call :func:`eli5.show_weights` for the locally-fit classification pipeline. Keyword arguments are passed to :func:`eli5.show_weights`. :func:`fit` must be called before using this method. """ self._fix_target_names(kwargs) return eli5.show_weights(self.clf_, vec=self.vec_, **kwargs)
def test_show_weights(): clf = LogisticRegression() X = [[0, 0], [1, 1], [0, 1]] y = ['a', 'b', 'a'] clf.fit(X, y) html = eli5.show_weights(clf) # write_html(clf, html.data, '') assert isinstance(html, HTML) assert 'y=b' in html.data assert 'Explained as' not in html.data # explain_weights arguments are supported html = eli5.show_weights(clf, target_names=['A', 'B']) assert 'y=B' in html.data # format_as_html arguments are supported html = eli5.show_weights(clf, show=['method']) assert 'y=b' not in html.data assert 'Explained as' in html.data
def explain_model(model, X_train): return show_weights( model, feature_names=X_train.columns.tolist(), show=( "method", "description", "transition_features", "targets", "feature_importances", ), )
def print_permutation_importance(fitted_model, data_model : tuple, random_state = 1): if len(data_model) == 4: X_train, X_test, y_train, y_test = data_model if len(data_model) == 2: X_test, y_test = data_model else: raise "the len of data model is neither 4 nor 2" feature_names = X_test.columns.tolist() perm = PermutationImportance(fitted_model, random_state = random_state).fit(X_test, y_test) display(eli5.show_weights(perm, feature_names = feature_names))
def describe(self, model_name, model, data_dict): feature_names = list(data_dict['x_train'].columns) test_observation = data_dict['x_test'].iloc[0] explained_weights = eli5.show_weights( model, feature_names=feature_names, show=['targets', 'transition_features', 'feature_importances']) explained_prediction = eli5.show_prediction(model, test_observation) return { "Weights explanation": explained_weights, "Predictions explanation": explained_prediction }
def show_model_permutations(X_test=None, y_test=None, lin_model=None): """Appends permutation importance for the labels on one model to a list. Prints when called with no model. """ if lin_model is None: for display_weights in weights: display(display_weights) return perm_importance = calc_model_perm_importance(X_test, y_test, lin_model) weights.append( eli5.show_weights(perm_importance, feature_names=X_test.columns.tolist()))
def data_dictionary(): st.title('What data do I need for risk estimation?') st.write( "Here is the table for data dictionary. You may refer to this table to understand variable meaning." ) st.image( "https://github.com/yaliu0703/yaliu0703.github.io/blob/master/images/data%20dictionary.png?raw=true", use_column_width=True) html_object1 = eli5.show_weights(model, feature_names=list(X_train.columns), top=100) raw_html1 = html_object1._repr_html_() components.v1.html(raw_html1, height=500, scrolling=True)
def trainAndTestCRF(): data = pd.read_csv(constants.NER_DATASET_DIR).fillna(method='ffill') getter = sg.SentenceGetter(data) sentences = getter.sentences X = [ft.sent2features(s) for s in sentences] y = [ft.sent2labels(s) for s in sentences] labels = ['O', 'PATH', 'DIS', 'SYMP'] print(labels) crf = sklearn_crfsuite.CRF( algorithm='lbfgs', max_iterations=100, all_possible_transitions=False ) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } # use the same metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer) rs.fit(X, y) print('best params:', rs.best_params_) print('best CV score:', rs.best_score_) print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000)) crf = rs.best_estimator_ y_pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5) print(metrics.flat_classification_report( y, y_pred, labels=sorted(labels), digits=3 )) if not os.path.exists(constants.CRF_DIR): os.makedirs(constants.CRF_DIR) with open(os.path.join(constants.CRF_DIR, 'weights_optimized.html'), 'w+', encoding='utf-8') as f: f.write(eli5.show_weights(crf, top=30).data) testdata = pd.read_csv(constants.TESTDATA_DIR) getter = sg.SentenceGetter(testdata) sentences = getter.sentences X = [ft.sent2features(s) for s in sentences] y = [ft.sent2labels(s) for s in sentences] y_pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5) print(metrics.flat_classification_report( y, y_pred, labels=sorted(labels), digits=3 ))
def predict(algo, zscoreX, zscoreY): # print algo, accuracy, r-squared print(algo) print('with zScore on X: ' + str(zscoreX) + ' and Y: ' + str(zscoreY)) # different ZScore options if zscoreX: x = np.copy(scaled_x) else: x = np.copy(original_x) if zscoreY: y = np.copy(scaled_y) else: y = np.copy(original_y) model = algo() if algo == RandomForestRegressor: model = algo(n_estimators=150) model.fit(x[:dividing_line], y[:dividing_line]) y_predict = model.predict(x[dividing_line:]) # Alibi Explainable AI, only has Classifier support # we are going to use ELI5 # ELI5 is typically used in a notebook, but we can export it as HTML swdoc = open('swdoc_' + grade + '.html', 'w') swdoc.write(show_weights(model).data) swdoc.close() spdoc = open('spdoc_' + grade + '.html', 'w') # we are remembering our prediction for every school, while it is in the test data myi = 0 for school in y_predict: schools_to_predict[order_of_schools[myi]] = int(1000 * float(school)) myi += 1 if myi % 60 == 0: # 1/10th chance of an individual prediction spdoc.write( show_prediction(model, x[dividing_line + myi], show_feature_values=True).data) spdoc.close() # evaluation of test data print(explained_variance_score(y[dividing_line:], y_predict)) print(r2_score(y[dividing_line:], y_predict))
def eli5_perm_importance(self, **kwargs): """ Return ----------------- permutation importance implemented in ELI5 """ try: perm = PermutationImportance(self.model).fit( self.x_train, self.y_train) return eli5.show_weights( perm, feature_names=self.x_train.columns.tolist(), **kwargs) except AttributeError as err: err_logging(err) raise AttributeError(err)
def permutation_importance(self, df_inputs, df_outputs, **kwargs): xx = self.interpret_preproc.transform( df_inputs[self.categorical_inputs + self.numeric_inputs]) yy = df_outputs perm = eli5.sklearn.PermutationImportance(self.full_model, **kwargs).fit(xx, yy) display( eli5.show_weights(perm, feature_names=self.categorical_inputs + self.numeric_inputs)) return perm
def eli5_features(test, pipeline): clf = pipeline.named_steps["clf"] vec = pipeline.named_steps["vec"] transformer = Pipeline(pipeline.steps[:-1]) with open("eli5_weights.html", "w") as f: f.write(eli5.show_weights(clf, vec=vec, top=50).data) with open("eli5_prediction.html", "w") as f: f.write( eli5.show_prediction(clf, transformer.transform(test), feature_names=vec.get_feature_names()).data)
def calculate_feature_importance(model, x_test, y_test): """ :param model: pass the model to fitted model that you want to plot a learning curve for :param x_test: pass a dataframe the training data :param y_test: pass the training data target feature :return: a data frame containing features and their ranking of importance """ start_time = datetime.now() model_name = type(model).__name__ perm = PermutationImportance(model, random_state=2019).fit(x_test, y_test) time_elapsed = datetime.now() - start_time print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed)) return eli5.show_weights(perm, feature_names=x_test.columns.tolist())
def get_permutation_importance(self): # calculating permutation importance perm = PermutationImportance(self.model, random_state=1).fit(self.x, self.y) # saving Permutation Importance table html_str = eli5.show_weights(perm, feature_names=list(self.x.columns)).data html_file = open(self.out + "/permutation_importance.html", "w") html_file.write(html_str) html_file.close() # creating data frame for weights and features PI = pd.DataFrame(perm.feature_importances_, columns=["Weights"]) PI["Features"] = list(self.x.columns) PI = PI.sort_values("Weights", ascending=False) return PI
def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), new_feature_names=None, cv=time_split, scoring='roc_auc', top_n_features_to_show=30, submission_file_name='submission.csv'): cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring, n_jobs=4) print('CV scores', cv_scores) print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std())) model.fit(X_train, y_train) if new_feature_names: all_feature_names = site_feature_names + new_feature_names else: all_feature_names = site_feature_names display_html( eli5.show_weights(estimator=model, feature_names=all_feature_names, top=top_n_features_to_show)) if new_feature_names: print('New feature weights:') print( pd.DataFrame({ 'feature': new_feature_names, 'coef': model.coef_.flatten()[-len(new_feature_names):] })) test_pred = model.predict_proba(X_test)[:, 1] write_to_submission_file(test_pred, submission_file_name) return cv_scores