def test_explain_linear_tuple_top(newsgroups_train): docs, y, target_names = newsgroups_train vec = TfidfVectorizer() clf = LogisticRegression(random_state=42) X = vec.fit_transform(docs) clf.fit(X, y) res_neg = explain_weights(clf, vec=vec, target_names=target_names, top=(0, 10)) expl_neg, _ = format_as_all(res_neg, clf) for target in res_neg.targets: assert len(target.feature_weights.pos) == 0 assert len(target.feature_weights.neg) == 10 assert "+0." not in expl_neg res_pos = explain_weights(clf, vec=vec, target_names=target_names, top=(10, 2)) format_as_all(res_pos, clf) for target in res_pos.targets: assert len(target.feature_weights.pos) == 10 assert len(target.feature_weights.neg) == 2
def test_explain_linear_feature_filter(newsgroups_train, vec): clf = LogisticRegression(random_state=42) docs, y, target_names = newsgroups_train X = vec.fit_transform(docs) clf.fit(X, y) if isinstance(vec, HashingVectorizer): vec = InvertableHashingVectorizer(vec) vec.fit(docs) res = explain_weights(clf, vec=vec, feature_re='^ath') text_expl, _ = expls = format_as_all(res, clf) for expl in expls: assert 'atheists' in expl assert 'atheism' in expl assert 'space' not in expl assert 'BIAS' not in expl res = explain_weights( clf, vec=vec, feature_filter=lambda name: name.startswith('ath') or name == '<BIAS>') text_expl, _ = expls = format_as_all(res, clf) for expl in expls: assert 'atheists' in expl assert 'atheism' in expl assert 'space' not in expl assert 'BIAS' in expl assert '<BIAS>' in text_expl
def main(input_path=TRAIN_TEST_DIR, output_path=MODELS_DIR): output_path.mkdir(parents=True, exist_ok=True) X_train, X_test, y_train, y_test = ujob.load_multiple( input_path, ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib']) model = make_model() cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) # Evaluate the model via cross-validation cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc') # Store and print evaluation metrics metrics = {} metrics['cv_scores'] = {} for i, value in enumerate(cv_scores.tolist(), start=1): metrics['cv_scores'][f'fold{i}'] = float(value) metrics['score_mean'] = float(np.mean(cv_scores)) metrics['score_std'] = float(np.std(cv_scores)) print(f'CV scores: {list(metrics["cv_scores"].values())}') print(f'Mean CV score: {metrics["score_mean"]}') print(f'Std CV score: {metrics["score_std"]}') # Fit the model on the whole dataset model.fit(X_train, y_train) # Show feature importances formatter = eli5.formatters.text.format_as_text print( formatter( eli5.explain_weights(model, feature_filter=lambda x: x.split('__')[0] == 'site_vectorizer'))) print( formatter(eli5.explain_weights( model, feature_filter=lambda x: x.split('__')[0] != 'site_vectorizer'), show=['targets'])) # Calculate model errors y_test_pred = model.predict(X_test) y_test_pred_proba = model.predict_proba(X_test) errors_mask = (y_test_pred != y_test) errors = np.array([np.arange(X_test.shape[0]), y_test_pred, y_test]).transpose()[errors_mask] errors_proba = y_test_pred_proba[errors_mask] # Export metrics and errors ujob.dump_multiple( [model, errors, errors_proba], output_path, ['model.joblib', 'errors.joblib', 'errors_proba.joblib']) with open(MODELS_DIR.joinpath('metrics.yaml'), 'w') as fout: yaml.dump(metrics, fout, sort_keys=False)
def test_explain_pipeline(predictor, transformer, X, feature_names, explain_kwargs): y = [1, 0] expected = explain_weights(clone(predictor).fit([[1, 0], [0, 1]], y), feature_names=['hello', 'world'], **explain_kwargs) pipe = make_pipeline(transformer, clone(predictor)).fit(X, y) actual = explain_weights(pipe, feature_names=feature_names, **explain_kwargs) assert expected._repr_html_() == actual._repr_html_()
def test_unsupported(): vec = CountVectorizer() clf = BaseEstimator() res = explain_weights(clf, vec=vec) assert 'BaseEstimator' in res.error for expl in format_as_all(res, clf): assert 'Error' in expl assert 'BaseEstimator' in expl with pytest.raises(TypeError): explain_weights(clf, unknown_argument=True)
def test_explain_decision_tree_regressor_multitarget(): X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg = DecisionTreeRegressor(random_state=42, max_depth=3) reg.fit(X, y) res = explain_weights(reg) expl_text, expl_html = format_as_all(res, reg) assert 'x9' in expl_text assert '---> [' in expl_text assert '---> [[' not in expl_text assert res == explain_weights(reg)
def test_explain_linear_regression_multitarget(reg): X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg.fit(X, y) res = explain_weights(reg) expl, _ = format_as_all(res, reg) assert 'x9' in expl assert '<BIAS>' in expl pos, neg = top_pos_neg(res, 'y2') assert 'x9' in neg or 'x9' in pos assert '<BIAS>' in neg or '<BIAS>' in pos assert res == explain_weights(reg)
def test_explain_lightgbm_booster(boston_train): xs, ys, feature_names = boston_train booster = lightgbm.train( params={ 'objective': 'regression', 'verbose_eval': -1 }, train_set=lightgbm.Dataset(xs, label=ys), ) res = explain_weights(booster) for expl in format_as_all(res, booster): assert 'Column_12' in expl res = explain_weights(booster, feature_names=feature_names) for expl in format_as_all(res, booster): assert 'LSTAT' in expl
def print_eli5(click_data, category): pred = pd.read_csv(category + '_xy.csv') model = joblib.load(category + ".h5") pred = pred.loc[(pred['grid_x'] == click_data['points'][0]['lon']) & (pred['grid_y'] == click_data['points'][0]['lat']), :] pred_sqr = pred['eurogrid_0250_1'].values[0] dane_model = df.loc[df['eurogrid_0250_1'] == pred_sqr, :] dict_ = eli5.format_as_dataframe(eli5.explain_weights(model)) cols = dict_['feature'].values maping = {} for i in range(len(cols)): maping['x' + str(i)] = cols[i] # print(dane_model.columns) expl = dane_model.loc[:, cols] # print(expl.head()) all_cols = itertools.permutations(cols) for cols in all_cols: try: expl = expl.loc[:, list(cols)] expl = eli5.formatters.format_as_dataframe( eli5.explain_prediction(model, expl)) break except: continue expl['feature'] = expl['feature'].apply(lambda x: map_x(x, maping)) return generate_table(expl)
def main(input_path=TRAIN_TEST_DIR, output_path=MODELS_DIR): output_path.mkdir(parents=True, exist_ok=True) X_train, X_test, y_train, y_test = ujob.load_multiple( input_path, ['X_train.joblib', 'X_test.joblib', 'y_train.joblib', 'y_test.joblib']) model = make_model() cv = StratifiedKFold(n_splits=5) cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy') print(f'CV scores: {cv_scores}') print(f'Mean CV score: {np.mean(cv_scores)}') model.fit(X_train, y_train) formatter = eli5.formatters.text.format_as_text print(formatter(eli5.explain_weights(model))) y_test_pred = model.predict(X_test) y_test_pred_proba = model.predict_proba(X_test) errors_mask = (y_test_pred != y_test) errors = np.array([np.arange(X_test.shape[0]), y_test_pred, y_test]).transpose()[errors_mask] errors_proba = y_test_pred_proba[errors_mask] ujob.dump_multiple( [model, errors, errors_proba], output_path, ['model.joblib', 'errors.joblib', 'errors_proba.joblib'])
def explore_final_model(): #https://github.com/gameofdimension/xgboost_explainer/blob/master/xgboost_explainer_demo.ipynb nr_labels = len(y) value_counts = y.value_counts() perc_per_label = {k:round(100 * v/float(nr_labels),2) for k,v in value_counts.items()} print('value counts:', y.value_counts()) print('perc per label:', perc_per_label) model = pickle.load(open(filename_model, "rb")) model_feature_names = model.attr('feature_names').split('|') index_to_class = json.loads(model.attr('index_to_class')) print(index_to_class) classes = [index_to_class[k] for k in sorted(index_to_class.keys())] print(classes) print('eli5 explain weights (gain):\n',eli5.format_as_text(eli5.explain_weights(model, top=10))) #gain df_test = pd.read_json(open(test_filename, "r")) df_test = df_test.head(5) feature_extractor = FeatureExtractor(df_test) X_test, X_test_featurenames = feature_extractor.get_features_pred_instances(df_test, model_feature_names) print(X) print(set(X.dtypes)) # print(X.iloc[0]) print(eli5.format_as_text(eli5.explain_prediction(model, X_test.head(1), target_names = classes, top = 10, feature_names = X_test_featurenames)))
def test_sklearn_crfsuite(xseq, yseq): crf = CRF(c1=0.0, c2=0.1, max_iterations=50) crf.fit([xseq], [yseq]) expl = explain_weights(crf) text, html = format_as_all(expl, crf) assert "y='sunny' top features" in text assert "y='rainy' top features" in text assert "Transition features" in text assert "sunny -0.130 0.696" in text assert u'+0.124 солнце:не светит' in text html_nospaces = html.replace(' ', '').replace("\n", '') assert u'солнце:не светит' in html assert '<th>rainy</th><th>sunny</th>' in html_nospaces try: from eli5 import format_as_dataframe, format_as_dataframes except ImportError: pass else: from .test_formatters_as_dataframe import check_targets_dataframe df_dict = format_as_dataframes(expl) check_targets_dataframe(df_dict['targets'], expl) df_transition = df_dict['transition_features'] transition = expl.transition_features print(df_transition) assert list(transition.class_names) == ['rainy', 'sunny'] assert np.isclose(df_transition['rainy']['rainy'], transition.coef[0, 0]) assert np.isclose(df_transition['sunny']['rainy'], transition.coef[0, 1]) assert np.isclose(df_transition['rainy']['sunny'], transition.coef[1, 0])
def explain(model_path): wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini") sent_word2vec_path = "./data/word2vec.query.bin" sent_vocab_path = "./data/word2vec.query.vocab" sent_model_path = "./data/sif.model" sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path, binary=True) sent_vocab_dict = load_vocab(sent_vocab_path) sent_model = joblib.load(sent_model_path) tfidf_count_hash_vectorModels = VectorModels() ner_dict_path = "./data/ner.dict" syn_dict_path = "./data/syn.dict" ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path) model = joblib.load(model_path) pd.set_option('display.max_rows', None) explain = eli5.explain_weights(model, top=None) explain = eli5.format_as_text(explain) print explain feature_names = [] column_names = ["qid", "ql", "qr"] #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100) reader = pd.read_csv(sys.stdin, sep="\t", dtype="str", names=column_names, chunksize=1) first_chunk = True feature_extractor = lambda row: extract_features( wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels, sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict) for data in reader: _ = data.fillna("", inplace=True) X = data[["ql", "qr"]].apply(feature_extractor, axis=1) X_features = X.apply(pd.Series) feature_names = X_features.columns.values.tolist() X_features = X_features[feature_names] y_preds = model.predict_proba(X_features, ntree_limit=model.best_ntree_limit) y_preds = map(lambda o: o[1], y_preds) data = pd.concat([data, X_features], axis=1) data = data.assign(predict=y_preds) #if first_chunk: # data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w") # first_chunk = False #else: # data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a") data.to_csv(sys.stdout, header=False, sep="\t") explain = eli5.explain_prediction(model, X_features.iloc[0]) explain = eli5.format_as_text(explain) print explain print X_features.iloc[0]
def visualise_feature_importance(model, title_variable, x_test, y_test): print( eli5.format_as_text( eli5.explain_weights( PermutationImportance(model, random_state=42).fit(x_test, y_test)))) importances = model.feature_importances_ labels = [ "X{} - {:4.1f}%".format(i, importances[i - 1] * 100) for i in range(1, 8) ] patches, texts = plt.pie(importances, wedgeprops=dict(width=0.5), startangle=90, radius=1.2) plt.legend(patches, labels, prop={'size': 12}, bbox_to_anchor=(0.74, 0.5), loc="center right", fontsize=8) plt.title("Feature Importance for {}".format(title_variable)) plt.savefig("plots/fi{}.png".format(title_variable)) plt.show()
def _train_model(self, df_train): # type: (List[List[Tuple[Text, Text, Text, Text]]]) -> None """Train the crf tagger based on the training data.""" import sklearn_crfsuite X_train = [self._sentence_to_features(sent) for sent in df_train] y_train = [self._sentence_to_labels(sent) for sent in df_train] self.ent_tagger = sklearn_crfsuite.CRF( algorithm='lbfgs', # coefficient for L1 penalty c1=self.component_config["L1_c"], # coefficient for L2 penalty c2=self.component_config["L2_c"], # stop earlier max_iterations=self.component_config["max_iterations"], # include transitions that are possible, but not observed all_possible_transitions=True ) self.ent_tagger.fit(X_train, y_train) result = eli5.explain_weights(self.ent_tagger, top=10, target_names='name') plt = str(result.targets[1]) #plt = plt.split("FeatureWeight") print(len(plt)) print(plt) for i in range(len(plt)): weight_dict = [] if "feature" in plt[i]: element = plt[i].split(',') weight_dict.append(element[0].replace('(',"")) weight_dict.append(element[1]) print(weight_dict)
def explainELI5(rows, name, classes, XY): import eli5 import eli5.sklearn from eli5 import show_weights from eli5.sklearn import PermutationImportance from eli5.sklearn import explain_prediction_linear_classifier from eli5 import show_prediction cols = XY[2] clf = loadModel(name+"-"+classes) ExplainPath_Specific = ExplainPath + name + "_" + classes weights = eli5.explain_weights(clf, feature_names=cols, top=(len(cols)+1)) predictionA = eli5.explain_prediction(clf, XY[0][0], feature_names=cols, top=(len(cols)+1)) predictionB = eli5.explain_prediction(clf, XY[0][1], feature_names=cols, top=(len(cols)+1)) html = eli5.format_as_html(weights) with open(uniquify(ExplainPath_Specific + "/" + classes +"_ELI5_WEIGHTS.html"), 'w') as f: f.write(html) f.close() html = eli5.format_as_html(predictionA) with open(uniquify(ExplainPath_Specific + "/" + classes +"_ELI5_PREDICTION_A.html"), 'w') as f: f.write(html) f.close() html = eli5.format_as_html(predictionB) with open(uniquify(ExplainPath_Specific + "/" + classes +"_ELI5_PREDICTION_B.html"), 'w') as f: f.write(html) f.close()
def get_feature_importances(model, data_dict, trainvars, data="even"): """Returns the feature importance relevant for neural network case using the eli5 package. Note: calculating the feature importances takes a while due to it calculating all the permutations. Parameters: ---------- model:The nn_model The NN model created by create_nn_model data_dict : dict Contains all the necessary information for the evaluation. Returns: ------- feature_importances : dict The feature importances equivalent for nn using the eli5 package. """ perm = PermutationImportance(model, scoring=mt.roc_curve).fit( data_dict[data+"_data"][trainvars].values, data_dict[data+"_data"]['multitarget'], sample_weight=data_dict[data+"_data"]['totalWeight'] ) weights = eli5.explain_weights(perm, feature_names=data_dict[trainvars]) weights_df = format_as_dataframe(weights).sort_values( by='weight', ascending=False).rename(columns={'weight': 'score'}) list_of_dicts = weights_df.to_dict('records') feature_importances = {} for single_variable_dict in list_of_dicts: key = single_variable_dict['feature'] feature_importances[key] = single_variable_dict['score'] return feature_importances
def get_feature_importances(model, data_dict): '''Returns the feature importance relevant for neural network case using the eli5 package. Note: calculating the feature importances takes a while due to it calculating all the permutations. Parameters: ---------- model:The nn_model The NN model created by create_nn_model data_dict : dict Contains all the necessary information for the evaluation. Returns: ------- feature_importances : dict The feature importances equivalent for nn using the eli5 package. ''' perm = PermutationImportance(model).fit(data_dict['train'], data_dict['training_labels']) weights = eli5.explain_weights(perm, feature_names=data_dict['trainvars']) weights_df = format_as_dataframe(weights).sort_values( by='weight', ascending=False).rename(columns={'weight': 'score'}) list_of_dicts = weights_df.to_dict('records') feature_importances = {} for single_variable_dict in list_of_dicts: key = single_variable_dict['feature'] feature_importances[key] = single_variable_dict['score'] return feature_importances
def performance_measurement(crf_model, x, y, g_sentences): """Utilizes different functions to measure the model's performance and saves the results to files for review.""" # Cross-validating the model cross_val_predictions = cross_val_predict(estimator=crf_model, X=x, y=y, cv=5) report = flat_classification_report(y_pred=cross_val_predictions, y_true=y) file = open( f'results/performance_measurement_results_{datetime.datetime.today().date()}.txt', 'a', encoding='utf-8') file.seek(0) file.truncate() print2both('created on:', str(datetime.datetime.today().date()), '\n', file=file) print2both('flat_classification_report:\n\n', report, '\n\n', file=file) print2both('cross_val_predict:\n\n', cross_val_predictions, '\n\n', file=file) # Showing the weights assigned to each feature print2both('eli5.explain_weights(crf, top=100):\n\n', eli5.format_as_text(eli5.explain_weights(crf_model, top=100)), '\n\n', file=file) file.close() # Saving the potentially correct and the incorrect classifications in separate CSV files for review categorize_predictions(gold_sents=g_sentences, y_hat=cross_val_predictions, y_actual=y)
def test_explain_weights_feature_names_pandas(boston_train): pd = pytest.importorskip('pandas') X, y, feature_names = boston_train df = pd.DataFrame(X, columns=feature_names) reg = LGBMRegressor().fit(df, y) # it shoud pick up feature names from DataFrame columns res = explain_weights(reg) for expl in format_as_all(res, reg): assert 'PTRATIO' in expl # it is possible to override DataFrame feature names numeric_feature_names = ["zz%s" % idx for idx in range(len(feature_names))] res = explain_weights(reg, feature_names=numeric_feature_names) for expl in format_as_all(res, reg): assert 'zz12' in expl
def weight_explainer(estimator, num_features, cat_features=None, remainders=None): """ input: a pipeline object output: feature coefficients/weights """ import eli5 if cat_features != None: onehot_cols = list(estimator.named_steps['transformer']\ .named_transformers_['categorical']\ .named_steps['encoder']\ .get_feature_names()) num_features.extend(onehot_cols) elif remainders != None: num_features.extend(remainders) else: pass return eli5.explain_weights(estimator.named_steps['estimator'], top=20, feature_names=num_features)
def explain_weights_df(estimator, **kwargs): # type: (...) -> pd.DataFrame """ Explain weights and export them to ``pandas.DataFrame``. All keyword arguments are passed to :func:`eli5.explain_weights`. Weights of all features are exported by default. """ kwargs = _set_defaults(kwargs) return format_as_dataframe(eli5.explain_weights(estimator, **kwargs))
def process_xgb(): col, train, test, test_ref = load_data() print(train.shape, test.shape, test_ref.shape) params = { 'colsample_bytree': 0.055, 'colsample_bylevel': 0.4, 'gamma': 1.5, 'learning_rate': 0.01, 'max_depth': 5, 'objective': 'reg:linear', 'booster': 'gbtree', 'min_child_weight': 10, 'n_estimators': 1800, 'reg_alpha': 0, 'reg_lambda': 0, 'eval_metric': 'rmse', 'subsample': 0.7, 'silent': True, 'seed': 7, } folds = 20 full_score = 0.0 xg_test = xgb.DMatrix(test[col]) use_regressor = True use_regressor = False for fold in range(folds): x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), test_size=0.0010, random_state=fold) if use_regressor: p = params model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'], colsample_bylevel=p['colsample_bylevel'], gamma=p['gamma'], learning_rate=p['learning_rate'], max_depth=p['max_depth'], objective=p['objective'], booster=p['booster'], min_child_weight=p['min_child_weight'], n_estimators=p['n_estimators'], reg_alpha=p['reg_alpha'], reg_lambda=p['reg_lambda'], eval_metric=p['eval_metric'] , subsample=p['subsample'], silent=1, n_jobs = -1, early_stopping_rounds = 100, random_state=7, nthread=-1) model.fit(x1, y1) score = np.sqrt(mean_squared_error(y2, model.predict(x2))) test['target'] += np.expm1(model.predict(test[col])) else: xg_valid = xgb.DMatrix(x2, label=y2) xg_train = xgb.DMatrix(x1, label=y1) model = xgb.train(params, xg_train, params['n_estimators']) score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid))) test['target'] += np.expm1(model.predict(xg_test)) print('Fold', fold, 'Score', score) full_score += score full_score /= folds print('Full score', full_score) test['target'] /= folds test.loc[test_ref.target > 0, 'target'] = test_ref[test_ref.target > 0].target.values test[['ID', 'target']].to_csv('subxgb.csv', index=False) explain=False #explain=True if explain and not use_regressor: print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
def test_explain_linear_hashed_pos_neg(newsgroups_train, pass_feature_weights): docs, y, target_names = newsgroups_train # make it binary y = y.copy() y[y != 0] = 1 target_names = [target_names[0], 'other'] vec = HashingVectorizer(norm=None) ivec = InvertableHashingVectorizer(vec) clf = LogisticRegression(random_state=42) clf.fit(vec.fit_transform(docs), y) ivec.fit(docs) if pass_feature_weights: res = explain_weights( clf, top=(10, 10), target_names=target_names, feature_names=ivec.get_feature_names(always_signed=False), coef_scale=ivec.column_signs_) else: res = explain_weights(clf, ivec, top=(10, 10), target_names=target_names) # HashingVectorizer with norm=None is "the same" as CountVectorizer, # so we can compare it and check that explanation is almost the same. count_vec = CountVectorizer() count_clf = LogisticRegression(random_state=42) count_clf.fit(count_vec.fit_transform(docs), y) count_res = explain_weights(count_clf, vec=count_vec, top=(10, 10), target_names=target_names) for key in ['pos', 'neg']: values, count_values = [ sorted(get_names_coefs(getattr(r.targets[0].feature_weights, key))) for r in [res, count_res] ] assert len(values) == len(count_values) for (name, coef), (count_name, count_coef) in zip(values, count_values): assert name == count_name assert abs(coef - count_coef) < 0.05
def test_feature_importances_no_remaining(clf): """ Check that number of remaining features is shown if it is zero. """ n = 100 clf.fit(np.array([[i % 2 + 0.1 * np.random.random(), 0] for i in range(n)]), np.array([i % 2 for i in range(n)])) res = explain_weights(clf) for expl in format_as_all(res, clf): assert 'more features' not in expl and 'more …' not in expl
def test_explain_linear_regression_feature_filter(boston_train): clf = ElasticNet(random_state=42) X, y, feature_names = boston_train clf.fit(X, y) res = explain_weights(clf, feature_names=feature_names, feature_re=re.compile('ratio$', re.I)) for expl in format_as_all(res, clf): assert 'PTRATIO' in expl assert 'LSTAT' not in expl
def explain_weights_dfs(estimator, **kwargs): # type: (...) -> Dict[str, pd.DataFrame] """ Explain weights and export them to a dict with ``pandas.DataFrame`` values (as :func:`eli5.formatters.as_dataframe.format_as_dataframes` does). All keyword arguments are passed to :func:`eli5.explain_weights`. Weights of all features are exported by default. """ kwargs = _set_defaults(kwargs) return format_as_dataframes(eli5.explain_weights(estimator, **kwargs))
def test_sklearn_crfsuite_feature_re(xseq, yseq): crf = CRF(c1=0.0, c2=0.1, max_iterations=50) crf.fit([xseq], [yseq]) expl = explain_weights(crf, feature_re=u'(солн|clean)') for expl in format_as_all(expl, crf): assert u'солн' in expl assert u'clean' in expl assert 'walk' not in expl
def interpret_model(model: Pipeline, all_features: list): log.debug("All features: {}".format(all_features)) # Explain the model log.info( eli5.format_as_text( eli5.explain_weights( model.named_steps["model"], feature_names=all_features)))
def show_feature_importance(self): print("\n\n\n\n+++++++++++++++++++++++++++++++") print('Calculating feature importance for model ', self.name, "...") perm = PermutationImportance(self.get_eli5_model(), random_state=1).fit( self.x_test, self.y_test) print(self.name, 'model feature importance') print( eli5.format_as_text( eli5.explain_weights(perm, feature_names=self.feature_names)))
def explain_weights(self, **kwargs): """ Call :func:`eli5.show_weights` for the locally-fit classification pipeline. Keyword arguments are passed to :func:`eli5.show_weights`. :func:`fit` must be called before using this method. """ self._fix_target_names(kwargs) return eli5.explain_weights(self.clf_, vec=self.vec_, **kwargs)
def save_explanation(crf, out_path='crf-features.html'): print("Writing explanation to {} file".format(out_path)) expl = eli5.explain_weights(crf, top=100) html = eli5.format_as_html(expl) Path(out_path).write_text("""<!DOCTYPE html> <html> <title>Contact Extraction Model</title> <head><meta charset="utf-8"></head> <body>{}</body> </html> """.format(html))