def test_lime_explain_probabilistic(newsgroups_train): docs, y, target_names = newsgroups_train try: vec = HashingVectorizer(alternate_sign=False) except TypeError: # sklearn < 0.19 vec = HashingVectorizer(non_negative=True) clf = MultinomialNB() X = vec.fit_transform(docs) clf.fit(X, y) print(clf.score(X, y)) pipe = make_pipeline(vec, clf) doc = docs[0] te = TextExplainer(random_state=42) te.fit(doc, pipe.predict_proba) print(te.metrics_) assert te.metrics_['score'] > 0.7 assert te.metrics_['mean_KL_divergence'] < 0.1 res = te.explain_prediction(top=20, target_names=target_names) expl = format_as_text(res) print(expl) assert 'file' in expl
def explain_pred(sentence): te.fit(sentence, pipe.predict_proba) #txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"])) t_pred = te.explain_prediction(top=20, target_names=[ "ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP", "ORI", "QER", "COL", ]) txt = format_as_text(t_pred) html = format_as_html(t_pred) html_file = open("latest_prediction.html", "a+") html_file.write(html) html_file.close() print(te.metrics_)
def explain_prediction_me(x, model, feature_name_list): from eli5.explain import explain_prediction params = {} params['feature_names'] = feature_name_list params['top'] = 5 expl = explain_prediction(model.get_booster(), x, **params) #expl.targets for target_explanation_i in range(len(expl.targets)): target_explanation = expl.targets[target_explanation_i] print "class " + str( target_explanation.target) + " probability: " + str( target_explanation.proba) + " score: " + str( target_explanation.score) print "Positive:" for feature_weight in target_explanation.feature_weights.pos: print str(feature_weight.feature) + ": weight: " + str( feature_weight.weight) + " actual value: " + str( feature_weight.value) print "Negative:" for feature_weight in target_explanation.feature_weights.neg: print str(feature_weight.feature) + ": weight: " + str( feature_weight.weight) + " actual value: " + str( feature_weight.value) from eli5.formatters import format_as_text params_text = {} params_text['show_feature_values'] = True return format_as_text(expl, **params_text)
def test_lime_explain_probabilistic(newsgroups_train): docs, y, target_names = newsgroups_train vec = HashingVectorizer(non_negative=True) clf = MultinomialNB() X = vec.fit_transform(docs) clf.fit(X, y) pipe = make_pipeline(vec, clf) doc = docs[0] clf_local, vec_local, metrics = get_local_pipeline_text(doc, pipe.predict_proba, n_samples=5000, expand_factor=10) print(metrics) assert metrics['score'] > 0.7 res = explain_prediction_sklearn(clf_local, doc, vec_local, top=10, target_names=target_names) expl = format_as_text(res) print(expl) assert 'file' in expl
def assert_tree_explain_prediction_single_target(clf, X, feature_names): get_res = lambda _x, **kwargs: explain_prediction( clf, _x, feature_names=feature_names, **kwargs) res = get_res(X[0]) for expl in format_as_all(res, clf): assert_feature_values_present(expl, feature_names, X[0]) checked_flt = False all_expls = [] for x in X[:5]: res = get_res(x) text_expl = format_as_text(res, show=fields.WEIGHTS) print(text_expl) assert '<BIAS>' in text_expl check_targets_scores(res) all_expls.append(text_expl) get_all = lambda fw: get_all_features(fw.pos) | get_all_features(fw.neg ) all_features = get_all(res.targets[0].feature_weights) if len(all_features) > 1: f = list(all_features - {'<BIAS>'})[0] flt_res = get_res(x, feature_filter=lambda name, _: name != f) flt_features = get_all(flt_res.targets[0].feature_weights) assert flt_features == (all_features - {f}) checked_flt = True assert checked_flt assert any(f in ''.join(all_expls) for f in feature_names)
def test_format_html_options(force_weights, horizontal_layout): # test options that are not tested elsewhere X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg = LinearRegression() reg.fit(X, y) res = explain_weights_sklearn(reg) kwargs = dict( force_weights=force_weights, horizontal_layout=horizontal_layout) postfix = '_' + '_'.join( '{}-{}'.format(k, v) for k, v in sorted(kwargs.items())) print(kwargs, postfix) # just check that it does not crash expl = format_as_html(res, **kwargs) write_html(reg, expl, format_as_text(res), postfix=postfix) pred_res = explain_prediction_sklearn(reg, X[0]) pred_expl = format_as_html(pred_res, **kwargs) write_html(reg, pred_expl, format_as_text(pred_res), postfix='_expl' + postfix)
def explain_prediction(self, x, column_id, feature_names): from eli5.explain import explain_prediction params = {} params['feature_names'] = feature_names params['top'] = 5 expl = explain_prediction(self.model[column_id], x, **params) from eli5.formatters import format_as_text params_text = {} params_text['show_feature_values'] = True return format_as_text(expl, **params_text)
def explain_prediction(self, x, model): from eli5.explain import explain_prediction params = {} params['feature_names'] = self.feature_name_list params['top'] = 5 expl = explain_prediction(model, x, **params) from eli5.formatters import format_as_text params_text = {} params_text['show_feature_values'] = True return format_as_text(expl, **params_text)
def format_as_all(res, clf, **kwargs): """ Format explanation as text and html, check JSON-encoding, print text explanation, save html, return text and html. """ expl_dict = format_as_dict(res) pprint(expl_dict) json.dumps(expl_dict) # check that it can be serialized to JSON expl_text = format_as_text(res, **kwargs) expl_html = format_as_html(res, **kwargs) print(expl_text) write_html(clf, expl_html, expl_text, caller_depth=2) return expl_text, expl_html
def visualize_model(dataSet, column_id, final_gb, feature_name_list, train, target_run, res): try: column_name = dataSet.clean_pd.columns[column_id] feature_name_list_err_corr = list(feature_name_list) print "missing features: " + str( len(final_gb[column_id].feature_names) - len(feature_name_list)) if len(final_gb[column_id].feature_names) - len(feature_name_list) > 0: for err_corr_id in range(dataSet.shape[1]): if dataSet.is_column_applicable( err_corr_id) and err_corr_id != column_id: feature_name_list_err_corr.append( "error_corr_" + str(dataSet.clean_pd.columns[err_corr_id])) directory = Config.get("logging.folder") + '/out/html/' + dataSet.name if not os.path.exists(directory): os.makedirs(directory) path = directory + '/' + str(column_name) + '_' + str( train[column_id].shape[0]) + '_' + str(time.time()) + '.html' table_content = show_weights(final_gb[column_id], feature_names=feature_name_list_err_corr, importance_type="gain").data # print table_content from ml.VisualizeSVD import replace_with_url table_content = replace_with_url(table_content, dataSet) url = 'file://' + path html = "<h1>" + str(column_name) + "</h1>" html += "<h2>number of labels: " + str( train[column_id].shape[0]) + "</h2>" html += "<h2>F-Score: " + str(f1_score(target_run, res[column_id])) + "</h2>" html += str(table_content) with open(path, 'w') as webf: webf.write(html) webf.close() # webbrowser.open(url) except jinja2.exceptions.UndefinedError: print( format_as_text( explain_weights(final_gb[column_id], feature_names=feature_name_list)))
def train(self, X_train, y_train): logger.info("\n{}".format(X_train.dtypes)) features = self.get_pipe() # logger.info(features.get_feature_names()) # features.fit_transform(X_train) pipe = make_pipeline(features, LGBMClassifier()) logger.info(features.get_params().keys()) pipe.fit(X_train, y_train) scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy') logger.info("Validation Accuracy: {:.3f} ± {:.3f}".format(np.mean(scores), 2 * np.std(scores))) clf = pipe.steps[1][1] ft = pipe.steps[0][1] self.print_importances(clf, X_train) logger.info(format_as_text(explain_weights_lightgbm(lgb=clf, vec=ft))) return pipe
def get_eli5_weights(model: BaseModel, docs: List): """ Return eli5 feature weights (as a dict) with added color info. """ logging.info('explaining weights') try: expl = model.explain_predictions(docs) except NotImplementedError: expl = model.explain_weights() logging.info('model weights:\n{}'.format( format_as_text(expl, show=fields.WEIGHTS))) if expl.targets: weights = expl.targets[0].feature_weights weight_range = get_weight_range(weights) for w_lst in [weights.pos, weights.neg]: w_lst[:] = [{ 'feature': fw.feature, 'weight': fw.weight, 'hsl_color': format_hsl(weight_color_hsl(fw.weight, weight_range)), } for fw in w_lst] weights.neg.reverse() return format_as_dict(weights) elif expl.feature_importances: importances = expl.feature_importances.importances weight_range = max_or_0(abs(fw.weight) for fw in importances) return { 'pos': [{ 'feature': fw.feature, 'weight': float(fw.weight), 'hsl_color': format_hsl(weight_color_hsl(fw.weight, weight_range)), } for fw in importances], 'neg': [], 'pos_remaining': int(expl.feature_importances.remaining), 'neg_remaining': 0, } else: return {}
def assert_explain_prediction_single_target(estimator, X, feature_names): get_res = lambda _x, **kwargs: explain_prediction( estimator, _x, feature_names=feature_names, **kwargs) res = get_res(X[0]) for expl in format_as_all(res, estimator): assert_feature_values_present(expl, feature_names, X[0]) # take first elements in the dataset; check that # 1. <BIAS> feature is present; # 2. scores have correct absolute values; # 3. feature filter function works. checked_flt = False all_expls = [] for x in X[:5]: res = get_res(x) text_expl = format_as_text(res, show=fields.WEIGHTS) print(text_expl) assert '<BIAS>' in text_expl check_targets_scores(res) all_expls.append(text_expl) checked_flt = checked_flt or _assert_feature_filter_works(get_res, x) assert checked_flt assert any(f in ''.join(all_expls) for f in feature_names)
from eli5.formatters import format_as_text from eli5 import explain_weights import jinja2 path = '/home/felix/SequentialPatternErrorDetection/html/fpredict/model.html' url = 'file://' + path html = show_weights(final, feature_names=feature_names, importance_type="gain").data with open(path, 'w') as webf: webf.write(html) webf.close() # webbrowser.open(url) except jinja2.exceptions.UndefinedError: print format_as_text( explain_weights(final, feature_names=feature_names)) importances = final.get_score(importance_type='gain') print importances sorted_x = sorted(importances.items(), key=operator.itemgetter(1), reverse=True) print sorted_x labels = [] score = [] t = 0 for key, value in sorted_x: labels.append(key) score.append(value)
html = "<h1>" + column_name + "</h1>" html += "<h2>number of labels: " + str( train[column_id].shape[0]) + "</h2>" html += "<h2>F-Score: " + str(f1_score(target_run, res[column_id])) + "</h2>" html += show_weights(final_gb[column_id], feature_names=feature_name_list, importance_type="gain").data with open(path, 'w') as webf: webf.write(html) webf.close() #webbrowser.open(url) except jinja2.exceptions.UndefinedError: print format_as_text( explain_weights(final_gb[column_id], feature_names=feature_name_list)) print "current train shape: " + str(train[column_id].shape) print "column: " + str(column_id) print_stats(target_run, res[column_id]) print_stats_whole(dataSet.matrix_is_error[0:split_id, :], all_error_status, "run all") if all_matrix_test != None: print_stats_whole( dataSet.matrix_is_error[split_id:dataSet.shape[0], :], all_error_status_test, "test general") number_samples = 0 for key, value in train.iteritems():
def eval_clf(arg, text_features, ys, vec_filename, show_features=False, n_best_features=None, save=None): fold_idx, (train_idx, test_idx) = arg if fold_idx == 0: print('{} in train, {} in test'.format(len(train_idx), len(test_idx))) text_pipeline, text_clf = make_text_pipeline() text_pipeline.fit(text_features[train_idx], ys[train_idx]) vec = load_vec(vec_filename) if show_features and fold_idx == 0: print(format_as_text(explain_weights(text_clf, vec, top=(100, 20)))) result_metrics = {} test_y = ys[test_idx] if n_best_features: if len(test_idx): pred_y = text_pipeline.predict_proba(text_features[test_idx])[:, 1] result_metrics.update({ 'PR AUC (all text features)': metrics.average_precision_score(test_y, pred_y), 'ROC AUC (all text features)': metrics.roc_auc_score(test_y, pred_y), }) coef = sorted(enumerate(text_clf.coef_[0]), key=lambda x: abs(x[1]), reverse=True) best_feature_indices = [ idx for idx, weight in coef[:n_best_features] if weight != 0 ] result_metrics['selected_features'] = len(best_feature_indices) text_features = text_features[:, best_feature_indices] text_pipeline, text_clf = make_text_pipeline() text_pipeline.fit(text_features[train_idx], ys[train_idx]) inverse = {idx: w for w, idx in vec.vocabulary_.items()} vec.vocabulary_ = { inverse[idx]: i for i, idx in enumerate(best_feature_indices) } vec.stop_words_ = None if show_features and fold_idx == 0: print(format_as_text(explain_weights(text_clf, vec, top=(100, 20)))) if len(test_idx): text_features_test = text_features[test_idx] pred_y = text_pipeline.predict_proba(text_features_test)[:, 1] result_metrics.update({ 'PR AUC': metrics.average_precision_score(test_y, pred_y), 'ROC AUC': metrics.roc_auc_score(test_y, pred_y), }) if save: pipeline = Pipeline([ ('html_to_item', _function_transformer(html_to_item)), ('item_to_text', _function_transformer(item_to_text)), ('vec', vec), ] + text_pipeline.steps) Soft404Classifier.save_model(save, pipeline) return result_metrics