def test_explain_prediction_clf_xor(): true_xs = [[np.random.randint(2), np.random.randint(2)] for _ in range(100)] xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)] for x, y in true_xs]) ys = np.array([x == y for x, y in true_xs]) clf = XGBClassifier(n_estimators=100, max_depth=2) clf.fit(xs, ys) res = explain_prediction(clf, np.array([1, 1])) format_as_all(res, clf) for x in [[0, 1], [1, 0], [0, 0], [1, 1]]: res = explain_prediction(clf, np.array(x)) print(x) print(format_as_text(res, show=fields.WEIGHTS)) check_targets_scores(res)
def test_explain_prediction_clf_interval(): true_xs = [[np.random.randint(3), np.random.randint(10)] for _ in range(1000)] xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)] for x, y in true_xs]) ys = np.array([x == 1 for x, _ in true_xs]) clf = XGBClassifier(n_estimators=100, max_depth=2) clf.fit(xs, ys) res = explain_prediction(clf, np.array([1.23, 1.45])) for expl in format_as_all(res, clf, show_feature_values=True): assert 'x0' in expl assert '1.23' in expl for x in [[0, 1], [1, 1], [2, 1], [0.8, 5], [1.2, 5]]: res = explain_prediction(clf, np.array(x)) print(x) print(format_as_text(res, show=fields.WEIGHTS)) check_targets_scores(res)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) #making the confusion matrix and the accuracy score from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, y_pred) print(cm) print(accuracy_score(y_test, y_pred)) import eli5 from eli5.sklearn import PermutationImportance from eli5.formatters.text import format_as_text perm = PermutationImportance(classifier).fit(X_test, y_test) expl = eli5.explain_weights(perm, feature_names = [*dataset.loc[:, dataset.columns != 'RESULTSBDA']]) print(format_as_text(expl, highlight_spaces=None)) # define the grid search parameters from sklearn.model_selection import GridSearchCV batch_size = [10, 20, 40, 60, 80, 100] epochs = [10, 50, 100] param_grid = dict(batch_size=batch_size, epochs=epochs) grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3) grid_result = grid.fit(X_train, y_train) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params):
def train(data, *regs, save_to=None, concat_features=False, explain=False): coords = utils.load_coords() concated_xs = np.concatenate(data['xs'], axis=1) all_rmse, all_patch_rmse, all_baselines = [], [], [] regs_name = ', '.join(type(reg).__name__ for reg in regs) fitted_regs = [] expl_by_cls = defaultdict(list) for cls in range(utils.N_CLASSES): ids = data['ids'][cls] scales = data['scales'][cls] ys = data['ys'][cls] xs = input_features(concated_xs if concat_features else data['xs'][cls]) # indices = np.array(sorted(range(len(ids)), key=lambda i: (scales[i], ids[i]))) # ids, xs, ys = ids[indices], xs[indices], ys[indices] pred, fitted = train_predict(regs, xs, ys, ids) ys_by_id, pred_by_id = [], [] unique_ids = sorted(set(ids)) pred_by_id = get_pred_by_id(ids, pred, unique_ids) for img_id in unique_ids: try: ys_by_id.append((coords.loc[[img_id]].cls == cls).sum()) except KeyError: ys_by_id.append(0) pred_by_id = round_prediction(pred_by_id) patch_rmse = np.sqrt(metrics.mean_squared_error(ys, pred)) rmse = np.sqrt(metrics.mean_squared_error(ys_by_id, pred_by_id)) baseline_rmse = np.sqrt(metrics.mean_squared_error( cross_val_predict(DummyRegressor(), [[0]] * len(ys_by_id), ys_by_id, cv=5), ys_by_id)) print('cls {}, patch mean {:.3f}, patch RMSE {:.3f}, ' 'image mean {:.2f}, image RMSE {:.2f}, baseline RMSE {:.2f}' .format(cls, np.mean(ys), patch_rmse, np.mean(ys_by_id), rmse, baseline_rmse)) all_rmse.append(rmse) all_patch_rmse.append(patch_rmse) all_baselines.append(baseline_rmse) if save_to: fitted_regs.append(fitted) if explain: for reg in fitted: expl = eli5.explain_weights(reg, feature_names=FEATURE_NAMES) expl_by_cls[cls].append(expl) print(type(reg).__name__, format_as_text( expl, show=('method', 'targets', 'feature_importances'))) print('{} with {} features: mean patch RMSE {:.3f}, mean image RMSE {:.2f}, ' 'mean baseline RMSE {:.2f}' .format(regs_name, ', '.join(FEATURE_NAMES), np.mean(all_patch_rmse), np.mean(all_rmse), np.mean(all_baselines))) if save_to: joblib.dump(fitted_regs, save_to) print('Saved to', save_to) if explain: dfs = [] for cls, expls in expl_by_cls.items(): for expl in expls: df = eli5.format_as_dataframe(expl) df['cls'] = cls df['estimator'] = expl.estimator.split('(')[0] dfs.append(df) df = pd.concat(dfs) df.reset_index(inplace=True) df['feature'] = df['index'] del df['index'] df = df[['feature', 'cls', 'estimator', 'std', 'weight']] df.to_csv('feature_importances.csv', index=None)
def explain_review_prediction(): """ Explain a specific prediction using the eli5 library """ data = request.get_json(force=True) # Use the original documents, not the corrected ones target_names = ['negative', 'neutral', 'positive', 'very_negative', 'very_positive'] clf, vocabulary = load_clf_and_vocabulary(data['classifier'], data['vocabModel'], data['tfIdf'], False) vect = CountVectorizer(vocabulary=vocabulary) vect._validate_vocabulary() # reviews = load_files(dir_path + '/../../data/reviews/not_corrected') # text_train, text_test, y_train, y_test = train_test_split(reviews.data, reviews.target, test_size=0.2, random_state=0) # if data['tfIdf']: # if data['vocabModel'] == 'unigram': # vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(1, 1)).fit(text_train) # elif data['vocabModel'] == 'bigram': # vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(2, 2)).fit(text_train) # elif data['vocabModel'] == 'trigram': # vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(3, 3)).fit(text_train) # else: # if data['vocabModel'] == 'unigram': # vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(1, 1)).fit(text_train) # elif data['vocabModel'] == 'bigram': # vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(2, 2)).fit(text_train) # elif data['vocabModel'] == 'trigram': # vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(3, 3)).fit(text_train) if data['classifier'] == 'LR': explanation = explain_prediction.explain_prediction_linear_classifier(clf, data['review'], vec=vect, top=10, target_names=target_names) div = html.format_as_html(explanation, include_styles=False) style = html.format_html_styles() txt = text.format_as_text(explanation, show=eli5.formatters.fields.ALL, highlight_spaces=True, show_feature_values=True) print(txt) return jsonify({ 'div': div, 'style': style }) elif data['classifier'] == 'SVM' or data['classifier'] == 'MLP': te = TextExplainer(n_samples=100, clf=LogisticRegression(solver='newton-cg'), vec=vect, random_state=0) te.fit(data['review'], clf.predict_proba) explanation = te.explain_prediction(top=10, target_names=target_names) div = html.format_as_html(explanation, include_styles=False) style = html.format_html_styles() distorted_texts = [] for sample in te.samples_: sample_explanation = explain_prediction.explain_prediction_linear_classifier(te.clf_, sample, vec=te.vec_, top=10, target_names=target_names) dict_explanation = as_dict.format_as_dict(sample_explanation) curr = { 'text': sample } for c in dict_explanation['targets']: if c['target'] == 'negative': curr['negative'] = c['proba'] elif c['target'] == 'neutral': curr['neutral'] = c['proba'] elif c['target'] == 'positive': curr['positive'] = c['proba'] elif c['target'] == 'very_negative': curr['very_negative'] = c['proba'] elif c['target'] == 'very_positive': curr['very_positive'] = c['proba'] distorted_texts.append(curr) review_explanation = as_dict.format_as_dict(explanation) probabilities = {} for c in review_explanation['targets']: if c['target'] == 'negative': probabilities['negative'] = c['proba'] elif c['target'] == 'neutral': probabilities['neutral'] = c['proba'] elif c['target'] == 'positive': probabilities['positive'] = c['proba'] elif c['target'] == 'very_negative': probabilities['very_negative'] = c['proba'] elif c['target'] == 'very_positive': probabilities['very_positive'] = c['proba'] return jsonify({ 'div': div, 'style': style, 'original_text': data['review'], 'probabilities': probabilities, 'distorted_texts': distorted_texts, 'metrics': te.metrics_ })