Пример #1
0
def test_explain_prediction_clf_xor():
    true_xs = [[np.random.randint(2), np.random.randint(2)] for _ in range(100)]
    xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)]
                   for x, y in true_xs])
    ys = np.array([x == y for x, y in true_xs])
    clf = XGBClassifier(n_estimators=100, max_depth=2)
    clf.fit(xs, ys)
    res = explain_prediction(clf, np.array([1, 1]))
    format_as_all(res, clf)
    for x in [[0, 1], [1, 0], [0, 0], [1, 1]]:
        res = explain_prediction(clf, np.array(x))
        print(x)
        print(format_as_text(res, show=fields.WEIGHTS))
        check_targets_scores(res)
Пример #2
0
def test_explain_prediction_clf_interval():
    true_xs = [[np.random.randint(3), np.random.randint(10)] for _ in range(1000)]
    xs = np.array([[np.random.normal(x, 0.2), np.random.normal(y, 0.2)]
                   for x, y in true_xs])
    ys = np.array([x == 1 for x, _ in true_xs])
    clf = XGBClassifier(n_estimators=100, max_depth=2)
    clf.fit(xs, ys)
    res = explain_prediction(clf, np.array([1.23, 1.45]))
    for expl in format_as_all(res, clf, show_feature_values=True):
        assert 'x0' in expl
        assert '1.23' in expl
    for x in [[0, 1], [1, 1], [2, 1], [0.8, 5], [1.2, 5]]:
        res = explain_prediction(clf, np.array(x))
        print(x)
        print(format_as_text(res, show=fields.WEIGHTS))
        check_targets_scores(res)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

#making the confusion matrix and the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

import eli5
from eli5.sklearn import PermutationImportance
from eli5.formatters.text import format_as_text

perm = PermutationImportance(classifier).fit(X_test, y_test)
expl = eli5.explain_weights(perm, feature_names = [*dataset.loc[:, dataset.columns != 'RESULTSBDA']])
print(format_as_text(expl, highlight_spaces=None))

# define the grid search parameters

from sklearn.model_selection import GridSearchCV
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=classifier, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
Пример #4
0
def train(data, *regs,
          save_to=None, concat_features=False, explain=False):
    coords = utils.load_coords()
    concated_xs = np.concatenate(data['xs'], axis=1)
    all_rmse, all_patch_rmse, all_baselines = [], [], []
    regs_name = ', '.join(type(reg).__name__ for reg in regs)
    fitted_regs = []
    expl_by_cls = defaultdict(list)
    for cls in range(utils.N_CLASSES):
        ids = data['ids'][cls]
        scales = data['scales'][cls]
        ys = data['ys'][cls]
        xs = input_features(concated_xs if concat_features else data['xs'][cls])
        # indices = np.array(sorted(range(len(ids)), key=lambda i: (scales[i], ids[i])))
        # ids, xs, ys = ids[indices], xs[indices], ys[indices]
        pred, fitted = train_predict(regs, xs, ys, ids)
        ys_by_id, pred_by_id = [], []
        unique_ids = sorted(set(ids))
        pred_by_id = get_pred_by_id(ids, pred, unique_ids)
        for img_id in unique_ids:
            try:
                ys_by_id.append((coords.loc[[img_id]].cls == cls).sum())
            except KeyError:
                ys_by_id.append(0)
        pred_by_id = round_prediction(pred_by_id)
        patch_rmse = np.sqrt(metrics.mean_squared_error(ys, pred))
        rmse = np.sqrt(metrics.mean_squared_error(ys_by_id, pred_by_id))
        baseline_rmse = np.sqrt(metrics.mean_squared_error(
            cross_val_predict(DummyRegressor(), [[0]] * len(ys_by_id), ys_by_id, cv=5),
            ys_by_id))
        print('cls {}, patch mean {:.3f}, patch RMSE {:.3f}, '
              'image mean {:.2f}, image RMSE {:.2f}, baseline RMSE {:.2f}'
              .format(cls, np.mean(ys), patch_rmse,
                      np.mean(ys_by_id), rmse, baseline_rmse))
        all_rmse.append(rmse)
        all_patch_rmse.append(patch_rmse)
        all_baselines.append(baseline_rmse)
        if save_to:
            fitted_regs.append(fitted)
        if explain:
            for reg in fitted:
                expl = eli5.explain_weights(reg, feature_names=FEATURE_NAMES)
                expl_by_cls[cls].append(expl)
                print(type(reg).__name__, format_as_text(
                    expl, show=('method', 'targets', 'feature_importances')))
    print('{} with {} features: mean patch RMSE {:.3f}, mean image RMSE {:.2f}, '
          'mean baseline RMSE {:.2f}'
          .format(regs_name, ', '.join(FEATURE_NAMES),
                  np.mean(all_patch_rmse), np.mean(all_rmse),
                  np.mean(all_baselines)))
    if save_to:
        joblib.dump(fitted_regs, save_to)
        print('Saved to', save_to)

    if explain:
        dfs = []
        for cls, expls in expl_by_cls.items():
            for expl in expls:
                df = eli5.format_as_dataframe(expl)
                df['cls'] = cls
                df['estimator'] = expl.estimator.split('(')[0]
                dfs.append(df)
        df = pd.concat(dfs)
        df.reset_index(inplace=True)
        df['feature'] = df['index']
        del df['index']
        df = df[['feature', 'cls', 'estimator', 'std', 'weight']]
        df.to_csv('feature_importances.csv', index=None)
Пример #5
0
def explain_review_prediction():
	"""
	Explain a specific prediction using the eli5 library
	"""
	data = request.get_json(force=True)

	# Use the original documents, not the corrected ones
	target_names = ['negative', 'neutral', 'positive', 'very_negative', 'very_positive']
	clf, vocabulary = load_clf_and_vocabulary(data['classifier'], data['vocabModel'], data['tfIdf'], False)
	vect = CountVectorizer(vocabulary=vocabulary)
	vect._validate_vocabulary()

	# reviews = load_files(dir_path + '/../../data/reviews/not_corrected')
	# text_train, text_test, y_train, y_test = train_test_split(reviews.data, reviews.target, test_size=0.2, random_state=0)

	# if data['tfIdf']:
	# 	if data['vocabModel'] == 'unigram':
	# 		vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(1, 1)).fit(text_train)
	# 	elif data['vocabModel'] == 'bigram':
	# 		vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(2, 2)).fit(text_train)
	# 	elif data['vocabModel'] == 'trigram':
	# 		vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(3, 3)).fit(text_train)
	# else:
	# 	if data['vocabModel'] == 'unigram':
	# 		vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(1, 1)).fit(text_train)
	# 	elif data['vocabModel'] == 'bigram':
	# 		vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(2, 2)).fit(text_train)
	# 	elif data['vocabModel'] == 'trigram':
	# 		vect = CountVectorizer(min_df=5, stop_words=stopwords, ngram_range=(3, 3)).fit(text_train)

	if data['classifier'] == 'LR':
		explanation = explain_prediction.explain_prediction_linear_classifier(clf, data['review'], vec=vect, top=10, target_names=target_names)
		div = html.format_as_html(explanation, include_styles=False)
		style = html.format_html_styles()

		txt = text.format_as_text(explanation, show=eli5.formatters.fields.ALL, highlight_spaces=True, show_feature_values=True)
		print(txt)

		return jsonify({
			'div': div,
			'style': style
		})

	elif data['classifier'] == 'SVM' or data['classifier'] == 'MLP':
		te = TextExplainer(n_samples=100, clf=LogisticRegression(solver='newton-cg'), vec=vect, random_state=0)
		te.fit(data['review'], clf.predict_proba)
		explanation = te.explain_prediction(top=10, target_names=target_names)
		div = html.format_as_html(explanation, include_styles=False)
		style = html.format_html_styles()

		distorted_texts = []

		for sample in te.samples_:
			sample_explanation = explain_prediction.explain_prediction_linear_classifier(te.clf_, sample, vec=te.vec_, top=10, target_names=target_names)
			dict_explanation = as_dict.format_as_dict(sample_explanation)

			curr = {
				'text': sample
			}

			for c in dict_explanation['targets']:
				if c['target'] == 'negative':
					curr['negative'] = c['proba']
				elif c['target'] == 'neutral':
					curr['neutral'] = c['proba']
				elif c['target'] == 'positive':
					curr['positive'] = c['proba']
				elif c['target'] == 'very_negative':
					curr['very_negative'] = c['proba']
				elif c['target'] == 'very_positive':
					curr['very_positive'] = c['proba']

			distorted_texts.append(curr)

		review_explanation = as_dict.format_as_dict(explanation)
		probabilities = {}

		for c in review_explanation['targets']:
			if c['target'] == 'negative':
				probabilities['negative'] = c['proba']
			elif c['target'] == 'neutral':
				probabilities['neutral'] = c['proba']
			elif c['target'] == 'positive':
				probabilities['positive'] = c['proba']
			elif c['target'] == 'very_negative':
				probabilities['very_negative'] = c['proba']
			elif c['target'] == 'very_positive':
				probabilities['very_positive'] = c['proba']

		return jsonify({
			'div': div,
			'style': style,
			'original_text': data['review'],
			'probabilities': probabilities,
			'distorted_texts': distorted_texts,
			'metrics': te.metrics_
		})