def test_explain_feature_union(vec_cls): data = [ { 'url': 'http://a.com/blog', 'text': 'security research' }, { 'url': 'http://a.com', 'text': 'security research' }, { 'url': 'http://b.com/blog', 'text': 'health study' }, { 'url': 'http://b.com', 'text': 'health research' }, { 'url': 'http://c.com/blog', 'text': 'security' }, ] ys = [1, 0, 0, 0, 1] url_vec = vec_cls(preprocessor=lambda x: x['url'], analyzer='char', ngram_range=(3, 3)) text_vec = vec_cls(preprocessor=lambda x: x['text']) vec = FeatureUnion([('url', url_vec), ('text', text_vec)]) xs = vec.fit_transform(data) clf = LogisticRegression(random_state=42) clf.fit(xs, ys) ivec = invert_hashing_and_fit(vec, data) weights_res = explain_weights(clf, ivec) html_expl = format_as_html(weights_res) write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__)) assert 'text__security' in html_expl assert 'url__log' in html_expl assert 'BIAS' in html_expl pred_res = explain_prediction(clf, data[0], vec) html_expl = format_as_html(pred_res, force_weights=False) write_html(clf, html_expl, '', postfix=vec_cls.__name__) assert 'text: Highlighted in text (sum)' in html_expl assert 'url: Highlighted in text (sum)' in html_expl assert '<b>url:</b> <span' in html_expl assert '<b>text:</b> <span' in html_expl assert 'BIAS' in html_expl
def test_explain_feature_union_with_nontext(vec_cls): data = [ { 'score': 1, 'text': 'security research' }, { 'score': 0.1, 'text': 'security research' }, { 'score': 0.5, 'text': 'health study' }, { 'score': 0.5, 'text': 'health research' }, { 'score': 0.1, 'text': 'security' }, ] ys = [1, 0, 0, 0, 1] score_vec = DictVectorizer() text_vec = vec_cls(preprocessor=lambda x: x['text']) vec = FeatureUnion([('score', score_vec), ('text', text_vec)]) xs = vec.fit_transform(data) clf = LogisticRegression(random_state=42) clf.fit(xs, ys) ivec = invert_hashing_and_fit(vec, data) weights_res = explain_weights(clf, ivec) html_expl = format_as_html(weights_res) write_html(clf, html_expl, '', postfix='{}_weights'.format(vec_cls.__name__)) assert 'score__score' in html_expl assert 'text__security' in html_expl assert 'BIAS' in html_expl res = explain_prediction(clf, data[0], vec) html_expl = format_as_html(res, force_weights=False) write_html(clf, html_expl, '', postfix=vec_cls.__name__) assert 'text: Highlighted in text (sum)' in html_expl assert '<b>text:</b> <span' in html_expl assert 'BIAS' in html_expl assert 'score__score' in html_expl
def explain_pred(sentence): te.fit(sentence, pipe.predict_proba) #txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"])) t_pred = te.explain_prediction(top=20, target_names=[ "ANB", "CAP", "ECON", "EDU", "ENV", "EX", "FED", "HEG", "NAT", "POL", "TOP", "ORI", "QER", "COL", ]) txt = format_as_text(t_pred) html = format_as_html(t_pred) html_file = open("latest_prediction.html", "a+") html_file.write(html) html_file.close() print(te.metrics_)
def explain_pred(input_data, model): y_preds = [] y_probs = [] encoded_htmls = [] for i in input_data: expl = eli5.explain_prediction( model.steps[-1][1], i, model.steps[0][1], target_names=['Compliant', 'Not Compliant'], top=10) html_explanation = format_as_html(expl, force_weights=False, show_feature_values=True).replace( "\n", "").strip() encoded_html = base64.b64encode( bytes(html_explanation, encoding='utf-8')) encoded_htmls.append(encoded_html) expl_dict = format_as_dict(expl) targets = expl_dict['targets'][0] target = targets['target'] y_pred = 1 if target.startswith('N') else 0 y_prob = targets['proba'] if len(i.split()) < 3: # one or two words can't be non-compliant y_pred = 0 y_prob = 1.0 y_prob = f'{round(y_prob, 3) * 100}%' y_preds.append(y_pred) y_probs.append(y_prob) inferences = np.column_stack((y_probs, y_preds, encoded_htmls)) return inferences
def eli5visual(pData, pDesc, Idx, pAccountName, pVec, nTopKeywrd, pRootDir): try: for i in range(len(Idx)): if Idx[i] <= len(pData): pIntent = pData['Intent'][int(Idx[i])] _, pModels = loadmodel(pRootDir, pAccountName, pIntent) pPipeModel = make_pipeline(pVec, pModels) pTe = TextExplainer(random_state=42).fit( pData[pDesc][int(Idx[i])], pPipeModel.predict_proba) pExplanation = pTe.explain_prediction() pHtml = format_as_html(pExplanation, force_weights=False, include_styles=False, horizontal_layout=True, show_feature_values=False) savehtml(pRootDir, pHtml, Idx[i], pIntent) else: print("Please select valid Id") except Exception as e: print( '*** ERROR[003]: Error in visualiation file of eil5visual function: ', sys.exc_info()[0], str(e)) print(traceback.format_exc()) return (-1) return (0)
def test_format_html_options(force_weights, horizontal_layout): # test options that are not tested elsewhere X, y = make_regression(n_samples=100, n_targets=3, n_features=10, random_state=42) reg = LinearRegression() reg.fit(X, y) res = explain_weights_sklearn(reg) kwargs = dict( force_weights=force_weights, horizontal_layout=horizontal_layout) postfix = '_' + '_'.join( '{}-{}'.format(k, v) for k, v in sorted(kwargs.items())) print(kwargs, postfix) # just check that it does not crash expl = format_as_html(res, **kwargs) write_html(reg, expl, format_as_text(res), postfix=postfix) pred_res = explain_prediction_sklearn(reg, X[0]) pred_expl = format_as_html(pred_res, **kwargs) write_html(reg, pred_expl, format_as_text(pred_res), postfix='_expl' + postfix)
def format_as_all(res, clf, **kwargs): """ Format explanation as text and html, check JSON-encoding, print text explanation, save html, return text and html. """ expl_dict = format_as_dict(res) pprint(expl_dict) json.dumps(expl_dict) # check that it can be serialized to JSON expl_text = format_as_text(res, **kwargs) expl_html = format_as_html(res, **kwargs) print(expl_text) write_html(clf, expl_html, expl_text, caller_depth=2) return expl_text, expl_html
def links_expls(model, le, item): explanations = [] for link in raw_html_links(le, item['url'], item['raw_content']): expl = explain_prediction(model['Q'].clf_online, doc=link, vec=model['link_vectorizer']) explanations.append((expl.targets[0].score, format_as_html(expl, include_styles=False, force_weights=False, show=fields.WEIGHTS))) print(item['url'], len(explanations)) return explanations
def test_explain_feature_union(vec_cls): data = [ { 'url': 'http://a.com/blog', 'text': 'security research' }, { 'url': 'http://a.com', 'text': 'security research' }, { 'url': 'http://b.com/blog', 'text': 'health study' }, { 'url': 'http://b.com', 'text': 'health research' }, { 'url': 'http://c.com/blog', 'text': 'security' }, ] ys = [1, 0, 0, 0, 1] vec = FeatureUnion([ ('url', vec_cls(preprocessor=lambda x: x['url'], analyzer='char', ngram_range=(3, 3))), ('text', vec_cls(preprocessor=lambda x: x['text'])), ]) xs = vec.fit_transform(data) clf = LogisticRegression(random_state=42) clf.fit(xs, ys) res = explain_prediction(clf, data[0], vec) html_expl = format_as_html(res, force_weights=False) write_html(clf, html_expl, '') assert 'text: Highlighted in text (sum)' in html_expl assert 'url: Highlighted in text (sum)' in html_expl assert '<b>url:</b> <span' in html_expl assert '<b>text:</b> <span' in html_expl assert 'BIAS' in html_expl
def _logit_model_eli5_explain_weights(model_type, cur_key, cur_trait, score, clf, vectorizer, eli_5_dir, logging, cur_time): """Explains top K features with the highest coefficient values, per class, using eli5""" import os from eli5 import sklearn from eli5 import formatters logging.info("_base_logit_model_eli5_explain_weights trait = {}, score = {}, model_type = {}, parameters = {}".format(cur_trait, score, model_type, cur_key)) # get explanation eli5_ew = sklearn.explain_linear_classifier_weights( clf=clf, vec=vectorizer, top=40, target_names=['under', 'over'] ) # format explanation as html eli5_fh = formatters.format_as_html(eli5_ew) # create relevant sub dir for output files # output_path_for_vertical = self.params['eli5_output_path_eli5'] + '/' + vertical + '/weights' # output_path_for_vertical = log + '/' + vertical + '/weights' output_path = eli_5_dir + '/' + str(cur_time) + '/weights' if not os.path.exists(output_path): os.makedirs(output_path) # write explanation html to file wf_path = output_path + '/{}_{}_{}_{}.html' \ .format(round(score, 3), cur_trait, model_type, cur_key) prefix_to_html = ('Personality trait: ' + str(cur_trait) + ', Score: ' + str(score) + '<br><br>' + 'Model: ' + str(model_type) + '<br><br>' + 'Parameters: ' + str(cur_key) + '<br><br>').encode('utf-8', 'replace') lines_final = prefix_to_html + eli5_fh.encode('utf8', 'replace') logging.info("writing weight explanation to file {}".format(wf_path)) with open(wf_path, 'w') as wf: wf.writelines(lines_final)
def sklearn_predict(lines, current_app): estimator = current_app.config['ESTIMATOR'] explanations = [] y_preds = [] y_probs = [] for line in lines: line = ' '.join(t for t in line.split() if t not in stopwords) if not line.strip(): continue expl = eli5.explain_prediction( estimator.steps[-1][1], line, estimator.steps[0][1], target_names=['Compliant', 'Not Compliant'], top=10) html_explanation = format_as_html(expl, force_weights=False, show_feature_values=True).replace( "\n", "").strip() explanations.append(html_explanation) expl_dict = format_as_dict(expl) targets = expl_dict['targets'][0] target = targets['target'] y_pred = 1 if target.startswith('N') else 0 y_prob = targets['proba'] if len(line.split()) < 3: # one or two words can't be non-compliant y_pred = 0 y_prob = 1.0 y_preds.append(y_pred) y_probs.append(y_prob) y_probs = [f'{round(y_prob, 3) * 100}%' for y_prob in y_probs] data = zip(y_preds, lines, lines, y_probs, explanations) results = [ dict(y_pred=y, line=l, clean_line=cl, y_prob=y_prob, expl=expl) for y, l, cl, y_prob, expl in data ] return results
def test_explain_feature_union_with_nontext(vec_cls): data = [ { 'score': 1, 'text': 'security research' }, { 'score': 0.1, 'text': 'security research' }, { 'score': 0.5, 'text': 'health study' }, { 'score': 0.5, 'text': 'health research' }, { 'score': 0.1, 'text': 'security' }, ] ys = [1, 0, 0, 0, 1] vec = FeatureUnion([ ('score', DictVectorizer()), ('text', vec_cls(preprocessor=lambda x: x['text'])), ]) xs = vec.fit_transform(data) clf = LogisticRegression(random_state=42) clf.fit(xs, ys) res = explain_prediction(clf, data[0], vec) html_expl = format_as_html(res, force_weights=False) write_html(clf, html_expl, '') assert 'text: Highlighted in text (sum)' in html_expl assert '<b>text:</b> <span' in html_expl assert 'BIAS' in html_expl assert 'score__score' in html_expl
def _logit_model_eli5_explain_prediction(model_type, cur_key, cur_trait, score, auc_score, clf, vectorizer, X_raw, X_vectorized, Y, eli_5_dir, logging, cur_time, k=10): """Explains top K predictions with the highest confidence where the model was correct, per class, using eli5""" import copy import numpy as np from eli5 import sklearn from eli5 import formatters import os logging.info("_base_logit_model_eli5_explain_prediction trait = {}, score = {}, model_type = {}, parameters = {}".format( cur_trait, score, model_type, cur_key)) num_labels = len(np.unique(Y)) assert num_labels == 2, "currently supporting evaluation for binary problems only" '''assert 'description' in self.item_data_logit_features_list # get optimal threshold for stest sdt = 'stest' assert 'threshold_value' in self.item_data_logit_models[model_type][vertical] \ [price_col, category_col][sdt]['threshold_opt']''' if False: # TODO threshold_opt = 'calculate' # self.item_data_logit_models[model_type][vertical][price_col, category_col][sdt]['threshold_opt']['threshold_value'] else: threshold_opt = 0.5 # get probability predictions for input data Y_probs = clf.predict_proba(X_vectorized) # get binary scores from probabilities using threshold Y_pred = (Y_probs[:, 1] > threshold_opt).astype(np.int) Y_wrong = np.logical_not(np.equal(Y, Y_pred)) # Y_wrong = np.logical_not(np.equal(Y, Y_probs)) # place threshold value for all wrong locations and by doing that # make sure they will not be picked as most extreme when sorting Y_probs_adjusted = copy.deepcopy(Y_probs[:, 1]) Y_probs_adjusted[Y_wrong] = threshold_opt # take top k samples where the model had the greatest confidence and it was correct np_topk = np.argsort(Y_probs_adjusted)[-k:] np_bottomk = np.argsort(Y_probs_adjusted)[:k] dict_of_lists = {'over': np_topk, 'under': np_bottomk} for position, np_list in dict_of_lists.iteritems(): for i, loc in enumerate(np_list): # raw = X_raw.iloc[[loc]]['item_description'].values[0] raw = X_raw[loc] # print(raw) # print("real: {}, model probability: {}".format(Y.iloc[loc], Y_probs[loc, 1])) prefix_to_html = '' try: a = 4 # real_pred = "true label: {}, model probability: {} (model prediction: {})<br><br>" \ # .format(Y.iloc[loc], Y_probs[loc, 1], Y_pred[loc]) '''real_pred = "true label: {}, model probability: {} (model prediction: {})<br><br>" \ .format(Y.iloc[loc], Y_probs[loc, 1], Y_probs[loc]) logging.info( "_base_logit_model_eli5_explain_prediction = {}, score = {}, model_type = {}, parameters = {}".format( cur_trait, score, model_type, cur_key)) prefix_to_html = ('Personality trait: ' + str(cur_trait) + '<br><br>' + 'parameters: ' + str(cur_key) + '<br><br>' + real_pred + 'item_description:<br>' + raw).encode('utf-8', 'replace')''' except UnicodeDecodeError: pass # TODO: handle this properly # get explanation eli5_ep = sklearn.explain_prediction_linear_classifier(clf=clf, doc=X_vectorized[loc], vec=vectorizer, top=20, target_names=['under', 'over'], vectorized=True) # format explanation as html eli5_fh = formatters.format_as_html(explanation=eli5_ep) if position == 'under': # manually replace red and green within the html symbol_temp = "tempTEMPtempTEMPtemp" eli5_fh = eli5_fh.replace("hsl(0", symbol_temp) eli5_fh = eli5_fh.replace("hsl(120", "hsl(0") eli5_fh = eli5_fh.replace(symbol_temp, "hsl(120") ''' output_path = eli_5_dir + '/' + str(cur_time) if not os.path.exists(output_path): os.makedirs(output_path) # write explanation html to file wf_path = output_path + '/{}_{}_{}_{}.html' \ .format(round(score, 3), cur_trait, model_type, cur_key) prefix_to_html = ('Personality trait: ' + str(cur_trait) + ', Score: ' + str(score) + '<br><br>' + 'Model: ' + str(model_type) + '<br><br>' + 'Parameters: ' + str(cur_key) + '<br><br>').encode('utf-8', 'replace') ''' # create relevant sub dir for output files output_path = eli_5_dir + '/' + str(cur_time) + '/predictions/' + str(round(score, 3)) + \ '_accuracy' + '_auc_' + str(round(auc_score, 3)) + '_' + cur_trait + '_' + model_type \ + '_' + cur_key if not os.path.exists(output_path): os.makedirs(output_path) # write explanation html to file wf_path = output_path + '/type_{}_loc_{}_pred_{}.html' \ .format(position, loc, round(Y_probs_adjusted[loc], 4)) lines_final = prefix_to_html + eli5_fh.encode('utf8', 'replace') # logging.info("writing prediction explanation to file".format(wf_path)) with open(wf_path, 'w') as wf: wf.write(lines_final)
clf = SVC(C=150, gamma=2e-2, probability=True) pipe = make_pipeline(lsa, clf) pipe.fit(twenty_train.data, twenty_train.target) pipe.score(twenty_test.data, twenty_test.target) doc = twenty_test.data[0] print_prediction(doc) te = TextExplainer(random_state=42) te.fit(doc, pipe.predict_proba) #print(te.explain_prediction(target_names=twenty_train.target_names)) #print(eli5.format_as_image(te.explain_weights(target_names=twenty_train.target_names))) show_html = lambda html: display(HTML(html)) show_html_expl = lambda expl, **kwargs: show_html( format_as_html(expl, include_styles=False, **kwargs)) show_html(format_html_styles()) weights = eli5.show_weights(clf, vec=vec, target_names=train['target_names'], horizontal_layout=False) pred = show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']), force_weights=False, horizontal_layout=True) with open('weights.htm', 'wb') as f: