Пример #1
0
def test_explain_feature_union(vec_cls):
    data = [
        {
            'url': 'http://a.com/blog',
            'text': 'security research'
        },
        {
            'url': 'http://a.com',
            'text': 'security research'
        },
        {
            'url': 'http://b.com/blog',
            'text': 'health study'
        },
        {
            'url': 'http://b.com',
            'text': 'health research'
        },
        {
            'url': 'http://c.com/blog',
            'text': 'security'
        },
    ]
    ys = [1, 0, 0, 0, 1]
    url_vec = vec_cls(preprocessor=lambda x: x['url'],
                      analyzer='char',
                      ngram_range=(3, 3))
    text_vec = vec_cls(preprocessor=lambda x: x['text'])
    vec = FeatureUnion([('url', url_vec), ('text', text_vec)])
    xs = vec.fit_transform(data)
    clf = LogisticRegression(random_state=42)
    clf.fit(xs, ys)

    ivec = invert_hashing_and_fit(vec, data)
    weights_res = explain_weights(clf, ivec)
    html_expl = format_as_html(weights_res)
    write_html(clf,
               html_expl,
               '',
               postfix='{}_weights'.format(vec_cls.__name__))
    assert 'text__security' in html_expl
    assert 'url__log' in html_expl
    assert 'BIAS' in html_expl

    pred_res = explain_prediction(clf, data[0], vec)
    html_expl = format_as_html(pred_res, force_weights=False)
    write_html(clf, html_expl, '', postfix=vec_cls.__name__)
    assert 'text: Highlighted in text (sum)' in html_expl
    assert 'url: Highlighted in text (sum)' in html_expl
    assert '<b>url:</b> <span' in html_expl
    assert '<b>text:</b> <span' in html_expl
    assert 'BIAS' in html_expl
Пример #2
0
def test_explain_feature_union_with_nontext(vec_cls):
    data = [
        {
            'score': 1,
            'text': 'security research'
        },
        {
            'score': 0.1,
            'text': 'security research'
        },
        {
            'score': 0.5,
            'text': 'health study'
        },
        {
            'score': 0.5,
            'text': 'health research'
        },
        {
            'score': 0.1,
            'text': 'security'
        },
    ]
    ys = [1, 0, 0, 0, 1]
    score_vec = DictVectorizer()
    text_vec = vec_cls(preprocessor=lambda x: x['text'])
    vec = FeatureUnion([('score', score_vec), ('text', text_vec)])
    xs = vec.fit_transform(data)
    clf = LogisticRegression(random_state=42)
    clf.fit(xs, ys)

    ivec = invert_hashing_and_fit(vec, data)
    weights_res = explain_weights(clf, ivec)
    html_expl = format_as_html(weights_res)
    write_html(clf,
               html_expl,
               '',
               postfix='{}_weights'.format(vec_cls.__name__))
    assert 'score__score' in html_expl
    assert 'text__security' in html_expl
    assert 'BIAS' in html_expl

    res = explain_prediction(clf, data[0], vec)
    html_expl = format_as_html(res, force_weights=False)
    write_html(clf, html_expl, '', postfix=vec_cls.__name__)
    assert 'text: Highlighted in text (sum)' in html_expl
    assert '<b>text:</b> <span' in html_expl
    assert 'BIAS' in html_expl
    assert 'score__score' in html_expl
def explain_pred(sentence):
    te.fit(sentence, pipe.predict_proba)
    #txt = format_as_text(te.explain_prediction(target_names=["green", "neutral", "red"]))
    t_pred = te.explain_prediction(top=20,
                                   target_names=[
                                       "ANB",
                                       "CAP",
                                       "ECON",
                                       "EDU",
                                       "ENV",
                                       "EX",
                                       "FED",
                                       "HEG",
                                       "NAT",
                                       "POL",
                                       "TOP",
                                       "ORI",
                                       "QER",
                                       "COL",
                                   ])
    txt = format_as_text(t_pred)
    html = format_as_html(t_pred)
    html_file = open("latest_prediction.html", "a+")
    html_file.write(html)
    html_file.close()
    print(te.metrics_)
Пример #4
0
def explain_pred(input_data, model):
    y_preds = []
    y_probs = []
    encoded_htmls = []
    for i in input_data:
        expl = eli5.explain_prediction(
            model.steps[-1][1],
            i,
            model.steps[0][1],
            target_names=['Compliant', 'Not Compliant'],
            top=10)
        html_explanation = format_as_html(expl,
                                          force_weights=False,
                                          show_feature_values=True).replace(
                                              "\n", "").strip()
        encoded_html = base64.b64encode(
            bytes(html_explanation, encoding='utf-8'))
        encoded_htmls.append(encoded_html)
        expl_dict = format_as_dict(expl)
        targets = expl_dict['targets'][0]
        target = targets['target']
        y_pred = 1 if target.startswith('N') else 0
        y_prob = targets['proba']
        if len(i.split()) < 3:
            # one or two words can't be non-compliant
            y_pred = 0
            y_prob = 1.0
        y_prob = f'{round(y_prob, 3) * 100}%'
    y_preds.append(y_pred)
    y_probs.append(y_prob)
    inferences = np.column_stack((y_probs, y_preds, encoded_htmls))

    return inferences
Пример #5
0
def eli5visual(pData, pDesc, Idx, pAccountName, pVec, nTopKeywrd, pRootDir):
    try:
        for i in range(len(Idx)):
            if Idx[i] <= len(pData):
                pIntent = pData['Intent'][int(Idx[i])]
                _, pModels = loadmodel(pRootDir, pAccountName, pIntent)
                pPipeModel = make_pipeline(pVec, pModels)
                pTe = TextExplainer(random_state=42).fit(
                    pData[pDesc][int(Idx[i])], pPipeModel.predict_proba)
                pExplanation = pTe.explain_prediction()
                pHtml = format_as_html(pExplanation,
                                       force_weights=False,
                                       include_styles=False,
                                       horizontal_layout=True,
                                       show_feature_values=False)
                savehtml(pRootDir, pHtml, Idx[i], pIntent)
            else:
                print("Please select valid Id")

    except Exception as e:
        print(
            '*** ERROR[003]: Error in visualiation file of eil5visual function: ',
            sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        return (-1)
    return (0)
Пример #6
0
def test_format_html_options(force_weights, horizontal_layout):
    # test options that are not tested elsewhere
    X, y = make_regression(n_samples=100, n_targets=3, n_features=10,
                           random_state=42)
    reg = LinearRegression()
    reg.fit(X, y)
    res = explain_weights_sklearn(reg)
    kwargs = dict(
        force_weights=force_weights, horizontal_layout=horizontal_layout)
    postfix = '_' + '_'.join(
        '{}-{}'.format(k, v) for k, v in sorted(kwargs.items()))
    print(kwargs, postfix)
    # just check that it does not crash
    expl = format_as_html(res, **kwargs)
    write_html(reg, expl, format_as_text(res), postfix=postfix)
    pred_res = explain_prediction_sklearn(reg, X[0])
    pred_expl = format_as_html(pred_res, **kwargs)
    write_html(reg, pred_expl, format_as_text(pred_res),
               postfix='_expl' + postfix)
Пример #7
0
def format_as_all(res, clf, **kwargs):
    """ Format explanation as text and html, check JSON-encoding,
    print text explanation, save html, return text and html.
    """
    expl_dict = format_as_dict(res)
    pprint(expl_dict)
    json.dumps(expl_dict)  # check that it can be serialized to JSON
    expl_text = format_as_text(res, **kwargs)
    expl_html = format_as_html(res, **kwargs)
    print(expl_text)
    write_html(clf, expl_html, expl_text, caller_depth=2)
    return expl_text, expl_html
Пример #8
0
def links_expls(model, le, item):
    explanations = []
    for link in raw_html_links(le, item['url'], item['raw_content']):
        expl = explain_prediction(model['Q'].clf_online,
                                  doc=link,
                                  vec=model['link_vectorizer'])
        explanations.append((expl.targets[0].score,
                             format_as_html(expl,
                                            include_styles=False,
                                            force_weights=False,
                                            show=fields.WEIGHTS)))
    print(item['url'], len(explanations))
    return explanations
Пример #9
0
def test_explain_feature_union(vec_cls):
    data = [
        {
            'url': 'http://a.com/blog',
            'text': 'security research'
        },
        {
            'url': 'http://a.com',
            'text': 'security research'
        },
        {
            'url': 'http://b.com/blog',
            'text': 'health study'
        },
        {
            'url': 'http://b.com',
            'text': 'health research'
        },
        {
            'url': 'http://c.com/blog',
            'text': 'security'
        },
    ]
    ys = [1, 0, 0, 0, 1]
    vec = FeatureUnion([
        ('url',
         vec_cls(preprocessor=lambda x: x['url'],
                 analyzer='char',
                 ngram_range=(3, 3))),
        ('text', vec_cls(preprocessor=lambda x: x['text'])),
    ])
    xs = vec.fit_transform(data)
    clf = LogisticRegression(random_state=42)
    clf.fit(xs, ys)
    res = explain_prediction(clf, data[0], vec)
    html_expl = format_as_html(res, force_weights=False)
    write_html(clf, html_expl, '')
    assert 'text: Highlighted in text (sum)' in html_expl
    assert 'url: Highlighted in text (sum)' in html_expl
    assert '<b>url:</b> <span' in html_expl
    assert '<b>text:</b> <span' in html_expl
    assert 'BIAS' in html_expl
Пример #10
0
    def _logit_model_eli5_explain_weights(model_type, cur_key, cur_trait, score, clf, vectorizer, eli_5_dir, logging, cur_time):
            """Explains top K features with the highest coefficient values, per class, using eli5"""

            import os
            from eli5 import sklearn
            from eli5 import formatters

            logging.info("_base_logit_model_eli5_explain_weights trait = {}, score = {}, model_type = {}, parameters = {}".format(cur_trait, score, model_type, cur_key))

            # get explanation
            eli5_ew = sklearn.explain_linear_classifier_weights(
                clf=clf,
                vec=vectorizer,
                top=40,
                target_names=['under', 'over']
            )
            # format explanation as html
            eli5_fh = formatters.format_as_html(eli5_ew)

            # create relevant sub dir for output files
            # output_path_for_vertical = self.params['eli5_output_path_eli5'] + '/' + vertical + '/weights'
            # output_path_for_vertical = log + '/' + vertical + '/weights'
            output_path = eli_5_dir + '/' + str(cur_time) + '/weights'
            if not os.path.exists(output_path):
                os.makedirs(output_path)

            # write explanation html to file
            wf_path = output_path + '/{}_{}_{}_{}.html' \
                .format(round(score, 3), cur_trait, model_type, cur_key)

            prefix_to_html = ('Personality trait: ' + str(cur_trait) + ', Score: ' + str(score) +
                              '<br><br>' + 'Model: ' + str(model_type) + '<br><br>' + 'Parameters: ' +
                              str(cur_key) + '<br><br>').encode('utf-8', 'replace')

            lines_final = prefix_to_html + eli5_fh.encode('utf8', 'replace')

            logging.info("writing weight explanation to file {}".format(wf_path))
            with open(wf_path, 'w') as wf:
                wf.writelines(lines_final)
Пример #11
0
def sklearn_predict(lines, current_app):
    estimator = current_app.config['ESTIMATOR']
    explanations = []
    y_preds = []
    y_probs = []
    for line in lines:
        line = ' '.join(t for t in line.split() if t not in stopwords)
        if not line.strip():
            continue
        expl = eli5.explain_prediction(
            estimator.steps[-1][1],
            line,
            estimator.steps[0][1],
            target_names=['Compliant', 'Not Compliant'],
            top=10)
        html_explanation = format_as_html(expl,
                                          force_weights=False,
                                          show_feature_values=True).replace(
                                              "\n", "").strip()
        explanations.append(html_explanation)
        expl_dict = format_as_dict(expl)
        targets = expl_dict['targets'][0]
        target = targets['target']
        y_pred = 1 if target.startswith('N') else 0
        y_prob = targets['proba']
        if len(line.split()) < 3:
            # one or two words can't be non-compliant
            y_pred = 0
            y_prob = 1.0
        y_preds.append(y_pred)
        y_probs.append(y_prob)
    y_probs = [f'{round(y_prob, 3) * 100}%' for y_prob in y_probs]
    data = zip(y_preds, lines, lines, y_probs, explanations)
    results = [
        dict(y_pred=y, line=l, clean_line=cl, y_prob=y_prob, expl=expl)
        for y, l, cl, y_prob, expl in data
    ]
    return results
Пример #12
0
def test_explain_feature_union_with_nontext(vec_cls):
    data = [
        {
            'score': 1,
            'text': 'security research'
        },
        {
            'score': 0.1,
            'text': 'security research'
        },
        {
            'score': 0.5,
            'text': 'health study'
        },
        {
            'score': 0.5,
            'text': 'health research'
        },
        {
            'score': 0.1,
            'text': 'security'
        },
    ]
    ys = [1, 0, 0, 0, 1]
    vec = FeatureUnion([
        ('score', DictVectorizer()),
        ('text', vec_cls(preprocessor=lambda x: x['text'])),
    ])
    xs = vec.fit_transform(data)
    clf = LogisticRegression(random_state=42)
    clf.fit(xs, ys)
    res = explain_prediction(clf, data[0], vec)
    html_expl = format_as_html(res, force_weights=False)
    write_html(clf, html_expl, '')
    assert 'text: Highlighted in text (sum)' in html_expl
    assert '<b>text:</b> <span' in html_expl
    assert 'BIAS' in html_expl
    assert 'score__score' in html_expl
Пример #13
0
    def _logit_model_eli5_explain_prediction(model_type, cur_key, cur_trait, score, auc_score, clf, vectorizer, X_raw,
                                             X_vectorized, Y, eli_5_dir, logging, cur_time, k=10):
            """Explains top K predictions with the highest confidence where the model was correct, per class, using eli5"""

            import copy
            import numpy as np
            from eli5 import sklearn
            from eli5 import formatters
            import os

            logging.info("_base_logit_model_eli5_explain_prediction trait = {}, score = {}, model_type = {}, parameters = {}".format(
                    cur_trait, score, model_type, cur_key))

            num_labels = len(np.unique(Y))
            assert num_labels == 2, "currently supporting evaluation for binary problems only"

            '''assert 'description' in self.item_data_logit_features_list

            # get optimal threshold for stest
            sdt = 'stest'
            assert 'threshold_value' in self.item_data_logit_models[model_type][vertical] \
                [price_col, category_col][sdt]['threshold_opt']'''

            if False:
                # TODO
                threshold_opt = 'calculate' # self.item_data_logit_models[model_type][vertical][price_col, category_col][sdt]['threshold_opt']['threshold_value']
            else:
                threshold_opt = 0.5

            # get probability predictions for input data
            Y_probs = clf.predict_proba(X_vectorized)

            # get binary scores from probabilities using threshold
            Y_pred = (Y_probs[:, 1] > threshold_opt).astype(np.int)
            Y_wrong = np.logical_not(np.equal(Y, Y_pred))
            # Y_wrong = np.logical_not(np.equal(Y, Y_probs))

            # place threshold value for all wrong locations and by doing that
            # make sure they will not be picked as most extreme when sorting
            Y_probs_adjusted = copy.deepcopy(Y_probs[:, 1])
            Y_probs_adjusted[Y_wrong] = threshold_opt

            # take top k samples where the model had the greatest confidence and it was correct
            np_topk = np.argsort(Y_probs_adjusted)[-k:]
            np_bottomk = np.argsort(Y_probs_adjusted)[:k]

            dict_of_lists = {'over': np_topk, 'under': np_bottomk}

            for position, np_list in dict_of_lists.iteritems():

                for i, loc in enumerate(np_list):

                    # raw = X_raw.iloc[[loc]]['item_description'].values[0]
                    raw = X_raw[loc]

                    # print(raw)
                    # print("real: {}, model probability: {}".format(Y.iloc[loc], Y_probs[loc, 1]))

                    prefix_to_html = ''
                    try:
                        a = 4
                        # real_pred = "true label: {}, model probability: {} (model prediction: {})<br><br>" \
                        #    .format(Y.iloc[loc], Y_probs[loc, 1], Y_pred[loc])
                        '''real_pred = "true label: {}, model probability: {} (model prediction: {})<br><br>" \
                            .format(Y.iloc[loc], Y_probs[loc, 1], Y_probs[loc])

                        logging.info(
                            "_base_logit_model_eli5_explain_prediction = {}, score = {}, model_type = {}, parameters = {}".format(
                                cur_trait, score, model_type, cur_key))

                        prefix_to_html = ('Personality trait: ' + str(cur_trait) + '<br><br>' + 'parameters: ' +
                                          str(cur_key) + '<br><br>' + real_pred + 'item_description:<br>' + raw).encode('utf-8', 'replace')'''

                    except UnicodeDecodeError:
                        pass
                        # TODO: handle this properly

                    # get explanation
                    eli5_ep = sklearn.explain_prediction_linear_classifier(clf=clf, doc=X_vectorized[loc],
                                                                           vec=vectorizer, top=20,
                                                                           target_names=['under', 'over'], vectorized=True)

                    # format explanation as html
                    eli5_fh = formatters.format_as_html(explanation=eli5_ep)

                    if position == 'under':
                        # manually replace red and green within the html
                        symbol_temp = "tempTEMPtempTEMPtemp"
                        eli5_fh = eli5_fh.replace("hsl(0", symbol_temp)
                        eli5_fh = eli5_fh.replace("hsl(120", "hsl(0")
                        eli5_fh = eli5_fh.replace(symbol_temp, "hsl(120")

                    '''
                    output_path = eli_5_dir + '/' + str(cur_time)
                    if not os.path.exists(output_path):
                        os.makedirs(output_path)

                    # write explanation html to file
                    wf_path = output_path + '/{}_{}_{}_{}.html' \
                        .format(round(score, 3), cur_trait, model_type, cur_key)

                    prefix_to_html = ('Personality trait: ' + str(cur_trait) + ', Score: ' + str(score) +
                                      '<br><br>' + 'Model: ' + str(model_type) + '<br><br>' + 'Parameters: ' +
                                      str(cur_key) + '<br><br>').encode('utf-8', 'replace')
                                      '''

                    # create relevant sub dir for output files
                    output_path = eli_5_dir + '/' + str(cur_time) + '/predictions/' + str(round(score, 3)) + \
                                  '_accuracy' + '_auc_' + str(round(auc_score, 3)) + '_' + cur_trait + '_' + model_type \
                                  + '_' + cur_key
                    if not os.path.exists(output_path):
                        os.makedirs(output_path)

                    # write explanation html to file
                    wf_path = output_path + '/type_{}_loc_{}_pred_{}.html' \
                        .format(position, loc, round(Y_probs_adjusted[loc], 4))

                    lines_final = prefix_to_html + eli5_fh.encode('utf8', 'replace')

                    # logging.info("writing prediction explanation to file".format(wf_path))

                    with open(wf_path, 'w') as wf:
                        wf.write(lines_final)
Пример #14
0
clf = SVC(C=150, gamma=2e-2, probability=True)
pipe = make_pipeline(lsa, clf)
pipe.fit(twenty_train.data, twenty_train.target)
pipe.score(twenty_test.data, twenty_test.target)

doc = twenty_test.data[0]
print_prediction(doc)

te = TextExplainer(random_state=42)
te.fit(doc, pipe.predict_proba)
#print(te.explain_prediction(target_names=twenty_train.target_names))
#print(eli5.format_as_image(te.explain_weights(target_names=twenty_train.target_names)))

show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(
    format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

weights = eli5.show_weights(clf,
                            vec=vec,
                            target_names=train['target_names'],
                            horizontal_layout=False)

pred = show_html_expl(explain_prediction(clf,
                                         test['data'][2],
                                         vec,
                                         target_names=train['target_names']),
                      force_weights=False,
                      horizontal_layout=True)

with open('weights.htm', 'wb') as f: