예제 #1
0
def eli5visual(pData, pDesc, Idx, pAccountName, pVec, nTopKeywrd, pRootDir):
    try:
        for i in range(len(Idx)):
            if Idx[i] <= len(pData):
                pIntent = pData['Intent'][int(Idx[i])]
                _, pModels = loadmodel(pRootDir, pAccountName, pIntent)
                pPipeModel = make_pipeline(pVec, pModels)
                pTe = TextExplainer(random_state=42).fit(
                    pData[pDesc][int(Idx[i])], pPipeModel.predict_proba)
                pExplanation = pTe.explain_prediction()
                pHtml = format_as_html(pExplanation,
                                       force_weights=False,
                                       include_styles=False,
                                       horizontal_layout=True,
                                       show_feature_values=False)
                savehtml(pRootDir, pHtml, Idx[i], pIntent)
            else:
                print("Please select valid Id")

    except Exception as e:
        print(
            '*** ERROR[003]: Error in visualiation file of eil5visual function: ',
            sys.exc_info()[0], str(e))
        print(traceback.format_exc())
        return (-1)
    return (0)
예제 #2
0
def test_lime_explain_probabilistic(newsgroups_train):
    docs, y, target_names = newsgroups_train
    try:
        vec = HashingVectorizer(alternate_sign=False)
    except TypeError:
        # sklearn < 0.19
        vec = HashingVectorizer(non_negative=True)
    clf = MultinomialNB()

    X = vec.fit_transform(docs)
    clf.fit(X, y)
    print(clf.score(X, y))

    pipe = make_pipeline(vec, clf)
    doc = docs[0]

    te = TextExplainer(random_state=42)
    te.fit(doc, pipe.predict_proba)

    print(te.metrics_)
    assert te.metrics_['score'] > 0.7
    assert te.metrics_['mean_KL_divergence'] < 0.1

    res = te.explain_prediction(top=20, target_names=target_names)
    expl = format_as_text(res)
    print(expl)
    assert 'file' in expl
예제 #3
0
    def _lime_analyze(self,
                      query,
                      indicies,
                      max_len,
                      max_replace,
                      top_targets=None):
        model = self.model
        vocab = self.vocab.word_to_idx
        label = self.label.word_to_idx
        prepro_query = self.preprocess(query)

        explainer_generator = ExplainerGenerator(model, vocab, max_len)

        sampler = MaskingTextSampler(replacement=UNK,
                                     max_replace=max_replace,
                                     token_pattern=None,
                                     bow=False)

        explainer_list = list()
        for i in indicies:
            predict_fn = explainer_generator.get_predict_function(i)

            te = TextExplainer(
                sampler=sampler,
                position_dependent=True,
                random_state=RANDOM_SEED,
            )

            te.fit(' '.join(prepro_query), predict_fn)

            pred_explain = te.explain_prediction(
                target_names=[l for l in label][3:], top_targets=top_targets)
            explainer_list.append(pred_explain)

        return explainer_list
예제 #4
0
def test_text_explainer_position_dependent():
    text = "foo bar baz egg spam bar baz egg spam ham"

    @_apply_to_list
    def predict_proba(doc):
        tokens = doc.split()
        # 'bar' is only important in the beginning of the document,
        # not in the end
        return [0, 1] if len(tokens) >= 2 and tokens[1] == 'bar' else [1, 0]

    # bag of words model is not powerful enough to explain predict_proba above
    te = TextExplainer(random_state=42, vec=CountVectorizer())
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] < 0.9
    assert te.metrics_['mean_KL_divergence'] > 0.3

    # position_dependent=True can make it work
    te = TextExplainer(position_dependent=True, random_state=42)
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.3

    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    # it is also possible to almost make it work using a custom vectorizer
    vec = CountVectorizer(ngram_range=(1, 2))
    te = TextExplainer(vec=vec, random_state=42)
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.3

    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    # custom vectorizers are not supported when position_dependent is True
    with pytest.raises(ValueError):
        te = TextExplainer(position_dependent=True, vec=HashingVectorizer())
예제 #5
0
def test_text_explainer_token_pattern():
    text = "foo-bar baz egg-spam"
    predict_proba = substring_presence_predict_proba('bar')

    # a different token_pattern
    te = TextExplainer(token_pattern=r'(?u)\b[-\w]+\b')
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.1
    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    assert expl.targets[0].feature_weights.pos[0].feature == 'foo-bar'
예제 #6
0
파일: main.py 프로젝트: mapmeld/crud-ml
def predict(model_id):
    if os.path.exists("model/" + str(int(model_id)) + ".pkl"):
        try:
            if str(model_id) in clfs:
                clf = clfs[str(model_id)]
            else:
                clf = joblib.load(model_file_name(model_id))
            explainers = []
            if is_text_type(model_id):
                pipe = make_pipeline(vectorizer, clf)
                prediction = pipe.predict(request.json)

                for post in request.json:
                    te = TextExplainer(random_state=42, n_samples=500)
                    te.fit(post['text'], pipe.predict_proba)
                    made = te.explain_prediction(target_names=['pos', 'neg'])
                    explanation = made.targets[0].feature_weights
                    op_exp = {'pos': [], 'neg': []}
                    for feature in explanation.pos:
                        op_exp['pos'].append([feature.feature, feature.weight])
                    for feature in explanation.neg:
                        op_exp['neg'].append([feature.feature, feature.weight])
                    explainers.append(op_exp)
            else:
                rows = request.json
                query = pd.get_dummies(pd.DataFrame(rows))
                query = query.reindex(columns=model_columns, fill_value=0)
                prediction = clf.predict(query)
                for index, row in query.iterrows():
                    explanation = eli5.explain_prediction(
                        clf, row).targets[0].feature_weights
                    op_exp = {'pos': [], 'neg': []}
                    for feature in explanation.pos:
                        op_exp['pos'].append([feature.feature, feature.weight])
                    for feature in explanation.neg:
                        op_exp['neg'].append([feature.feature, feature.weight])
                    explainers.append(op_exp)

            # Converting to int from int64
            return jsonify({
                "predictions": list(map(str, prediction)),
                "explanations": explainers
            })

        except Exception as e:

            return jsonify({'error': str(e), 'trace': traceback.format_exc()})
    else:
        print('train first')
        return 'no model here'
예제 #7
0
def test_text_explainer_char_based(token_pattern):
    text = "Hello, world!"
    predict_proba = substring_presence_predict_proba('lo')

    te = TextExplainer(char_based=True, token_pattern=token_pattern)
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.95
    assert te.metrics_['mean_KL_divergence'] < 0.1

    res = te.explain_prediction()
    format_as_all(res, te.clf_)
    check_targets_scores(res)
    assert res.targets[0].feature_weights.pos[0].feature == 'lo'

    # another way to look at results (not that useful for char ngrams)
    res = te.explain_weights()
    assert res.targets[0].feature_weights.pos[0].feature == 'lo'
예제 #8
0
def test_text_explainer_custom_classifier():
    text = "foo-bar baz egg-spam"
    predict_proba = substring_presence_predict_proba('bar')

    # use decision tree to explain the prediction
    te = TextExplainer(clf=DecisionTreeClassifier(max_depth=2))
    te.fit(text, predict_proba)
    print(te.metrics_)
    assert te.metrics_['score'] > 0.99
    assert te.metrics_['mean_KL_divergence'] < 0.01
    expl = te.explain_prediction()
    format_as_all(expl, te.clf_)

    # with explain_weights we can get a nice tree representation
    expl = te.explain_weights()
    print(expl.decision_tree.tree)
    assert expl.decision_tree.tree.feature_name == "bar"
    format_as_all(expl, te.clf_)
예제 #9
0
def test_lime_flat_neighbourhood(newsgroups_train):
    docs, y, target_names = newsgroups_train
    doc = docs[0]

    @_apply_to_list
    def predict_proba(doc):
        """ This function predicts non-zero probabilities only for 3 labels """
        proba_graphics = [0, 1.0, 0, 0]
        proba_other = [0.9, 0, 0.1, 0]
        return proba_graphics if 'file' in doc else proba_other

    te = TextExplainer(expand_factor=None, random_state=42)
    te.fit(doc, predict_proba)
    print(te.metrics_)
    print(te.clf_.classes_, target_names)

    res = te.explain_prediction(top=20, target_names=target_names)
    for expl in format_as_all(res, te.clf_):
        assert 'file' in expl
        assert "comp.graphics" in expl
예제 #10
0
파일: explain.py 프로젝트: mabu-dev/gobbli
def st_lime_explanation(
    text: str,
    predict_func: Callable[[List[str]], np.ndarray],
    unique_labels: List[str],
    n_samples: int,
    position_dependent: bool = True,
):
    # TODO just use ELI5's built-in visualization when streamlit supports it:
    # https://github.com/streamlit/streamlit/issues/779
    with st.spinner("Generating LIME explanations..."):
        te = TextExplainer(
            random_state=1, n_samples=n_samples, position_dependent=position_dependent
        )
        te.fit(text, predict_func)
    st.json(te.metrics_)
    explanation = te.explain_prediction()
    explanation_df = eli5.format_as_dataframe(explanation)
    for target_ndx, target in enumerate(
        sorted(explanation.targets, key=lambda t: -t.proba)
    ):
        target_explanation_df = explanation_df[
            explanation_df["target"] == target_ndx
        ].copy()

        target_explanation_df["contribution"] = (
            target_explanation_df["weight"] * target_explanation_df["value"]
        )
        target_explanation_df["abs_contribution"] = abs(
            target_explanation_df["contribution"]
        )
        target_explanation_df = (
            target_explanation_df.drop("target", axis=1)
            .sort_values(by="abs_contribution", ascending=False)
            .reset_index(drop=True)
        )
        st.subheader(
            f"Target: {unique_labels[target_ndx]} (probability {target.proba:.4f}, score {target.score:.4f})"
        )
        st.dataframe(target_explanation_df)
예제 #11
0
    # opcodes_dir = '/home/hwangdz/coreutils/coreutils-8.28/install_m32/bin/md5funcs_ops'
    opcodes_dir = '/home/hwangdz/git/rl-select-div/only-similarity/explanation/%s_ops_info' % bin_name
    output_dir = 'explanation/%s_html' % bin_name
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for file_name in os.listdir(opcodes_dir):
        # if file_name != 'dump.s':
        #    continue
        if file_name == 'op_distribution':
            continue
        file_path = os.path.join(opcodes_dir, file_name)
        with open(file_path, 'r') as f:
            op_codes = f.read()
            if len(op_codes) < 20:
                continue
            num_ops = len(op_codes.split())
            op_codes = op_codes.replace('\n', ' ')
            opcode_explainer = TextExplainer(random_state=59, sampler=ops_sampler, n_samples=5000)
            #repeat_times = (len(op_codes.split()) / 100) ** 2
            repeat_times = 1
            for _ in range(repeat_times):
                opcode_explainer.fit(op_codes, ss.predict_proba)
            explanation = opcode_explainer.explain_prediction()._repr_html_()
            with open('explanation/%s_html/explanation-%s.html' % (bin_name, file_name), 'w') as ef:
                ef.write(explanation)
                ef.write('num of opcodes: %d\n' % num_ops)
                ef.write('</br>\n')
                ef.write(op_codes)