예제 #1
0
    def explain(self,
                doc,
                truncate_len=512,
                all_targets=False,
                n_samples=2500):
        """
        Highlights text to explain prediction
        Args:
            doc (str): text of documnet
            truncate_len(int): truncate document to this many words
            all_targets(bool):  If True, show visualization for
                                each target.
            n_samples(int): number of samples to generate and train on.
                            Larger values give better results, but will take more time.
                            Lower this value if explain is taking too long.
        """
        is_array, is_pair = detect_text_format(doc)
        if is_pair:
            warnings.warn(
                "currently_unsupported: explain does not currently support sentence pair classification"
            )
            return
        if not self.c:
            warnings.warn(
                "currently_unsupported: explain does not support text regression"
            )
            return
        try:
            import eli5
            from eli5.lime import TextExplainer
        except:
            msg = (
                "ktrain requires a forked version of eli5 to support tf.keras. "
                +
                "Install with: pip install https://github.com/amaiya/eli5/archive/refs/heads/tfkeras_0_10_1.zip"
            )
            warnings.warn(msg)
            return
        if (not hasattr(eli5, "KTRAIN_ELI5_TAG")
                or eli5.KTRAIN_ELI5_TAG != KTRAIN_ELI5_TAG):
            msg = (
                "ktrain requires a forked version of eli5 to support tf.keras. It is either missing or not up-to-date. "
                +
                "Uninstall the current version and install/re-install the fork with: pip install https://github.com/amaiya/eli5/archive/refs/heads/tfkeras_0_10_1.zip"
            )
            warnings.warn(msg)
            return

        if not isinstance(doc, str):
            raise TypeError("text must of type str")
        prediction = [self.predict(doc)] if not all_targets else None

        if self.preproc.is_nospace_lang():
            doc = self.preproc.process_chinese([doc])
            doc = doc[0]
        doc = " ".join(doc.split()[:truncate_len])
        te = TextExplainer(random_state=42, n_samples=n_samples)
        _ = te.fit(doc, self.predict_proba)
        return te.show_prediction(target_names=self.preproc.get_classes(),
                                  targets=prediction)
예제 #2
0
    def explain(self, doc, truncate_len=512, all_targets=False):
        """
        Highlights text to explain prediction
        Args:
            doc (str): text of documnet
            truncate_len(int): truncate document to this many words
            all_targets(bool):  If True, show visualization for
                                each target.
        """
        try:
            import eli5
            from eli5.lime import TextExplainer
        except:
            msg = 'ktrain requires a forked version of eli5 to support tf.keras. '+\
                  'Install with: pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1'
            warnings.warn(msg)
            return

        prediction = [self.predict(doc)] if not all_targets else None

        if not isinstance(doc, str): raise Exception('text must of type str')
        if self.preproc.is_nospace_lang():
            doc = self.preproc.process_chinese([doc])
            doc = doc[0]
        doc = ' '.join(doc.split()[:truncate_len])
        te = TextExplainer(random_state=42)
        _ = te.fit(doc, self.predict_proba)
        return te.show_prediction(target_names=self.preproc.get_classes(),
                                  targets=prediction)
def limeTextClassification(
    dataset, data, pr=Predictor(callingFunction="TextClassifier")
):  # example retrieved from https://eli5.readthedocs.io/en/latest/tutorials/black-box-text-classifiers.html#textexplainer

    pr = Predictor(dataset=dataset, callingFunction="TextClassifier")
    resultColumnName = pr.resultColumn
    dataClasses = list(dict.fromkeys(data[resultColumnName].astype(str)))
    dataClasses.sort()
    pr = Predictor(dataset=dataset, callingFunction="TextClassifier")

    te = TextExplainer(random_state=42)
    te.fit(dataset["text"], pr.predict_proba)

    te.fit(dataset["text"], pr.predict_proba)
    te.show_prediction(target_names=pr._classes_000.tolist())

    return te, pr._classes_000.tolist()
    '''
예제 #4
0
def highlight_text(text):
    predict_dict = predict(text)

    try:
        te = TextExplainer(random_state=42, n_samples=1000)
        te.fit(text, nn_model.predict_proba)
        highlight_html = te.show_prediction(
            target_names=[val for val in CLASSES.values()],
            top_targets=3,
            top=200)
        predict_dict["highlight"] = highlight_html
    except:
        predict_dict["highlight"] = None

    return predict_dict
예제 #5
0
    def explain(self,
                doc,
                truncate_len=512,
                all_targets=False,
                n_samples=2500):
        """
        Highlights text to explain prediction
        Args:
            doc (str): text of documnet
            truncate_len(int): truncate document to this many words
            all_targets(bool):  If True, show visualization for
                                each target.
            n_samples(int): number of samples to generate and train on.
                            Larger values give better results, but will take more time.
                            Lower this value if explain is taking too long.
        """
        is_array, is_pair = detect_text_format(doc)
        if is_pair:
            warnings.warn(
                'currently_unsupported: explain does not currently support sentence pair classification'
            )
            return
        if not self.c:
            warnings.warn(
                'currently_unsupported: explain does not support text regression'
            )
            return
        try:
            import eli5
            from eli5.lime import TextExplainer
        except:
            msg = 'ktrain requires a forked version of eli5 to support tf.keras. '+\
                  'Install with: pip3 install git+https://github.com/amaiya/eli5@tfkeras_0_10_1'
            warnings.warn(msg)
            return

        prediction = [self.predict(doc)] if not all_targets else None

        if not isinstance(doc, str): raise Exception('text must of type str')
        if self.preproc.is_nospace_lang():
            doc = self.preproc.process_chinese([doc])
            doc = doc[0]
        doc = ' '.join(doc.split()[:truncate_len])
        te = TextExplainer(random_state=42, n_samples=n_samples)
        _ = te.fit(doc, self.predict_proba)
        return te.show_prediction(target_names=self.preproc.get_classes(),
                                  targets=prediction)
예제 #6
0
def test_text_explainer_show_methods():
    pytest.importorskip('IPython')
    from IPython.display import HTML

    text = "Hello, world!"

    @_apply_to_list
    def predict_proba(doc):
        return [0.0, 1.0] if 'lo' in doc else [1.0, 0.0]

    te = TextExplainer()
    te.fit(text, predict_proba)

    pred_expl = te.show_prediction()
    assert isinstance(pred_expl, HTML)
    assert 'lo' in pred_expl.data

    weight_expl = te.show_weights()
    assert isinstance(weight_expl, HTML)
    assert 'lo' in weight_expl.data
예제 #7
0
seg_list = jieba.cut("看了快一半了才发现是mini的广告", cut_all=False)
list(seg_list)

# ### Example 1

# In[10]:

get_proba(["看 了 快 一半 了 才 发现 是 mini 的 广告"])

# In[11]:

from eli5.lime import TextExplainer

te = TextExplainer(random_state=42, n_samples=5000)
te.fit(" ".join(jieba.cut("看了快一半了才发现是mini的广告", cut_all=False)), get_proba)
te.show_prediction(target_names=["neg", "pos"])

# In[12]:

te.metrics_

# In[13]:

te.samples_[:10]

# #### Character-based Whitebox

# In[14]:

te = TextExplainer(random_state=42, n_samples=5000, char_based=True)
te.fit("看了快一半了才发现是mini的广告", get_proba)
예제 #8
0
# -*- coding: utf-8 -*-
"""
__title__ = 'eli5'
__author__ = 'JieYuan'
__mtime__ = '2018/8/21'
"""
from eli5.lime import TextExplainer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

X = [
    "The dimension of the input documents is reduced to 100, and then a kernel SVM is used to classify the documents.",
    "This is what the pipeline returns for a document - it is pretty sure the first message in test data belongs to sci.med:"
]

y = [0, 1]

piplie = make_pipeline(TfidfVectorizer(), LogisticRegression())

te = TextExplainer(random_state=42)
te.fit(X[0], piplie.predict_proba)
te.show_prediction()
te.show_weights()

eli5.show_prediction
            for word in words:
                index = 0
                for word_block in word:
                    if len(average_word_vector) == index:
                        average_word_vector.append(0)
                    average_word_vector[index] += float(word_block)
                    index += 1
            index = 0
            for word_block in average_word_vector:
                average_word_vector[index] /= float(len(words))
                index += 1
            xout.append(average_word_vector)
        return np.array(xout)


vectorizer = V()

for classifier in classifiers:
    print(classifier)
    gnb = classifier()

    pipe = make_pipeline(vectorizer, gnb)
    pipe.fit(x[:testcutoff], y[:testcutoff])

    y_predicted = pipe.predict_proba(x[testcutoff:])
    #print(classification_report(y[testcutoff:], y_predicted, target_names=['known weird', 'less weird']))

    te = TextExplainer(random_state=101, n_samples=500)
    te.fit('Green new deal is the best bro, bring it on', pipe.predict_proba)
    te.show_prediction(target_names=['known weird', 'less weird'])
예제 #10
0
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.1, random_state = 40, stratify=Y)



text_model.fit(x_train, y_train)



text_model.score(x_test, y_test)



from IPython.display import display, HTML





import eli5
from eli5.lime import TextExplainer

for idx in x_test.index[190:210]:
  te = TextExplainer(random_state=42)
  te.fit(cleaner(x_test[idx]), text_model.predict_proba, )
  print("Real Class:",  ["Non Toxic" if x == 0 else "Toxic" for x in [df_corpus_final.iloc[idx]['class']]])
  print("Text uncleaned tweet:", df_corpus_final.iloc[idx]['tweet'])
  print("ELI5 Predicted Class:")
  HTML(display((te.show_prediction(target_names=[ 'Non Toxic','Toxic',]))))
  
  import pickle
  pickle.dump(text_model, open('toxic.pickle', 'wb'))
예제 #11
0
exp.fit(X_test.values[0], pipe.predict_proba)

# In[378]:

ylabels.unique()

# In[ ]:

# In[379]:

target_names = ['Negative', 'Positive']

# In[380]:

exp.show_prediction()

# In[381]:

exp.show_prediction(target_names=target_names)

# In[382]:

exp.metrics_

# - ‘score’ is an accuracy score weighted by cosine distance between generated sample and the original document (i.e. texts which are closer to the example are more important). Accuracy shows how good are ‘top 1’ predictions.
# - ‘mean_KL_divergence’ is a mean Kullback–Leibler divergence for all target classes; it is also weighted by distance. KL divergence shows how well are probabilities approximated; 0.0 means a perfect match.

# In[46]:

exp.show_weights()
예제 #12
0
__author__ = 'xead'
# coding: utf-8

from sentiment_classifier import SentimentClassifier
from sklearn.externals import joblib
from eli5.lime import TextExplainer

#clf = SentimentClassifier()

#pred = clf.get_prediction_message("Хороший телефон")
text = 'Хороший был у меня телефон 5 лет назад'

pipe = joblib.load("./pipe6.pkl")
te = TextExplainer(random_state=42)
te.fit(text, pipe.predict_proba)
res = te.show_prediction(target_names=['negative', 'positive'], top=25)

print (res)
예제 #13
0
 def get_result(self, text):
     te = TextExplainer(random_state=42)
     te.fit(text, self.pipe.predict_proba)
     res = te.show_prediction(target_names=['negative', 'positive'], top=25)
     return res
     
예제 #14
0
# And now let's use ELI5 to see how model makes predictions!

# In[ ]:

import eli5
from eli5.lime import TextExplainer

te = TextExplainer(random_state=42)


def model_predict(x):
    return logreg.predict_proba(vectorizer.transform(x))


te.fit(valid_df['comment_text'].values[2:3][0], model_predict)
te.show_prediction()

# In[ ]:

te.fit(valid_df['comment_text'].values[12:13][0], model_predict)
te.show_prediction()

# In[ ]:

test_vectorized = vectorizer.transform(test['comment_text'].values)
sub['prediction'] = logreg.predict_proba(test_vectorized)[:, 1]
sub.to_csv('submission.csv', index=False)
del logreg, vectorizer, test_vectorized, train_vectorized, valid_vectorized

# ## Selecting number of words and sequence length
#