示例#1
0
    def __init__(self):
        (self.x_train, _), (_, _) = imdb.load_data(num_words=20000)

        self.x_train = sequence.pad_sequences(self.x_train, maxlen=80)

        self.session = tf.Session()
        self.graph = tf.get_default_graph()
        set_session(self.session)
        self.model = load_model('models/pretrained/shap_imdb.h5')
示例#2
0
    def plot(self, review):
        result = text_to_word_sequence(review)

        words = imdb.get_word_index()

        result = list(filter(lambda x: x in words and int(words.get(x)) <= 20000, result))

        preprocess = np.array([list(map(lambda x: int(words.get(x)), result))])
        preprocess = sequence.pad_sequences(preprocess, maxlen=80)
        # we use the first 100 training examples as our background dataset to integrate over

        with self.graph.as_default():
            set_session(self.session)
            explainer = shap.DeepExplainer(self.model, self.x_train[:100])

        # explain the first 10 predictions
        # explaining each prediction requires 2 * background dataset size runs
        with self.graph.as_default():
            set_session(self.session)
            shap_values = explainer.shap_values(preprocess)
        # init the JS visualization code
        shap.initjs()

        # transform the indexes to words
        words = imdb.get_word_index()
        num2word = {}
        for w in words.keys():
            num2word[words[w]] = w

        x_test_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), preprocess[0])))])

        # plot the explanation of the first prediction
        # Note the model is "multi-output" because it is rank-2 but only has one column
        shap.force_plot(explainer.expected_value[0], shap_values[0][0], x_test_words[0], matplotlib=True, show=False)

        img = BytesIO()
        plt.savefig(img, format='png', dpi=200)
        plt.clf()
        plt.cla()
        plt.close()
        img.seek(0)
        return img
示例#3
0
def preprocess(data, tokenizer, maxlen=280):
    return (pad_sequences(tokenizer.texts_to_sequences(data), maxlen=maxlen))
示例#4
0
 def __pad_size(self, x, length=None):
     return pad_sequences(x, maxlen=length, padding='post')
示例#5
0
text_data = pd.read_csv('../../tmp/all_data.txt', header=None)

# In[20]:

# w2v=Word2Vec(text_data[0].apply(lambda x:x.split(' ')).tolist(),size=128, window=8, iter=30, min_count=2,
#                      sg=1, sample=0.002, workers=6 , seed=1017)

# w2v.wv.save_word2vec_format('../../tmp/w2v_128.txt')

# In[78]:

tokenizer = Tokenizer(lower=False, char_level=False, split=' ')
tokenizer.fit_on_texts(data[1].tolist())
seq = tokenizer.texts_to_sequences(data[1].tolist())
# 分训练和测试集合
seq = pad_sequences(seq, maxlen=128, value=0)
train_seq = np.asarray(seq[:len(train)])
test_seq = seq[len(train):]

# In[61]:

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 128))
w2v = gensim.models.KeyedVectors.load_word2vec_format('../../tmp/w2v_128.txt',
                                                      binary=False)

for word in tokenizer.word_index:
    if word not in w2v.wv.vocab:
        continue
    embedding_matrix[tokenizer.word_index[word]] = w2v[word]

# In[71]:
示例#6
0
def train_model(X,
                y,
                mtype,
                cv,
                epochs,
                cv_models_path,
                train,
                X_test=None,
                y_test=None,
                nfolds=None,
                rs=42,
                max_features=40000,
                maxlen=400,
                dropout_rate=0.25,
                rec_units=150,
                embed_dim=50,
                batch_size=256,
                fscore=False,
                threshold=0.3):
    if cv:
        kf = StratifiedKFold(n_splits=nfolds, random_state=rs)
        auc = []
        roc = []
        fscore_ = []

        for c, (train_index, val_index) in enumerate(kf.split(X, y)):

            print(f' fold {c}')

            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            tokenizer = keras.preprocessing.text.Tokenizer(
                num_words=max_features)
            tokenizer.fit_on_texts(X_train)

            list_tokenized_train = tokenizer.texts_to_sequences(X_train)
            list_tokenized_val = tokenizer.texts_to_sequences(X_val)

            X_train = sequence.pad_sequences(list_tokenized_train,
                                             maxlen=maxlen)
            X_val = sequence.pad_sequences(list_tokenized_val, maxlen=maxlen)

            model = dl_model(model_type=mtype,
                             max_features=max_features,
                             maxlen=maxlen,
                             dropout_rate=dropout_rate,
                             embed_dim=embed_dim,
                             rec_units=rec_units,
                             max_sent_len=max_sen_len,
                             max_sent_amount=max_sent_amount)

            print('Fitting')
            if train:
                model.fit(X_train,
                          y_train,
                          batch_size=batch_size,
                          epochs=epochs,
                          shuffle=True,
                          verbose=1)
                model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.h5')
            else:
                model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.h5')

            probs = model.predict(X_val, batch_size=batch_size, verbose=1)

            if fscore:
                #for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
                threshold = threshold
                probs_class = probs.copy()
                probs_class[probs_class >= threshold] = 1
                probs_class[probs_class < threshold] = 0
                precision = precision_score(y_val, probs_class)
                recall = recall_score(y_val, probs_class)
                fscore = f1_score(y_val, probs_class)
                print(
                    f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}'
                )
                fscore_.append(fscore)

            auc_f = average_precision_score(y_val, probs)
            auc.append(auc_f)
            roc_f = roc_auc_score(y_val, probs)
            roc.append(roc_f)

            print(
                f'fold {c} average precision {round(auc_f, 3)} ++++  roc auc {round(roc_f, 3)}'
            )

            del model
            K.clear_session()

        if fscore:
            print(
                f'PR-C {round(np.array(auc).mean(), 3)}  ++++ ROC AUC {round(np.array(roc).mean(), 3)}  ++++ FScore {round(np.array(fscore_).mean(), 3)}'
            )
            print(
                f'PR-C std {round(np.array(auc).std(), 3)}  ++++ ROC AUC std {round(np.array(roc).std(), 3)}  ++++ FScore std {round(np.array(fscore_).std(), 3)}'
            )
        else:
            print(
                f'PR-C {round(np.array(auc).mean(), 3)}  ++++ ROC AUC {round(np.array(roc).mean(), 3)}'
            )
            print(
                f'PR-C std {round(np.array(auc).std(), 3)}  ++++ ROC AUC std {round(np.array(roc).std(), 3)}'
            )

    else:
        X_train = X
        y_train = y
        tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features,
                                                       oov_token='unknown')
        tokenizer.fit_on_texts(X_train)

        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
        list_tokenized_test = tokenizer.texts_to_sequences(X_test)
        X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
        X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

        y_train = np.array(y_train)
        y_test = np.array(y_test)

        model = dl_model(model_type=mtype,
                         max_features=max_features,
                         maxlen=maxlen,
                         dropout_rate=dropout_rate,
                         embed_dim=embed_dim,
                         rec_units=rec_units,
                         max_sent_len=max_sen_len,
                         max_sent_amount=max_sent_amount)

        print('Fitting')

        if train:
            model.fit(X_train,
                      y_train,
                      batch_size=batch_size,
                      epochs=epochs,
                      shuffle=True,
                      verbose=1)
            model.save_weights(f'{cv_models_path}/{mtype}.h5')
        else:
            model.load_weights(f'{cv_models_path}/{mtype}.h5')
        probs = model.predict(X_test, batch_size=batch_size, verbose=1)
        auc_f = average_precision_score(y_test, probs)
        roc_f = roc_auc_score(y_test, probs)

        if fscore:
            threshold = threshold
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_test, probs_class)
            recall = recall_score(y_test, probs_class)
            fscore = f1_score(y_test, probs_class)

        if fscore:
            print('_________________________________')
            print(
                f'PR-C is {round(auc_f,3)}     ++++   ROC AUC is {round(roc_f,3)}   +++++ FScore is {round(fscore,3)}'
            )
            print('_________________________________\n')
        else:
            print('_________________________________')
            print(
                f'PR-C is {round(auc_f,3)}     ++++   ROC AUC is {round(roc_f,3)}'
            )
            print('_________________________________\n')
示例#7
0
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data_text)

x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

#Padding and Truncating Data
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens,
                            maxlen=max_tokens,
                            padding=pad,
                            truncating=pad)
x_test_pad = pad_sequences(x_test_tokens,
                           maxlen=max_tokens,
                           padding=pad,
                           truncating=pad)

#Tokinzer Inverse Map
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))


def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]