def __init__(self): (self.x_train, _), (_, _) = imdb.load_data(num_words=20000) self.x_train = sequence.pad_sequences(self.x_train, maxlen=80) self.session = tf.Session() self.graph = tf.get_default_graph() set_session(self.session) self.model = load_model('models/pretrained/shap_imdb.h5')
def plot(self, review): result = text_to_word_sequence(review) words = imdb.get_word_index() result = list(filter(lambda x: x in words and int(words.get(x)) <= 20000, result)) preprocess = np.array([list(map(lambda x: int(words.get(x)), result))]) preprocess = sequence.pad_sequences(preprocess, maxlen=80) # we use the first 100 training examples as our background dataset to integrate over with self.graph.as_default(): set_session(self.session) explainer = shap.DeepExplainer(self.model, self.x_train[:100]) # explain the first 10 predictions # explaining each prediction requires 2 * background dataset size runs with self.graph.as_default(): set_session(self.session) shap_values = explainer.shap_values(preprocess) # init the JS visualization code shap.initjs() # transform the indexes to words words = imdb.get_word_index() num2word = {} for w in words.keys(): num2word[words[w]] = w x_test_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), preprocess[0])))]) # plot the explanation of the first prediction # Note the model is "multi-output" because it is rank-2 but only has one column shap.force_plot(explainer.expected_value[0], shap_values[0][0], x_test_words[0], matplotlib=True, show=False) img = BytesIO() plt.savefig(img, format='png', dpi=200) plt.clf() plt.cla() plt.close() img.seek(0) return img
def preprocess(data, tokenizer, maxlen=280): return (pad_sequences(tokenizer.texts_to_sequences(data), maxlen=maxlen))
def __pad_size(self, x, length=None): return pad_sequences(x, maxlen=length, padding='post')
text_data = pd.read_csv('../../tmp/all_data.txt', header=None) # In[20]: # w2v=Word2Vec(text_data[0].apply(lambda x:x.split(' ')).tolist(),size=128, window=8, iter=30, min_count=2, # sg=1, sample=0.002, workers=6 , seed=1017) # w2v.wv.save_word2vec_format('../../tmp/w2v_128.txt') # In[78]: tokenizer = Tokenizer(lower=False, char_level=False, split=' ') tokenizer.fit_on_texts(data[1].tolist()) seq = tokenizer.texts_to_sequences(data[1].tolist()) # 分训练和测试集合 seq = pad_sequences(seq, maxlen=128, value=0) train_seq = np.asarray(seq[:len(train)]) test_seq = seq[len(train):] # In[61]: embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 128)) w2v = gensim.models.KeyedVectors.load_word2vec_format('../../tmp/w2v_128.txt', binary=False) for word in tokenizer.word_index: if word not in w2v.wv.vocab: continue embedding_matrix[tokenizer.word_index[word]] = w2v[word] # In[71]:
def train_model(X, y, mtype, cv, epochs, cv_models_path, train, X_test=None, y_test=None, nfolds=None, rs=42, max_features=40000, maxlen=400, dropout_rate=0.25, rec_units=150, embed_dim=50, batch_size=256, fscore=False, threshold=0.3): if cv: kf = StratifiedKFold(n_splits=nfolds, random_state=rs) auc = [] roc = [] fscore_ = [] for c, (train_index, val_index) in enumerate(kf.split(X, y)): print(f' fold {c}') X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] tokenizer = keras.preprocessing.text.Tokenizer( num_words=max_features) tokenizer.fit_on_texts(X_train) list_tokenized_train = tokenizer.texts_to_sequences(X_train) list_tokenized_val = tokenizer.texts_to_sequences(X_val) X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen) X_val = sequence.pad_sequences(list_tokenized_val, maxlen=maxlen) model = dl_model(model_type=mtype, max_features=max_features, maxlen=maxlen, dropout_rate=dropout_rate, embed_dim=embed_dim, rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount) print('Fitting') if train: model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1) model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.h5') else: model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.h5') probs = model.predict(X_val, batch_size=batch_size, verbose=1) if fscore: #for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: threshold = threshold probs_class = probs.copy() probs_class[probs_class >= threshold] = 1 probs_class[probs_class < threshold] = 0 precision = precision_score(y_val, probs_class) recall = recall_score(y_val, probs_class) fscore = f1_score(y_val, probs_class) print( f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}' ) fscore_.append(fscore) auc_f = average_precision_score(y_val, probs) auc.append(auc_f) roc_f = roc_auc_score(y_val, probs) roc.append(roc_f) print( f'fold {c} average precision {round(auc_f, 3)} ++++ roc auc {round(roc_f, 3)}' ) del model K.clear_session() if fscore: print( f'PR-C {round(np.array(auc).mean(), 3)} ++++ ROC AUC {round(np.array(roc).mean(), 3)} ++++ FScore {round(np.array(fscore_).mean(), 3)}' ) print( f'PR-C std {round(np.array(auc).std(), 3)} ++++ ROC AUC std {round(np.array(roc).std(), 3)} ++++ FScore std {round(np.array(fscore_).std(), 3)}' ) else: print( f'PR-C {round(np.array(auc).mean(), 3)} ++++ ROC AUC {round(np.array(roc).mean(), 3)}' ) print( f'PR-C std {round(np.array(auc).std(), 3)} ++++ ROC AUC std {round(np.array(roc).std(), 3)}' ) else: X_train = X y_train = y tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features, oov_token='unknown') tokenizer.fit_on_texts(X_train) list_tokenized_train = tokenizer.texts_to_sequences(X_train) list_tokenized_test = tokenizer.texts_to_sequences(X_test) X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen) X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen) y_train = np.array(y_train) y_test = np.array(y_test) model = dl_model(model_type=mtype, max_features=max_features, maxlen=maxlen, dropout_rate=dropout_rate, embed_dim=embed_dim, rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount) print('Fitting') if train: model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1) model.save_weights(f'{cv_models_path}/{mtype}.h5') else: model.load_weights(f'{cv_models_path}/{mtype}.h5') probs = model.predict(X_test, batch_size=batch_size, verbose=1) auc_f = average_precision_score(y_test, probs) roc_f = roc_auc_score(y_test, probs) if fscore: threshold = threshold probs_class = probs.copy() probs_class[probs_class >= threshold] = 1 probs_class[probs_class < threshold] = 0 precision = precision_score(y_test, probs_class) recall = recall_score(y_test, probs_class) fscore = f1_score(y_test, probs_class) if fscore: print('_________________________________') print( f'PR-C is {round(auc_f,3)} ++++ ROC AUC is {round(roc_f,3)} +++++ FScore is {round(fscore,3)}' ) print('_________________________________\n') else: print('_________________________________') print( f'PR-C is {round(auc_f,3)} ++++ ROC AUC is {round(roc_f,3)}' ) print('_________________________________\n')
tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(data_text) x_train_tokens = tokenizer.texts_to_sequences(x_train_text) x_test_tokens = tokenizer.texts_to_sequences(x_test_text) #Padding and Truncating Data num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens] num_tokens = np.array(num_tokens) max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens) max_tokens = int(max_tokens) pad = 'pre' x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad) x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens, padding=pad, truncating=pad) #Tokinzer Inverse Map idx = tokenizer.word_index inverse_map = dict(zip(idx.values(), idx.keys())) def tokens_to_string(tokens): # Map from tokens back to words. words = [inverse_map[token] for token in tokens if token != 0]