def train_model_embed(train, dev, glove, model, model_dir = 'models/curr_model', nb_epochs = 20, batch_size = 64, hs=True, ci = True): X_dev_p, X_dev_h, y_dev = load_data.prepare_split_vec_dataset(dev, glove=glove) word_index = load_data.WordIndex(glove) if not os.path.exists(model_dir): os.makedirs(model_dir) for e in range(nb_epochs): print "Epoch ", e mb = load_data.get_minibatches_idx(len(train), batch_size, shuffle=True) p = Progbar(len(train)) for i, train_index in mb: if len(train_index) != batch_size: continue X_train_p, X_train_h , y_train = load_data.prepare_split_vec_dataset([train[k] for k in train_index], word_index.index) padded_p = load_data.pad_sequences(X_train_p, maxlen = PREM_LEN, dim = -1, padding = 'pre') padded_h = load_data.pad_sequences(X_train_h, maxlen = HYPO_LEN, dim = -1, padding = 'post') data = {'premise_input': padded_p, 'embed_input': np.expand_dims(np.array(train_index), axis=1), 'output' : padded_h} if ci: data['class_input'] = y_train if hs: data['train_input'] = padded_h data['output'] = np.ones((batch_size, HYPO_LEN, 1)) #sw = (padded_h != 0).astype(float) #train_loss = float(model.train_on_batch(data, sample_weight={'output':sw})[0]) train_loss = float(model.train_on_batch(data)[0]) p.add(len(train_index),[('train_loss', train_loss)]) sys.stdout.write('\n') model.save_weights(model_dir + '/model~' + str(e))
def generation_embded_test(train, glove, model, batch_size = 64, prem_len = 22, hypo_len = 12): batch = np.arange(batch_size) X_prem, X_hypo, _ = load_data.prepare_split_vec_dataset([train[k] for k in batch], glove) X_p = load_data.pad_sequences(X_prem, maxlen = prem_len, dim = 50) X_h = load_data.pad_sequences(X_hypo, maxlen = hypo_len, dim = 50) data = {'premise_input': X_p, 'embed_input': np.expand_dims(np.array(batch), axis=1), 'output' : X_h} return model.train_on_batch(data) #p.add(len(X_p),[('train_loss', train_loss)])
def generation_test(train, glove, model, batch_size = 64, prem_len = 22, hypo_len = 12): mb = load_data.get_minibatches_idx(len(train), batch_size, shuffle=True) p = Progbar(len(train)) for i, train_index in mb: X_prem, X_hypo, _ = load_data.prepare_split_vec_dataset([train[k] for k in train_index], glove) X_p = load_data.pad_sequences(X_prem, maxlen = prem_len, dim = 50) X_h = load_data.pad_sequences(X_hypo, maxlen = hypo_len, dim = 50) train_loss = model.train_on_batch(X_p, X_h)[0] p.add(len(X_p),[('train_loss', train_loss)])
def generation_test(train, glove, model, batch_size=64, prem_len=22, hypo_len=12): mb = load_data.get_minibatches_idx(len(train), batch_size, shuffle=True) p = Progbar(len(train)) for i, train_index in mb: X_prem, X_hypo, _ = load_data.prepare_split_vec_dataset( [train[k] for k in train_index], glove) X_p = load_data.pad_sequences(X_prem, maxlen=prem_len, dim=50) X_h = load_data.pad_sequences(X_hypo, maxlen=hypo_len, dim=50) train_loss = model.train_on_batch(X_p, X_h)[0] p.add(len(X_p), [('train_loss', train_loss)])
def adversarial_generator(train, gen_model, discriminator, word_index, beam_size): batch_size, prem_len, _ = gen_model[0].inputs['premise'].input_shape examples = batch_size / beam_size hidden_size = gen_model[0].nodes['hypo_merge'].output_shape[2] hypo_len = discriminator.input_shape[1] while True: mb = load_data.get_minibatches_idx(len(train), examples, shuffle=True) for i, train_index in mb: if len(train_index) != examples: continue orig_batch = [train[k] for k in train_index] noise_input = np.random.normal(scale=0.11, size=(examples, 1, hidden_size)) class_indices = np.random.random_integers(0, 2, examples) hypo_batch, probs = generative_predict_beam(gen_model, word_index, orig_batch, noise_input, class_indices, True, hypo_len) ad_preds = discriminator.predict_on_batch(hypo_batch)[0].flatten() X_prem, _, _ = load_data.prepare_split_vec_dataset(orig_batch, word_index.index) premise_batch = load_data.pad_sequences(X_prem, maxlen = prem_len, dim = -1, padding = 'pre') yield {'premise' : premise_batch, 'hypo' : hypo_batch, 'label': class_indices, 'sanity': ad_preds, 'gen_probs' : probs}
def adverse_generate2(gen_model, ad_model, cmodel, train, word_index, glove, threshold = 0.95, batch_size = 64, ci = False): mb = load_data.get_minibatches_idx(len(train), batch_size, shuffle=True) p = Progbar(len(train)) results = [] for i, train_index in mb: if len(train_index) != batch_size: continue orig_batch = [train[k] for k in train_index] class_indices = [load_data.LABEL_LIST.index(train[k][2]) for k in train_index] probs = generation.generation_predict_embed(gen_model, word_index.index, orig_batch, np.random.random_integers(0, len(train), len(orig_batch)), class_indices = class_indices) gen_batch = generation.get_classes(probs) ad_preds = ad_model.predict_on_batch(gen_batch)[0].flatten() X = [] for i in range(len(orig_batch)): concat = orig_batch[i][0] + ["--"] + word_index.get_seq(gen_batch[i]) X.append(load_data.load_word_vecs(concat, glove)) X = np.array(X) X_padded = load_data.pad_sequences(X, dim = len(X[0][0])) cpreds = cmodel.predict_on_batch(X_padded)[0][np.arange(len(X_padded)), class_indices] pred_seq = [word_index.print_seq(gen) for gen in gen_batch] premises = [" ".join(ex[0]) for ex in orig_batch] classes = np.array(load_data.LABEL_LIST)[class_indices] zipped = zip(cpreds, ad_preds, premises, pred_seq, classes) results += [el for el in zipped if el[0] * el[1]> threshold] p.add(len(train_index),[('added', float(len([el for el in zipped if el[0] * el[1]> threshold])))]) if len(results) > 200: print (i + 1) * batch_size return results return results
def generative_predict(test_model, word_index, batch, embed_indices, class_indices, batch_size = 64, prem_len = 22, hypo_len = 12): prem, _, _ = load_data.prepare_split_vec_dataset(batch, word_index) padded_p = load_data.pad_sequences(prem, maxlen=prem_len, dim = -1) core_model, premise_func, noise_func = test_model premise = premise_func(padded_p) noise = noise_func(embed_indices, load_data.convert_to_one_hot(class_indices, 3)) core_model.reset_states() core_model.nodes['attention'].set_state(noise) word_input = np.zeros((batch_size, 1)) result = [] for i in range(hypo_len): data = {'premise' :premise, 'creative': noise, 'hypo_input': word_input, 'train_input': np.zeros((batch_size,1))} preds = core_model.predict_on_batch(data)['output'] result.append(preds) word_input = np.argmax(preds, axis=2) result = np.transpose(np.array(result)[:,:,-1,:], (1,0,2)) return result
def generation_embded_test(train, glove, model, batch_size=64, prem_len=22, hypo_len=12): batch = np.arange(batch_size) X_prem, X_hypo, _ = load_data.prepare_split_vec_dataset( [train[k] for k in batch], glove) X_p = load_data.pad_sequences(X_prem, maxlen=prem_len, dim=50) X_h = load_data.pad_sequences(X_hypo, maxlen=hypo_len, dim=50) data = { 'premise_input': X_p, 'embed_input': np.expand_dims(np.array(batch), axis=1), 'output': X_h } return model.train_on_batch(data)
def generative_train_generator(train, word_index, batch_size = 64, prem_len = 22, hypo_len = 12): while True: mb = load_data.get_minibatches_idx(len(train), batch_size, shuffle=True) for i, train_index in mb: if len(train_index) != batch_size: continue X_train_p, X_train_h , y_train = load_data.prepare_split_vec_dataset([train[k] for k in train_index], word_index.index) padded_p = load_data.pad_sequences(X_train_p, maxlen = prem_len, dim = -1, padding = 'pre') padded_h = load_data.pad_sequences(X_train_h, maxlen = hypo_len, dim = -1, padding = 'post') hypo_input = np.concatenate([np.zeros((batch_size, 1)), padded_h], axis = 1) train_input = np.concatenate([padded_h, np.zeros((batch_size, 1))], axis = 1) yield {'premise_input': padded_p, 'hypo_input': hypo_input, 'train_input' : train_input, 'noise_input' : np.expand_dims(train_index, axis=1), 'class_input' : y_train, 'output': np.ones((batch_size, hypo_len + 1, 1))}
def test_model(model, dev, glove, batch_size = 100, return_probs = False): X_dev, y_dev = load_data.prepare_vec_dataset(dev, glove) dmb = load_data.get_minibatches_idx(len(X_dev), batch_size, shuffle=False) #dmb = load_data.get_minibatches_idx_bucketing([len(ex[0]) + len(ex[1]) for ex in dev], batch_size, shuffle=True) y_pred = np.zeros((len(y_dev), 3)) for i, dev_index in dmb: X_padded = load_data.pad_sequences(X_dev[dev_index], dim = len(X_dev[0][0])) y_pred[dev_index] = model.predict_on_batch(X_padded) acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax(y_dev, axis=1)) / float(len(y_pred)) if return_probs: return acc, y_pred else: return acc
def generation_predict_embed(model, word_index, batch, embed_indices, batch_size = 64, hs = True, class_indices = None): prem, hypo, y = load_data.prepare_split_vec_dataset(batch, word_index) X_p = load_data.pad_sequences(prem, maxlen=PREM_LEN, dim = -1) data = {'premise_input': X_p, 'embed_input': embed_indices[:,None]} if class_indices is not None: C = load_data.convert_to_one_hot(class_indices, 3) data['class_input'] = C if hs: data['train_input'] = np.zeros((batch_size, HYPO_LEN)) model_pred = model.predict_on_batch(data) return model_pred['output']
def update_model_once(model, glove, train_data): X_train, y_train = load_data.prepare_vec_dataset(train_data, glove=glove) X_padded = load_data.pad_sequences(X_train, dim = len(X_train[0][0])) model.train_on_batch(X_padded, y_train, accuracy=True)
custom_objects={'AttentionWithContext': AttentionWithContext}) loaded_model.load_weights("rnn_docs_ranking.h5") print("Loaded model from disk") print(loaded_model.summary()) from keras import backend as K # with a Sequential model get_td_layer_output = K.function([loaded_model.layers[0].input], [loaded_model.layers[3].output]) data, y = read_file('../hatespeech', with_evaluation=True) data = preprocess_doc(data, True) data = [s.split(" ") for s in data] data = pad_sequences(data) word_counts, vocabulary, vocabulary_inv = build_vocab(data) x = build_input_data_rnn(data, vocabulary, len(data[0])) print('Loaded data') print("Computing predicted labels") y_dist = loaded_model.predict(x) y_pred = y_dist.argmax(axis=1) print(classification_report(y, y_pred)) print("Computing 3rd layer output") td_output = get_td_layer_output([x])[0] W, b, u = loaded_model.layers[4].get_weights()
def graph_train_batch(train, dev, model, glove, embed_size = 300): P,H,y = load_data.prepare_split_vec_dataset(train[:128], glove) padded_P = load_data.pad_sequences(P, dim = embed_size) padded_H = load_data.pad_sequences(H, dim = embed_size) data = {'premise_input': padded_P, 'hypo_input': padded_H, 'output' : y} return model.train_on_batch(data)
def make_train_batch(orig_batch, word_index, hypo_len): _, X_hypo, _ = load_data.prepare_split_vec_dataset(orig_batch, word_index.index) return load_data.pad_sequences(X_hypo, maxlen = hypo_len, dim = -1, padding = 'post')