def initialize(self, hidden_size, projection_size, in_vocabulary, out_vocabulary, batch_size, hidden_activation="Tanh", bptt_steps=5, use_pauses=False): self.hidden_size = hidden_size self.projection_size = projection_size self.bptt_steps = bptt_steps self.batch_size = batch_size self.use_pauses = use_pauses self.in_vocabulary = in_vocabulary self.out_vocabulary = out_vocabulary self.hidden_activation_name = hidden_activation self.hidden_activation = getattr(activation_functions, hidden_activation) self.We = self.weights(get_vocabulary_size(self.in_vocabulary), self.projection_size) self.Wp = self.weights(1, self.projection_size) self.W = self.weights(self.projection_size, self.hidden_size*4) self.Wip = self.weights(1, self.hidden_size) self.Wfp = self.weights(1, self.hidden_size) self.Wop = self.weights(1, self.hidden_size) self.Wr = self.weights(self.hidden_size, self.hidden_size*4) self.Wy = self.weights(self.hidden_size, get_vocabulary_size(self.out_vocabulary)) # AdaGread sum of squares of per feature historical gradients for p in self.params: setattr(self, p+"_hg", np.zeros_like(getattr(self, p))) self.reset_state() self.initialized = True
def train(model_name, p1_train_data, p1_dev_data, p2_train_data, p2_dev_data): ### PHASE 1 ### training_data = np.load(p1_train_data) validation_data = np.load(p1_dev_data) assert training_data["batch_size"] == validation_data["batch_size"] assert training_data["vocabulary"] == validation_data["vocabulary"] assert training_data["punctuations"] == validation_data["punctuations"] print "1st phase data loaded..." print "Vocabulary size is %d" % utils.get_vocabulary_size(validation_data["vocabulary"]) print "Training set size is %d" % training_data["total_size"] print "Validation set size is %d" % validation_data["total_size"] net = models.T_LSTM() net.initialize(hidden_size=conf.PHASE1["HIDDEN_SIZE"], projection_size=conf.PHASE1["PROJECTION_SIZE"], in_vocabulary=training_data["vocabulary"], out_vocabulary=training_data["punctuations"], batch_size=training_data["batch_size"], hidden_activation=conf.PHASE1["HIDDEN_ACTIVATION"], bptt_steps=conf.PHASE1["BPTT_STEPS"], use_pauses=False) _train(net, training_data, validation_data, model_name, conf.PHASE1["LEARNING_RATE"], conf.PHASE1["MAX_EPOCHS"], conf.PHASE1["MIN_IMPROVEMENT"]) ### PHASE 2 ### if not os.path.isfile(p2_train_data) or not os.path.isfile(p2_train_data): print "No second phase data." return training_data = np.load(p2_train_data) validation_data = np.load(p2_dev_data) assert training_data["batch_size"] == validation_data["batch_size"] == net.batch_size assert training_data["vocabulary"] == validation_data["vocabulary"] == net.in_vocabulary assert training_data["punctuations"] == validation_data["punctuations"] == net.out_vocabulary print "2nd phase data loaded..." print "Training set size is %d" % training_data["total_size"] print "Validation set size is %d" % validation_data["total_size"] print "Trainging %s pause durations." % ("with" if conf.PHASE2["USE_PAUSES"] else "without") t_lstm = net net = models.TA_LSTM() net.initialize(hidden_size=conf.PHASE2["HIDDEN_SIZE"], t_lstm=t_lstm, out_vocabulary=training_data["punctuations"], batch_size=training_data["batch_size"], hidden_activation=conf.PHASE2["HIDDEN_ACTIVATION"], bptt_steps=conf.PHASE2["BPTT_STEPS"], use_pauses=conf.PHASE2["USE_PAUSES"]) _train(net, training_data, validation_data, model_name, conf.PHASE2["LEARNING_RATE"], conf.PHASE2["MAX_EPOCHS"], conf.PHASE2["MIN_IMPROVEMENT"])
def train(model_name, p1_train_data, p1_dev_data, p2_train_data, p2_dev_data): ### PHASE 1 ### training_data = np.load(p1_train_data) validation_data = np.load(p1_dev_data) assert training_data["batch_size"] == validation_data["batch_size"] assert training_data["vocabulary"] == validation_data["vocabulary"] assert training_data["punctuations"] == validation_data["punctuations"] print("1st phase data loaded...") print("Vocabulary size is %d" % utils.get_vocabulary_size(validation_data["vocabulary"])) print("Training set size is %d" % training_data["total_size"]) print("Validation set size is %d" % validation_data["total_size"]) net = models.T_LSTM() net.initialize(hidden_size=conf.PHASE1["HIDDEN_SIZE"], projection_size=conf.PHASE1["PROJECTION_SIZE"], in_vocabulary=training_data["vocabulary"], out_vocabulary=training_data["punctuations"], batch_size=training_data["batch_size"], hidden_activation=conf.PHASE1["HIDDEN_ACTIVATION"], gate_activation=conf.GATE_ACTIVATION, bptt_steps=conf.PHASE1["BPTT_STEPS"], use_pauses=False) _train(net, training_data, validation_data, model_name, conf.PHASE1["LEARNING_RATE"], conf.PHASE1["MAX_EPOCHS"], conf.PHASE1["MIN_IMPROVEMENT"])
def initialize(self, hidden_size, projection_size, in_vocabulary, out_vocabulary, batch_size, hidden_activation="Tanh", gate_activation="Sigmoid", bptt_steps=5, use_pauses=False): self.hidden_size = hidden_size self.projection_size = projection_size self.bptt_steps = bptt_steps self.batch_size = batch_size self.use_pauses = use_pauses self.in_vocabulary = in_vocabulary self.out_vocabulary = out_vocabulary self.hidden_activation_name = hidden_activation self.hidden_activation = getattr(activation_functions, hidden_activation) self.gate_activation_name = gate_activation self.gate_activation = getattr(activation_functions, gate_activation) self.We = self.weights(get_vocabulary_size(self.in_vocabulary), self.projection_size) self.Wp = self.weights(1, self.projection_size) self.W = self.weights(self.projection_size, self.hidden_size * 4) self.Wip = self.weights(1, self.hidden_size) self.Wfp = self.weights(1, self.hidden_size) self.Wop = self.weights(1, self.hidden_size) self.Wr = self.weights(self.hidden_size, self.hidden_size * 4) self.Wy = self.weights(self.hidden_size, get_vocabulary_size(self.out_vocabulary)) # AdaGread sum of squares of per feature historical gradients for p in self.params: setattr(self, p + "_hg", np.zeros_like(getattr(self, p))) self.reset_state() self.initialized = True
def initialize(self, hidden_size, t_lstm, out_vocabulary, batch_size, hidden_activation="Tanh", bptt_steps=5, use_pauses=False): assert isinstance(t_lstm, T_LSTM) self.hidden_size = hidden_size self.t_lstm = t_lstm self.bptt_steps = bptt_steps self.batch_size = batch_size self.use_pauses = use_pauses self.in_vocabulary = self.t_lstm.in_vocabulary self.out_vocabulary = out_vocabulary self.hidden_activation_name = hidden_activation self.hidden_activation = getattr(activation_functions, hidden_activation) self.W = self.weights(self.t_lstm.hidden_size, self.hidden_size * 4) self.Wp = self.weights(1, self.hidden_size * 4) self.Wy = self.weights(self.hidden_size, get_vocabulary_size(self.out_vocabulary)) self.Wip = self.weights(1, self.hidden_size) self.Wfp = self.weights(1, self.hidden_size) self.Wop = self.weights(1, self.hidden_size) self.Wr = self.weights(self.hidden_size, self.hidden_size * 4) # AdaGread sum of squares of per feature historical gradients for p in self.params: setattr(self, p + "_hg", np.zeros_like(getattr(self, p))) self.reset_state() self.initialized = True
SEQUENCE_LENGTH = 250 EMBEDDING_DIM = 100 HIDDEN_SIZE = 150 ATTENTION_SIZE = 50 KEEP_PROB = 0.8 BATCH_SIZE = 256 NUM_EPOCHS = 3 # Model easily overfits without pre-trained words embeddings, that's why train for a few epochs DELTA = 0.5 MODEL_PATH = './model' # Load the data set (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM) # Sequences pre-processing vocabulary_size = get_vocabulary_size(X_train) X_test = fit_in_vocabulary(X_test, vocabulary_size) X_train = zero_pad(X_train, SEQUENCE_LENGTH) X_test = zero_pad(X_test, SEQUENCE_LENGTH) # Different placeholders with tf.name_scope('Inputs'): batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph') target_ph = tf.placeholder(tf.float32, [None], name='target_ph') seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph') keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform(
def read_sparse_input(lang_code, concepts_to_include, validation_set): '''Reads data for langauge lang_code from a file in the sparse matrix folder. All of the concepts not found in the set concepts_to_include are excluded. Aditionaly the concepts found in the validation_set are also excluded from the read data (even if they are found in concepts_to_include).''' rows = [] columns = [] data = [] concept_id_2_index = {} intersection_with_concept_validation_set = set() # Sanity check with gzip.open(utils.get_sparse_matrix_file_path(lang_code), 'r') as f: # Used to check for repeated assignment of same cell (which should not happen) non_zero = set() # Used to check for word ID conformation vocabulary_size = utils.get_vocabulary_size(lang_code) documents_counter = 0 for line in codecs.getreader("utf-8")(f): parts = line.split() if len(parts) != 3: raise Exception( "Line formatting not as expected! Line: %s" % line) concept = parts[0] word_id = int(parts[1]) if concept not in concepts_to_include: continue if concept in validation_set: intersection_with_concept_validation_set.add(concept) continue # Keeps track of which concept is described in the given document if concept not in concept_id_2_index: if documents_counter % 100000 == 0: print("%d documents processed" % documents_counter) sys.stdout.flush() concept_id_2_index[concept] = documents_counter documents_counter = documents_counter + 1 r = concept_id_2_index[concept] if word_id > vocabulary_size: raise Exception('Word_ID higher then vocabulary size!') c = word_id d = int(parts[2]) field = (r, c) if field in non_zero: # raise Exception('Repeated cell assignment!') print('Repeated cell assignment!') print(field) else: non_zero.add(field) rows.append(r) columns.append(c) data.append(d) index_2_concept_id = { index: concept_id for concept_id, index in concept_id_2_index.items() } # Generating the word index word_id_2_index = {} # Removes zero columns, by first generating a new index for the nonzero columns, and then updating the document representation to comply # with the new index which contain only nonzero_columns c = 0 non_zero_word_ids = sorted(set(columns)) for word_id in non_zero_word_ids: word_id_2_index[word_id] = c c += 1 for i in range(len(columns)): word_id = columns[i] columns[i] = word_id_2_index[word_id] index_2_word_id = { index: word_id for word_id, index in word_id_2_index.items() } print("Intersection with validation set in %s counts %d elements" % (lang_code, len(intersection_with_concept_validation_set))) # Sanity check sys.stdout.flush() temp_processed_data = {'rows' : np.array(rows), 'columns' : np.array(columns), 'data' : np.array(data), 'concept_id_2_index' : concept_id_2_index, \ 'index_2_concept_id' : index_2_concept_id, 'word_id_2_index' : word_id_2_index, 'index_2_word_id' : index_2_word_id } return temp_processed_data
def train_RNN(SEQUENCE_LENGTH, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, KEEP_PROB, BATCH_SIZE, NUM_EPOCHS, DELTA, LEARNING_RATE, ALPHA_DIVIDER, MAXIMUM_DATA_NUM): t1 = time.time() # Load preprocessed data with open('raw_sums', 'rb') as fp: raw_sums = pickle.load(fp) with open('raw_texts', 'rb') as fp: raw_texts = pickle.load(fp) with open('vocab_limit', 'rb') as fp: vocab_limit = pickle.load(fp) #removing stopwords for the y_in of the arnn y_trains = [] for line in raw_sums: line = [ word for word in line if word not in open('english.txt').read() ] y_trains.append(line) print(y_trains[0]) # Embedd vocabulary(important for our int2word later to have only one embedding) embd_vocab = [] max_words = len(vocab_limit) tokenizer = Tokenizer(num_words=max_words) # This builds the word index tokenizer.fit_on_texts(vocab_limit) # This turns strings into lists of integer indices. embd_vocab = tokenizer.texts_to_sequences(vocab_limit) embd_vocab = list(itertools.chain(*embd_vocab)) #embd_vocab.append(encoded_docs) print('TRAIN_ARNN: vocab embedded.') print('Länge EMBD_VOCAB: ' + str(len(embd_vocab))) # Saving with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) keys = vocab_limit[:] values = embd_vocab[:] dictionary = dict(zip(keys, values)) def word2int(texts): embd_texts = [] for each_text in texts: embd_text = [] for word in each_text: for key in dictionary.keys(): if key == word: embd_text.append(dictionary.get(key)) embd_texts.append(embd_text) return embd_texts print('TRAIN_ARNN: ' + str(texts) + ' embedded.') embd_texts = word2int(raw_texts) embd_sums = word2int(raw_sums) embd_summaries = word2int(y_trains) #%% ADDED BY MKLOENNE from summarization model -- Creating train and test sets train_len = int((.7) * len(embd_sums)) train_texts = embd_texts[0:train_len] train_summaries = embd_sums[0:train_len] train_sums = embd_summaries[0:train_len] val_len = int((.15) * len(embd_sums)) ''' val_texts = embd_texts[train_len:train_len+val_len] val_summaries = embd_sums[train_len:train_len+val_len] ''' test_texts = embd_texts[train_len + val_len:len(embd_sums)] test_summaries = embd_sums[train_len + val_len:len(embd_sums)] test_sums = embd_summaries[train_len + val_len:len(embd_summaries)] # Load the dataset (X_train, y_train), (X_test, y_tests) = (train_texts, train_summaries), (test_texts, test_sums) y_trains = train_sums # Convert text lists into numpy arrays to get shape like in the imdb dataset for train and test data def convert2array(listtoconvert, comparing): converted_arr = [] maxList = max(max(len(x) for x in listtoconvert), max(len(x) for x in comparing)) pre_array = np.asarray([np.asarray(x) for x in listtoconvert]) for arr in pre_array: arr = np.lib.pad(arr, (0, maxList - len(arr)), 'constant', constant_values=0) text_with_zero = np.concatenate([[1], arr]) text_with_zero = np.concatenate([text_with_zero, [0]]) converted_arr.append(text_with_zero) converted_arr = np.asarray(converted_arr) return converted_arr max_y = max(max(len(x) for x in train_sums), max(len(x) for x in test_sums)) + 2 y_trains = convert2array(train_sums, test_sums) y_tests = convert2array(test_sums, train_sums) X_test = convert2array(test_texts, train_texts) X_train = convert2array(train_texts, test_texts) y_test = convert2array(test_summaries, train_summaries) y_train = convert2array(train_summaries, test_summaries) print(str(X_train[0])) #%% def normalize(i, mini, maxi): norm = (np.float64(i) - mini) / (maxi - mini) return norm vocabulary_size = int(get_vocabulary_size(X_train)) print('SIZE X_TRAIN:' + str(vocabulary_size)) # Different placeholders batch_ph = tf.placeholder(tf.int32, [None, None], name="X") target_ph = tf.placeholder(tf.float32, [None, None], name="Y") #adapted for y seq_len_ph = tf.placeholder(tf.int32, [None], name="SEQ_LEN") keep_prob_ph = tf.placeholder(tf.float32, name="KEEP_PROB") with tf.name_scope("Embedding-Layer"): # Embedding layer embeddings_var = tf.Variable(tf.random_uniform( [vocabulary_size, EMBEDDING_DIM], 0.0, 1.0), trainable=True, name="embedding") batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) with tf.name_scope("RNN"): # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) #, _ = rnn(GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) with tf.name_scope("Attention-Layer"): # Attention layer attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) with tf.name_scope("Dropout"): # Dropout drop = tf.nn.dropout(attention_output, keep_prob_ph) with tf.name_scope("Dense"): # Fully connected layer W = tf.Variable( tf.truncated_normal(shape=[HIDDEN_SIZE * 2, max_y], stddev=0.1), name="weights") # Hidden size is multiplied by 2 for Bi-RNN b = tf.Variable(tf.constant(0., shape=[max_y]), name="bias") y_hat = tf.nn.xw_plus_b(drop, W, b) y_hat = tf.squeeze(y_hat) with tf.name_scope("loss"): # Cross-entropy loss and optimizer initialization loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph)) # = tf.reduce_mean(tf.nn.nce_loss(weights=W, biases=b, labels=target_ph, inputs=y_hat)) tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer( learning_rate=LEARNING_RATE).minimize(loss) with tf.name_scope("Accuracy"): # Accuracy metric accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32)) tf.summary.scalar('Accuracy', accuracy) # rec, rec_op = tf.metrics.recall(labels=target_ph, predictions=y_hat) # pre, pre_op = tf.metrics.precision(labels=target_ph, predictions=y_hat) # Batch generators train_batch_generator = batch_generator(X_train, y_trains, BATCH_SIZE) test_batch_generator = batch_generator(X_test, y_tests, BATCH_SIZE) #%% print(type([x for x in X_train[0]])) saver = tf.train.Saver() # Train session with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train_writer = tf.summary.FileWriter('./logs/rnn_logs' + '/train') train_writer.add_graph(sess.graph) test_writer = tf.summary.FileWriter('./logs/rnn_logs' + '/test') test_writer.add_graph(sess.graph) merged = tf.summary.merge_all() losses = [] print("TRAIN_ARNN: Start learning...") for epoch in range(NUM_EPOCHS): loss_train = 0 loss_test = 0 accuracy_train = 0 accuracy_test = 0 print("epoch: {}\t".format(epoch), end="") # Training num_batches = X_train.shape[0] // BATCH_SIZE for b in range(num_batches): x_batch, y_batch = next(train_batch_generator) maxValue_y = max([max(sublist) for sublist in y_batch]) minValue_y = min([min(sublist) for sublist in y_batch]) j = 0 y_batch_normed = [] for each_y_batch in y_batch: norm_y = [ normalize(i, minValue_y, maxValue_y) for i in each_y_batch ] y_batch_normed.append(norm_y) j += 1 seq_len = np.array([list(x).index(0) + 1 for x in x_batch ]) # actual lengths of sequences summary, loss_tr, acc, _ = sess.run( [merged, loss, accuracy, optimizer], feed_dict={ batch_ph: x_batch, target_ph: y_batch_normed, seq_len_ph: seq_len, keep_prob_ph: KEEP_PROB }) accuracy_train += acc loss_train = loss_tr * DELTA + loss_train * (1 - DELTA) train_writer.add_summary(summary, epoch * num_batches + b) if accuracy_train != 0: accuracy_train /= num_batches saver.save( sess, "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/Bachelorarbeit/BA/buffering-model-rnn.ckpt" ) # Testing print(X_test.shape) num_batches = X_test.shape[0] // BATCH_SIZE for b in range(num_batches): x_batch, y_batch = next(test_batch_generator) maxValue_y = max([max(sublist) for sublist in y_batch]) minValue_y = min([min(sublist) for sublist in y_batch]) j = 0 y_batch_normed = [] for each_y_batch in y_batch: norm_y = [ normalize(i, minValue_y, maxValue_y) for i in each_y_batch ] y_batch_normed.append(norm_y) j += 1 seq_len = np.array([list(x).index(0) + 1 for x in x_batch ]) # actual lengths of sequences summary, loss_test_batch, acc = sess.run( [merged, loss, accuracy], feed_dict={ batch_ph: x_batch, target_ph: y_batch_normed, seq_len_ph: seq_len, keep_prob_ph: 1.0 }) accuracy_test += acc loss_test += loss_test_batch test_writer.add_summary(summary, epoch * num_batches + b) if loss_test == 0: print('loss_test is zero!') break if accuracy_train != 0: accuracy_test /= num_batches loss_test /= num_batches losses.append(loss_test) print( "loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}". format(loss_train, loss_test, accuracy_train, accuracy_test)) #saver.save(sess, "H:/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt") # "model") saver.save( sess, "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt" ) # "model") #%% Restore session with tf.Session() as sess: saver.restore( sess, "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt" ) #"model") #saver.restore(sess, "H:/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt")#"model") max_y = max(max(len(x) for x in train_summaries), max(len(x) for x in test_summaries)) + 2 x_batch_train, y_batch_train = X_train[:len( X_train)], y_trains[:len(y_trains)] seq_len_test = np.array( [list(x).index(0) + 1 for x in x_batch_train]) maxValue_y = max([max(sublist) for sublist in y_batch]) minValue_y = min([min(sublist) for sublist in y_batch]) j = 0 y_batch_normed = [] for each_y_batch in y_batch_train: norm_y = [ normalize(i, minValue_y, maxValue_y) for i in each_y_batch ] y_batch_normed.append(norm_y) j += 1 alphas_train = sess.run( [alphas], feed_dict={ batch_ph: x_batch_train, target_ph: y_batch_normed, seq_len_ph: seq_len_test, keep_prob_ph: 1.0 }) #%% Restore session with tf.Session() as sess: saver.restore( sess, "/home/kloe_mr/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt" ) #"model") #saver.restore(sess, "H:/AnacondaProjects/tf-rnn-attention/tf-rnn-attention-master/model.ckpt")#"model") max_y = max(max(len(x) for x in train_summaries), max(len(x) for x in test_summaries)) + 2 x_batch_test, y_batch_test = X_test[:len(X_test )], y_tests[:len(y_tests)] seq_len_test = np.array( [list(x).index(0) + 1 for x in x_batch_test]) maxValue_y = max([max(sublist) for sublist in y_batch]) minValue_y = min([min(sublist) for sublist in y_batch]) j = 0 y_batch_normed = [] for each_y_batch in y_batch_test: norm_y = [ normalize(i, minValue_y, maxValue_y) for i in each_y_batch ] y_batch_normed.append(norm_y) j += 1 alphas_test = sess.run( [alphas], feed_dict={ batch_ph: x_batch_test, target_ph: y_batch_normed, seq_len_ph: seq_len_test, keep_prob_ph: 1.0 }) #%% setting up the generated outputs by choosing the most weighted higlights out of the text for the LSTM input alphas_values = alphas_train[:][0] y_batch = y_train[:len(y_train)] # Save visualization as HTML rnn_outs = [] #with open("visualization.html", "w") as html_file: for words, alphas in zip(x_batch_train, alphas_values): rnn_out = [] Largest_alphas = len(words) // ALPHA_DIVIDER if Largest_alphas == 0: continue min_value = min(heapq.nlargest(Largest_alphas, alphas, key=None)) for word, alpha in zip( words, alphas): #_values):# / alphas_values.max()): if alpha > min_value: if word not in rnn_out: #think about wether it makes sense to remove 2nd appearance, maybe check for frequency in listheap rnn_out.append(word) if word == 0: break #html_file.write('<font style="background: rgba(255, 255, 0, %f)">%s</font>\n' % (alpha, word)) rnn_outs.append(rnn_out) print('RNN_OUTS: ' + str(len(rnn_outs))) alphas_values = alphas_test[:][0] #y_batch = y_test[:len(y_test)] #print(alphas_values[0]) # Save visualization as HTML rnntest_outs = [] #with open("visualization.html", "w") as html_file: for words, alphas in zip(x_batch_test, alphas_values): rnn_out = [] Largest_alphas = len(words) // ALPHA_DIVIDER if Largest_alphas == 0: continue min_value = min(heapq.nlargest(Largest_alphas, alphas, key=None)) for word, alpha in zip( words, alphas): #_values):# / alphas_values.max()): if alpha > min_value: if word not in rnn_out: #think about wether it makes sense to remove 2nd appearance, maybe check for frequency in listheap rnn_out.append(word) if word == 0: break #html_file.write('<font style="background: rgba(255, 255, 0, %f)">%s</font>\n' % (alpha, word)) rnntest_outs.append(rnn_out) # Creates list of lists with reembedded words def int2word(data): lists = [] for paragraph in data: words = [] for word in paragraph: if word == 1: continue if word == 0: break words.append( list(dictionary.keys())[list( dictionary.values()).index(word)]) lists.append(words) return lists rnn_out = int2word(rnn_outs) batch_words = int2word(y_batch_train) y_batch = int2word(y_batch) print(rnn_out[0]) print(batch_words[0]) mean_testloss = np.mean(losses) avg_testloss = np.mean(losses) std_testloss = np.std(losses) ##EVENTUELL AUCH FÜR TEST DATA(????) #in csv datei packen t2 = time.time() t = t2 - t1 r = Rouge() def rouge_score(rnn_outs, rnn_out, batch_words): d = [] for i in range(len(rnn_outs)): system_generated_summary = rnn_out[i] manual_summmary = batch_words[i] try: [precision, recall, f_score] = r.rouge_l([system_generated_summary], [manual_summmary]) except ZeroDivisionError: continue d.append( (batch_words[i], rnn_out[i], precision, recall, f_score, t, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, KEEP_PROB, BATCH_SIZE, NUM_EPOCHS, DELTA, LEARNING_RATE, ALPHA_DIVIDER, MAXIMUM_DATA_NUM, mean_testloss, avg_testloss, std_testloss)) print("Summary" + str(i) + "\nPrecision is :" + str(precision) + "\nRecall is :" + str(recall) + "\nF Score is :" + str(f_score)) return d d = rouge_score(rnn_outs, rnn_out, batch_words) df = pd.DataFrame( d, columns=('Original Sum', 'Predicted Sum', 'Precision', 'Recall', 'F_Score', 'RUNTIME/sec', 'EMBEDDING_DIM', 'HIDDEN_SIZE', 'ATTENTION_SIZE', 'KEEP_PROB', 'BATCH_SIZE', 'EPOCHS', 'DELTA', 'LEARNING_RATE', 'ALPHA_DIVIDER', 'MAXIMUM_DATA_NUM', 'MEAN_TESTLOSS', 'AVG_TESTLOSS', 'STD_TESTLOSS')) print(df.head(2)) df.to_csv("arnn_train_1" + str(BATCH_SIZE) + "_batches" + str(NUM_EPOCHS) + "_epochs.csv", sep=',') #%% # saving with open('rnn_out.pickle', 'wb') as handle: pickle.dump(rnn_out, handle, protocol=pickle.HIGHEST_PROTOCOL) # saving with open('batch_words.pickle', 'wb') as handle: pickle.dump(y_batch, handle, protocol=pickle.HIGHEST_PROTOCOL) rnntest_out = int2word(rnntest_outs) batch_words = int2word(y_batch_test) d = rouge_score(rnntest_outs, rnntest_out, batch_words) df = pd.DataFrame( d, columns=('Original Sum', 'Predicted Sum', 'Precision', 'Recall', 'F_Score', 'RUNTIME/sec', 'EMBEDDING_DIM', 'HIDDEN_SIZE', 'ATTENTION_SIZE', 'KEEP_PROB', 'BATCH_SIZE', 'EPOCHS', 'DELTA', 'LEARNING_RATE', 'ALPHA_DIVIDER', 'MAXIMUM_DATA_NUM', 'MEAN_TESTLOSS', 'AVG_TESTLOSS', 'STD_TESTLOSS')) print(df.head(2)) df.to_csv("arnn_test_1" + str(BATCH_SIZE) + "_batches" + str(NUM_EPOCHS) + "_epochs.csv", sep=',')
def train(model_name, p1_train_data, p1_dev_data, p2_train_data, p2_dev_data): ### PHASE 1 ### training_data = np.load(p1_train_data) validation_data = np.load(p1_dev_data) assert training_data["batch_size"] == validation_data["batch_size"] assert training_data["vocabulary"] == validation_data["vocabulary"] assert training_data["punctuations"] == validation_data["punctuations"] print "1st phase data loaded..." print "Vocabulary size is %d" % utils.get_vocabulary_size( validation_data["vocabulary"]) print "Training set size is %d" % training_data["total_size"] print "Validation set size is %d" % validation_data["total_size"] net = models.T_LSTM() net.initialize(hidden_size=conf.PHASE1["HIDDEN_SIZE"], projection_size=conf.PHASE1["PROJECTION_SIZE"], in_vocabulary=training_data["vocabulary"], out_vocabulary=training_data["punctuations"], batch_size=training_data["batch_size"], hidden_activation=conf.PHASE1["HIDDEN_ACTIVATION"], bptt_steps=conf.PHASE1["BPTT_STEPS"], use_pauses=False) _train(net, training_data, validation_data, model_name, conf.PHASE1["LEARNING_RATE"], conf.PHASE1["MAX_EPOCHS"], conf.PHASE1["MIN_IMPROVEMENT"]) ### PHASE 2 ### if not os.path.isfile(p2_train_data) or not os.path.isfile(p2_train_data): print "No second phase data." return training_data = np.load(p2_train_data) validation_data = np.load(p2_dev_data) assert training_data["batch_size"] == validation_data[ "batch_size"] == net.batch_size assert training_data["vocabulary"] == validation_data[ "vocabulary"] == net.in_vocabulary assert training_data["punctuations"] == validation_data[ "punctuations"] == net.out_vocabulary print "2nd phase data loaded..." print "Training set size is %d" % training_data["total_size"] print "Validation set size is %d" % validation_data["total_size"] print "Trainging %s pause durations." % ( "with" if conf.PHASE2["USE_PAUSES"] else "without") t_lstm = net net = models.TA_LSTM() net.initialize(hidden_size=conf.PHASE2["HIDDEN_SIZE"], t_lstm=t_lstm, out_vocabulary=training_data["punctuations"], batch_size=training_data["batch_size"], hidden_activation=conf.PHASE2["HIDDEN_ACTIVATION"], bptt_steps=conf.PHASE2["BPTT_STEPS"], use_pauses=conf.PHASE2["USE_PAUSES"]) _train(net, training_data, validation_data, model_name, conf.PHASE2["LEARNING_RATE"], conf.PHASE2["MAX_EPOCHS"], conf.PHASE2["MIN_IMPROVEMENT"])