def load_embeddings(sess, model, metadata, path): tv_dict = { name: variable for name, variable in [(v.name, v) for v in tf.trainable_variables()] } if model.mode == ATTENTION_MODE or \ model.mode == ATTENTION_DIVERSITY_MODE: name1 = 'embedding_attention_seq2seq' name2 = 'embedding_attention_decoder' else: name1 = 'embedding_rnn_seq2seq' name2 = 'embedding_rnn_decoder' emb1 = tv_dict[ 'decoder/%s/rnn/embedding_wrapper/embedding:0' % name1] emb2 = tv_dict[ 'decoder/%s/%s/embedding:0' % (name1, name2)] emb1_val = sess.run(emb1) emb2_val = sess.run(emb2) print(np.sum(emb1_val), np.sum(emb2_val)) load_embeddings.load_embedding( sess, metadata['w2idx'], [emb1, emb2], path, dim_embedding=300, vocab_length=len(metadata['w2idx'])) emb1_val = sess.run(emb1) emb2_val = sess.run(emb2) print(np.sum(emb1_val), np.sum(emb2_val))
def _embeddings(self, pretrained=False, scope_name=None): """Compute word embeddings for sentence. Parameters ---------- pretrained: bool, default False Whether to use pretrained embeddings scope_name: str, default None Variable scope """ if not scope_name: scope_name = "Embedding" self.sentence_ph = tf.placeholder(dtype=tf.int32, shape=[None, self.time_steps + 1], name="Sentence_placeholder") with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE): self.embedding_matrix = tf.get_variable( name="embedding_matrix", shape=[self.len_corpus, self.embedding_size], initializer=xav_init() ) if pretrained: print("Loading pretrained embeddings...") load_embedding(session=self.session, vocab=self.dataset.word_to_idx, emb=self.embedding_matrix, path=self.dataset.embedding_file, vocab_size=self.len_corpus, dim_embedding=self.embedding_size) self.word_embeddings = tf.nn.embedding_lookup(self.embedding_matrix, self.sentence_ph)
def main(): # Write to both logfile and stdout timestamp = time.strftime('%Y-%m-%d--%H_%M_%S') sys.stdout = Logger(timestamp) # Read train data train_reader = Reader(vocab_size=cfg["vocab_size"], sentence_length=cfg["sentence_length"], max_sentences=cfg["max_sentences"]) train_reader.build_dict(cfg["dictionary_name"], cfg["path"]["train"]) train_reader.read_sentences(cfg["path"]["train"]) if cfg["use_pretrained"]: # Read given embeddings sess = tf.Session() embeddings = tf.placeholder( dtype=tf.float32, shape=[cfg["vocab_size"], cfg["embeddings_size"]]) embeddings_blank = tf.Variable( dtype=tf.float32, initial_value=np.zeros(shape=(cfg["vocab_size"], cfg["embeddings_size"]))) embeddings = load_embeddings.load_embedding( session=sess, vocab=train_reader.vocab_dict, emb=embeddings_blank, path=cfg["path"]["embeddings"], dim_embedding=cfg["embeddings_size"]) m = model.Model(cfg=cfg, embeddings=embeddings) else: m = model.Model(cfg=cfg) # Training m.build_forward_prop() m.build_backprop() # Read evaluation data eval_reader = Reader(vocab_size=cfg["vocab_size"], sentence_length=cfg["sentence_length"], vocab_dict=train_reader.vocab_dict, max_sentences=cfg["max_test_sentences"]) eval_reader.read_sentences(cfg["path"]["eval"]) m.train(train_data=train_reader.id_data, test_data=eval_reader.id_data) # Read test data test_reader = Reader(vocab_size=cfg["vocab_size"], sentence_length=cfg["sentence_length"], vocab_dict=train_reader.vocab_dict, max_sentences=cfg["max_test_sentences"]) test_reader.read_sentences(cfg["path"]["test"]) #Revert dictionary for perplexity reverted_dict = dict([(y, x) for x, y in list(test_reader.vocab_dict.items())]) m.test(data=test_reader.id_data, vocab_dict=reverted_dict)
def train_network(operators, sentences_array, num_epochs, vocabulary, configProto=None, num_steps=MAX_SENTENCE_LENGTH, batch_size=BATCH_SIZE, state_size=CELL_SIZE, checkpoint_filename=None, useWord2Vec=False): """ Trains the network using a given graph operators Dictionary of graph operators to execute sentences_array Input sentences (in index form) num_epochs Number of epochs to train for vocabulary Word2Index dictionary of our vocabulary configProto Session configuration for tensorflow num_steps Number of steps of our RNN / Sequence length batch_size Number of sentences per batch state_size Size of the hidden state in an RNN cell checkpoint_filename Name of the file to save the graph. Set to None if no saving is required. useWord2Vec Indicates whether to load word embeddings from the provided word2vec file. """ with tf.Session(config=configProto) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1) global_step = 1 # Init Tensorboard summaries. This will save Tensorboard information into a different folder at each run. timestamp = str(math.trunc(time.time())) train_writer = tf.summary.FileWriter("{}{}-{}-training".format( LOG_DIRECTORY, timestamp, checkpoint_filename), graph=tf.get_default_graph()) validation_writer = tf.summary.FileWriter("{}{}-{}-validation".format( LOG_DIRECTORY, timestamp, checkpoint_filename), graph=tf.get_default_graph()) ''' If the flag is activated, load word2vec embeddings. It is placed just before training since it requires session object. Also, since it needs W_embed tensor, we reach it from "word_embedding" variable_scope. load_embeddings.py requires installing gensim package! ''' if useWord2Vec == True: with tf.variable_scope("word_embedding"): tf.get_variable_scope().reuse_variables() W_embed = tf.get_variable("W_embed") load_embeddings.load_embedding( session=sess, vocab=vocabulary, emb=W_embed, path="wordembeddings-dim100.word2vec", dim_embedding=100, vocab_size=VOCABULARY_SIZE) # Setup the feed dictionary and fetch the operators to execute zero_state = np.zeros([batch_size, state_size]) feed_dict = { operators['init_state_a']: zero_state, operators['init_state_b']: zero_state } # For training we always feed zero states at the beginning of each sentence summary_op = operators['summary_op'] train_step = operators['train_step'] x = operators['x'] y = operators['y'] for epoch in range(num_epochs): print("Starting epoch {}".format(epoch)) for X, Y in shuffle_iterator(sentences_array, batch_size, num_steps): global_step += 1 feed_dict[x] = X feed_dict[y] = Y ''' Every PRINT_FREQUENCY-th batch is used for testing purposes only! In other words, - weights are not updated then - current state of the graph is saved (this way, no metter when we kill the script in GPU VM, we have last previous graph state saved) ''' # Every VALIDATION_SUMMARY_FREQUENCY steps we test our network with validation batch # that has not been seen by the network. if global_step % VALIDATION_SUMMARY_FREQUENCY == 0: summary_validation = sess.run(summary_op, feed_dict) validation_writer.add_summary(summary_validation, global_step) elif global_step % TRAIN_SUMMARY_FREQUENCY == 0: # Every TRAIN_SUMMARY_FREQUENCY steps we also evaluate the summary on the current training batch _, summary_train = sess.run([train_step, summary_op], feed_dict) train_writer.add_summary(summary_train, global_step) else: # In every other case, just train the network sess.run([train_step], feed_dict) # Regularly we save the current model to disk if global_step % CHECKPOINT_FREQUENCY == 0: if checkpoint_filename is not None: saver.save(sess, "./{}-ep{}".format(checkpoint_filename, epoch), global_step=global_step) # At the end of the whole training process, save the model to disk if checkpoint_filename is not None: saver.save(sess, "./{}".format(checkpoint_filename), global_step=global_step)
def main(): """load configs & data -> preprocessing""" max_predicted_words = 20 """ PARAMETERS INTO TENSORFLOW FLAGS -> the advantage : Variables can be accessed from a tensorflow object without explicitely passing them""" tf.flags.DEFINE_string("train_set", train_set, "Path to the training data") # Model parameters tf.flags.DEFINE_integer("embeddings_size", embeddings_size, "Dimensionality of word embeddings (default: 50)") tf.flags.DEFINE_integer("vocabulary_size", vocabulary_size, "Size of the vocabulary (default: 20k)") # tf.flags.DEFINE_integer("past_words", 3, "How many previous words are used for prediction (default: 3)") # Training parameters tf.flags.DEFINE_integer("batch_size", batch_size, "Batch Size (default: 64)") tf.flags.DEFINE_integer("num_epochs", num_epochs, "Number of training epochs (default: 200)") tf.flags.DEFINE_integer("evaluate_every", evaluate_every, "Evaluate model on dev set after this many steps (default: 100)") tf.flags.DEFINE_integer("checkpoint_every", checkpoint_every, "Save model after this many steps (default: 100)") tf.flags.DEFINE_integer("num_checkpoints", num_checkpoints, "Number of checkpoints to store (default: 5)") tf.flags.DEFINE_integer("lstm_cell_state", lstm_cell_state, "Number of units inside the lastm cell") tf.flags.DEFINE_integer("lstm_cell_state_down", lstm_cell_state_down, "Number of units inside the lstm cell") # Tensorflow Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") # for running on EULER, adapt this tf.flags.DEFINE_integer("inter_op_parallelism_threads", 4, "TF nodes that perform blocking operations are enqueued on a pool of inter_op_parallelism_threads available in each process (default 0).") tf.flags.DEFINE_integer("intra_op_parallelism_threads", 4, "The execution of an individual op (for some op types) can be parallelized on a pool of intra_op_parallelism_threads (default: 0).") """Printing model configuration to command line""" FLAGS = tf.flags.FLAGS # FLAGS._parse_flags() # add if using tensorflow version <= 1.3 print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value.value)) # print("{}={}".format(attr.upper(), value)) # change to this if using tensorflow version <= 1.3 print("") def train_step(x_batch, y_batch): """ A single training step, x_batch = y_batch Both are matrices indices of words """ feed_dict = { lstm_network.input_x: x_batch, lstm_network.input_y: y_batch, lstm_network.init_state_hidden: lstm_network.next_hidden_state, lstm_network.init_state_current: lstm_network.next_current_state } _, step, summaries, loss, accuracy, new_hidden_state, new_current_state, vocab_idx_predictions = sess.run( [train_optimizer, global_step, train_summary_op, lstm_network.loss, lstm_network.accuracy, lstm_network.init_state_hidden, lstm_network.init_state_current, lstm_network.vocab_indices_predictions], feed_dict) # print("Predictions indices w.r.t vocabulary") # print(vocab_idx_predictions) # print("Example of sentence predicted by the network by training") # print(train_utils.words_mapper_from_vocab_indices(vocab_idx_predictions, utils.vocabulary_words_list, # is_tuple=True)[0:28]) # print("Groundtruth for the sentence predicted by the network above") # print(train_utils.words_mapper_from_vocab_indices(np.reshape(x_batch, [batch_size * 29]), # utils.vocabulary_words_list)[0:28]) lstm_network.next_hidden_state = new_hidden_state lstm_network.next_current_state = new_current_state time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def predicting_step(word, state): """The input in this case is represented by a single word""" feed_dict = { lstm_network.input_x: word, init_state_hidden: state[0], init_state_current: state[1] } word_predicted, next_final_state = sess.run( [lstm_network.vocab_indices_predictions, lstm_network.final_lstm_state], feed_dict) """Word indices in vocabulary -> charachter words""" word_predicted = np.array(word_predicted).reshape((1, 1)) # print(train_utils.words_mapper_from_vocab_indices(word_predicted, utils.vocabulary)) """Update state in the lstm, which is the contextual memory""" next_hidden_state, next_current_state = next_final_state state = (next_hidden_state, next_current_state) # print(next_hidden_state) return word_predicted, state def eval_step(current_step): """ Evaluates the model on eval_set """ eval_dataset, vocabulary_words_list = data_utilities.data_utils(model_to_load, embeddings_size, sentence_len, vocabulary_size, bos, eos, pad, unk).load_eval_data(eval_set, vocabulary_pkl) batches = train_utils.batch_iter_train(data=eval_dataset, batch_size=test_batch_size, num_epochs=1, shuffle=False) perplexities = [] # array with perplexities for each sentence for i, batch in enumerate(batches): _, y_batch = zip(*batch) y_batch = train_utils.words_mapper_to_vocab_indices(y_batch, vocabulary_words_list) feed_dict = { lstm_network.input_x: y_batch, lstm_network.init_state_hidden: np.zeros([test_batch_size, lstm_cell_state]), lstm_network.init_state_current: np.zeros([test_batch_size, lstm_cell_state]) } estimates = sess.run(lstm_network.softmax, feed_dict) estimates = np.reshape(estimates, [-1, sentence_len-1, vocabulary_size]) for j, sentence in enumerate(y_batch): sentence_perplexity = eval.perplexity(sentence, estimates[j], vocabulary_words_list) print("Sentence {} in batch {}: perplexity {}".format(test_batch_size * i + j, i, sentence_perplexity)) perplexities.append(sentence_perplexity) if eval_perpl_file: eval.write_perplexity(perplexities, eval_step=True, current_step=current_step) return if lstm_is_training: """Preprocess data""" utils = data_utilities.data_utils(model_to_load, embeddings_size, sentence_len, vocabulary_size, bos, eos, pad, unk) model_w2v, dataset = utils.load_train_data(train_set) # dataset=dataset[0:100] dataset_size = len(dataset) print("Total sentences in the dataset: ", dataset_size) print("Example of a random wrapped sentence in dataset ", dataset[(randint(0, dataset_size))]) print("Example of the first wrapped sentence in dataset ", dataset[0]) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads) sess = tf.Session(config=session_conf) with sess.as_default(): # Initialize model lstm_network = model_lstm.lstm_model( vocab_size=FLAGS.vocabulary_size, embedding_size=FLAGS.embeddings_size, words_in_sentence=sentence_len - 1, lstm_cell_size=lstm_cell_state, lstm_cell_size_down=lstm_cell_state_down, down_project=down_project ) """Please note that the tf variables keeps updated, ready to be printed out or logged to file""" global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer() train_optimizer = optimizer.minimize(lstm_network.loss, global_step=global_step) """ Output directory for models and summaries """ timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, runs_dir, timestamp)) print("Writing to {}\n".format(out_dir)) """ Summaries for loss and accuracy """ loss_summary = tf.summary.scalar("loss", lstm_network.loss) acc_summary = tf.summary.scalar("accuracy", lstm_network.accuracy) """ Train Summaries """ train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # """ Dev summaries """ # dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) # dev_summary_dir = os.path.join(out_dir, "summaries", "dev") # dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) """ Checkpoint directory (Tensorflow assumes this directory already exists so we need to create it) """ checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) sess.run(tf.global_variables_initializer()) lstm_network.next_hidden_state = np.zeros([batch_size, lstm_cell_state]) lstm_network.next_current_state = np.zeros([batch_size, lstm_cell_state]) if training_with_w2v: Total_IDs = len(utils.vocabulary_words_list) vocab_and_IDs = dict(zip(utils.vocabulary_words_list, [idx for idx in range(Total_IDs)])) load_embeddings.load_embedding(session=sess, vocab=vocab_and_IDs, emb=lstm_network.W_embedding, path=embeddings, dim_embedding=embeddings_size, vocab_size=Total_IDs) """batches is a generator, please refer to training_utilities for more information. batch_iter function is executed if an iteration is performed on op of it and it gives a new batch each time (sequentially-wise w.r.t the original dataset)""" batches = train_utils.batch_iter_train(data=dataset, batch_size=batch_size, num_epochs=num_epochs, shuffle=shuffle_training, testing=False) for batch in batches: x_batch, y_batch = zip(*batch) x_batch = train_utils.words_mapper_to_vocab_indices(x_batch, utils.vocabulary_words_list) y_batch = train_utils.words_mapper_to_vocab_indices(y_batch, utils.vocabulary_words_list) """Train batch is used as evaluation batch as well -> it will be compared with predicitons""" train_step(x_batch=x_batch, y_batch=y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation: computing perplexities for the evaluation set...") eval_step(current_step) print("") if global_step == max_global_steps: sys.exit() else: """The network is doing predictions""" """Restore model for predictions""" lstm_network = model_lstm.lstm_model( vocab_size=FLAGS.vocabulary_size, embedding_size=FLAGS.embeddings_size, words_in_sentence=test_sentence_len, lstm_cell_size=lstm_cell_state, lstm_cell_size_down=lstm_cell_state_down, down_project=down_project ) out_dir = os.path.abspath(os.path.join(os.path.curdir, runs_dir)) all_runs = [os.path.join(out_dir, o) for o in os.listdir(out_dir) if os.path.isdir(os.path.join(out_dir, o))] latest_run = max(all_runs, key=os.path.getmtime) # get the latest run checkpoint_dir = os.path.abspath(os.path.join(latest_run, "checkpoints")) checkpoint_prefix = tf.train.latest_checkpoint(checkpoint_dir) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=5) saver.restore(sess, checkpoint_prefix) input_x = tf.get_default_graph().get_tensor_by_name("input_x:0") print(input_x) init_state_current = tf.get_default_graph().get_tensor_by_name("init_state_current:0") print(init_state_current) init_state_hidden = tf.get_default_graph().get_tensor_by_name("init_state_hidden:0") vocab_indices_predictions = tf.get_default_graph().get_tensor_by_name("vocab_indices_predictions:0") print(vocab_indices_predictions) print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'softmax_out_layer')) # arr=a.split(" ") print("YESSSSSS") """Load test data""" utils = data_utilities.data_utils(model_to_load, embeddings_size, max_predicted_words, vocabulary_size, bos, eos, pad, unk) dataset, _ = utils.load_test_data(path_to_file=cont_set, vocabulary_file_path=vocabulary_pkl) #dataset = dataset[0:100] print(len(dataset)) # dataset=dataset[0:50] uncomment for testing and have results in the brief time dataset_size = len(dataset) complete_sentences = [] sentence_nb = 0 """Zero state feeded initially for each sentence""" for sentence in dataset: index = sentence.index(eos) sentence = sentence[1:index] print("SENTENCE IS ", sentence) nb_initial_words = len(sentence) print("INITIAL SENTECE IS LONG ", nb_initial_words) initial_lstm_state = (np.zeros((1, lstm_cell_state)),) * 2 lstm_state = initial_lstm_state full_sentence = [] for word in sentence: if word == eos: print("FOUND", word) break word = np.array(utils.vocabulary_words_list.index(word)).reshape(1, 1) word_predicted, lstm_state = predicting_step(word, lstm_state) mapped_word = utils.vocabulary_words_list[word_predicted[0][0]] full_sentence.append(utils.vocabulary_words_list[word[0][0]]) print(mapped_word) if mapped_word == eos: break print("Sentence before continuation is ", full_sentence) """Futher predictions done through the last predicted word of lstm and the current lstm state""" words_remaining = max_predicted_words - nb_initial_words print("max_predicted_words ", max_predicted_words) print("nb_initial_words ", nb_initial_words) states = [] if full_sentence[-1] != eos: print("Words remaining ", words_remaining) for i in range(words_remaining): last_word_predicted = full_sentence[-1] # print(full_sentence[-1]) last_word_predicted = np.array(utils.vocabulary_words_list.index(last_word_predicted)).reshape( 1, 1) word_predicted, lstm_new_state = predicting_step(last_word_predicted, lstm_state) lstm_state = lstm_new_state mapped_word = utils.vocabulary_words_list[word_predicted[0][0]] full_sentence.append(mapped_word) if mapped_word == eos: break print("SENTENCE WAS LONG ", len(full_sentence)) print("FULL PREDICTION ", full_sentence) sentence_nb = sentence_nb + 1 complete_sentences.append(full_sentence) print("Completed sentence number ", sentence_nb) """Write predictions to submission file""" testing_utils.write_submission_predictions(complete_sentences, bos, eos, n_group)