def main(_): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, word_tensors, max_doc_length, label_tensors = \ load_data(FLAGS.data_dir, FLAGS.max_doc_length, FLAGS.max_sen_length) train_reader = DataReader(word_tensors['train'], label_tensors['train'], FLAGS.batch_size) valid_reader = DataReader(word_tensors['valid'], label_tensors['valid'], FLAGS.batch_size) test_reader = DataReader(word_tensors['test'], label_tensors['test'], FLAGS.batch_size) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = build_model(word_vocab, max_doc_length, train=True) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = build_model(word_vocab, max_doc_length, train=False) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.global_variables_initializer().run() session.run(train_model.clear_word_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None #rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.global_norm, train_model.global_step, train_model.clear_word_embedding_padding ], { train_model.input: x, train_model.targets: y, }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 #rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss = session.run(valid_model.loss, { valid_model.input: x, valid_model.targets: y, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss
def main(_): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') minimum_valid_ppl = 1000000 minimum_vl_epoch = 0 text_file = open("train_log.txt", "w") # text_file.write("Purchase Amount: %s" % TotalAmount) with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=10) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) text_file.write( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f \n' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) # text_file.write('Epoch training time:'+str( time.time()-epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) text_file.write("at the end of epoch:" + str(epoch) + '\n') text_file.write("train loss = %6.8f, perplexity = %6.8f \n" % (avg_train_loss, np.exp(avg_train_loss))) text_file.write("validation loss = %6.8f, perplexity = %6.8f \n" % (avg_valid_loss, np.exp(avg_valid_loss))) if (np.exp(avg_valid_loss) < minimum_valid_ppl): minimum_valid_ppl = np.exp(avg_valid_loss) minimum_vl_epoch = epoch save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) elif (epoch % 4 == 0): save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) print("----------------------------------------------") print( "Minimum Valid PPL is attained in epoch:%d and Validation PPL is %6.8f" % (minimum_vl_epoch, minimum_valid_ppl))
def main(file, batch_size=20, num_unroll_steps=35, char_embed_size=15, rnn_size=650, kernels="[1,2,3,4,5,6,7]", kernel_features="[50,100,150,200,200,200,200]", max_grad_norm=5.0, learning_rate=1.0, learning_rate_decay=0.5, decay_when=1.0, seed=3435, param_init=0.05, max_epochs=25, print_every=5): ''' Trains model from data ''' if not os.path.exists(TRAINING_DIR): os.mkdir(TRAINING_DIR) print('Created training directory', TRAINING_DIR) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_dataset() print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: train_reader = DataReader(word_tensors['train'], char_tensors['train'], batch_size, num_unroll_steps, char_vocab) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], batch_size, num_unroll_steps, char_vocab) test_reader = DataReader(word_tensors['test'], char_tensors['test'], batch_size, num_unroll_steps, char_vocab) # tensorflow seed must be inside graph tf.set_random_seed(seed) np.random.seed(seed=seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(param_init, param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size(), word_vocab_size=word_vocab.size(), char_embed_size=char_embed_size, batch_size=batch_size, rnn_size=rnn_size, max_word_length=max_word_length, kernels=eval(kernels), kernel_features=eval(kernel_features), num_unroll_steps=num_unroll_steps) train_model.update( model.loss_graph(train_model.logits, batch_size, num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * num_unroll_steps, learning_rate, max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size(), word_vocab_size=word_vocab.size(), char_embed_size=char_embed_size, batch_size=batch_size, rnn_size=rnn_size, max_word_length=max_word_length, kernels=eval(kernels), kernel_features=eval(kernel_features), num_unroll_steps=num_unroll_steps) valid_model.update( model.loss_graph(valid_model.logits, batch_size, num_unroll_steps)) '''if load_model: saver.restore(session, load_model) print('Loaded model from', load_model, 'saved at global step', train_model.global_step.eval()) else:''' tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(TRAINING_DIR, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % print_every == 0: print( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (TRAINING_DIR, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp( avg_valid_loss) > np.exp(best_valid_loss) - decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss
def fitness(self, word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, evo_epoch): self._model, self._valid_model, self._saver = self.update_graph( word_vocab, char_vocab, max_word_length, evo_epoch) self._max_word_length = max_word_length train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) with tf.Session( graph=self._graph, config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=FLAGS.num_cpus)) as session: # initialize model tf.global_variables_initializer().run() session.run(self._model.clear_char_embedding_padding) print('[EVOLUTION] Epoch_%d Individual_%d. Size: %d' % (evo_epoch, self._id_number, model.model_size())) # self._summary_writer = tf.summary.FileWriter(self._individual_dir, graph=session.graph) session.run( tf.assign(self._model.learning_rate, FLAGS.learning_rate), ) # scout train best_valid_loss = None rnn_state = session.run(self._model.initial_rnn_state) # train a mini epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ self._model.loss, self._model.train_op, self._model.final_rnn_state, self._model.global_norm, self._model.global_step, self._model.clear_char_embedding_padding ], { self._model.input: x, self._model.targets: y, self._model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time print('training time:', time.time() - epoch_start_time) # time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(self._valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [ self._valid_model.loss, self._valid_model.final_rnn_state ], { self._valid_model.input: x, self._valid_model.targets: y, self._valid_model.initial_rnn_state: rnn_state, }) avg_valid_loss += loss / valid_reader.length print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) # save_as = '%s/epoch%03d_%.4f.model' % (self._individual_dir, evo_epoch, avg_valid_loss) save_as = '%s/epoch%03d.model' % (self._individual_dir, evo_epoch) # self._saver.save(session, save_as) # print('Saved model', save_as) ''' write out summary events ''' # summary = tf.Summary(value=[ # tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), # tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) # ]) # self._summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(self._model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') self._fitness = best_valid_loss return self._fitness session.run( self._model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss self._fitness = best_valid_loss return self._fitness
def main(print): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory' + FLAGS.train_dir) # CSV initialize pd.DataFrame(FLAGS.flag_values_dict(), index=range(1)).to_csv(FLAGS.train_dir + '/train_parameters.csv') epochs_results = initialize_epoch_data_dict() fasttext_model_path = None if FLAGS.fasttext_model_path: fasttext_model_path = FLAGS.fasttext_model_path word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, words_list = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) fasttext_model = None if 'fasttext' in FLAGS.embedding: fasttext_model = FasttextModel( fasttext_path=fasttext_model_path).get_fasttext_model() train_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='train') valid_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='valid') train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) if FLAGS.load_model_for_training: saver.restore(session, FLAGS.load_model_for_training) string = str('Loaded model from' + str(FLAGS.load_model_for_training) + 'saved at global step' + str(train_model.global_step.eval())) print(string) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) string = str('Created and initialized fresh model. Size:' + str(model.model_size())) print(string) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 if fasttext_model: iter_over = zip(train_reader.iter(), train_ft_reader.iter()) else: iter_over = train_reader.iter() for batch_kim, batch_ft in iter_over: if fasttext_model: x, y = batch_kim else: x, y = batch_kim, batch_ft count += 1 start_time = time.time() if fasttext_model: ft_vectors = fasttext_model.wv[ words_list['train'][count]].reshape( fasttext_model.wv.vector_size, 1) loss, _, rnn_state, gradient_norm, step, _, probas = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input2: batch_ft, train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) else: loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: string = str( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print(string) string = str('Epoch training time:' + str(time.time() - epoch_start_time)) print(string) epochs_results['epoch_training_time'].append( str(time.time() - epoch_start_time)) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for batch_kim, batch_ft in zip(valid_reader.iter(), valid_ft_reader.iter()): x, y = batch_kim count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input2: batch_ft, valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: string = str( "\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) print(string) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:" + str(epoch)) epochs_results['epoch_number'].append(str(epoch)) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) epochs_results['train_loss'].append(avg_train_loss) epochs_results['train_perplexity'].append(np.exp(avg_train_loss)) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) epochs_results['validation_loss'].append(avg_valid_loss) epochs_results['valid_perplexity'].append(np.exp(avg_valid_loss)) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model' + str(save_as)) epochs_results['model_name'].append(str(save_as)) epochs_results['learning_rate'].append( str(session.run(train_model.learning_rate))) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="train_perplexity", simple_value=np.exp(avg_train_loss)), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss), tf.Summary.Value(tag="valid_perplexity", simple_value=np.exp(avg_valid_loss)), ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) string = str('learning rate was:' + str(current_learning_rate)) print(string) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-3: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) string = str('new learning rate is:' + str(current_learning_rate)) print(string) else: best_valid_loss = avg_valid_loss # Save model performance data pd.DataFrame(epochs_results).to_csv(FLAGS.train_dir + '/train_results.csv')
def main(_): ''' Trains model from data ''' print("we in main") print(sys.argv[2]) print(FLAGS) if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update(model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update(model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update(model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) with tf.variable_scope("Model", reuse=True): test_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=1, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=1, dropout=0.0) test_model.update(model.loss_graph(test_model.logits, 1, 1)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.initialize_all_variables().run() print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run( tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) def clear_char_embedding_padding(): char_embedding = session.run(train_model.char_embedding) char_embedding[0,:] = 0.0 session.run(tf.assign(train_model.char_embedding, char_embedding)) char_embedding = session.run(train_model.char_embedding) clear_char_embedding_padding() run_test2(session, test_model, train_reader) #exit(1) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() print (x) exit(1) loss, _, rnn_state, gradient_norm, step = session.run([ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, ], { train_model.input : x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) clear_char_embedding_padding() avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print('%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run([ valid_model.loss, valid_model.final_rnn_state ], { valid_model.input : x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp(best_valid_loss) - FLAGS.decay_when: print('** validation perplexity did not improve enough, decay learning rate') current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run(train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss run_test2(session, test_model, train_reader) print ("AGAIN") run_test2(session, test_model, train_reader)
def main(_): ''' Trains model from data ''' date = time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()) directory = FLAGS.train_dir.format(date) if not os.path.exists(directory): os.makedirs(directory) print('Created training directory', directory) word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS) char_embedding_metadata = os.path.join(directory + "characters_embeddings.tsv") with open(char_embedding_metadata, "w", encoding="utf-8") as metadata_file: metadata_file.write('padding\n') for i in range(1, char_vocab.size): metadata_file.write('%s\n' % (char_vocab.token(i))) word_embedding_metadata = os.path.join(directory + "words_embeddings.tsv") with open(word_embedding_metadata, "w", encoding="utf-8") as metadata_file: metadata_file.write('padding\n') for i in range(1, word_vocab.size): metadata_file.write('%s\n' % (word_vocab.token(i))) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps, char_vocab) valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps, char_vocab) test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps, char_vocab) print('initialized all dataset readers') config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' with tf.Graph().as_default(), tf.Session(config=config) as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = Model(flags=FLAGS, char_vocab=char_vocab, word_vocab=word_vocab, max_word_length=max_word_length, metadata=char_embedding_metadata) with tf.variable_scope("Model", reuse=True): valid_model = Model(FLAGS, char_vocab, word_vocab, max_word_length, ModelUsage.VALIDATE, char_embedding_metadata) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=50) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAFS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(directory, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: print( '%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), time_elapsed, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) save_as = '%s/epoch%03d_%.4f.model' % (directory, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss), tf.Summary.Value(tag="train_perplexity", simple_value=np.exp(avg_train_loss)), tf.Summary.Value(tag="valid_perplexity", simple_value=np.exp(avg_valid_loss)) ]) summary_writer.add_summary(summary, step) projector.visualize_embeddings(summary_writer, valid_model.projector_config) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss
def main(print): ''' Trains model from data ''' if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory' + FLAGS.train_dir) # CSV initialize df_train_params = pd.DataFrame(FLAGS.flag_values_dict(), index=range(1)) df_train_params['comment'] = '' df_train_params.to_csv(FLAGS.train_dir + '/train_parameters.csv') epochs_results = initialize_epoch_data_dict() fasttext_model_path = None if FLAGS.fasttext_model_path: fasttext_model_path = FLAGS.fasttext_model_path word_vocab, char_vocab, word_tensors, char_tensors, max_word_length, words_list, wers, acoustics = \ load_data(FLAGS.data_dir, FLAGS.max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, eos=FLAGS.EOS, batch_size=FLAGS.batch_size) word_vocab_valid, char_vocab_valid, word_tensors_valid, char_tensors_valid, max_word_length_valid, words_list_valid, wers_valid,\ acoustics_valid, files_name_valid, kaldi_sents_index_valid = \ load_test_data(FLAGS.data_dir, FLAGS.max_word_length, num_unroll_steps=FLAGS.num_unroll_steps, eos=FLAGS.EOS, datas=['valid']) fasttext_model = None if 'fasttext' in FLAGS.embedding: fasttext_model = FasttextModel( fasttext_path=fasttext_model_path).get_fasttext_model() train_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='train', acoustics=acoustics) valid_ft_reader = DataReaderFastText( words_list=words_list, batch_size=FLAGS.batch_size, num_unroll_steps=FLAGS.num_unroll_steps, model=fasttext_model, data='valid', acoustics=acoustics) train_reader = DataReader(word_tensors['train'], char_tensors['train'], FLAGS.batch_size, FLAGS.num_unroll_steps, wers['train']) valid_reader = TestDataReader(word_tensors_valid['valid'], char_tensors_valid['valid'], FLAGS.batch_size, FLAGS.num_unroll_steps, wers_valid['valid'], files_name_valid['valid'], kaldi_sents_index_valid['valid']) # test_reader = DataReader(word_tensors['test'], char_tensors['test'], # FLAGS.batch_size, FLAGS.num_unroll_steps, wers['train'], word_vocab, char_vocab) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( char_vocab_size=char_vocab.size, word_vocab_size=word_vocab.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( char_vocab_size=char_vocab_valid.size, word_vocab_size=word_vocab_valid.size, char_embed_size=FLAGS.char_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, max_word_length=max_word_length, kernels=eval(FLAGS.kernels), kernel_features=eval(FLAGS.kernel_features), num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0, embedding=FLAGS.embedding, fasttext_word_dim=300, acoustic_features_dim=4) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size)) # create saver before creating more graph nodes, so that we do not save any vars defined below if FLAGS.load_model_for_training: # delete last layers (softmax) - SimpleLinear/Matrix + Bias variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) subset_grpah_for_loading = variables[:29] + variables[31:] loader = tf.train.Saver(max_to_keep=50, var_list=subset_grpah_for_loading) saver = tf.train.Saver(max_to_keep=50) if FLAGS.load_model_for_training: loader.restore(session, FLAGS.load_model_for_training) string = str('Loaded model from' + str(FLAGS.load_model_for_training) + 'saved at global step' + str(train_model.global_step.eval())) print(string) session.run(tf.variables_initializer(var_list=variables[29:31])) string = str('initialized specific scope for fresh model. Size:' + str(model.model_size())) print(string) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) string = str('Created and initialized fresh model. Size:' + str(model.model_size())) print(string) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate), ) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(FLAGS.max_epochs): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for batch_kim, batch_ft in zip(train_reader.iter(), train_ft_reader.iter()): x, y = batch_kim count += 1 start_time = time.time() if fasttext_model: ft_vectors = fasttext_model.wv[ words_list['train'][count]].reshape( fasttext_model.wv.vector_size, 1) loss, _, rnn_state, gradient_norm, step, _, logits = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding, train_model.logits ], { train_model.input2: batch_ft, train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) else: loss, _, rnn_state, gradient_norm, step, _, logits = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding, train_model.logits ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: string = str( '%6d: %d [%5d/%5d], train_loss = %6.8f secs/batch = %.4fs' % (step, epoch, count, train_reader.length, loss, time_elapsed)) print(string) string = str('Epoch training time:' + str(time.time() - epoch_start_time)) print(string) epochs_results['epoch_training_time'].append( str(time.time() - epoch_start_time)) # epoch done: time to evaluate avg_valid_loss = 0. labels = [] predictions = [] files_name_list = [] kaldi_sents_index_list = [] count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for batch_kim, batch_ft in zip(valid_reader.iter(), valid_ft_reader.iter()): x, y, files_name_batch, kaldi_sents_index_batch = batch_kim count += 1 start_time = time.time() loss, logits = session.run( [valid_model.loss, valid_model.logits], { valid_model.input2: batch_ft, valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) labels.append(y) predictions.append(logits) files_name_list.append(files_name_batch) kaldi_sents_index_list.append(kaldi_sents_index_batch) if count % FLAGS.print_every == 0: string = str("\t> validation loss = %6.8f" % (loss)) print(string) avg_valid_loss = get_valid_rescore_loss(labels, predictions, files_name_list, kaldi_sents_index_list) print("at the end of epoch:" + str(epoch)) epochs_results['epoch_number'].append(str(epoch)) print("train loss = %6.8f" % (avg_train_loss)) epochs_results['train_loss'].append(avg_train_loss) print("validation loss = %6.8f" % (avg_valid_loss)) epochs_results['validation_loss'].append(avg_valid_loss) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model' + str(save_as)) epochs_results['model_name'].append(str(save_as)) epochs_results['learning_rate'].append( str(session.run(train_model.learning_rate))) current_learning_rate = session.run(train_model.learning_rate) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and avg_valid_loss > best_valid_loss - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) string = str('learning rate was:' + str(current_learning_rate)) print(string) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-6: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) string = str('new learning rate is:' + str(current_learning_rate)) print(string) else: best_valid_loss = avg_valid_loss ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss), tf.Summary.Value(tag="learning_rate", simple_value=current_learning_rate) ]) summary_writer.add_summary(summary, step) # Save model performance data pd.DataFrame(epochs_results).to_csv(FLAGS.train_dir + '/train_results.csv')
def main(_): ''' Trains model from data ''' min = [1000, 1000, 1000, 1000] # [t_loss, t_ppl, v_loss, v_ppl] total_time = 0. if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) print('Created training directory', FLAGS.train_dir) word_vocab, \ char_vocab, \ word_tensors, \ char_tensors, \ max_word_length = load_data(FLAGS.data_dir, FLAGS.max_word_length, flist = FILE_NAME_LIST, eos=FLAGS.EOS) train_reader = DataReader(word_tensors[FILE_NAME_LIST[0]], FLAGS.batch_size, FLAGS.num_unroll_steps) valid_reader = DataReader(word_tensors[FILE_NAME_LIST[1]], FLAGS.batch_size, FLAGS.num_unroll_steps) test_reader = DataReader(word_tensors[FILE_NAME_LIST[2]], FLAGS.batch_size, FLAGS.num_unroll_steps) print('initialized all dataset readers') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build training graph ''' initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init) with tf.variable_scope("Model", initializer=initializer): train_model = model.inference_graph( word_vocab_size=word_vocab.size, word_embed_size=FLAGS.word_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, num_unroll_steps=FLAGS.num_unroll_steps, dropout=FLAGS.dropout) train_model.update( model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor. # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately. # Thus, scaling gradients so that this trainer is exactly compatible with the original train_model.update( model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm)) # create saver before creating more graph nodes, so that we do not save any vars defined below saver = tf.train.Saver(max_to_keep=5) ''' build graph for validation and testing (shares parameters with the training graph!) ''' with tf.variable_scope("Model", reuse=True): valid_model = model.inference_graph( word_vocab_size=word_vocab.size, word_embed_size=FLAGS.word_embed_size, batch_size=FLAGS.batch_size, num_highway_layers=FLAGS.highway_layers, num_rnn_layers=FLAGS.rnn_layers, rnn_size=FLAGS.rnn_size, num_unroll_steps=FLAGS.num_unroll_steps, dropout=0.0) valid_model.update( model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps)) if FLAGS.load_model: saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval()) else: tf.global_variables_initializer().run() session.run(train_model.clear_char_embedding_padding) print('Created and initialized fresh model. Size:', model.model_size()) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, graph=session.graph) ''' take learning rate from CLI, not from saved graph ''' session.run(tf.assign(train_model.learning_rate, FLAGS.learning_rate)) print("=" * 89) print("=" * 89) all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 pi = 1 # 0 is for sum of grad_sses for v_name in list(all_weights): # sorted() v = all_weights[v_name] v_size = int(np.prod(np.array(v.shape.as_list()))) print("%02d-Weight %s\tshape %s\ttsize %d" % (pi, v.name[:-2].ljust(80), str(v.shape).ljust(20), v_size)) total_size += v_size pi += 1 print("Total size %d, %.3fMiB" % (total_size, (total_size * 4) / (1024 * 1024))) print("-" * 89) ''' training starts here ''' best_valid_loss = None rnn_state = session.run(train_model.initial_rnn_state) for epoch in range(1, FLAGS.max_epochs + 1): epoch_start_time = time.time() avg_train_loss = 0.0 count = 0 for x, y in train_reader.iter(): count += 1 start_time = time.time() loss, _, rnn_state, gradient_norm, step, _ = session.run( [ train_model.loss, train_model.train_op, train_model.final_rnn_state, train_model.global_norm, train_model.global_step, train_model.clear_char_embedding_padding ], { train_model.input: x, train_model.targets: y, train_model.initial_rnn_state: rnn_state }) avg_train_loss += 0.05 * (loss - avg_train_loss) time_elapsed = time.time() - start_time if count % FLAGS.print_every == 0: cur_lr = session.run(train_model.learning_rate) print( '%6d: -%d- [%5d/%5d], train_loss/ppl = %6.8f/%6.7f batch/secs = %.1fb/s, cur_lr = %2.5f, grad.norm=%6.8f' % (step, epoch, count, train_reader.length, loss, np.exp(loss), FLAGS.print_every / time_elapsed, cur_lr, gradient_norm)) print('Epoch training time:', time.time() - epoch_start_time) total_time += (time.time() - epoch_start_time) # epoch done: time to evaluate avg_valid_loss = 0.0 count = 0 rnn_state = session.run(valid_model.initial_rnn_state) for x, y in valid_reader.iter(): count += 1 start_time = time.time() loss, rnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: rnn_state, }) if count % FLAGS.print_every == 0: print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss))) avg_valid_loss += loss / valid_reader.length print("at the end of epoch:", epoch) print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss))) print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss))) if min[2] > avg_valid_loss: min[0] = avg_train_loss min[1] = np.exp(avg_train_loss) min[2] = avg_valid_loss min[3] = np.exp(avg_valid_loss) save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss) saver.save(session, save_as) print('Saved model', save_as) ''' write out summary events ''' summary = tf.Summary(value=[ tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss), tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss) ]) summary_writer.add_summary(summary, step) ''' decide if need to decay learning rate ''' if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp( best_valid_loss) - FLAGS.decay_when: print( 'validation perplexity did not improve enough, decay learning rate' ) current_learning_rate = session.run(train_model.learning_rate) print('learning rate was:', current_learning_rate) current_learning_rate *= FLAGS.learning_rate_decay if current_learning_rate < 1.e-5: print('learning rate too small - stopping now') break session.run( train_model.learning_rate.assign(current_learning_rate)) print('new learning rate is:', current_learning_rate) else: best_valid_loss = avg_valid_loss ''' test on the test set ''' ave_test_loss = 0. trnn_state = session.run(valid_model.initial_rnn_state) for x, y in test_reader.iter(): loss, trnn_state = session.run( [valid_model.loss, valid_model.final_rnn_state], { valid_model.input: x, valid_model.targets: y, valid_model.initial_rnn_state: trnn_state }) disp_loss = loss ave_test_loss += disp_loss / test_reader.length print("=" * 89) print("=" * 89) print("Total training time(not included the valid time): %f" % total_time) print("The best result:") print("train loss = %.3f, ppl = %.4f" % (min[0], min[1])) print("valid loss = %.3f, ppl = %.4f" % (min[2], min[3])) print("test loss = %.3f, ppl = %.4f" % (ave_test_loss, np.exp(ave_test_loss))) print("=" * 89)