def main(unused_argv): vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, spatial_features, input_seq, target_seq, indicator = ( import_mscoco(mode="train", batch_size=BATCH_SIZE, num_epochs=100, is_mini=True)) visual_sentinel_cell = VisualSentinelCell(300) image_captioner = ImageCaptioner(visual_sentinel_cell, vocab, pretrained_matrix) logits, ids = image_captioner(lengths=tf.reduce_sum(indicator, axis=1), spatial_image_features=spatial_features, seq_inputs=input_seq) tf.losses.sparse_softmax_cross_entropy(target_seq, logits, weights=indicator) loss = tf.losses.get_total_loss() learning_step = tf.train.GradientDescentOptimizer(1.0).minimize( loss, var_list=image_captioner.variables) captioner_saver = tf.train.Saver(var_list=image_captioner.variables) captioner_ckpt, captioner_ckpt_name = get_visual_sentinel_checkpoint() with tf.Session() as sess: sess.run(tf.variables_initializer(image_captioner.variables)) if captioner_ckpt is not None: captioner_saver.restore(sess, captioner_ckpt) captioner_saver.save(sess, captioner_ckpt_name) last_save = time.time() for i in itertools.count(): time_start = time.time() try: _ids, _loss, _learning_step = sess.run( [ids, loss, learning_step]) except: break print( PRINT_STRING.format( i, _loss, list_of_ids_to_string(_ids[0, :].tolist(), vocab), BATCH_SIZE / (time.time() - time_start))) new_save = time.time() if new_save - last_save > 3600: # save the model every hour captioner_saver.save(sess, captioner_ckpt_name) last_save = new_save captioner_saver.save(sess, captioner_ckpt_name) print("Finishing training.")
BEAM_SIZE = 16 if __name__ == "__main__": vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, mean_features, input_seq, target_seq, indicator = ( import_mscoco(mode="train", batch_size=BATCH_SIZE, num_epochs=1, is_mini=True)) image_captioner = ImageCaptioner(ShowAndTellCell(300), vocab, pretrained_matrix, trainable=False, beam_size=BEAM_SIZE) logits, ids = image_captioner(mean_image_features=mean_features) captioner_saver = tf.train.Saver( var_list=remap_decoder_name_scope(image_captioner.variables)) captioner_ckpt, captioner_ckpt_name = get_show_and_tell_checkpoint() with tf.Session() as sess: assert (captioner_ckpt is not None) captioner_saver.restore(sess, captioner_ckpt) used_ids = set() json_dump = [] for i in itertools.count():
def main(unused_argv): vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, mean_features, object_features, input_seq, target_seq, indicator = import_mscoco( mode="train", batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, is_mini=FLAGS.is_mini) up_down_cell = UpDownCell(300) image_captioner = ImageCaptioner(up_down_cell, vocab, pretrained_matrix) logits, ids = image_captioner(lengths=tf.reduce_sum(indicator, axis=1), mean_image_features=mean_features, mean_object_features=object_features, seq_inputs=input_seq) tf.losses.sparse_softmax_cross_entropy(target_seq, logits, weights=indicator) loss = tf.losses.get_total_loss() global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer() learning_step = optimizer.minimize(loss, var_list=image_captioner.variables, global_step=global_step) captioner_saver = tf.train.Saver(var_list=image_captioner.variables + [global_step]) captioner_ckpt, captioner_ckpt_name = get_up_down_checkpoint() with tf.Session() as sess: sess.run(tf.variables_initializer(optimizer.variables())) if captioner_ckpt is not None: captioner_saver.restore(sess, captioner_ckpt) else: sess.run( tf.variables_initializer(image_captioner.variables + [global_step])) captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step) last_save = time.time() for i in itertools.count(): time_start = time.time() try: _target, _ids, _loss, _learning_step = sess.run( [target_seq, ids, loss, learning_step]) except: break iteration = sess.run(global_step) print( PRINT_STRING.format( iteration, _loss, list_of_ids_to_string(_ids[0, :].tolist(), vocab), list_of_ids_to_string(_target[0, :].tolist(), vocab), FLAGS.batch_size / (time.time() - time_start))) new_save = time.time() if new_save - last_save > 3600: # save the model every hour captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step) last_save = new_save captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step) print("Finishing training.")
FLAGS = tf.flags.FLAGS if __name__ == "__main__": vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, spatial_features, input_seq, target_seq, indicator = import_mscoco( mode=FLAGS.mode, batch_size=FLAGS.batch_size, num_epochs=1, is_mini=FLAGS.is_mini) image_captioner = ImageCaptioner(ShowAttendAndTellCell(300), vocab, pretrained_matrix, trainable=False, beam_size=FLAGS.beam_size) logits, ids = image_captioner(spatial_image_features=spatial_features) captioner_saver = tf.train.Saver( var_list=remap_decoder_name_scope(image_captioner.variables)) captioner_ckpt, captioner_ckpt_name = get_show_attend_and_tell_checkpoint( ) with tf.Session() as sess: assert (captioner_ckpt is not None) captioner_saver.restore(sess, captioner_ckpt) used_ids = set() json_dump = []
FLAGS = tf.flags.FLAGS if __name__ == "__main__": vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, spatial_features, input_seq, target_seq, indicator = import_mscoco( mode=FLAGS.mode, batch_size=FLAGS.batch_size, num_epochs=1, is_mini=FLAGS.is_mini) image_captioner = ImageCaptioner(SpatialAttentionCell(300), vocab, pretrained_matrix, trainable=False, beam_size=FLAGS.beam_size) logits, ids = image_captioner(spatial_image_features=spatial_features) captioner_saver = tf.train.Saver( var_list=remap_decoder_name_scope(image_captioner.variables)) captioner_ckpt, captioner_ckpt_name = get_spatial_attention_checkpoint( ) with tf.Session() as sess: assert (captioner_ckpt is not None) captioner_saver.restore(sess, captioner_ckpt) used_ids = set() json_dump = []
def main(unused_argv): vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, mean_features, input_seq, target_seq, indicator = ( import_mscoco(mode="train", batch_size=BATCH_SIZE, num_epochs=100, is_mini=True)) show_and_tell_cell = ShowAndTellCell(300) image_captioner = ImageCaptioner(show_and_tell_cell, vocab, pretrained_matrix) logits, ids = image_captioner(lengths=tf.reduce_sum(indicator, axis=1), mean_image_features=mean_features, seq_inputs=input_seq) tf.losses.sparse_softmax_cross_entropy(target_seq, logits, weights=indicator) loss = tf.losses.get_total_loss() global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( INITIAL_LEARNING_RATE, global_step, (TRAINING_EXAMPLES // BATCH_SIZE) * EPOCHS_PER_DECAY, DECAY_RATE, staircase=True) learning_step = tf.train.GradientDescentOptimizer( learning_rate).minimize(loss, var_list=image_captioner.variables, global_step=global_step) captioner_saver = tf.train.Saver(var_list=image_captioner.variables + [global_step]) captioner_ckpt, captioner_ckpt_name = get_show_and_tell_checkpoint() with tf.Session() as sess: if captioner_ckpt is not None: captioner_saver.restore(sess, captioner_ckpt) else: sess.run( tf.variables_initializer(image_captioner.variables + [global_step])) captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step) last_save = time.time() for i in itertools.count(): time_start = time.time() try: _ids, _loss, _learning_step = sess.run( [ids, loss, learning_step]) except: break iteration = sess.run(global_step) print( PRINT_STRING.format( iteration, _loss, list_of_ids_to_string(_ids[0, :].tolist(), vocab), BATCH_SIZE / (time.time() - time_start))) new_save = time.time() if new_save - last_save > 3600: # save the model every hour captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step) last_save = new_save captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step) print("Finishing training.")
FLAGS = tf.flags.FLAGS if __name__ == "__main__": vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, spatial_features, input_seq, target_seq, indicator = import_mscoco( mode=FLAGS.mode, batch_size=FLAGS.batch_size, num_epochs=1, is_mini=FLAGS.is_mini) image_captioner = ImageCaptioner(VisualSentinelCell(300), vocab, pretrained_matrix, trainable=False, beam_size=FLAGS.beam_size) logits, ids = image_captioner(spatial_image_features=spatial_features) captioner_saver = tf.train.Saver( var_list=remap_decoder_name_scope(image_captioner.variables)) captioner_ckpt, captioner_ckpt_name = get_visual_sentinel_checkpoint() with tf.Session() as sess: assert (captioner_ckpt is not None) captioner_saver.restore(sess, captioner_ckpt) used_ids = set() json_dump = [] for i in itertools.count():
FLAGS = tf.flags.FLAGS if __name__ == "__main__": vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300) with tf.Graph().as_default(): image_id, mean_features, object_features, input_seq, target_seq, indicator = import_mscoco( mode=FLAGS.mode, batch_size=FLAGS.batch_size, num_epochs=1, is_mini=FLAGS.is_mini) image_captioner = ImageCaptioner(UpDownCell(300), vocab, pretrained_matrix, trainable=False, beam_size=FLAGS.beam_size) logits, ids = image_captioner(mean_image_features=mean_features, mean_object_features=object_features) captioner_saver = tf.train.Saver( var_list=remap_decoder_name_scope(image_captioner.variables)) captioner_ckpt, captioner_ckpt_name = get_up_down_checkpoint() with tf.Session() as sess: assert (captioner_ckpt is not None) captioner_saver.restore(sess, captioner_ckpt) used_ids = set() json_dump = []