Exemplo n.º 1
0
def test(model_path=None,
         video_data_path_test=video_data_path_val,
         n_test_samples=n_val_samples):
    #    test_data = val_data   # to evaluate on testing data or validation data
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)

    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_test)
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \
            = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label)
        val_v2s_tf, v2s_lstm3_vars_tf = model.build_v2s_generator(val_data)
        val_s2s_tf, s2s_lstm2_vars_tf, s2s_lstm3_vars_tf = model.build_s2s_generator(
            val_caption_id_1)
        val_s2v_tf, s2v_lstm2_vars_tf, s2v_lstm4_vars_tf = model.build_s2v_generator(
            val_caption_id_1, val_frame_data)
        val_v2v_tf, v2v_lstm4_vars_tf = model.build_v2v_generator(
            val_data, val_frame_data)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device(cpu_device):
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        print 'load parameters from:', model_path


#    print 'halve the dropout weights..'
#    for ind, row in enumerate(v2s_lstm3_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)
#    for ind, row in enumerate(s2s_lstm2_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)
#    for ind, row in enumerate(s2v_lstm4_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ######### test sentence generation ##########
    print 'testing...'
    n_test_steps = int(n_test_samples / batch_size)
    print 'n_test_steps:', n_test_steps
    tstart = time.time()
    ### TODO: sometimes COCO test show exceptions in the beginning of training ####
    if test_v2s:
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname)
        for i, key in enumerate(pred_dict.keys()):
            print 'video:', flist[i]
            for ele in gt_dict[key]:
                print "GT:  " + ele['caption']
            print "PD:  " + pred_dict[key][0]['caption']
            print '-------'
        print '############## video to sentence result #################'
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         _] = testing_all(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname)
        scorer = COCOScorer()
        total_score_1 = scorer.score(gt_dict, pred_dict, id_list)
        print '############## video to sentence result #################'

    if test_s2s:
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
        for i, key in enumerate(pred_dict.keys()):
            print 'video:', flist[i]
            for ele in gt_dict[key]:
                print "GT:  " + ele['caption']
            print "PD:  " + pred_dict[key][0]['caption']
            print '-------'
        print '############## sentence to sentence result #################'
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname)
        scorer = COCOScorer()
        total_score_2 = scorer.score(gt_dict, pred_dict, id_list)
        print '############## sentence to sentence result #################'

    ######### test video generation #############
    if test_v2v:
        mse_v2v = test_all_videos(sess, n_test_steps, val_frame_data,
                                  val_v2v_tf, val_video_label,
                                  pixel_scale_factor)
        print 'video2video mse:', mse_v2v
    if test_s2v:
        mse_s2v = test_all_videos(sess, n_test_steps, val_frame_data,
                                  val_s2v_tf, val_video_label,
                                  pixel_scale_factor)
        print 'caption2video mse:', mse_s2v
    if save_demo_sent_v2s:
        get_demo_sentence(sess,
                          n_test_steps,
                          ixtoword,
                          val_v2s_tf,
                          val_fname,
                          result_file=home_folder + 'demo_v2s.txt')
    if save_demo_sent_s2s:
        get_demo_sentence(sess,
                          n_test_steps,
                          ixtoword,
                          val_s2s_tf,
                          val_fname,
                          result_file=home_folder + 'demo_s2s.txt')
    if save_demo_video_v2v:
        get_demo_video(sess, n_test_steps, val_frame_data, val_v2v_tf,
                       val_video_label, val_fname, home_folder + 'demo_v2v/',
                       pixel_scale_factor)
    if save_demo_video_s2v:
        get_demo_video(sess, n_test_steps, val_frame_data, val_s2v_tf,
                       val_video_label, val_fname, home_folder + 'demo_s2v/',
                       pixel_scale_factor)

    sys.stdout.flush()
    coord.request_stop()
    coord.join(threads)
    tstop = time.time()
    print "Total Time Cost:", round(tstop - tstart, 2), "s"
    sess.close()
Exemplo n.º 2
0
def train():
    assert os.path.isfile(video_data_path_train)
    assert os.path.isfile(video_data_path_val)
    assert os.path.isdir(model_path)
    assert os.path.isfile(wordtoix_file)
    assert os.path.isfile(ixtoword_file)
    assert drop_strategy in ['block_video', 'block_sent', 'random', 'keep']
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    print 'build model and session...'
    # shared parameters on the GPU
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)
    tStart_total = time.time()
    n_epoch_steps = int(n_train_samples / batch_size)
    n_steps = n_epochs * n_epoch_steps
    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_val)
        # random batches
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \
            = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label)
        val_v2s_tf, _ = model.build_v2s_generator(val_data)
        val_s2s_tf, _, _ = model.build_s2s_generator(val_caption_id_1)
        val_s2v_tf, _, _ = model.build_s2v_generator(val_caption_id_1,
                                                     val_frame_data)
        val_v2v_tf, _ = model.build_v2v_generator(val_data, val_frame_data)

    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False))
    # check for model file
    with tf.device(cpu_device):
        saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state(model_path)
    global_step = 0
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
        #        print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True)
        global_step = get_model_step(ckpt.model_checkpoint_path)
        print 'global_step:', global_step
    else:
        print("Created model with fresh parameters.")
        sess.run(tf.global_variables_initializer())
    temp = set(tf.global_variables())
    # train on the GPU
    with tf.device("/gpu:0"):
        ## 1. weight decay
        for var in tf.trainable_variables():
            decay_loss = tf.multiply(tf.nn.l2_loss(var),
                                     0.0004,
                                     name='weight_loss')
            tf.add_to_collection('losses', decay_loss)
        tf.add_to_collection('losses', tf_loss)
        tf_total_loss = tf.add_n(tf.get_collection('losses'),
                                 name='total_loss')
        ## 2. gradient clip
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gvs = optimizer.compute_gradients(tf_total_loss)
        # when variable is not related to the loss, grad returned as None
        clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                    for grad, var in gvs if grad is not None]
        for grad, var in gvs:
            if grad is not None:
                tf.summary.histogram(var.name + '/grad', grad)
                tf.summary.histogram(var.name + '/data', var)
        train_op = optimizer.apply_gradients(clip_gvs)

    ## initialize variables added for optimizer
    sess.run(tf.variables_initializer(set(tf.global_variables()) - temp))
    # initialize epoch variable in queue reader
    sess.run(tf.local_variables_initializer())
    loss_epoch = 0
    loss_epoch_cap = 0
    loss_epoch_vid = 0
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ##### add summaries ######
    tf.summary.histogram('video_h', tf_v_h)
    tf.summary.histogram('sent_h', tf_s_h)
    tf.summary.scalar('loss_vid', tf_loss_vid)
    tf.summary.scalar('loss_lat', tf_loss_lat)
    tf.summary.scalar('loss_caption', tf_loss_cap)
    #    for var in tf.trainable_variables():
    #        summaries.append(tf.histogram_summary(var.op.name, var))
    summary_op = tf.summary.merge_all()
    # write graph architecture to file
    summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph)
    epoch = global_step
    video_label = sess.run(train_video_label)
    for step in xrange(1, n_steps + 1):
        tStart = time.time()
        if drop_strategy == 'keep':
            drop_type = 0
        elif drop_strategy == 'block_sentence':
            drop_type = 1
        elif drop_strategy == 'block_video':
            drop_type = 2
        else:
            drop_type = random.randint(0, 3)

        _, loss_val, loss_cap, loss_lat, loss_vid = sess.run(
            [train_op, tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid],
            feed_dict={tf_drop_type: drop_type})
        tStop = time.time()
        print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap * caption_weight, "loss_latent:", loss_lat * latent_weight, "loss_vid:", loss_vid * video_weight
        print "Time Cost:", round(tStop - tStart, 2), "s"
        loss_epoch += loss_val
        loss_epoch_cap += loss_cap
        loss_epoch_vid += loss_vid

        if step % n_epoch_steps == 0:
            #        if step % 3 == 0:
            epoch += 1
            loss_epoch /= n_epoch_steps
            loss_epoch_cap /= n_epoch_steps
            loss_epoch_vid /= n_epoch_steps
            with tf.device(cpu_device):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)


#            print 'z:', z[0, :10]
            print 'epoch:', epoch, 'loss:', loss_epoch, "loss_cap:", loss_epoch_cap, "loss_lat:", loss_lat, "loss_vid:", loss_epoch_vid
            loss_epoch = 0
            loss_epoch_cap = 0
            loss_epoch_vid = 0
            ######### test sentence generation ##########
            n_val_steps = int(n_val_samples / batch_size)
            #            n_val_steps = 3
            ### TODO: sometimes COCO test show exceptions in the beginning of training ####
            if test_v2s:
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname)
                for i, key in enumerate(pred_dict.keys()):
                    print 'video:', flist[i]
                    for ele in gt_dict[key]:
                        print "GT:  " + ele['caption']
                    print "PD:  " + pred_dict[key][0]['caption']
                    print '-------'
                print '############## video to sentence result #################'
                print 'epoch:', epoch
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 _] = testing_all(sess, n_val_steps, ixtoword, val_v2s_tf,
                                  val_fname)
                scorer = COCOScorer()
                total_score = scorer.score(gt_dict, pred_dict, id_list)
                print '############## video to sentence result #################'

            if test_s2s:
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
                for i, key in enumerate(pred_dict.keys()):
                    print 'video:', flist[i]
                    for ele in gt_dict[key]:
                        print "GT:  " + ele['caption']
                    print "PD:  " + pred_dict[key][0]['caption']
                    print '-------'
                print '############## sentence to sentence result #################'
                print 'epoch:', epoch
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 _] = testing_all(sess, n_val_steps, ixtoword, val_s2s_tf,
                                  val_fname)
                scorer = COCOScorer()
                total_score = scorer.score(gt_dict, pred_dict, id_list)
                print '############## sentence to sentence result #################'

            ######### test video generation #############
            if test_v2v:
                mse_v2v = test_all_videos(sess, n_val_steps, val_frame_data,
                                          val_v2v_tf, val_video_label,
                                          pixel_scale_factor)
                print 'epoch', epoch, 'video2video mse:', mse_v2v
            if test_s2v:
                mse_s2v = test_all_videos(sess, n_val_steps, val_frame_data,
                                          val_s2v_tf, val_video_label,
                                          pixel_scale_factor)
                print 'epoch', epoch, 'caption2video mse:', mse_s2v
            sys.stdout.flush()

            ###### summary ######
            if epoch % 2 == 0:
                summary = sess.run(summary_op)
                summary_writer.add_summary(summary, epoch)

        sys.stdout.flush()

    coord.request_stop()
    coord.join(threads)
    print "Finally, saving the model ..."
    with tf.device(cpu_device):
        saver.save(sess,
                   os.path.join(model_path, 'model'),
                   global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
    sess.close()