Пример #1
0
def test(model_path='models/model-900', video_feat_path=video_feat_path):
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    test_data = val_data
    ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0,
            bias_init_vector=None)

    video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator()
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))

    with tf.device("/cpu:0"):
	    saver = tf.train.Saver()
	    saver.restore(sess, model_path)
    
    for ind, row in enumerate(lstm3_variables_tf):
        if ind % 4 == 0:
                assign_op = row.assign(tf.mul(row,1-0.5))
                sess.run(assign_op)
    
    [pred_sent, gt_sent] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, caption_tf)
    #np.savez('Att_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent)
    scorer = COCOScorer()
    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
    return total_score
Пример #2
0
def test(model_path='models/model-900', video_feat_path=video_feat_path):
    meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test)
    #test_data = train_data
    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0,
            bias_init_vector=None)

    video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator()
    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    saver.restore(sess, model_path)
    for ind, row in enumerate(lstmRNN_variables_tf):
	if ind % 4 == 0:
		assign_op = row.assign(tf.mul(row,1-0.5))
		sess.run(assign_op)
    for ind, row in enumerate(lstm3_variables_tf):
	if ind % 4 == 0:
		assign_op = row.assign(tf.mul(row,1-0.5))
		sess.run(assign_op)

    [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf)
    np.savez('HS_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent,mp=mp,HLness=HLness)
    total_score = np.mean(mp)
    print model_path.split('/')[1]+' mAP: ' + str(total_score)
    scorer = COCOScorer()
    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
    return total_score
Пример #3
0
def score_with_cocoeval(samples_valid, samples_test, engine):
    scorer = COCOScorer()
    if samples_valid:
        gts_valid = OrderedDict()
        for vidID in engine.valid_ids:
            gts_valid[vidID] = engine.CAP[vidID]
        valid_score = scorer.score(gts_valid, samples_valid, engine.valid_ids)
    else:
        valid_score = None
    if samples_test:
        gts_test = OrderedDict()
        for vidID in engine.test_ids:
            gts_test[vidID] = engine.CAP[vidID]
        test_score = scorer.score(gts_test, samples_test, engine.test_ids)
    else:
        test_score = None
    return valid_score, test_score
Пример #4
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

    np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword)

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask= model.build_model()
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))

    with tf.device("/cpu:0"):
    	saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, 'models_Att_update_new/model-30')

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

	tStart_epoch = time.time()
	loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

	    tStart = time.time()
	    current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
	    current_video_len = np.zeros(batch_size)
	    for ind in xrange(batch_size):
		current_feats[ind,:,:] = current_batch['data'][:,ind,:]
		idx = np.where(current_batch['label'][:,ind] != -1)[0]
		if len(idx) == 0:
			continue
		current_video_masks[ind,:idx[-1]+1] = 1

            current_captions = current_batch['title']
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=16-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            _, loss_val = sess.run(
                    [train_op, tf_loss],
                    feed_dict={
                        tf_video: current_feats,
                        tf_video_mask : current_video_masks,
                        tf_caption: current_caption_matrix,
                        tf_caption_mask: current_caption_masks
                        })
	    loss_epoch[current_batch_file_idx] = loss_val
	    tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
	    #print "Time Cost:", round(tStop - tStart,2), "s"

	print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
	tStop_epoch = time.time()
	print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s"

        if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1:
            print "Epoch ", epoch, " is done. Saving the model ..."
    	    with tf.device("/cpu:0"):
            	saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

	    current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))])
    	    video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator()
    	    ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist())
	    [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf)
	    for idx in range(len(pred_sent)):
		print "GT:  " + gt_sent[idx][0]['caption']
		print "PD:  " + pred_sent[idx][0]['caption']
		print '-------'
    	    [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword,video_tf, video_mask_tf, caption_tf)
	    scorer = COCOScorer()
	    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))
	sys.stdout.flush()

    print "Finally, saving the model ..."
    with tf.device("/cpu:0"):
    	saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def train():
    assert os.path.isfile(video_data_path_train)
    assert os.path.isfile(video_data_path_val)
    assert os.path.isdir(model_path)
    assert os.path.isfile(wordtoix_file)
    assert os.path.isfile(ixtoword_file)
    assert drop_strategy in ['block_video', 'block_sent', 'random', 'keep']
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    print 'build model and session...'
    # shared parameters on the GPU
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)
    tStart_total = time.time()
    n_epoch_steps = int(n_train_samples / batch_size)
    n_steps = n_epochs * n_epoch_steps
    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_val)
        # random batches
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \
            = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label)
        val_v2s_tf, _ = model.build_v2s_generator(val_data)
        val_s2s_tf, _, _ = model.build_s2s_generator(val_caption_id_1)
        val_s2v_tf, _, _ = model.build_s2v_generator(val_caption_id_1,
                                                     val_frame_data)
        val_v2v_tf, _ = model.build_v2v_generator(val_data, val_frame_data)

    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False))
    # check for model file
    with tf.device(cpu_device):
        saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state(model_path)
    global_step = 0
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
        #        print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True)
        global_step = get_model_step(ckpt.model_checkpoint_path)
        print 'global_step:', global_step
    else:
        print("Created model with fresh parameters.")
        sess.run(tf.global_variables_initializer())
    temp = set(tf.global_variables())
    # train on the GPU
    with tf.device("/gpu:0"):
        ## 1. weight decay
        for var in tf.trainable_variables():
            decay_loss = tf.multiply(tf.nn.l2_loss(var),
                                     0.0004,
                                     name='weight_loss')
            tf.add_to_collection('losses', decay_loss)
        tf.add_to_collection('losses', tf_loss)
        tf_total_loss = tf.add_n(tf.get_collection('losses'),
                                 name='total_loss')
        ## 2. gradient clip
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gvs = optimizer.compute_gradients(tf_total_loss)
        # when variable is not related to the loss, grad returned as None
        clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                    for grad, var in gvs if grad is not None]
        for grad, var in gvs:
            if grad is not None:
                tf.summary.histogram(var.name + '/grad', grad)
                tf.summary.histogram(var.name + '/data', var)
        train_op = optimizer.apply_gradients(clip_gvs)

    ## initialize variables added for optimizer
    sess.run(tf.variables_initializer(set(tf.global_variables()) - temp))
    # initialize epoch variable in queue reader
    sess.run(tf.local_variables_initializer())
    loss_epoch = 0
    loss_epoch_cap = 0
    loss_epoch_vid = 0
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ##### add summaries ######
    tf.summary.histogram('video_h', tf_v_h)
    tf.summary.histogram('sent_h', tf_s_h)
    tf.summary.scalar('loss_vid', tf_loss_vid)
    tf.summary.scalar('loss_lat', tf_loss_lat)
    tf.summary.scalar('loss_caption', tf_loss_cap)
    #    for var in tf.trainable_variables():
    #        summaries.append(tf.histogram_summary(var.op.name, var))
    summary_op = tf.summary.merge_all()
    # write graph architecture to file
    summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph)
    epoch = global_step
    video_label = sess.run(train_video_label)
    for step in xrange(1, n_steps + 1):
        tStart = time.time()
        if drop_strategy == 'keep':
            drop_type = 0
        elif drop_strategy == 'block_sentence':
            drop_type = 1
        elif drop_strategy == 'block_video':
            drop_type = 2
        else:
            drop_type = random.randint(0, 3)

        _, loss_val, loss_cap, loss_lat, loss_vid = sess.run(
            [train_op, tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid],
            feed_dict={tf_drop_type: drop_type})
        tStop = time.time()
        print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap * caption_weight, "loss_latent:", loss_lat * latent_weight, "loss_vid:", loss_vid * video_weight
        print "Time Cost:", round(tStop - tStart, 2), "s"
        loss_epoch += loss_val
        loss_epoch_cap += loss_cap
        loss_epoch_vid += loss_vid

        if step % n_epoch_steps == 0:
            #        if step % 3 == 0:
            epoch += 1
            loss_epoch /= n_epoch_steps
            loss_epoch_cap /= n_epoch_steps
            loss_epoch_vid /= n_epoch_steps
            with tf.device(cpu_device):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)


#            print 'z:', z[0, :10]
            print 'epoch:', epoch, 'loss:', loss_epoch, "loss_cap:", loss_epoch_cap, "loss_lat:", loss_lat, "loss_vid:", loss_epoch_vid
            loss_epoch = 0
            loss_epoch_cap = 0
            loss_epoch_vid = 0
            ######### test sentence generation ##########
            n_val_steps = int(n_val_samples / batch_size)
            #            n_val_steps = 3
            ### TODO: sometimes COCO test show exceptions in the beginning of training ####
            if test_v2s:
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname)
                for i, key in enumerate(pred_dict.keys()):
                    print 'video:', flist[i]
                    for ele in gt_dict[key]:
                        print "GT:  " + ele['caption']
                    print "PD:  " + pred_dict[key][0]['caption']
                    print '-------'
                print '############## video to sentence result #################'
                print 'epoch:', epoch
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 _] = testing_all(sess, n_val_steps, ixtoword, val_v2s_tf,
                                  val_fname)
                scorer = COCOScorer()
                total_score = scorer.score(gt_dict, pred_dict, id_list)
                print '############## video to sentence result #################'

            if test_s2s:
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
                for i, key in enumerate(pred_dict.keys()):
                    print 'video:', flist[i]
                    for ele in gt_dict[key]:
                        print "GT:  " + ele['caption']
                    print "PD:  " + pred_dict[key][0]['caption']
                    print '-------'
                print '############## sentence to sentence result #################'
                print 'epoch:', epoch
                [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
                 _] = testing_all(sess, n_val_steps, ixtoword, val_s2s_tf,
                                  val_fname)
                scorer = COCOScorer()
                total_score = scorer.score(gt_dict, pred_dict, id_list)
                print '############## sentence to sentence result #################'

            ######### test video generation #############
            if test_v2v:
                mse_v2v = test_all_videos(sess, n_val_steps, val_frame_data,
                                          val_v2v_tf, val_video_label,
                                          pixel_scale_factor)
                print 'epoch', epoch, 'video2video mse:', mse_v2v
            if test_s2v:
                mse_s2v = test_all_videos(sess, n_val_steps, val_frame_data,
                                          val_s2v_tf, val_video_label,
                                          pixel_scale_factor)
                print 'epoch', epoch, 'caption2video mse:', mse_s2v
            sys.stdout.flush()

            ###### summary ######
            if epoch % 2 == 0:
                summary = sess.run(summary_op)
                summary_writer.add_summary(summary, epoch)

        sys.stdout.flush()

    coord.request_stop()
    coord.join(threads)
    print "Finally, saving the model ..."
    with tf.device(cpu_device):
        saver.save(sess,
                   os.path.join(model_path, 'model'),
                   global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
    sess.close()
Пример #6
0
def test(model_path=None,
         video_data_path_test=video_data_path_val,
         n_test_samples=n_val_samples,
         video_name=None):
    #    test_data = val_data   # to evaluate on testing data or validation data
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)

    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_test)
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss = model.build_model(train_caption_id, train_caption_id_1,
                                    train_caption_label)
        val_s2s_tf, s2s_lstm3_vars_tf = model.build_s2s_generator(
            val_caption_id_1)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device(cpu_device):
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        print 'load parameters from:', model_path

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ######### test sentence generation ##########
    print 'testing...'
    n_test_steps = int(n_test_samples / batch_size)
    print 'n_test_steps:', n_test_steps
    tstart = time.time()
    ### TODO: sometimes COCO test show exceptions in the beginning of training ####
    if test_s2s:
        #        [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
        #        for i, key in enumerate(pred_dict.keys()):
        #            print 'video:', flist[i]
        #            for ele in gt_dict[key]:
        #                print "GT:  " + ele['caption']
        #            print "PD:  " + pred_dict[key][0]['caption']
        #            print '-------'
        print '############## sentence to sentence result #################'
        [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
         flist] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf,
                              val_fname)
        if os.path.isfile('demo_s2s.txt.videos'):
            video_name = pickle.load(open('demo_s2s.txt.videos', "rb"))
        if video_name:
            for i, key in enumerate(pred_dict.keys()):
                if flist[i] in video_name:
                    print flist[i]
                    for ele in gt_dict[key]:
                        print "GT:  " + ele['caption']
                    print "PD:  " + pred_dict[key][0]['caption']
                    print '-----------'
        scorer = COCOScorer()
        total_score_2 = scorer.score(gt_dict, pred_dict, id_list)
        print '############## sentence to sentence result #################'

    if save_demo_sent_s2s:
        get_demo_sentence(sess,
                          n_test_steps,
                          ixtoword,
                          val_s2s_tf,
                          val_fname,
                          result_file='demo_s2s.txt')

    sys.stdout.flush()
    coord.request_stop()
    coord.join(threads)
    tstop = time.time()
    print "Total Time Cost:", round(tstop - tstart, 2), "s"
    sess.close()
Пример #7
0
def train():
    assert os.path.isdir(home_folder)
    assert os.path.isfile(video_data_path_train)
    assert os.path.isfile(video_data_path_val)
    assert os.path.isdir(model_path)
    print 'load meta data...'
    wordtoix = np.load(home_folder + 'data0/msvd_wordtoix.npy').tolist()
    ixtoword = pd.Series(
        np.load(home_folder + 'data0/msvd_ixtoword.npy').tolist())
    print 'build model and session...'
    # shared parameters on the GPU
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(dim_image=dim_image,
                                        n_words=len(wordtoix),
                                        dim_hidden=dim_hidden,
                                        batch_size=batch_size,
                                        n_caption_steps=n_caption_steps,
                                        n_video_steps=n_video_steps,
                                        drop_out_rate=0.5,
                                        bias_init_vector=None)
    tStart_total = time.time()
    n_epoch_steps = int(n_train_samples / batch_size)
    n_steps = n_epochs * n_epoch_steps
    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _ = read_and_decode(video_data_path_val)
        # random batches
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss = model.build_model(train_caption_id, train_caption_id_1,
                                    train_caption_label)
        val_caption_tf, val_lstm3_variables_tf = model.build_sent_generator(
            val_caption_id_1)

    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True, log_device_placement=False))
    # check for model file
    with tf.device(cpu_device):
        saver = tf.train.Saver(max_to_keep=100)
    ckpt = tf.train.get_checkpoint_state(model_path)
    global_step = 0
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
        #        print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True)
        global_step = get_model_step(ckpt.model_checkpoint_path)
        print 'global_step:', global_step
    else:
        print("Created model with fresh parameters.")
        sess.run(tf.global_variables_initializer())
    temp = set(tf.global_variables())
    # train on the GPU
    with tf.device("/gpu:0"):
        #        train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
        ## initialize variables added for optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gvs = optimizer.compute_gradients(tf_loss)
        # when variable is not related to the loss, grad returned as None
        clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                    for grad, var in gvs if grad is not None]
        train_op = optimizer.apply_gradients(gvs)

    sess.run(tf.variables_initializer(set(tf.global_variables()) - temp))
    # initialize epoch variable in queue reader
    sess.run(tf.local_variables_initializer())
    loss_epoch = 0
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    # write graph architecture to file
    summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph)
    loss_summary = tf.summary.scalar('loss', tf_loss)
    epoch = global_step
    for step in xrange(1, n_steps + 1):
        tStart = time.time()
        _, loss_val = sess.run([train_op, tf_loss])
        tStop = time.time()
        print "step:", step, " Loss:", loss_val
        print "Time Cost:", round(tStop - tStart, 2), "s"
        loss_epoch += loss_val

        if step % n_epoch_steps == 0:
            epoch += 1
            loss_epoch /= n_epoch_steps
            with tf.device(cpu_device):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)


#            print 'z:', z[0, :10]
            print 'epoch:', epoch, 'loss:', loss_epoch
            loss_epoch = 0
            n_val_steps = int(n_val_samples / batch_size)
            ######### test sentence generation ##########
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
             _] = testing_all(sess, 1, ixtoword, val_caption_tf, val_fname)
            for key in pred_dict.keys():
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict,
             _] = testing_all(sess, n_val_steps, ixtoword, val_caption_tf,
                              val_fname)
            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)

            #### summary #####
            summary = sess.run(loss_summary)
            summary_writer.add_summary(summary, epoch)
            sys.stdout.flush()

        sys.stdout.flush()

    coord.request_stop()
    coord.join(threads)
    print "Finally, saving the model ..."
    with tf.device(cpu_device):
        saver.save(sess,
                   os.path.join(model_path, 'model'),
                   global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
    sess.close()
Пример #8
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

#    np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword)
#    np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix)
#    sys.exit()
    ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist())
    wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
	    dim_tracker=dim_tracker,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    tracker_cnt=tracker_cnt,
            drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model()
    #loss_summary = tf.scalar_summary("Loss",tf_loss)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
    #merged = tf.merge_all_summaries()
    #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def)

    with tf.device("/cpu:0"):
    	saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, 'models/model-0')

    tStart_total = time.time()
    nr_prefetch = int(3)
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]
        
        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        ## init queue
        data_queue = mp.Queue(nr_prefetch)
#        tracker_queue = mp.Queue(nr_prefetch)
        title_queue = mp.Queue(nr_prefetch)
        t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data'))
#        t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker'))
        t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title'))
        t1.start()
#        t2.start()
        t3.start()
        for current_batch_file_idx in range(len(train_data)):
            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            
            if 'tracker' in current_batch.keys():
                current_tracker = np.array(current_batch['tracker'])
            else:
                current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker))
            
            if 'tracker_mask' in current_batch.keys():
                current_tracker_mask = np.array(current_batch['tracker_mask'])
            else:
                current_tracker_mask = np.zeros((batch_size, tracker_cnt))

#            current_tracker = tracker_queue.get()
            current_batch_data = data_queue.get()
            current_batch_title = title_queue.get()
            for ind in xrange(batch_size):
                current_feats[ind,:,:] = current_batch_data[:,ind,:]
                idx = np.where(current_batch['label'][:,ind] != -1)[0]
                if len(idx) == 0:
                        continue
                current_video_masks[ind,idx[-1]] = 1

            current_captions = current_batch_title
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            current_batch.close()


            _, loss_val= sess.run(
                [train_op, tf_loss],
                feed_dict={
                tf_video: current_feats,
                tf_video_mask : current_video_masks,
                tf_tracker : current_tracker,
                tf_tracker_mask : current_tracker_mask,
                tf_caption: current_caption_matrix,
                tf_caption_mask: current_caption_masks
                })
            #writer.add_summary(summary_str, epoch)
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
            #print "Time Cost:", round(tStop - tStart,2), "s"

        t1.join()
#       t2.join()
        t3.join()
        print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
        tStop_epoch = time.time()
        print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s"
	sys.stdout.flush()

        if np.mod(epoch, 2) == 0:
            print "Epoch ", epoch, " is done. Saving the model ..."
    	    with tf.device('/cpu:0'):
            	saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
        if np.mod(epoch, 10) == 0:
            current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))])
            video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
            ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist())
#            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)
#            for key in pred_dict.keys():
#                for ele in gt_dict[key]:
#                    print "GT:  " + ele['caption']
#                print "PD:  " + pred_dict[key][0]['caption']
#                print '-------'

            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)

            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)

    print "Finally, saving the model ..."
    with tf.device('/cpu:0'):
	    saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def test(model_path=None,
    video_data_path_test='/home/shenxu/data/msvd_feat_vgg_c3d_frame/test.tfrecords',
    n_test_samples=27020):
#    test_data = val_data   # to evaluate on testing data or validation data
    wordtoix = np.load(wordtoix_file).tolist()
    ixtoword = pd.Series(np.load(ixtoword_file).tolist())
    with tf.device("/gpu:0"):
        model = Video_Caption_Generator(
                dim_image=dim_image,
                n_words=len(wordtoix),
                dim_hidden=dim_hidden,
                batch_size=batch_size,
                n_caption_steps=n_caption_steps,
                n_video_steps=n_video_steps,
                drop_out_rate = 0.5,
                bias_init_vector=None)

    # preprocess on the CPU
    with tf.device('/cpu:0'):
        train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \
            _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train)
        val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \
            _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_test)
        train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \
            tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data],
                batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples)
        val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \
            tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data],
                batch_size=batch_size, num_threads=1, capacity=2* batch_size)
    # graph on the GPU
    with tf.device("/gpu:0"):
        tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \
            = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label)
        val_v2s_tf,v2s_lstm3_vars_tf = model.build_v2s_generator(val_data)
        val_s2s_tf, s2s_lstm3_vars_tf = model.build_s2s_generator(val_caption_id_1)
        val_s2v_tf, s2v_lstm4_vars_tf = model.build_s2v_generator(val_caption_id_1, val_frame_data)
        val_v2v_tf, v2v_lstm4_vars_tf = model.build_v2v_generator(val_data, val_frame_data)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))

    with tf.device(cpu_device):
        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        print 'load parameters from:', model_path

#    print 'halve the dropout weights..'
#    for ind, row in enumerate(v2s_lstm3_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)
#    for ind, row in enumerate(s2s_lstm2_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)
#    for ind, row in enumerate(s2v_lstm4_vars_tf):
#        if ind % 4 == 0:
#                assign_op = row.assign(tf.multiply(row,1-0.5))
#                sess.run(assign_op)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    ######### test sentence generation ##########
    print 'testing...'
    n_test_steps = int(n_test_samples / batch_size)
    print 'n_test_steps:', n_test_steps
    tstart = time.time()
    ### TODO: sometimes COCO test show exceptions in the beginning of training ####
    if test_v2s:
        try:
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname)
            for i, key in enumerate(pred_dict.keys()):
                print 'video:', flist[i]
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            print '############## video to sentence result #################'
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname)
            scorer  = COCOScorer()
            total_score_1 = scorer.score(gt_dict, pred_dict, id_list)
            print '############## video to sentence result #################'
        except Exception, e:
            print 'v2s bleu test exception'
        except Exception, e:
            print 'v2s bleu test exception'

    if test_s2s:
        try:
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname)
            for i, key in enumerate(pred_dict.keys()):
                print 'video:', flist[i]
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            print '############## sentence to sentence result #################'
            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname)
            scorer = COCOScorer()
            total_score_2 = scorer.score(gt_dict, pred_dict, id_list)
            print '############## sentence to sentence result #################'
        except Exception, e:
            print 'v2s bleu test exception'

    ######### test video generation #############
    if test_v2v:
        mse_v2v = test_all_videos(sess, n_test_steps, val_data, val_v2v_tf, val_video_label, pixel_scale_factor)
        print 'video2video mse:', mse_v2v
    if test_s2v:
        mse_s2v = test_all_videos(sess, n_test_steps, val_data, val_s2v_tf, val_video_label, pixel_scale_factor)
        print 'caption2video mse:', mse_s2v
    if save_demo_sent_v2s:
        get_demo_sentence(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname, result_file='demo_v2s.txt')
    if save_demo_sent_s2s:
        get_demo_sentence(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname, result_file='demo_s2s.txt')
Пример #11
0
def train():
    meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

    np.save('./data/ixtoword', ixtoword)

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_video_len, tf_caption, tf_caption_mask, tf_HLness, tf_HLness_mask, tf_HLness_att_mask= model.build_model()
    loss_summary = tf.scalar_summary("Loss",tf_loss)
    sess = tf.InteractiveSession()
    merged = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def)

    saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

	tStart_epoch = time.time()
	loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

	    tStart = time.time()
	    current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
	    current_HLness = np.zeros((batch_size, n_frame_step))
	    current_HLness_masks = np.zeros((batch_size, n_frame_step))
	    current_HLness_att_masks = np.zeros((batch_size, n_frame_step))
            current_video_masks = np.zeros((batch_size, n_frame_step))
	    current_video_len = np.zeros(batch_size)
	    for ind in xrange(batch_size):
		current_feats[ind,:,:] = current_batch['data'][:,ind,:]
		idx = np.where(current_batch['label'][:,ind] != -1)[0]
		if len(idx) == 0:
			continue
		idy = np.where(current_batch['label'][:,ind] == 1)[0]
		if len(idy) == 0:
			continue
		current_HLness[ind,idx] = current_batch['label'][idx,ind]
		current_HLness_masks[ind,idx] = 1
		current_video_masks[ind,idy[-1]] = 1
		current_video_len[ind] = idx[-1] + 1
		current_HLness_att_masks[ind,idy] = 1
		if(idy[0] > 4):
			current_HLness_att_masks[ind,idy[0]-5:idy[0]] = 1
		else:
			current_HLness_att_masks[ind,0:idy[0]] = 1

            current_captions = current_batch['title']
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=15-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            _, loss_val, summary_str= sess.run(
                    [train_op, tf_loss, merged],
                    feed_dict={
                        tf_video: current_feats,
                        tf_video_mask : current_video_masks,
                        tf_caption: current_caption_matrix,
                        tf_caption_mask: current_caption_masks,
			tf_HLness: current_HLness,
			tf_HLness_mask: current_HLness_masks,
			tf_HLness_att_mask: current_HLness_att_masks
                        })
	    writer.add_summary(summary_str, epoch)
	    loss_epoch[current_batch_file_idx] = loss_val
	    tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
	    #print "Time Cost:", round(tStop - tStart,2), "s"

	print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
	tStop_epoch = time.time()
	print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s"

        if np.mod(epoch, 20) == 0:
            print "Epoch ", epoch, " is done. Saving the model ..."
            saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

	    current_batch = h5py.File(test_data[np.random.randint(0,len(test_data))])
    	    video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator()
    	    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())
	    #[mp, pred_sent, gt_sent, HLness] = testing_one(sess, current_batch, ixtoword,video_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf)
    	    [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf)
	    #for xxx in xrange(current_batch['label'].shape[1]):
	    #	print gt_sent[xxx]
	    #	print pred_sent[xxx]
	    total_score = np.mean(mp)
	    print total_score
	    scorer = COCOScorer()
	    total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))

    print "Finally, saving the model ..."
    saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
Пример #12
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

#    np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword)
#    np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix)
#    sys.exit()
    ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist())
    wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist())

    model = Video_Caption_Generator(
            dim_image=dim_image,
	    dim_tracker=dim_tracker,
            n_words=len(wordtoix),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
	    tracker_cnt=tracker_cnt,
            drop_out_rate = 0.5,
            bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model()
    #loss_summary = tf.scalar_summary("Loss",tf_loss)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
    #merged = tf.merge_all_summaries()
    #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def)

    with tf.device("/cpu:0"):
    	saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, 'models/model-0')

    tStart_total = time.time()
    nr_prefetch = int(3)
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]
        
        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        ## init queue
        data_queue = mp.Queue(nr_prefetch)
#        tracker_queue = mp.Queue(nr_prefetch)
        title_queue = mp.Queue(nr_prefetch)
        t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data'))
#        t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker'))
        t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title'))
        t1.start()
#        t2.start()
        t3.start()
        for current_batch_file_idx in range(len(train_data)):
            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            
            if 'tracker' in current_batch.keys():
                current_tracker = np.array(current_batch['tracker'])
            else:
                current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker))
            
            if 'tracker_mask' in current_batch.keys():
                current_tracker_mask = np.array(current_batch['tracker_mask'])
            else:
                current_tracker_mask = np.zeros((batch_size, tracker_cnt))

#            current_tracker = tracker_queue.get()
            current_batch_data = data_queue.get()
            current_batch_title = title_queue.get()
            for ind in range(batch_size):
                current_feats[ind,:,:] = current_batch_data[:,ind,:]
                idx = np.where(current_batch['label'][:,ind] != -1)[0]
                if len(idx) == 0:
                        continue
                current_video_masks[ind,idx[-1]] = 1

            current_captions = current_batch_title
            current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions)

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1)
            current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix ))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            current_batch.close()


            _, loss_val= sess.run(
                [train_op, tf_loss],
                feed_dict={
                tf_video: current_feats,
                tf_video_mask : current_video_masks,
                tf_tracker : current_tracker,
                tf_tracker_mask : current_tracker_mask,
                tf_caption: current_caption_matrix,
                tf_caption_mask: current_caption_masks
                })
            #writer.add_summary(summary_str, epoch)
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
            #print "Time Cost:", round(tStop - tStart,2), "s"

        t1.join()
#       t2.join()
        t3.join()
        print ("Epoch:", epoch, " done. Loss:", np.mean(loss_epoch))
        tStop_epoch = time.time()
        print ("Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s")
	sys.stdout.flush()

        if np.mod(epoch, 2) == 0:
            print ("Epoch ", epoch, " is done. Saving the model ...")
    	    with tf.device('/cpu:0'):
            	saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
        if np.mod(epoch, 10) == 0:
            current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))])
            video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
            ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist())
#            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)
#            for key in pred_dict.keys():
#                for ele in gt_dict[key]:
#                    print "GT:  " + ele['caption']
#                print "PD:  " + pred_dict[key][0]['caption']
#                print '-------'

            [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf)

            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)

    print ("Finally, saving the model ...")
    with tf.device('/cpu:0'):
	    saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs)
    tStop_total = time.time()
    print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s")
Пример #13
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(
        video_data_path_train, video_data_path_val, video_data_path_test)
    captions = meta_data['Description'].values
    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(
        captions, word_count_threshold=1)

    np.save('./data0/ixtoword', ixtoword)

    model = Video_Caption_Generator(dim_image=dim_image,
                                    n_words=len(wordtoix),
                                    dim_hidden=dim_hidden,
                                    batch_size=batch_size,
                                    n_lstm_steps=n_frame_step,
                                    drop_out_rate=0.5,
                                    bias_init_vector=None)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model(
    )
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        allow_soft_placement=True))

    with tf.device("/cpu:0"):
        saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            for ind in xrange(batch_size):
                current_feats[ind, :, :] = current_batch['data'][:n_frame_step,
                                                                 ind, :]
                idx = np.where(current_batch['label'][:, ind] != -1)[0]
                if len(idx) == 0:
                    continue
                current_video_masks[ind, :idx[-1] + 1] = 1

            current_captions = current_batch['title']
            current_caption_ind = map(
                lambda cap: [
                    wordtoix[word] for word in cap.lower().split(' ')
                    if word in wordtoix
                ], current_captions)

            current_caption_matrix = sequence.pad_sequences(
                current_caption_ind, padding='post', maxlen=n_caption_step - 1)
            current_caption_matrix = np.hstack([
                current_caption_matrix,
                np.zeros([len(current_caption_matrix), 1])
            ]).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0],
                                              current_caption_matrix.shape[1]))
            nonzeros = np.array(
                map(lambda x: (x != 0).sum() + 1, current_caption_matrix))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            _, loss_val = sess.run(
                [train_op, tf_loss],
                feed_dict={
                    tf_video: current_feats,
                    tf_video_mask: current_video_masks,
                    tf_caption: current_caption_matrix,
                    tf_caption_mask: current_caption_masks
                })
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
        #print "Time Cost:", round(tStop - tStart,2), "s"

        print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
        tStop_epoch = time.time()
        print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s"

        if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1:
            print "Epoch ", epoch, " is done. Saving the model ..."
            with tf.device("/cpu:0"):
                saver.save(sess,
                           os.path.join(model_path, 'model'),
                           global_step=epoch)

            current_batch = h5py.File(val_data[np.random.randint(
                0, len(val_data))])
            video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator(
            )
            ixtoword = pd.Series(np.load('./data0/ixtoword.npy').tolist())
            [pred_sent, gt_sent, id_list, gt_dict,
             pred_dict] = testing_all(sess, train_data[-2:], ixtoword,
                                      video_tf, video_mask_tf, caption_tf)
            for key in pred_dict.keys():
                for ele in gt_dict[key]:
                    print "GT:  " + ele['caption']
                print "PD:  " + pred_dict[key][0]['caption']
                print '-------'
            [pred_sent, gt_sent, id_list, gt_dict,
             pred_dict] = testing_all(sess, val_data, ixtoword, video_tf,
                                      video_mask_tf, caption_tf)
            scorer = COCOScorer()
            total_score = scorer.score(gt_dict, pred_dict, id_list)
        sys.stdout.flush()

    print "Finally, saving the model ..."
    with tf.device("/cpu:0"):
        saver.save(sess,
                   os.path.join(model_path, 'model'),
                   global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
Пример #14
0
def train():
    meta_data, train_data, val_data, test_data = get_video_data_jukin(
        video_data_path_train, video_data_path_val, video_data_path_test
    )
    captions = meta_data["Description"].values
    captions = map(lambda x: x.replace(".", ""), captions)
    captions = map(lambda x: x.replace(",", ""), captions)
    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1)

    np.save("./data" + str(gpu_id) + "/ixtoword", ixtoword)

    model = Video_Caption_Generator(
        dim_image=dim_image,
        n_words=len(wordtoix),
        dim_hidden=dim_hidden,
        batch_size=batch_size,
        n_lstm_steps=n_frame_step,
        drop_out_rate=0.5,
        bias_init_vector=None,
    )

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model()
    loss_summary = tf.scalar_summary("Loss", tf_loss)
    sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True))
    merged = tf.merge_all_summaries()
    writer = tf.train.SummaryWriter("/tmp/tf_log", sess.graph_def)

    saver = tf.train.Saver(max_to_keep=100)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.initialize_all_variables().run()
    saver.restore(sess, "models_SS_youtube_notest_dummy/model-20")

    tStart_total = time.time()
    for epoch in range(n_epochs):
        index = np.arange(len(train_data))
        np.random.shuffle(index)
        train_data = train_data[index]

        tStart_epoch = time.time()
        loss_epoch = np.zeros(len(train_data))
        for current_batch_file_idx in xrange(len(train_data)):

            tStart = time.time()
            current_batch = h5py.File(train_data[current_batch_file_idx])
            current_feats = np.zeros((batch_size, n_frame_step, dim_image))
            current_video_masks = np.zeros((batch_size, n_frame_step))
            current_video_len = np.zeros(batch_size)
            for ind in xrange(batch_size):
                current_feats[ind, :, :] = current_batch["data"][:, ind, :]
                idx = np.where(current_batch["label"][:, ind] != -1)[0]
                if len(idx) == 0:
                    continue
                current_video_masks[ind, idx[-1]] = 1

            current_captions = current_batch["title"]
            current_caption_ind = map(
                lambda cap: [wordtoix[word] for word in cap.lower().split(" ") if word in wordtoix], current_captions
            )

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding="post", maxlen=16 - 1)
            current_caption_matrix = np.hstack(
                [current_caption_matrix, np.zeros([len(current_caption_matrix), 1])]
            ).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array(map(lambda x: (x != 0).sum() + 1, current_caption_matrix))

            for ind, row in enumerate(current_caption_masks):
                row[: nonzeros[ind]] = 1

            _, loss_val, summary_str = sess.run(
                [train_op, tf_loss, merged],
                feed_dict={
                    tf_video: current_feats,
                    tf_video_mask: current_video_masks,
                    tf_caption: current_caption_matrix,
                    tf_caption_mask: current_caption_masks,
                },
            )
            writer.add_summary(summary_str, epoch)
            loss_epoch[current_batch_file_idx] = loss_val
            tStop = time.time()
            # print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val
            # print "Time Cost:", round(tStop - tStart,2), "s"

        print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)
        tStop_epoch = time.time()
        print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s"
        sys.stdout.flush()

        if np.mod(epoch, 10) == 0:
            print "Epoch ", epoch, " is done. Saving the model ..."
            saver.save(sess, os.path.join(model_path, "model"), global_step=epoch)

            current_batch = h5py.File(val_data[np.random.randint(0, len(val_data))])
            video_tf, video_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator()
            ixtoword = pd.Series(np.load("./data" + str(gpu_id) + "/ixtoword.npy").tolist())
            [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf)
            for idx in range(len(pred_sent)):
                print "GT:  " + gt_sent[idx][0]["caption"]
                print "PD:  " + pred_sent[idx][0]["caption"]
                print "-------"
            [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword, video_tf, video_mask_tf, caption_tf)
            scorer = COCOScorer()
            total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent)))

    print "Finally, saving the model ..."
    saver.save(sess, os.path.join(model_path, "model"), global_step=n_epochs)
    tStop_total = time.time()
    print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"