示例#1
0
def test(model_path=default_model_path):
    test_videos = open(testing_data, 'r').read().split('\n')[:-1]

    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())
    bias_init_vector = np.load('./data/bias_init_vector.npy')

    model = Video_Caption_Generator(
            dim_image=dim_image,
            n_words=len(ixtoword),
            dim_hidden=dim_hidden,
            batch_size=batch_size,
            n_lstm_steps=n_frame_step,
            n_video_lstm_step=n_video_lstm_step,
            n_caption_lstm_step=n_caption_lstm_step,
            bias_init_vector=bias_init_vector)

    video_tf, video_mask_tf, caption_tf, _, _ = model.build_generator()

    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    try:
        print '\n=== Use model', model_path, '===\n'
        saver.restore(sess, model_path)
    except:
        print '\nUse default model\n'
        saver.restore(sess, default_model_path)

    with open('output.json', 'w') as out:
        generated_sentences = []
        for idx, video in enumerate(test_videos):
            print 'video =>', video

            video_feat_path = os.path.join(video_test_feat_path, video) + '.npy'
            video_feat = np.load(video_feat_path)[None,...]
            if video_feat.shape[1] == n_frame_step:
                video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
            else:
                continue
    
            generated_word_index = sess.run(caption_tf, feed_dict={video_tf: video_feat, video_mask_tf: video_mask})
            generated_words = ixtoword[generated_word_index]
    
            punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1
            generated_words = generated_words[:punctuation] 
            generated_sentence = ' '.join(generated_words)
            generated_sentence = generated_sentence.replace('<unk> ', '')
            generated_sentence = generated_sentence.replace('<bos> ', '')
            generated_sentence = generated_sentence.replace(' <eos>', '')

            print 'generated_sentence =>', generated_sentence

            generated_sentences.append({"caption": generated_sentence, "id": video})
        json.dump(generated_sentences, out, indent=4)
示例#2
0
def train():
    train_data = get_video_train_data(video_train_data_path,
                                      video_train_feat_path)
    train_captions = train_data['Description'].values
    test_data = get_video_test_data(video_test_data_path, video_test_feat_path)
    test_captions = test_data['Description'].values

    captions_list = list(train_captions) + list(test_captions)
    captions = np.asarray(captions_list, dtype=np.object)

    captions = map(lambda x: x.replace('.', ''), captions)
    captions = map(lambda x: x.replace(',', ''), captions)
    captions = map(lambda x: x.replace('"', ''), captions)
    captions = map(lambda x: x.replace('\n', ''), captions)
    captions = map(lambda x: x.replace('?', ''), captions)
    captions = map(lambda x: x.replace('!', ''), captions)
    captions = map(lambda x: x.replace('\\', ''), captions)
    captions = map(lambda x: x.replace('/', ''), captions)

    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(
        captions, word_count_threshold=word_count_threshold)

    np.save("./data/wordtoix", wordtoix)
    np.save('./data/ixtoword', ixtoword)
    np.save("./data/bias_init_vector", bias_init_vector)

    model = Video_Caption_Generator(batch_size=batch_size,
                                    n_words=len(wordtoix),
                                    dim_hidden=dim_hidden,
                                    dim_image=dim_image,
                                    n_video_lstm_step=n_video_lstm_step,
                                    n_caption_lstm_step=n_caption_lstm_step,
                                    bias_init_vector=bias_init_vector)

    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model(
    )

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
    # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    # sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
    sess = tf.InteractiveSession()

    # saver = tf.train.Saver(max_to_keep=100, write_version=1)
    saver = tf.train.Saver()
    with tf.variable_scope(tf.get_variable_scope(), reuse=False):
        train_op = tf.train.RMSPropOptimizer(learning_rate).minimize(tf_loss)
        # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_loss)
        # train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
    tf.global_variables_initializer().run()

    for epoch in range(0, n_epochs):
        index = list(train_data.index)
        np.random.shuffle(index)
        train_data = train_data.ix[index]

        current_train_data = train_data.groupby('video_path').apply(
            lambda x: x.irow(np.random.choice(len(x))))
        current_train_data = current_train_data.reset_index(drop=True)

        for start, end in zip(
                range(0, len(current_train_data), batch_size),
                range(batch_size, len(current_train_data), batch_size)):

            start_time = time.time()

            current_batch = current_train_data[start:end]
            current_videos = current_batch['video_path'].values

            current_feats = np.zeros(
                (batch_size, n_video_lstm_step, dim_image))
            try:
                current_feats_vals = map(lambda vid: np.load(vid),
                                         current_videos)
            except:
                continue

            current_video_masks = np.zeros((batch_size, n_video_lstm_step))

            for ind, feat in enumerate(current_feats_vals):
                current_feats[ind][:len(current_feats_vals[ind])] = feat
                current_video_masks[ind][:len(current_feats_vals[ind])] = 1

            current_captions = current_batch['Description'].values
            current_captions = map(lambda x: '<bos> ' + x, current_captions)
            current_captions = map(lambda x: x.replace('.', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace(',', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace('"', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace('\n', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace('?', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace('!', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace('\\', ''),
                                   current_captions)
            current_captions = map(lambda x: x.replace('/', ''),
                                   current_captions)

            for idx, each_cap in enumerate(current_captions):
                word = each_cap.lower().split(' ')
                if len(word) < n_caption_lstm_step:
                    current_captions[idx] = current_captions[idx] + ' <eos>'
                else:
                    new_word = ''
                    for i in range(n_caption_lstm_step - 1):
                        new_word = new_word + word[i] + ' '
                    current_captions[idx] = new_word + '<eos>'

            current_caption_ind = []
            for cap in current_captions:
                current_word_ind = []
                for word in cap.lower().split(' '):
                    if word in wordtoix:
                        current_word_ind.append(wordtoix[word])
                    else:
                        current_word_ind.append(wordtoix['<unk>'])
                current_caption_ind.append(current_word_ind)

            current_caption_matrix = sequence.pad_sequences(
                current_caption_ind,
                padding='post',
                maxlen=n_caption_lstm_step)
            current_caption_matrix = np.hstack([
                current_caption_matrix,
                np.zeros([len(current_caption_matrix), 1])
            ]).astype(int)
            current_caption_masks = np.zeros((current_caption_matrix.shape[0],
                                              current_caption_matrix.shape[1]))
            nonzeros = np.array(
                map(lambda x: (x != 0).sum() + 1, current_caption_matrix))

            for ind, row in enumerate(current_caption_masks):
                row[:nonzeros[ind]] = 1

            probs_val = sess.run(tf_probs,
                                 feed_dict={
                                     tf_video: current_feats,
                                     tf_caption: current_caption_matrix
                                 })

            _, loss_val = sess.run(
                [train_op, tf_loss],
                feed_dict={
                    tf_video: current_feats,
                    tf_video_mask: current_video_masks,
                    tf_caption: current_caption_matrix,
                    tf_caption_mask: current_caption_masks
                })

            print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val,
                  ' Elapsed time: ', str((time.time() - start_time)))

        if np.mod(epoch, 10) == 0:
            print("Epoch ", epoch, " is done. Saving the model ...")
            saver.save(sess, os.path.join(model_path, 'model-' + str(epoch)))
示例#3
0
def test(args):

	assert os.path.isfile(os.path.join(args.init_from,"config.pkl")), "config.pkl file does not exist in path %s" % args.init_from
	# open old config and check if models are compatible
	with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f:
		saved_args = cPickle.load(f)

	# complete arguments to fulfill different versions
	if("attention" in vars(saved_args)):
		print("attention: %d" % vars(saved_args)["attention"])
	else:
		vars(saved_args)["attention"] = 0

	if("schedule_sampling" in vars(saved_args)):
		print("schedule_sampling: %d" % vars(saved_args)["schedule_sampling"])
	else:
		vars(saved_args)["schedule_sampling"] = 0.0

	with open(os.path.join(args.init_from, 'vocab.pkl'), 'rb') as f:
		vocab = cPickle.load(f)

	vocab_inv = {v:k for k, v in vocab.items()}

	with open(args.testing_file,'r') as f:
	    test_feat_id = f.readlines()
	    for i in range(len(test_feat_id)):
	        test_feat_id[i] = test_feat_id[i].replace('\n','')

	model = Video_Caption_Generator(saved_args,n_vocab=len(vocab),infer=True)
	
	with tf.Session() as sess:
		result = []
		for i in range(len(test_feat_id)):
			tf.global_variables_initializer().run()
			saver = tf.train.Saver()
			ckpt = tf.train.get_checkpoint_state(args.init_from)

			if ckpt and ckpt.model_checkpoint_path: # args.init_from is not None:
				saver.restore(sess, ckpt.model_checkpoint_path)
				if i == 0:
					print("Model restored %s" % ckpt.model_checkpoint_path)
			sess.run(tf.global_variables())
			# 
			if i ==0:
				print("Initialized")
			
			this_test_feat_id = test_feat_id[i]

			# get vdieo features
			# notes: the second argument to get_video_feat must be np.array
			current_feat, current_feat_mask = get_video_feat(args.testing_path, np.array([this_test_feat_id]))
			
			this_gen_idx, probs = sess.run([model.gen_caption_idx,model.pred_probs],feed_dict={
										model.video: current_feat,
										model.video_mask : current_feat_mask
										})

			this_gen_words = []

			for k in range(len(this_gen_idx)):
				this_gen_words.append(vocab_inv.get(this_gen_idx[k],'<PAD>'))


			this_gen_words = np.array(this_gen_words)

			punctuation = np.argmax(this_gen_words == '<EOS>') + 1
			
			if punctuation > 1:
				this_gen_words = this_gen_words[:punctuation]


			this_caption = ' '.join(this_gen_words)
			this_caption = this_caption.replace('<BOS> ', '')
			this_caption = this_caption.replace(' <EOS>', '')

			this_answer = {}
			this_answer['caption'] = this_caption
			this_answer['id'] = this_test_feat_id

			print('Id: %s, caption: %s' % (this_test_feat_id, this_caption))

			result.append(this_answer)

		with open(args.result_file, 'w') as fout:
			json.dump(result, fout)
示例#4
0
def train(args):
    if args.init_from is not None:
        # check if all necessary files exist
        assert os.path.isfile(
            os.path.join(args.init_from, "config.pkl")
        ), "config.pkl file does not exist in path %s" % args.init_from

        # get ckpt
        ckpt = tf.train.get_checkpoint_state(args.init_from)

        # get vocab
        with open(os.path.join(args.init_from, 'vocab.pkl'), 'rb') as f:
            vocab = cPickle.load(f)
        vocab_inv = {v: k for k, v in vocab.items()}

        # read data
        _, _, train_feat_id, train_caption, test_feat_id, test_caption = data_preprocess(
            args.train_label_json, args.test_label_json)

        # open old config and check if models are compatible
        with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f:
            saved_args = cPickle.load(f)
        need_be_same = [
            "dim_image", "dim_hidden", "n_lstm_step", "n_video_step",
            "n_caption_step"
        ]
        for checkme in need_be_same:
            assert vars(saved_args)[checkme] == vars(
                args
            )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        # complete arguments to fulfill different versions
        if ("schedule_sampling" in vars(saved_args)):
            print("schedule_sampling: %d" %
                  vars(saved_args)["schedule_sampling"])
        else:
            vars(saved_args)["schedule_sampling"] = 0.0

    else:
        with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f:
            cPickle.dump(args, f)

        vocab, vocab_inv, train_feat_id, train_caption, test_feat_id, test_caption = data_preprocess(
            args.train_label_json, args.test_label_json)

        with open(os.path.join(args.save_dir, 'vocab.pkl'), 'wb') as f:
            cPickle.dump(vocab, f)

    model = Video_Caption_Generator(args, n_vocab=len(vocab), infer=False)

    # add gpu options
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem)

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        tf.global_variables_initializer().run()
        print("Initialized")

        saver = tf.train.Saver(tf.global_variables())
        if args.init_from is not None:
            saver.restore(sess, ckpt.model_checkpoint_path)

        loss_fd = open('log/loss.txt', 'w')
        loss_to_draw = []

        for epoch in range(0, args.n_epoch):
            if (model.schedule_sampling > 0.0):
                # [pseudo] prob of schedule sampling linearly increases with epochs
                model.schedule_sampling = np.min(
                    [model.schedule_sampling * (1.0 + epoch / 50), 1.0])

            # shuffle
            index = np.array(range(len(train_feat_id)))
            np.random.shuffle(index)
            epoch_train_feat_id = train_feat_id[index]
            epoch_train_caption = train_caption[index]

            loss_to_draw_epoch = []

            for start, end in zip(
                    range(0, len(epoch_train_feat_id), model.batch_size),
                    range(model.batch_size, len(epoch_train_feat_id),
                          model.batch_size)):
                # for start,end in zip(range(0,2,2),range(2,4,2)):
                start_time = time.time()

                # get one minibatch
                batch_feat_id = epoch_train_feat_id[start:end]
                batch_caption = epoch_train_caption[start:end]

                # get vdieo features
                current_feat, current_feat_mask = get_video_feat(
                    args.train_video_feat_path, batch_feat_id)

                # randomly select one captions for one video and get padding captions with maxlen = 20
                current_caption, current_caption_mask = get_padding_caption(
                    vocab, batch_caption, maxlen=model.n_caption_step + 1)

                # run train_op to optimizer tf_loss
                _, loss_val = sess.run(
                    [model.train_op, model.tf_loss],
                    feed_dict={
                        model.video: current_feat,
                        model.video_mask: current_feat_mask,
                        model.caption: current_caption,
                        model.caption_mask: current_caption_mask
                    })
                loss_to_draw_epoch.append(loss_val)

                print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val,
                      ' Elapsed time: ', str((time.time() - start_time)))
                loss_fd.write('epoch ' + str(epoch) + ' loss ' +
                              str(loss_val) + '\n')
            if np.mod(epoch, args.save_every) == 0:
                checkpoint_path = os.path.join(args.save_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=epoch)
                print("Epoch ", epoch,
                      "model saved to {}".format(checkpoint_path))
        loss_fd.close()
示例#5
0
文件: test.py 项目: pochih/Video-Cap
def test(model_path=default_model_path):
    test_videos = open(testing_data, 'r').read().split('\n')[:-1]
    with open(testing_label) as data_file:
        test_labels = json.load(data_file)

    ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

    bias_init_vector = np.load('./data/bias_init_vector.npy')

    model = Video_Caption_Generator(dim_image=dim_image,
                                    n_words=len(ixtoword),
                                    dim_hidden=dim_hidden,
                                    batch_size=batch_size,
                                    n_lstm_steps=n_frame_step,
                                    n_video_lstm_step=n_video_lstm_step,
                                    n_caption_lstm_step=n_caption_lstm_step,
                                    bias_init_vector=bias_init_vector)

    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator(
    )

    sess = tf.InteractiveSession()

    saver = tf.train.Saver()
    try:
        print '\n=== Use model', model_path, '===\n'
        saver.restore(sess, model_path)
    except:
        print '\nUse default model\n'
        saver.restore(sess, default_model_path)

    with open('S2VT_prediction.json', 'w') as out:
        generated_sentences = []
        bleu_score_avg = [0., 0.]
        for idx, video in enumerate(test_videos):
            print 'video =>', video

            video_feat_path = os.path.join(video_test_feat_path,
                                           video) + '.npy'
            video_feat = np.load(video_feat_path)[None, ...]
            #video_feat = np.load(video_feat_path)
            #video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
            if video_feat.shape[1] == n_frame_step:
                video_mask = np.ones(
                    (video_feat.shape[0], video_feat.shape[1]))
            else:
                continue
                #shape_templete = np.zeros(shape=(1, n_frame_step, 4096), dtype=float )
                #shape_templete[:video_feat.shape[0], :video_feat.shape[1], :video_feat.shape[2]] = video_feat
                #video_feat = shape_templete
                #video_mask = np.ones((video_feat.shape[0], n_frame_step))

            generated_word_index = sess.run(caption_tf,
                                            feed_dict={
                                                video_tf: video_feat,
                                                video_mask_tf: video_mask
                                            })
            generated_words = ixtoword[generated_word_index]

            punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1
            generated_words = generated_words[:punctuation]
            generated_sentence = ' '.join(generated_words)
            generated_sentence = generated_sentence.replace('<bos> ', '')
            generated_sentence = generated_sentence.replace(' <eos>', '')

            bleu_score = 0.
            print 'generated_sentence =>', generated_sentence
            for reference_sentence in test_labels[idx]['caption']:
                bleu_score += bleu_eval.BLEU_new(generated_sentence,
                                                 reference_sentence)
            bleu_score_avg[0] += bleu_score
            bleu_score_avg[1] += len(test_labels[idx]['caption'])
            print 'bleu score', bleu_score / len(
                test_labels[idx]['caption']), '\n'

            generated_sentences.append({
                "caption": generated_sentence,
                "id": video
            })
        print 'avg bleu score', bleu_score_avg[0] / bleu_score_avg[1], '\n'
        json.dump(generated_sentences, out, indent=4)