def test(model_path=default_model_path): test_videos = open(testing_data, 'r').read().split('\n')[:-1] ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) bias_init_vector = np.load('./data/bias_init_vector.npy') model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, n_video_lstm_step=n_video_lstm_step, n_caption_lstm_step=n_caption_lstm_step, bias_init_vector=bias_init_vector) video_tf, video_mask_tf, caption_tf, _, _ = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() try: print '\n=== Use model', model_path, '===\n' saver.restore(sess, model_path) except: print '\nUse default model\n' saver.restore(sess, default_model_path) with open('output.json', 'w') as out: generated_sentences = [] for idx, video in enumerate(test_videos): print 'video =>', video video_feat_path = os.path.join(video_test_feat_path, video) + '.npy' video_feat = np.load(video_feat_path)[None,...] if video_feat.shape[1] == n_frame_step: video_mask = np.ones((video_feat.shape[0], video_feat.shape[1])) else: continue generated_word_index = sess.run(caption_tf, feed_dict={video_tf: video_feat, video_mask_tf: video_mask}) generated_words = ixtoword[generated_word_index] punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1 generated_words = generated_words[:punctuation] generated_sentence = ' '.join(generated_words) generated_sentence = generated_sentence.replace('<unk> ', '') generated_sentence = generated_sentence.replace('<bos> ', '') generated_sentence = generated_sentence.replace(' <eos>', '') print 'generated_sentence =>', generated_sentence generated_sentences.append({"caption": generated_sentence, "id": video}) json.dump(generated_sentences, out, indent=4)
def train(): train_data = get_video_train_data(video_train_data_path, video_train_feat_path) train_captions = train_data['Description'].values test_data = get_video_test_data(video_test_data_path, video_test_feat_path) test_captions = test_data['Description'].values captions_list = list(train_captions) + list(test_captions) captions = np.asarray(captions_list, dtype=np.object) captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) captions = map(lambda x: x.replace('"', ''), captions) captions = map(lambda x: x.replace('\n', ''), captions) captions = map(lambda x: x.replace('?', ''), captions) captions = map(lambda x: x.replace('!', ''), captions) captions = map(lambda x: x.replace('\\', ''), captions) captions = map(lambda x: x.replace('/', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab( captions, word_count_threshold=word_count_threshold) np.save("./data/wordtoix", wordtoix) np.save('./data/ixtoword', ixtoword) np.save("./data/bias_init_vector", bias_init_vector) model = Video_Caption_Generator(batch_size=batch_size, n_words=len(wordtoix), dim_hidden=dim_hidden, dim_image=dim_image, n_video_lstm_step=n_video_lstm_step, n_caption_lstm_step=n_caption_lstm_step, bias_init_vector=bias_init_vector) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model( ) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options)) sess = tf.InteractiveSession() # saver = tf.train.Saver(max_to_keep=100, write_version=1) saver = tf.train.Saver() with tf.variable_scope(tf.get_variable_scope(), reuse=False): train_op = tf.train.RMSPropOptimizer(learning_rate).minimize(tf_loss) # train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_loss) # train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.global_variables_initializer().run() for epoch in range(0, n_epochs): index = list(train_data.index) np.random.shuffle(index) train_data = train_data.ix[index] current_train_data = train_data.groupby('video_path').apply( lambda x: x.irow(np.random.choice(len(x)))) current_train_data = current_train_data.reset_index(drop=True) for start, end in zip( range(0, len(current_train_data), batch_size), range(batch_size, len(current_train_data), batch_size)): start_time = time.time() current_batch = current_train_data[start:end] current_videos = current_batch['video_path'].values current_feats = np.zeros( (batch_size, n_video_lstm_step, dim_image)) try: current_feats_vals = map(lambda vid: np.load(vid), current_videos) except: continue current_video_masks = np.zeros((batch_size, n_video_lstm_step)) for ind, feat in enumerate(current_feats_vals): current_feats[ind][:len(current_feats_vals[ind])] = feat current_video_masks[ind][:len(current_feats_vals[ind])] = 1 current_captions = current_batch['Description'].values current_captions = map(lambda x: '<bos> ' + x, current_captions) current_captions = map(lambda x: x.replace('.', ''), current_captions) current_captions = map(lambda x: x.replace(',', ''), current_captions) current_captions = map(lambda x: x.replace('"', ''), current_captions) current_captions = map(lambda x: x.replace('\n', ''), current_captions) current_captions = map(lambda x: x.replace('?', ''), current_captions) current_captions = map(lambda x: x.replace('!', ''), current_captions) current_captions = map(lambda x: x.replace('\\', ''), current_captions) current_captions = map(lambda x: x.replace('/', ''), current_captions) for idx, each_cap in enumerate(current_captions): word = each_cap.lower().split(' ') if len(word) < n_caption_lstm_step: current_captions[idx] = current_captions[idx] + ' <eos>' else: new_word = '' for i in range(n_caption_lstm_step - 1): new_word = new_word + word[i] + ' ' current_captions[idx] = new_word + '<eos>' current_caption_ind = [] for cap in current_captions: current_word_ind = [] for word in cap.lower().split(' '): if word in wordtoix: current_word_ind.append(wordtoix[word]) else: current_word_ind.append(wordtoix['<unk>']) current_caption_ind.append(current_word_ind) current_caption_matrix = sequence.pad_sequences( current_caption_ind, padding='post', maxlen=n_caption_lstm_step) current_caption_matrix = np.hstack([ current_caption_matrix, np.zeros([len(current_caption_matrix), 1]) ]).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum() + 1, current_caption_matrix)) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 probs_val = sess.run(tf_probs, feed_dict={ tf_video: current_feats, tf_caption: current_caption_matrix }) _, loss_val = sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask: current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val, ' Elapsed time: ', str((time.time() - start_time))) if np.mod(epoch, 10) == 0: print("Epoch ", epoch, " is done. Saving the model ...") saver.save(sess, os.path.join(model_path, 'model-' + str(epoch)))
def test(args): assert os.path.isfile(os.path.join(args.init_from,"config.pkl")), "config.pkl file does not exist in path %s" % args.init_from # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_args = cPickle.load(f) # complete arguments to fulfill different versions if("attention" in vars(saved_args)): print("attention: %d" % vars(saved_args)["attention"]) else: vars(saved_args)["attention"] = 0 if("schedule_sampling" in vars(saved_args)): print("schedule_sampling: %d" % vars(saved_args)["schedule_sampling"]) else: vars(saved_args)["schedule_sampling"] = 0.0 with open(os.path.join(args.init_from, 'vocab.pkl'), 'rb') as f: vocab = cPickle.load(f) vocab_inv = {v:k for k, v in vocab.items()} with open(args.testing_file,'r') as f: test_feat_id = f.readlines() for i in range(len(test_feat_id)): test_feat_id[i] = test_feat_id[i].replace('\n','') model = Video_Caption_Generator(saved_args,n_vocab=len(vocab),infer=True) with tf.Session() as sess: result = [] for i in range(len(test_feat_id)): tf.global_variables_initializer().run() saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(args.init_from) if ckpt and ckpt.model_checkpoint_path: # args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) if i == 0: print("Model restored %s" % ckpt.model_checkpoint_path) sess.run(tf.global_variables()) # if i ==0: print("Initialized") this_test_feat_id = test_feat_id[i] # get vdieo features # notes: the second argument to get_video_feat must be np.array current_feat, current_feat_mask = get_video_feat(args.testing_path, np.array([this_test_feat_id])) this_gen_idx, probs = sess.run([model.gen_caption_idx,model.pred_probs],feed_dict={ model.video: current_feat, model.video_mask : current_feat_mask }) this_gen_words = [] for k in range(len(this_gen_idx)): this_gen_words.append(vocab_inv.get(this_gen_idx[k],'<PAD>')) this_gen_words = np.array(this_gen_words) punctuation = np.argmax(this_gen_words == '<EOS>') + 1 if punctuation > 1: this_gen_words = this_gen_words[:punctuation] this_caption = ' '.join(this_gen_words) this_caption = this_caption.replace('<BOS> ', '') this_caption = this_caption.replace(' <EOS>', '') this_answer = {} this_answer['caption'] = this_caption this_answer['id'] = this_test_feat_id print('Id: %s, caption: %s' % (this_test_feat_id, this_caption)) result.append(this_answer) with open(args.result_file, 'w') as fout: json.dump(result, fout)
def train(args): if args.init_from is not None: # check if all necessary files exist assert os.path.isfile( os.path.join(args.init_from, "config.pkl") ), "config.pkl file does not exist in path %s" % args.init_from # get ckpt ckpt = tf.train.get_checkpoint_state(args.init_from) # get vocab with open(os.path.join(args.init_from, 'vocab.pkl'), 'rb') as f: vocab = cPickle.load(f) vocab_inv = {v: k for k, v in vocab.items()} # read data _, _, train_feat_id, train_caption, test_feat_id, test_caption = data_preprocess( args.train_label_json, args.test_label_json) # open old config and check if models are compatible with open(os.path.join(args.init_from, 'config.pkl'), 'rb') as f: saved_args = cPickle.load(f) need_be_same = [ "dim_image", "dim_hidden", "n_lstm_step", "n_video_step", "n_caption_step" ] for checkme in need_be_same: assert vars(saved_args)[checkme] == vars( args )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # complete arguments to fulfill different versions if ("schedule_sampling" in vars(saved_args)): print("schedule_sampling: %d" % vars(saved_args)["schedule_sampling"]) else: vars(saved_args)["schedule_sampling"] = 0.0 else: with open(os.path.join(args.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(args, f) vocab, vocab_inv, train_feat_id, train_caption, test_feat_id, test_caption = data_preprocess( args.train_label_json, args.test_label_json) with open(os.path.join(args.save_dir, 'vocab.pkl'), 'wb') as f: cPickle.dump(vocab, f) model = Video_Caption_Generator(args, n_vocab=len(vocab), infer=False) # add gpu options gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_mem) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: tf.global_variables_initializer().run() print("Initialized") saver = tf.train.Saver(tf.global_variables()) if args.init_from is not None: saver.restore(sess, ckpt.model_checkpoint_path) loss_fd = open('log/loss.txt', 'w') loss_to_draw = [] for epoch in range(0, args.n_epoch): if (model.schedule_sampling > 0.0): # [pseudo] prob of schedule sampling linearly increases with epochs model.schedule_sampling = np.min( [model.schedule_sampling * (1.0 + epoch / 50), 1.0]) # shuffle index = np.array(range(len(train_feat_id))) np.random.shuffle(index) epoch_train_feat_id = train_feat_id[index] epoch_train_caption = train_caption[index] loss_to_draw_epoch = [] for start, end in zip( range(0, len(epoch_train_feat_id), model.batch_size), range(model.batch_size, len(epoch_train_feat_id), model.batch_size)): # for start,end in zip(range(0,2,2),range(2,4,2)): start_time = time.time() # get one minibatch batch_feat_id = epoch_train_feat_id[start:end] batch_caption = epoch_train_caption[start:end] # get vdieo features current_feat, current_feat_mask = get_video_feat( args.train_video_feat_path, batch_feat_id) # randomly select one captions for one video and get padding captions with maxlen = 20 current_caption, current_caption_mask = get_padding_caption( vocab, batch_caption, maxlen=model.n_caption_step + 1) # run train_op to optimizer tf_loss _, loss_val = sess.run( [model.train_op, model.tf_loss], feed_dict={ model.video: current_feat, model.video_mask: current_feat_mask, model.caption: current_caption, model.caption_mask: current_caption_mask }) loss_to_draw_epoch.append(loss_val) print('idx: ', start, " Epoch: ", epoch, " loss: ", loss_val, ' Elapsed time: ', str((time.time() - start_time))) loss_fd.write('epoch ' + str(epoch) + ' loss ' + str(loss_val) + '\n') if np.mod(epoch, args.save_every) == 0: checkpoint_path = os.path.join(args.save_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=epoch) print("Epoch ", epoch, "model saved to {}".format(checkpoint_path)) loss_fd.close()
def test(model_path=default_model_path): test_videos = open(testing_data, 'r').read().split('\n')[:-1] with open(testing_label) as data_file: test_labels = json.load(data_file) ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) bias_init_vector = np.load('./data/bias_init_vector.npy') model = Video_Caption_Generator(dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, n_video_lstm_step=n_video_lstm_step, n_caption_lstm_step=n_caption_lstm_step, bias_init_vector=bias_init_vector) video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator( ) sess = tf.InteractiveSession() saver = tf.train.Saver() try: print '\n=== Use model', model_path, '===\n' saver.restore(sess, model_path) except: print '\nUse default model\n' saver.restore(sess, default_model_path) with open('S2VT_prediction.json', 'w') as out: generated_sentences = [] bleu_score_avg = [0., 0.] for idx, video in enumerate(test_videos): print 'video =>', video video_feat_path = os.path.join(video_test_feat_path, video) + '.npy' video_feat = np.load(video_feat_path)[None, ...] #video_feat = np.load(video_feat_path) #video_mask = np.ones((video_feat.shape[0], video_feat.shape[1])) if video_feat.shape[1] == n_frame_step: video_mask = np.ones( (video_feat.shape[0], video_feat.shape[1])) else: continue #shape_templete = np.zeros(shape=(1, n_frame_step, 4096), dtype=float ) #shape_templete[:video_feat.shape[0], :video_feat.shape[1], :video_feat.shape[2]] = video_feat #video_feat = shape_templete #video_mask = np.ones((video_feat.shape[0], n_frame_step)) generated_word_index = sess.run(caption_tf, feed_dict={ video_tf: video_feat, video_mask_tf: video_mask }) generated_words = ixtoword[generated_word_index] punctuation = np.argmax(np.array(generated_words) == '<eos>') + 1 generated_words = generated_words[:punctuation] generated_sentence = ' '.join(generated_words) generated_sentence = generated_sentence.replace('<bos> ', '') generated_sentence = generated_sentence.replace(' <eos>', '') bleu_score = 0. print 'generated_sentence =>', generated_sentence for reference_sentence in test_labels[idx]['caption']: bleu_score += bleu_eval.BLEU_new(generated_sentence, reference_sentence) bleu_score_avg[0] += bleu_score bleu_score_avg[1] += len(test_labels[idx]['caption']) print 'bleu score', bleu_score / len( test_labels[idx]['caption']), '\n' generated_sentences.append({ "caption": generated_sentence, "id": video }) print 'avg bleu score', bleu_score_avg[0] / bleu_score_avg[1], '\n' json.dump(generated_sentences, out, indent=4)