def test(model_path='models/model-900', video_feat_path=video_feat_path): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) test_data = val_data ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver() saver.restore(sess, model_path) for ind, row in enumerate(lstm3_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) [pred_sent, gt_sent] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, caption_tf) #np.savez('Att_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) return total_score
def test(model_path='models/model-900', video_feat_path=video_feat_path): meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test) #test_data = train_data ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0, bias_init_vector=None) video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) for ind, row in enumerate(lstmRNN_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) for ind, row in enumerate(lstm3_variables_tf): if ind % 4 == 0: assign_op = row.assign(tf.mul(row,1-0.5)) sess.run(assign_op) [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf) np.savez('HS_result/'+model_path.split('/')[1],gt = gt_sent,pred=pred_sent,mp=mp,HLness=HLness) total_score = np.mean(mp) print model_path.split('/')[1]+' mAP: ' + str(total_score) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) return total_score
def score_with_cocoeval(samples_valid, samples_test, engine): scorer = COCOScorer() if samples_valid: gts_valid = OrderedDict() for vidID in engine.valid_ids: gts_valid[vidID] = engine.CAP[vidID] valid_score = scorer.score(gts_valid, samples_valid, engine.valid_ids) else: valid_score = None if samples_test: gts_test = OrderedDict() for vidID in engine.test_ids: gts_test[vidID] = engine.CAP[vidID] test_score = scorer.score(gts_test, samples_test, engine.test_ids) else: test_score = None return valid_score, test_score
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask= model.build_model() sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, 'models_Att_update_new/model-30') tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind,:,:] = current_batch['data'][:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind,:idx[-1]+1] = 1 current_captions = current_batch['title'] current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=16-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 _, loss_val = sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s" if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1: print "Epoch ", epoch, " is done. Saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))]) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data'+str(gpu_id)+'/ixtoword.npy').tolist()) [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf) for idx in range(len(pred_sent)): print "GT: " + gt_sent[idx][0]['caption'] print "PD: " + pred_sent[idx][0]['caption'] print '-------' [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword,video_tf, video_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) sys.stdout.flush() print "Finally, saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def train(): assert os.path.isfile(video_data_path_train) assert os.path.isfile(video_data_path_val) assert os.path.isdir(model_path) assert os.path.isfile(wordtoix_file) assert os.path.isfile(ixtoword_file) assert drop_strategy in ['block_video', 'block_sent', 'random', 'keep'] wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) print 'build model and session...' # shared parameters on the GPU with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) tStart_total = time.time() n_epoch_steps = int(n_train_samples / batch_size) n_steps = n_epochs * n_epoch_steps # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_val) # random batches train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \ = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_v2s_tf, _ = model.build_v2s_generator(val_data) val_s2s_tf, _, _ = model.build_s2s_generator(val_caption_id_1) val_s2v_tf, _, _ = model.build_s2v_generator(val_caption_id_1, val_frame_data) val_v2v_tf, _ = model.build_v2v_generator(val_data, val_frame_data) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # check for model file with tf.device(cpu_device): saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(model_path) global_step = 0 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True) global_step = get_model_step(ckpt.model_checkpoint_path) print 'global_step:', global_step else: print("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) temp = set(tf.global_variables()) # train on the GPU with tf.device("/gpu:0"): ## 1. weight decay for var in tf.trainable_variables(): decay_loss = tf.multiply(tf.nn.l2_loss(var), 0.0004, name='weight_loss') tf.add_to_collection('losses', decay_loss) tf.add_to_collection('losses', tf_loss) tf_total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') ## 2. gradient clip optimizer = tf.train.AdamOptimizer(learning_rate) gvs = optimizer.compute_gradients(tf_total_loss) # when variable is not related to the loss, grad returned as None clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in gvs if grad is not None] for grad, var in gvs: if grad is not None: tf.summary.histogram(var.name + '/grad', grad) tf.summary.histogram(var.name + '/data', var) train_op = optimizer.apply_gradients(clip_gvs) ## initialize variables added for optimizer sess.run(tf.variables_initializer(set(tf.global_variables()) - temp)) # initialize epoch variable in queue reader sess.run(tf.local_variables_initializer()) loss_epoch = 0 loss_epoch_cap = 0 loss_epoch_vid = 0 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ##### add summaries ###### tf.summary.histogram('video_h', tf_v_h) tf.summary.histogram('sent_h', tf_s_h) tf.summary.scalar('loss_vid', tf_loss_vid) tf.summary.scalar('loss_lat', tf_loss_lat) tf.summary.scalar('loss_caption', tf_loss_cap) # for var in tf.trainable_variables(): # summaries.append(tf.histogram_summary(var.op.name, var)) summary_op = tf.summary.merge_all() # write graph architecture to file summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph) epoch = global_step video_label = sess.run(train_video_label) for step in xrange(1, n_steps + 1): tStart = time.time() if drop_strategy == 'keep': drop_type = 0 elif drop_strategy == 'block_sentence': drop_type = 1 elif drop_strategy == 'block_video': drop_type = 2 else: drop_type = random.randint(0, 3) _, loss_val, loss_cap, loss_lat, loss_vid = sess.run( [train_op, tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid], feed_dict={tf_drop_type: drop_type}) tStop = time.time() print "step:", step, " Loss:", loss_val, "loss_cap:", loss_cap * caption_weight, "loss_latent:", loss_lat * latent_weight, "loss_vid:", loss_vid * video_weight print "Time Cost:", round(tStop - tStart, 2), "s" loss_epoch += loss_val loss_epoch_cap += loss_cap loss_epoch_vid += loss_vid if step % n_epoch_steps == 0: # if step % 3 == 0: epoch += 1 loss_epoch /= n_epoch_steps loss_epoch_cap /= n_epoch_steps loss_epoch_vid /= n_epoch_steps with tf.device(cpu_device): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) # print 'z:', z[0, :10] print 'epoch:', epoch, 'loss:', loss_epoch, "loss_cap:", loss_epoch_cap, "loss_lat:", loss_lat, "loss_vid:", loss_epoch_vid loss_epoch = 0 loss_epoch_cap = 0 loss_epoch_vid = 0 ######### test sentence generation ########## n_val_steps = int(n_val_samples / batch_size) # n_val_steps = 3 ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_v2s: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## video to sentence result #################' print 'epoch:', epoch [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_val_steps, ixtoword, val_v2s_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print '############## video to sentence result #################' if test_s2s: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## sentence to sentence result #################' print 'epoch:', epoch [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_val_steps, ixtoword, val_s2s_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_val_steps, val_frame_data, val_v2v_tf, val_video_label, pixel_scale_factor) print 'epoch', epoch, 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_val_steps, val_frame_data, val_s2v_tf, val_video_label, pixel_scale_factor) print 'epoch', epoch, 'caption2video mse:', mse_s2v sys.stdout.flush() ###### summary ###### if epoch % 2 == 0: summary = sess.run(summary_op) summary_writer.add_summary(summary, epoch) sys.stdout.flush() coord.request_stop() coord.join(threads) print "Finally, saving the model ..." with tf.device(cpu_device): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s" sess.close()
def test(model_path=None, video_data_path_test=video_data_path_val, n_test_samples=n_val_samples, video_name=None): # test_data = val_data # to evaluate on testing data or validation data wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_test) train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss = model.build_model(train_caption_id, train_caption_id_1, train_caption_label) val_s2s_tf, s2s_lstm3_vars_tf = model.build_s2s_generator( val_caption_id_1) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device(cpu_device): saver = tf.train.Saver() saver.restore(sess, model_path) print 'load parameters from:', model_path coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ######### test sentence generation ########## print 'testing...' n_test_steps = int(n_test_samples / batch_size) print 'n_test_steps:', n_test_steps tstart = time.time() ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_s2s: # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) # for i, key in enumerate(pred_dict.keys()): # print 'video:', flist[i] # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname) if os.path.isfile('demo_s2s.txt.videos'): video_name = pickle.load(open('demo_s2s.txt.videos', "rb")) if video_name: for i, key in enumerate(pred_dict.keys()): if flist[i] in video_name: print flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-----------' scorer = COCOScorer() total_score_2 = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' if save_demo_sent_s2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname, result_file='demo_s2s.txt') sys.stdout.flush() coord.request_stop() coord.join(threads) tstop = time.time() print "Total Time Cost:", round(tstop - tstart, 2), "s" sess.close()
def train(): assert os.path.isdir(home_folder) assert os.path.isfile(video_data_path_train) assert os.path.isfile(video_data_path_val) assert os.path.isdir(model_path) print 'load meta data...' wordtoix = np.load(home_folder + 'data0/msvd_wordtoix.npy').tolist() ixtoword = pd.Series( np.load(home_folder + 'data0/msvd_ixtoword.npy').tolist()) print 'build model and session...' # shared parameters on the GPU with tf.device("/gpu:0"): model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate=0.5, bias_init_vector=None) tStart_total = time.time() n_epoch_steps = int(n_train_samples / batch_size) n_steps = n_epochs * n_epoch_steps # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _ = read_and_decode(video_data_path_val) # random batches train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1 = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1 = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss = model.build_model(train_caption_id, train_caption_id_1, train_caption_label) val_caption_tf, val_lstm3_variables_tf = model.build_sent_generator( val_caption_id_1) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # check for model file with tf.device(cpu_device): saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state(model_path) global_step = 0 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # print_tensors_in_checkpoint_file(ckpt.model_checkpoint_path, "", True) global_step = get_model_step(ckpt.model_checkpoint_path) print 'global_step:', global_step else: print("Created model with fresh parameters.") sess.run(tf.global_variables_initializer()) temp = set(tf.global_variables()) # train on the GPU with tf.device("/gpu:0"): # train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) ## initialize variables added for optimizer optimizer = tf.train.AdamOptimizer(learning_rate) gvs = optimizer.compute_gradients(tf_loss) # when variable is not related to the loss, grad returned as None clip_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in gvs if grad is not None] train_op = optimizer.apply_gradients(gvs) sess.run(tf.variables_initializer(set(tf.global_variables()) - temp)) # initialize epoch variable in queue reader sess.run(tf.local_variables_initializer()) loss_epoch = 0 coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # write graph architecture to file summary_writer = tf.summary.FileWriter(model_path + 'summary', sess.graph) loss_summary = tf.summary.scalar('loss', tf_loss) epoch = global_step for step in xrange(1, n_steps + 1): tStart = time.time() _, loss_val = sess.run([train_op, tf_loss]) tStop = time.time() print "step:", step, " Loss:", loss_val print "Time Cost:", round(tStop - tStart, 2), "s" loss_epoch += loss_val if step % n_epoch_steps == 0: epoch += 1 loss_epoch /= n_epoch_steps with tf.device(cpu_device): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) # print 'z:', z[0, :10] print 'epoch:', epoch, 'loss:', loss_epoch loss_epoch = 0 n_val_steps = int(n_val_samples / batch_size) ######### test sentence generation ########## [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, 1, ixtoword, val_caption_tf, val_fname) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_val_steps, ixtoword, val_caption_tf, val_fname) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) #### summary ##### summary = sess.run(loss_summary) summary_writer.add_summary(summary, epoch) sys.stdout.flush() sys.stdout.flush() coord.request_stop() coord.join(threads) print "Finally, saving the model ..." with tf.device(cpu_device): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s" sess.close()
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) # np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword) # np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix) # sys.exit() ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist()) wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, dim_tracker=dim_tracker, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, tracker_cnt=tracker_cnt, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model() #loss_summary = tf.scalar_summary("Loss",tf_loss) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) #merged = tf.merge_all_summaries() #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, 'models/model-0') tStart_total = time.time() nr_prefetch = int(3) for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) ## init queue data_queue = mp.Queue(nr_prefetch) # tracker_queue = mp.Queue(nr_prefetch) title_queue = mp.Queue(nr_prefetch) t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data')) # t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker')) t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title')) t1.start() # t2.start() t3.start() for current_batch_file_idx in range(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) if 'tracker' in current_batch.keys(): current_tracker = np.array(current_batch['tracker']) else: current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker)) if 'tracker_mask' in current_batch.keys(): current_tracker_mask = np.array(current_batch['tracker_mask']) else: current_tracker_mask = np.zeros((batch_size, tracker_cnt)) # current_tracker = tracker_queue.get() current_batch_data = data_queue.get() current_batch_title = title_queue.get() for ind in xrange(batch_size): current_feats[ind,:,:] = current_batch_data[:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind,idx[-1]] = 1 current_captions = current_batch_title current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 current_batch.close() _, loss_val= sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_tracker : current_tracker, tf_tracker_mask : current_tracker_mask, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) #writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" t1.join() # t2.join() t3.join() print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s" sys.stdout.flush() if np.mod(epoch, 2) == 0: print "Epoch ", epoch, " is done. Saving the model ..." with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) if np.mod(epoch, 10) == 0: current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))]) video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist()) # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) # for key in pred_dict.keys(): # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print "Finally, saving the model ..." with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def test(model_path=None, video_data_path_test='/home/shenxu/data/msvd_feat_vgg_c3d_frame/test.tfrecords', n_test_samples=27020): # test_data = val_data # to evaluate on testing data or validation data wordtoix = np.load(wordtoix_file).tolist() ixtoword = pd.Series(np.load(ixtoword_file).tolist()) with tf.device("/gpu:0"): model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_caption_steps=n_caption_steps, n_video_steps=n_video_steps, drop_out_rate = 0.5, bias_init_vector=None) # preprocess on the CPU with tf.device('/cpu:0'): train_data, train_encode_data, _, _, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, \ _, _, _, _, train_frame_data = read_and_decode_with_frame(video_data_path_train) val_data, val_encode_data, val_fname, val_title, val_video_label, val_caption_label, val_caption_id, val_caption_id_1, \ _, _, _, _, val_frame_data = read_and_decode_with_frame(video_data_path_test) train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data = \ tf.train.shuffle_batch([train_data, train_encode_data, train_video_label, train_caption_label, train_caption_id, train_caption_id_1, train_frame_data], batch_size=batch_size, num_threads=num_threads, capacity=prefetch, min_after_dequeue=min_queue_examples) val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data = \ tf.train.batch([val_data, val_video_label, val_fname, val_caption_label, val_caption_id_1, val_frame_data], batch_size=batch_size, num_threads=1, capacity=2* batch_size) # graph on the GPU with tf.device("/gpu:0"): tf_loss, tf_loss_cap, tf_loss_lat, tf_loss_vid, tf_z, tf_v_h, tf_s_h, tf_drop_type \ = model.build_model(train_data, train_frame_data, train_video_label, train_caption_id, train_caption_id_1, train_caption_label) val_v2s_tf,v2s_lstm3_vars_tf = model.build_v2s_generator(val_data) val_s2s_tf, s2s_lstm3_vars_tf = model.build_s2s_generator(val_caption_id_1) val_s2v_tf, s2v_lstm4_vars_tf = model.build_s2v_generator(val_caption_id_1, val_frame_data) val_v2v_tf, v2v_lstm4_vars_tf = model.build_v2v_generator(val_data, val_frame_data) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) with tf.device(cpu_device): saver = tf.train.Saver() saver.restore(sess, model_path) print 'load parameters from:', model_path # print 'halve the dropout weights..' # for ind, row in enumerate(v2s_lstm3_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2s_lstm2_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) # for ind, row in enumerate(s2v_lstm4_vars_tf): # if ind % 4 == 0: # assign_op = row.assign(tf.multiply(row,1-0.5)) # sess.run(assign_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) ######### test sentence generation ########## print 'testing...' n_test_steps = int(n_test_samples / batch_size) print 'n_test_steps:', n_test_steps tstart = time.time() ### TODO: sometimes COCO test show exceptions in the beginning of training #### if test_v2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_v2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## video to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname) scorer = COCOScorer() total_score_1 = scorer.score(gt_dict, pred_dict, id_list) print '############## video to sentence result #################' except Exception, e: print 'v2s bleu test exception'
except Exception, e: print 'v2s bleu test exception' if test_s2s: try: [pred_sent, gt_sent, id_list, gt_dict, pred_dict, flist] = testing_all(sess, 1, ixtoword, val_s2s_tf, val_fname) for i, key in enumerate(pred_dict.keys()): print 'video:', flist[i] for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' print '############## sentence to sentence result #################' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, _] = testing_all(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname) scorer = COCOScorer() total_score_2 = scorer.score(gt_dict, pred_dict, id_list) print '############## sentence to sentence result #################' except Exception, e: print 'v2s bleu test exception' ######### test video generation ############# if test_v2v: mse_v2v = test_all_videos(sess, n_test_steps, val_data, val_v2v_tf, val_video_label, pixel_scale_factor) print 'video2video mse:', mse_v2v if test_s2v: mse_s2v = test_all_videos(sess, n_test_steps, val_data, val_s2v_tf, val_video_label, pixel_scale_factor) print 'caption2video mse:', mse_s2v if save_demo_sent_v2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_v2s_tf, val_fname, result_file='demo_v2s.txt') if save_demo_sent_s2s: get_demo_sentence(sess, n_test_steps, ixtoword, val_s2s_tf, val_fname, result_file='demo_s2s.txt')
def train(): meta_data, train_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) np.save('./data/ixtoword', ixtoword) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_video_len, tf_caption, tf_caption_mask, tf_HLness, tf_HLness_mask, tf_HLness_att_mask= model.build_model() loss_summary = tf.scalar_summary("Loss",tf_loss) sess = tf.InteractiveSession() merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def) saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_HLness = np.zeros((batch_size, n_frame_step)) current_HLness_masks = np.zeros((batch_size, n_frame_step)) current_HLness_att_masks = np.zeros((batch_size, n_frame_step)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind,:,:] = current_batch['data'][:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue idy = np.where(current_batch['label'][:,ind] == 1)[0] if len(idy) == 0: continue current_HLness[ind,idx] = current_batch['label'][idx,ind] current_HLness_masks[ind,idx] = 1 current_video_masks[ind,idy[-1]] = 1 current_video_len[ind] = idx[-1] + 1 current_HLness_att_masks[ind,idy] = 1 if(idy[0] > 4): current_HLness_att_masks[ind,idy[0]-5:idy[0]] = 1 else: current_HLness_att_masks[ind,0:idy[0]] = 1 current_captions = current_batch['title'] current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=15-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 _, loss_val, summary_str= sess.run( [train_op, tf_loss, merged], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks, tf_HLness: current_HLness, tf_HLness_mask: current_HLness_masks, tf_HLness_att_mask: current_HLness_att_masks }) writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s" if np.mod(epoch, 20) == 0: print "Epoch ", epoch, " is done. Saving the model ..." saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) current_batch = h5py.File(test_data[np.random.randint(0,len(test_data))]) video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf, lstmRNN_variables_tf, lstm3_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) #[mp, pred_sent, gt_sent, HLness] = testing_one(sess, current_batch, ixtoword,video_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf) [mp, pred_sent, gt_sent, HLness] = testing_all(sess, test_data, ixtoword,video_tf, video_mask_tf, video_len_tf, HLness_tf, caption_tf, HLness_att_mask_tf) #for xxx in xrange(current_batch['label'].shape[1]): # print gt_sent[xxx] # print pred_sent[xxx] total_score = np.mean(mp) print total_score scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) print "Finally, saving the model ..." saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total,2), "s"
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin(video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) # np.save('./data'+str(gpu_id)+'/ixtoword', ixtoword) # np.save('./data'+str(gpu_id)+'/wordtoix', wordtoix) # sys.exit() ixtoword=pd.Series(np.load('./data_all/ixtoword.npy').tolist()) wordtoix=pd.Series(np.load('./data_all/wordtoix.npy').tolist()) model = Video_Caption_Generator( dim_image=dim_image, dim_tracker=dim_tracker, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, tracker_cnt=tracker_cnt, drop_out_rate = 0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_tracker, tf_tracker_mask, tf_caption, tf_caption_mask= model.build_model() #loss_summary = tf.scalar_summary("Loss",tf_loss) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) #merged = tf.merge_all_summaries() #writer = tf.train.SummaryWriter('/tmp/tf_log', sess.graph_def) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, 'models/model-0') tStart_total = time.time() nr_prefetch = int(3) for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) ## init queue data_queue = mp.Queue(nr_prefetch) # tracker_queue = mp.Queue(nr_prefetch) title_queue = mp.Queue(nr_prefetch) t1 = Thread(target=load_data_into_queue, args=(train_data, data_queue, 'data')) # t2 = Thread(target=load_data_into_queue, args=(train_data, tracker_queue, 'tracker')) t3 = Thread(target=load_data_into_queue, args=(train_data, title_queue, 'title')) t1.start() # t2.start() t3.start() for current_batch_file_idx in range(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) if 'tracker' in current_batch.keys(): current_tracker = np.array(current_batch['tracker']) else: current_tracker = np.zeros((batch_size, tracker_cnt, dim_tracker)) if 'tracker_mask' in current_batch.keys(): current_tracker_mask = np.array(current_batch['tracker_mask']) else: current_tracker_mask = np.zeros((batch_size, tracker_cnt)) # current_tracker = tracker_queue.get() current_batch_data = data_queue.get() current_batch_title = title_queue.get() for ind in range(batch_size): current_feats[ind,:,:] = current_batch_data[:,ind,:] idx = np.where(current_batch['label'][:,ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind,idx[-1]] = 1 current_captions = current_batch_title current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix], current_captions) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=35-1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros( [len(current_caption_matrix),1]) ] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum()+1, current_caption_matrix )) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 current_batch.close() _, loss_val= sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask : current_video_masks, tf_tracker : current_tracker, tf_tracker_mask : current_tracker_mask, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) #writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" t1.join() # t2.join() t3.join() print ("Epoch:", epoch, " done. Loss:", np.mean(loss_epoch)) tStop_epoch = time.time() print ("Epoch Time Cost:", round(tStop_epoch - tStart_epoch,2), "s") sys.stdout.flush() if np.mod(epoch, 2) == 0: print ("Epoch ", epoch, " is done. Saving the model ...") with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) if np.mod(epoch, 10) == 0: current_batch = h5py.File(val_data[np.random.randint(0,len(val_data))]) video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() ixtoword = pd.Series(np.load('./data_all/ixtoword.npy').tolist()) # [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, train_data[-2:], ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) # for key in pred_dict.keys(): # for ele in gt_dict[key]: # print "GT: " + ele['caption'] # print "PD: " + pred_dict[key][0]['caption'] # print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict, fnamelist] = testing_all_multi_gt(sess, val_data, ixtoword,video_tf, video_mask_tf, tracker_tf, tracker_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) print ("Finally, saving the model ...") with tf.device('/cpu:0'): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print ("Total Time Cost:", round(tStop_total - tStart_total,2), "s")
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin( video_data_path_train, video_data_path_val, video_data_path_test) captions = meta_data['Description'].values captions = map(lambda x: x.replace('.', ''), captions) captions = map(lambda x: x.replace(',', ''), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab( captions, word_count_threshold=1) np.save('./data0/ixtoword', ixtoword) model = Video_Caption_Generator(dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate=0.5, bias_init_vector=None) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model( ) sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True)) with tf.device("/cpu:0"): saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind, :, :] = current_batch['data'][:n_frame_step, ind, :] idx = np.where(current_batch['label'][:, ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind, :idx[-1] + 1] = 1 current_captions = current_batch['title'] current_caption_ind = map( lambda cap: [ wordtoix[word] for word in cap.lower().split(' ') if word in wordtoix ], current_captions) current_caption_matrix = sequence.pad_sequences( current_caption_ind, padding='post', maxlen=n_caption_step - 1) current_caption_matrix = np.hstack([ current_caption_matrix, np.zeros([len(current_caption_matrix), 1]) ]).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array( map(lambda x: (x != 0).sum() + 1, current_caption_matrix)) for ind, row in enumerate(current_caption_masks): row[:nonzeros[ind]] = 1 _, loss_val = sess.run( [train_op, tf_loss], feed_dict={ tf_video: current_feats, tf_video_mask: current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks }) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() #print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val #print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s" if np.mod(epoch, 10) == 0 or epoch == n_epochs - 1: print "Epoch ", epoch, " is done. Saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) current_batch = h5py.File(val_data[np.random.randint( 0, len(val_data))]) video_tf, video_mask_tf, caption_tf, lstm3_variables_tf = model.build_generator( ) ixtoword = pd.Series(np.load('./data0/ixtoword.npy').tolist()) [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf) for key in pred_dict.keys(): for ele in gt_dict[key]: print "GT: " + ele['caption'] print "PD: " + pred_dict[key][0]['caption'] print '-------' [pred_sent, gt_sent, id_list, gt_dict, pred_dict] = testing_all(sess, val_data, ixtoword, video_tf, video_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list) sys.stdout.flush() print "Finally, saving the model ..." with tf.device("/cpu:0"): saver.save(sess, os.path.join(model_path, 'model'), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"
def train(): meta_data, train_data, val_data, test_data = get_video_data_jukin( video_data_path_train, video_data_path_val, video_data_path_test ) captions = meta_data["Description"].values captions = map(lambda x: x.replace(".", ""), captions) captions = map(lambda x: x.replace(",", ""), captions) wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=1) np.save("./data" + str(gpu_id) + "/ixtoword", ixtoword) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(wordtoix), dim_hidden=dim_hidden, batch_size=batch_size, n_lstm_steps=n_frame_step, drop_out_rate=0.5, bias_init_vector=None, ) tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model() loss_summary = tf.scalar_summary("Loss", tf_loss) sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True)) merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("/tmp/tf_log", sess.graph_def) saver = tf.train.Saver(max_to_keep=100) train_op = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss) tf.initialize_all_variables().run() saver.restore(sess, "models_SS_youtube_notest_dummy/model-20") tStart_total = time.time() for epoch in range(n_epochs): index = np.arange(len(train_data)) np.random.shuffle(index) train_data = train_data[index] tStart_epoch = time.time() loss_epoch = np.zeros(len(train_data)) for current_batch_file_idx in xrange(len(train_data)): tStart = time.time() current_batch = h5py.File(train_data[current_batch_file_idx]) current_feats = np.zeros((batch_size, n_frame_step, dim_image)) current_video_masks = np.zeros((batch_size, n_frame_step)) current_video_len = np.zeros(batch_size) for ind in xrange(batch_size): current_feats[ind, :, :] = current_batch["data"][:, ind, :] idx = np.where(current_batch["label"][:, ind] != -1)[0] if len(idx) == 0: continue current_video_masks[ind, idx[-1]] = 1 current_captions = current_batch["title"] current_caption_ind = map( lambda cap: [wordtoix[word] for word in cap.lower().split(" ") if word in wordtoix], current_captions ) current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding="post", maxlen=16 - 1) current_caption_matrix = np.hstack( [current_caption_matrix, np.zeros([len(current_caption_matrix), 1])] ).astype(int) current_caption_masks = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array(map(lambda x: (x != 0).sum() + 1, current_caption_matrix)) for ind, row in enumerate(current_caption_masks): row[: nonzeros[ind]] = 1 _, loss_val, summary_str = sess.run( [train_op, tf_loss, merged], feed_dict={ tf_video: current_feats, tf_video_mask: current_video_masks, tf_caption: current_caption_matrix, tf_caption_mask: current_caption_masks, }, ) writer.add_summary(summary_str, epoch) loss_epoch[current_batch_file_idx] = loss_val tStop = time.time() # print "Epoch:", epoch, " Batch:", current_batch_file_idx, " Loss:", loss_val # print "Time Cost:", round(tStop - tStart,2), "s" print "Epoch:", epoch, " done. Loss:", np.mean(loss_epoch) tStop_epoch = time.time() print "Epoch Time Cost:", round(tStop_epoch - tStart_epoch, 2), "s" sys.stdout.flush() if np.mod(epoch, 10) == 0: print "Epoch ", epoch, " is done. Saving the model ..." saver.save(sess, os.path.join(model_path, "model"), global_step=epoch) current_batch = h5py.File(val_data[np.random.randint(0, len(val_data))]) video_tf, video_mask_tf, caption_tf, lstm1_variables_tf, lstm2_variables_tf = model.build_generator() ixtoword = pd.Series(np.load("./data" + str(gpu_id) + "/ixtoword.npy").tolist()) [pred_sent, gt_sent] = testing_all(sess, train_data[-2:], ixtoword, video_tf, video_mask_tf, caption_tf) for idx in range(len(pred_sent)): print "GT: " + gt_sent[idx][0]["caption"] print "PD: " + pred_sent[idx][0]["caption"] print "-------" [pred_sent, gt_sent] = testing_all(sess, val_data, ixtoword, video_tf, video_mask_tf, caption_tf) scorer = COCOScorer() total_score = scorer.score(gt_sent, pred_sent, range(len(pred_sent))) print "Finally, saving the model ..." saver.save(sess, os.path.join(model_path, "model"), global_step=n_epochs) tStop_total = time.time() print "Total Time Cost:", round(tStop_total - tStart_total, 2), "s"