def main(hf,f_type,capl=16, d_w2v=512, output_dim=512, feature_shape=None,lr=0.01, batch_size=64,total_epoch=100, file=None,pretrained_model=None): ''' capl: the length of caption ''' # Create vocabulary v2i, train_data, val_data, test_data = MsrDataUtil.create_vocabulary_word2vec(file, capl=capl, word_threshold=1, v2i={'': 0, 'UNK':1,'BOS':2, 'EOS':3}) i2v = {i:v for v,i in v2i.items()} print('building model ...') voc_size = len(v2i) input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video') input_captions = tf.placeholder(tf.int32, shape=(None,capl), name='input_captions') y = tf.placeholder(tf.int32,shape=(None, capl)) attentionCaptionModel = mGRUCaptionModel.mLSTMAttentionCaptionModel(input_video, input_captions, voc_size, d_w2v, output_dim, T_k=[1,2,4,8]) predict_score, predict_words, loss_mask = attentionCaptionModel.build_model() loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=predict_score) loss = tf.reduce_sum(loss,reduction_indices=[-1])/tf.reduce_sum(loss_mask,reduction_indices=[-1]) loss = tf.reduce_mean(loss)+sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9,beta2=0.999,epsilon=1e-08,use_locking=False,name='Adam') gvs = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_global_norm([grad], 10)[0][0], var) for grad, var in gvs ] train = optimizer.apply_gradients(capped_gvs) # optimizer = tf.train.RMSPropOptimizer(lr,decay=0.9, momentum=0.0, epsilon=1e-8) # train = optimizer.minimize(loss) ''' configure && runtime environment ''' config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 # sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) config.log_device_placement=False sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) with sess.as_default(): saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch) if pretrained_model is not None: saver.restore(sess, pretrained_model) print('restore pre trained file:' + pretrained_model) for epoch in xrange(total_epoch): # # shuffle print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size)) # # train phase tic = time.time() total_loss = exe_train(sess, train_data, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, y, capl=capl) print(' --Train--, Loss: %.5f, .......Time:%.3f' %(total_loss,time.time()-tic)) tic = time.time() js = exe_test(sess, test_data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, capl=capl) print(' --Val--, .......Time:%.3f' %(time.time()-tic)) #save model export_path = '/home/xyj/usr/local/saved_model/msrvtt2017/s2s'+'_'+f_type+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])+'_B'+str(batch_size) if not os.path.exists(export_path+'/model'): os.makedirs(export_path+'/model') print('mkdir %s' %export_path+'/model') if not os.path.exists(export_path+'/res'): os.makedirs(export_path+'/res') print('mkdir %s' %export_path+'/res') # eval res_path = export_path+'/res/'+f_type+'_E'+str(epoch+1)+'.json' evaluate_mode_by_shell(res_path,js) save_path = saver.save(sess, export_path+'/model/'+'E'+str(epoch+1)+'_L'+str(total_loss)+'.ckpt') print("Model saved in file: %s" % save_path)
def beam_search_exe_test(sess, data, audio_info, cate_info, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, input_categories, input_audio, y, finished_beam, logprobs_finished_beams, past_logprobs, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel( batch_caption, v2i, capl=capl) data_cate = MsrDataUtil.getBatchVideoCategoriesInfo( batch_caption, cate_info, feature_shape) data_audio = MsrDataUtil.getBatchVideoAudioInfo( batch_caption, audio_info, feature_shape) [gw, tw, gp, gl] = sess.run( [ finished_beam, predict_words, logprobs_finished_beams, past_logprobs ], feed_dict={ input_video: data_v, input_captions: data_c, input_categories: data_cate, input_audio: data_audio, y: data_y }) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def exe_test(sess, data, cate_info, batch_size, v2i, i2v, hf1, hf2, feature_shape1, feature_shape2, predict_words, input_video1, input_video2, input_captions, input_categories, y, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) + 1 for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v1 = MsrDataUtil.getBatchVideoFeature(batch_caption, hf1, feature_shape1) data_v2 = MsrDataUtil.getBatchC3DVideoFeature(batch_caption, hf2, feature_shape2) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel( batch_caption, v2i, capl=capl) data_cate = MsrDataUtil.getBatchVideoCategoriesInfo( batch_caption, cate_info, feature_shape1) [gw] = sess.run( [predict_words], feed_dict={ input_video1: data_v1, input_video2: data_v2, input_captions: data_c, input_categories: data_cate, y: data_y }) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def exe_test(sess, data, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, y, finished_beam, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel( batch_caption, v2i, capl=capl) [gw, tw, gp, gl, pp, pss, fbs] = sess.run([ finished_beam, predict_words, logprobs_finished_beams, past_logprobs, beam_hidden_state, past_symbols_states, finished_beams_states ], feed_dict={ input_video: data_v, input_captions: data_c, y: data_y }) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): # print(gw[idx]) print('%s : %s' % (batch_caption[idx].keys()[0], sen)) # print(pss) # print(fbs) # print(gp) # print(gl) # print(pp) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js