def beam_search_exe_test(sess, data, audio_info, batch_size, v2i, i2v, hf, feature_shape, predict_words, input_video, input_captions, input_audio, y, finished_beam, logprobs_finished_beams, past_logprobs, capl=16): caption_output = [] total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) for batch_idx in xrange(num_batch): batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTestCaptionWithSparseLabel( batch_caption, v2i, capl=capl) data_audio = MsrDataUtil.getBatchVideoAudioInfo( batch_caption, audio_info) [gw, tw, gp, gl] = sess.run( [ finished_beam, predict_words, logprobs_finished_beams, past_logprobs ], feed_dict={ input_video: data_v, input_captions: data_c, input_audio: data_audio, y: data_y }) generated_captions = MsrDataUtil.convertCaptionI2V( batch_caption, gw, i2v) for idx, sen in enumerate(generated_captions): print('%s : %s' % (batch_caption[idx].keys()[0], sen)) caption_output.append({ 'image_id': batch_caption[idx].keys()[0], 'caption': sen }) js = {} js['val_predictions'] = caption_output return js
def exe_train(sess, data, audio_info, cate_info, batch_size, v2i, hf, feature_shape, train, loss, input_video, input_captions, input_categories, input_audio, y, capl=16): np.random.shuffle(data) total_data = len(data) num_batch = int(round(total_data * 1.0 / batch_size)) total_loss = 0.0 for batch_idx in xrange(num_batch): # for batch_idx in xrange(500): # if batch_idx < 100: batch_caption = data[batch_idx * batch_size:min((batch_idx + 1) * batch_size, total_data)] data_v = MsrDataUtil.getBatchVideoFeature(batch_caption, hf, feature_shape) data_c, data_y = MsrDataUtil.getBatchTrainCaptionWithSparseLabel( batch_caption, v2i, capl=capl) data_cate = MsrDataUtil.getBatchVideoCategoriesInfo( batch_caption, cate_info, feature_shape) data_audio = MsrDataUtil.getBatchVideoAudioInfo( batch_caption, audio_info, feature_shape) _, l = sess.run( [train, loss], feed_dict={ input_video: data_v, input_captions: data_c, input_categories: data_cate, input_audio: data_audio, y: data_y }) total_loss += l print(' batch_idx:%d/%d, loss:%.5f' % (batch_idx + 1, num_batch, l)) total_loss = total_loss / num_batch return total_loss