def main(_): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Evaluate trained models on val decoder = mRNNDecoder(config, FLAGS.model_name, FLAGS.vocab_path, gpu_memory_fraction=FLAGS.gpu_memory_fraction) start, stop, step = [int(x) for x in FLAGS.eval_stat.split()] for i in range(start, stop, step): model_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'variables', 'model_%d.ckpt' % i) while not os.path.exists(model_path + ".meta"): logger.warning('Cannot load model file, sleep 1 hour to retry') time.sleep(3600) decoder.load_model(model_path) num_decode = 0 pred_sentences = [] for anno_file_path in FLAGS.anno_files_path.split(':'): annos = np.load(anno_file_path).tolist() for anno in annos: feat_path = os.path.join( FLAGS.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') visual_features = np.loadtxt(feat_path) sentences = decoder.decode(visual_features, FLAGS.beam_size) sentence_coco = { 'image_id': anno['id'], 'caption': ' '.join(sentences[0]['words']) } pred_sentences.append(sentence_coco) num_decode += 1 if num_decode % 100 == 0: logger.info('%d images are decoded' % num_decode) pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'generated_%d.json' % i) result_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'result_%d.txt' % i) cu.create_dir_if_not_exists(os.path.dirname(pred_path)) with open(pred_path, 'w') as fout: json.dump(pred_sentences, fout) cu.coco_val_eval(pred_path, result_path)
def main(_): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Start model training with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=FLAGS.ses_threads)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) assert len(config.buckets) >= 1 assert config.buckets[-1] == config.max_num_steps models = [] with tf.variable_scope("mRNNmodel", reuse=None, initializer=initializer): m = mRNNModel(is_training=True, num_steps=config.buckets[0], config=config, model_name=FLAGS.model_name, flag_with_saver=True, model_root=FLAGS.model_root) models.append(m) with tf.variable_scope("mRNNmodel", reuse=True): for bucket in config.buckets[1:]: m = mRNNModel(is_training=True, num_steps=bucket, config=config, model_name=FLAGS.model_name, model_root=FLAGS.model_root) models.append(m) hdlr = logging.FileHandler(os.path.join(m.model_dir, 'log.txt')) hdlr.setLevel(logging.INFO) hdlr.setFormatter(logging.Formatter(formatter_log)) logger.addHandler(hdlr) if FLAGS.pre_trained_model_path: models[0].saver.restore(session, FLAGS.pre_trained_model_path) logger.info('Continue to train from %s', FLAGS.pre_trained_model_path) else: tf.global_variables_initializer() iters_done = 0 data_provider = mRNNCocoBucketDataProvider( FLAGS.anno_files_path.split(':'), FLAGS.vocab_path, config.vocab_size, FLAGS.vf_dir, config.vf_size) for i in range(config.num_epoch): train_cost, iters_done = run_epoch(session, iters_done, config, models, data_provider, verbose=True) logger.info("Train cost for epoch %d is %.3f" % (i, train_cost)) # Save final copy of the model models[0].saver.save( session, os.path.join(m.variable_dir, 'model_%d.ckpt' % iters_done))
pylab.rcParams['figure.figsize'] = (6.0, 4.0) #sys.path.append('./py_lib/') from py_lib.common_utils import CommonUtiler from py_lib.tf_mrnn_decoder import mRNNDecoder from py_lib.vision import ImageFeatureExtractor # set up paths mrnn_model_path = './trained_models/coco_caption/mrnn_GRU_570K.ckpt' mrnn_config_path = './model_conf/mrnn_GRU_conf.py' mrnn_vocab_path = './trained_models/coco_caption/mscoco_mc3_vocab' img_model_path = './external/tf_cnn_models/inception_v3.pb' # initilize feature extractor and sentence decoder cu = CommonUtiler() config = cu.load_config(mrnn_config_path) ife = ImageFeatureExtractor(img_model_path) decoder = mRNNDecoder(config, 'demo', mrnn_vocab_path) demo_image_path = 'demo_image.jpg' beam_size = 3 # extract visual feature for the image visual_features = ife.extract_features(demo_image_path, flag_from_file=True) # generate sentences decoder.load_model(mrnn_model_path) sentences = decoder.decode(visual_features, beam_size) # Visualize the result print('Top generated sentences and their log-likelihood:') for (ind_s, sentence) in enumerate(sentences): print(' %d (%.2f): %s' %