示例#1
0
def update_output(n_clicks, value):
    vocab_data = data_loader.get_question_answer_vocab("2")
    qvocab = vocab_data['question_vocab']
    q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']}
    print('filename::',filen)
    fc7_features = utils.extract_fc7_features(filen, 'Data/vgg16.tfmodel')
    model_options = {
		'num_lstm_layers' : 2,
		'rnn_size' : 512,
		'embedding_size' : 512,
		'word_emb_dropout' : 0.5,
		'image_dropout' : 0.5,
		'fc7_feature_length' : 4096,
		'lstm_steps' : vocab_data['max_question_length'] + 1,
		'q_vocab_size' : len(vocab_data['question_vocab']),
		'ans_vocab_size' : len(vocab_data['answer_vocab'])
	}
    
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32')
    print('qst',value)
    question_words = re.findall(word_regex, value)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[ question_words[i] ]
        else:
            question_ids[0][base + i] = question_vocab['UNK']
    ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']}
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, 'Data/Models/modelnew99.ckpt')
    pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={
        input_tensors['fc7']:fc7_features,
        input_tensors['sentence']:question_ids,
    })
    print("answerprediction",pred[0])
    #model.summary()
    #plot_model(model,to_file='predictmodel.png')
    print ("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print ("Top Answers")
    for i in range(1):
        print (ans_map[ answer_probab_tuples[0][1] ])
        #ans=(ans_map[answer_probab_tuples[i][1] ])
        lang = "en"
        text="This is a "+ans_map[ answer_probab_tuples[0][1] ]
        speech = Speech(text, lang)

        sox_effects = ("speed", "0.8")
        speech.play(sox_effects)
        
    return ans_map[answer_probab_tuples[0][1]]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_path',
                        type=str,
                        default='Data/cat.jpeg',
                        help='Image Path')
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/Models/model2.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='Which animal is this?',
                        help='Question')

    args = parser.parse_args()

    print("Image:", args.image_path)
    print("Question:", args.question)

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    fc7_features = utils.extract_fc7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: fc7_features,
            input_tensors['sentence']: question_ids,
        })

    print("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])
示例#3
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--num_lstm_layers', type=int, default=2,
                       help='num_lstm_layers')
	parser.add_argument('--fc7_feature_length', type=int, default=4096,
                       help='fc7_feature_length')
	parser.add_argument('--rnn_size', type=int, default=512,
                       help='rnn_size')
	parser.add_argument('--embedding_size', type=int, default=512,
                       help='embedding_size'),
	parser.add_argument('--word_emb_dropout', type=float, default=0.5,
                       help='word_emb_dropout')
	parser.add_argument('--image_dropout', type=float, default=0.5,
                       help='image_dropout')
	parser.add_argument('--data_dir', type=str, default='Data',
                       help='Data directory')
	parser.add_argument('--batch_size', type=int, default=200,
                       help='Batch Size')
	parser.add_argument('--learning_rate', type=float, default=0.001,
                       help='Batch Size')
	parser.add_argument('--epochs', type=int, default=200,
                       help='Expochs')
	parser.add_argument('--debug', type=bool, default=False,
                       help='Debug')
	parser.add_argument('--model_path', type=str, default = 'Data/Models/model21.ckpt',
                       help='Model Path')

	args = parser.parse_args()
	print "Reading QA DATA"
	qa_data = data_loader.load_questions_answers(args)
	
	print "Reading fc7 features"
	fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val')
	print "FC7 features", fc7_features.shape
	print "image_id_list", image_id_list.shape

	image_id_map = {}
	for i in xrange(len(image_id_list)):
		image_id_map[ image_id_list[i] ] = i

	ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']}

	model_options = {
		'num_lstm_layers' : args.num_lstm_layers,
		'rnn_size' : args.rnn_size,
		'embedding_size' : args.embedding_size,
		'word_emb_dropout' : args.word_emb_dropout,
		'image_dropout' : args.image_dropout,
		'fc7_feature_length' : args.fc7_feature_length,
		'lstm_steps' : qa_data['max_question_length'] + 1,
		'q_vocab_size' : len(qa_data['question_vocab']),
		'ans_vocab_size' : len(qa_data['answer_vocab'])
	}
	
	
	
	model = vis_lstm_model.Vis_lstm_model(model_options)
	input_tensors, t_prediction, t_ans_probab = model.build_generator()
	sess = tf.InteractiveSession()
	saver = tf.train.Saver()

	avg_accuracy = 0.0
	total = 0
	saver.restore(sess, args.model_path)
	
	batch_no = 0
	while (batch_no*args.batch_size) < len(qa_data['validation']):
		sentence, answer, fc7 = get_batch(batch_no, args.batch_size, 
			fc7_features, image_id_map, qa_data, 'val')
		
		pred, ans_prob = sess.run([t_prediction, t_ans_probab], feed_dict={
            input_tensors['fc7']:fc7,
            input_tensors['sentence']:sentence,
        })
		
		batch_no += 1
		if args.debug:
			for idx, p in enumerate(pred):
				print ans_map[p], ans_map[ np.argmax(answer[idx])]

		correct_predictions = np.equal(pred, np.argmax(answer, 1))
		correct_predictions = correct_predictions.astype('float32')
		accuracy = correct_predictions.mean()
		print "Acc", accuracy
		avg_accuracy += accuracy
		total += 1
	
	print "Acc", avg_accuracy/total
示例#4
0
def main():
	parser = argparse.ArgumentParser()  # argparse 是 Python 内置的一个用于命令项选项与参数解析的模块,\
					    # 通过在程序中定义好我们需要的参数,argparse 将会从 sys.argv 中解析出这些参数,并自动生成帮助和使用信息
	parser.add_argument('--num_lstm_layers', type=int, default=2,
                       help='num_lstm_layers')
	parser.add_argument('--fc7_feature_length', type=int, default=4096,
                       help='fc7_feature_length')
	parser.add_argument('--rnn_size', type=int, default=512,
                       help='rnn_size')
	parser.add_argument('--embedding_size', type=int, default=512,
                       help='embedding_size'),
	parser.add_argument('--word_emb_dropout', type=float, default=0.5,
                       help='word_emb_dropout')
	parser.add_argument('--image_dropout', type=float, default=0.5,
                       help='image_dropout')
	parser.add_argument('--data_dir', type=str, default='Data',
                       help='Data directory')
	parser.add_argument('--batch_size', type=int, default=200,
                       help='Batch Size')
	parser.add_argument('--learning_rate', type=float, default=0.001,
                       help='Batch Size')
	parser.add_argument('--epochs', type=int, default=200,
                       help='Expochs')
	parser.add_argument('--debug', type=bool, default=False,
                       help='Debug')
	parser.add_argument('--resume_model', type=str, default=None,
                       help='Trained Model Path')
	parser.add_argument('--version', type=int, default=2,
                       help='VQA data version')

	args = parser.parse_args() # Get the two attributes, integers and accumulate.
	print("Reading QA DATA") 
	#存了些什么样子的数据?函数返回的数据结构啥样子= qa_data的结构啥样子。
	qa_data = data_loader.load_questions_answers(args.version, args.data_dir)
	
	print("Reading fc7 features")
	
	#下面data_loader提取到的就是feature和id,但是dataloader应该还没有经过training,如何得到的?
	#data_loader的到的image_id_list是什么样子的?
	fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train')
	print("FC7 features", fc7_features.shape)
	print("image_id_list", image_id_list.shape)

	image_id_map = {}  #得到的是image_id名字对应的id数字;数据类型为字典 
	for i in range(len(image_id_list)):
		image_id_map[ image_id_list[i] ] = i
	
	# 为啥需要一个ans_map这样的字典?
	# 这里面的ans是什么东西,以及qa_data['answer_vocab'][ans]的数据结构为何会是这样?
	ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']}
	
	#下面这个是配置好TensorFlow初始化的参数。
	model_options = {
		'num_lstm_layers' : args.num_lstm_layers,
		'rnn_size' : args.rnn_size,
		'embedding_size' : args.embedding_size,
		'word_emb_dropout' : args.word_emb_dropout,
		'image_dropout' : args.image_dropout,
		'fc7_feature_length' : args.fc7_feature_length,
		'lstm_steps' : qa_data['max_question_length'] + 1,
		'q_vocab_size' : len(qa_data['question_vocab']),
		'ans_vocab_size' : len(qa_data['answer_vocab'])
	}
	
	
	#下面这里几句话对TensorFLow进行了初始化与调用。
	model = vis_lstm_model.Vis_lstm_model(model_options)# 初始化TensorFlow
	input_tensors, t_loss, t_accuracy, t_p = model.build_model() # Get the results of the Neural Network Model(LSTM)
	train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss) # Use Adam to get better learning rate
	sess = tf.InteractiveSession() # Get into the interactive session, I think here just open a window or sth to display sth.
	tf.initialize_all_variables().run()

	# I think here is the interrupt processing. if resume from previous process, resume with previous process results.
	saver = tf.train.Saver()
	if args.resume_model:
		saver.restore(sess, args.resume_model)

	for i in range(args.epochs):
		batch_no = 0

		while (batch_no*args.batch_size) < len(qa_data['training']): # batch_no*args.batch_size = the total number of elements 
			#in training set that has been explored.
			#Get the batch of the training set.
			sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data, 'train')
			_, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], 
				feed_dict={
					input_tensors['fc7']:fc7,
					input_tensors['sentence']:sentence,
					input_tensors['answer']:answer
				}
			)  ### The whole part just store all the parameters into tensorflow inner class! ###
			batch_no += 1
			if args.debug:
				for idx, p in enumerate(pred):
					print(ans_map[p], ans_map[ np.argmax(answer[idx])])

				print("Loss", loss_value, batch_no, i)
				print("Accuracy", accuracy)
				print("---------------")
			else:
				print("Loss", loss_value, batch_no, i)
				print("Training Accuracy", accuracy)
			
		save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        help='Batch Size')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Batch Size')
    parser.add_argument('--epochs', type=int, default=200, help='Expochs')
    parser.add_argument('--debug', type=bool, default=False, help='Debug')
    parser.add_argument('--resume_model',
                        type=str,
                        default=None,
                        help='Trained Model Path')

    args = parser.parse_args()
    print "Reading QA DATA"
    qa_data = data_loader.load_questions_answers(args)

    print "Reading fc7 features"
    fc7_features, image_id_list = data_loader.load_fc7_features(
        args.data_dir, 'train')
    print "FC7 features", fc7_features.shape
    print "image_id_list", image_id_list.shape

    image_id_map = {}
    for i in xrange(len(image_id_list)):
        image_id_map[image_id_list[i]] = i

    ans_map = {
        qa_data['answer_vocab'][ans]: ans
        for ans in qa_data['answer_vocab']
    }

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': qa_data['max_question_length'] + 1,
        'q_vocab_size': len(qa_data['question_vocab']),
        'ans_vocab_size': len(qa_data['answer_vocab'])
    }

    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_loss, t_accuracy, t_p = model.build_model()
    train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss)
    sess = tf.InteractiveSession()
    tf.initialize_all_variables().run()

    saver = tf.train.Saver()
    if args.resume_model:
        saver.restore(sess, args.resume_model)

    for i in xrange(args.epochs):
        batch_no = 0

        while (batch_no * args.batch_size) < len(qa_data['training']):
            sentence, answer, fc7 = get_training_batch(batch_no,
                                                       args.batch_size,
                                                       fc7_features,
                                                       image_id_map, qa_data,
                                                       'train')
            _, loss_value, accuracy, pred = sess.run(
                [train_op, t_loss, t_accuracy, t_p],
                feed_dict={
                    input_tensors['fc7']: fc7,
                    input_tensors['sentence']: sentence,
                    input_tensors['answer']: answer
                })
            batch_no += 1
            if args.debug:
                for idx, p in enumerate(pred):
                    print ans_map[p], ans_map[np.argmax(answer[idx])]

                print "Loss", loss_value, batch_no, i
                print "Accuracy", accuracy
                print "---------------"
            else:
                print "Loss", loss_value, batch_no, i
                print "Training Accuracy", accuracy

        save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))
示例#6
0
def main():
    config = json.load(open('config.json'))

    parser = argparse.ArgumentParser()
    parser.add_argument('--num_lstm_layers', type=int, default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length', type=int, default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512,
                        help='rnn_size')
    parser.add_argument('--embedding_size', type=int, default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout', type=float, default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout', type=float, default=0.5,
                        help='image_dropout')
    parser.add_argument('--qa_dir', type=str, default=config['qa_dir'],
                        help='QA Data directory')
    parser.add_argument('--data_dir', type=str, default=config['data_dir'],
                        help='Common Data directory')
    parser.add_argument('--batch_size', type=int, default=200,
                        help='Batch Size')
    parser.add_argument('--learning_rate', type=float, default=0.001,
                        help='Batch Size')
    parser.add_argument('--epochs', type=int, default=2,
                        help='Expochs')
    parser.add_argument('--debug', type=bool, default=False,
                        help='Debug')
    parser.add_argument('--resume_model', type=str, default=None,
                        help='Trained Model Path')
    parser.add_argument('--version', type=int, default=1,
                        help='VQA data version')

    args = parser.parse_args()
    print("Reading QA DATA")
    qa_data = data_loader.load_questions_answers(args.qa_dir)

    print("Reading fc7 features")
    fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'train')
    print("FC7 features", fc7_features.shape)
    print("image_id_list", image_id_list.shape)

    image_id_map = {}
    for i in range(len(image_id_list)):
        image_id_map[image_id_list[i]] = i

    ans_map = {qa_data['answer_vocab'][ans]: ans for ans in qa_data['answer_vocab']}

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': qa_data['max_question_length'] + 1,
        'q_vocab_size': len(qa_data['question_vocab']),
        'ans_vocab_size': len(qa_data['answer_vocab'])
    }

    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_loss, t_accuracy, t_p = model.build_model()
    train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss)
    sess = tf.InteractiveSession()
    # tf.initialize_all_variables().run()  # tf.initialize_all_variables() is deprecated since 2017-03-02
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()
    if args.resume_model:
        saver.restore(sess, args.resume_model)

    acc_file = open('train_acc.txt', 'w', encoding='utf-8')
    acc_file.write('epoch   avg_acc\n')
    for i in range(args.epochs):
        batch_no = 0
        epochs_acc_sum = 0
        while (batch_no * args.batch_size) < len(qa_data['training']):
            sentence, answer, fc7 = get_training_batch(batch_no, args.batch_size, fc7_features, image_id_map, qa_data,
                                                       'train')
            _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p],
                                                     feed_dict={
                                                         input_tensors['fc7']: fc7,
                                                         input_tensors['sentence']: sentence,
                                                         input_tensors['answer']: answer
                                                     }
                                                     )
            batch_no += 1
            if args.debug:
                for idx, p in enumerate(pred):
                    print(ans_map[p], ans_map[np.argmax(answer[idx])])

                print("Loss", loss_value, batch_no, i)
                print("Accuracy", accuracy)
                print("---------------")
                epochs_acc_sum += accuracy
            else:
                print("Loss", loss_value, batch_no, i)
                print("Training Accuracy", accuracy)
                epochs_acc_sum += accuracy
        acc_file.write(str(i) + '   ' + str(epochs_acc_sum/batch_no) + '\n')
        print()
        save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i))

    acc_file.close()
示例#7
0
def serve(img_features, question):
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/model2.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default=question,
                        help='Question')

    args = parser.parse_args()

    tf.reset_default_graph()

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()

    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: img_features,
            input_tensors['sentence']: question_ids,
        })

    sess.close()

    ans_list = []
    ans_list.append(ans_map[pred[0]])

    return ans_list
示例#8
0
文件: train.py 项目: ssiwakot/PathVQA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_lstm_layers', type=int, default=2,
                       help='num_lstm_layers')
    parser.add_argument('--cnn7_feature_length', type=int, default=512,
                       help='cnn7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512,
                       help='rnn_size')
    parser.add_argument('--embedding_size', type=int),
    parser.add_argument('--word_emb_dropout', type=float)
    parser.add_argument('--image_dropout', type=float)
    parser.add_argument('--data_dir', type=str)
    parser.add_argument('--batch_size', type=int, default=100,
                       help='Batch Size')
    parser.add_argument('--learning_rate', type=float, default=0.1,
                       help='Batch Size')
    parser.add_argument('--epochs', type=int, default=400,
                       help='Expochs')
    parser.add_argument('--debug', type=bool, default=False,
                       help='Debug')
    parser.add_argument('--resume_model', type=str, default=None,
                       help='Trained Model Path')
    parser.add_argument('--version', type=int, default=2,
                       help='VQA data version')

    args = parser.parse_args()
    print("Reading QA DATA")
    qa_data = data_loader.load_questions_answers(args.version, args.data_dir)
    
    print("Reading cnn7 features")
    cnn7_features, image_id_list = data_loader.load_cnn7_features(args.data_dir, 'train')
    print("cnn7 features", cnn7_features.shape)
    print("image_id_list", image_id_list.shape)

    image_id_map = {}
    for i in range(len(image_id_list)):
        image_id_map[ image_id_list[i] ] = i

    ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']}

    model_options = {
        'num_lstm_layers' : args.num_lstm_layers,
        'rnn_size' : args.rnn_size,
        'embedding_size' : args.embedding_size,
        'word_emb_dropout' : args.word_emb_dropout,
        'image_dropout' : args.image_dropout,
        'cnn7_feature_length' : args.cnn7_feature_length,
        'lstm_steps' : qa_data['max_question_length'] + 1,
        'q_vocab_size' : len(qa_data['question_vocab']),
        'ans_vocab_size' : len(qa_data['answer_vocab'])
    }
    
    
    
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_loss, t_accuracy, t_p = model.build_model()
    train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(t_loss)
    sess = tf.InteractiveSession()
    tf.initialize_all_variables().run()

    saver = tf.train.Saver()
    if args.resume_model:
        saver.restore(sess, args.resume_model)
        last_epoch = int(args.resume_model[-7:-5])
        print(f'I resume Epoch {last_epoch}')
    else:
        last_epoch = int(-1)
        
    for i in range(args.epochs):
        batch_no = 0
        batch_acc_record = []

        while batch_no < 220:
            start = time.clock()

            sentence, answer, cnn7 = get_training_batch(batch_no, args.batch_size, cnn7_features, image_id_map, qa_data, 'train')
            _, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], 
                feed_dict={
                    input_tensors['cnn7']:cnn7,
                    input_tensors['sentence']:sentence,
                    input_tensors['answer']:answer
                }
            )
            batch_acc_record.append(accuracy)
            batch_no += 1
            if args.debug:
                for idx, p in enumerate(pred):
                    print(ans_map[p], ans_map[ np.argmax(answer[idx])])

                print("Loss", loss_value, batch_no, i + 1 + last_epoch)
                print("Accuracy", accuracy)
                print("---------------")
            else:
                print("Loss", loss_value, batch_no, i + 1 + last_epoch)
                print("Training Accuracy", accuracy)

            end = time.clock()
            print("Time for one batch", end - start)
            print("Hours For one epoch" , (291 * 1.0)*(end - start)/60.0/60.0)
            
        save_path = saver.save(sess, "Data/Models/model{}.ckpt".format(i + 1 + last_epoch))

        if np.mean(batch_acc_record)>=0.9:
            break
            
    sess.close()
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path',
                        type=str,
                        default=MODEL_PATH,
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--feature_length',
                        type=int,
                        default=4096,
                        help='feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=1.0,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=1.0,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default=DATA_PATH,
                        help='Data directory')
    parser.add_argument('--image_features',
                        type=str,
                        default='vgg16',
                        help='Image features')
    args = parser.parse_args()

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'feature_length': args.feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }
    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')

    print("Reading QA DATA")
    test_data = data_loader.load_test_questions()
    print(len(test_data['question_vocab']))
    features, image_id_list = data_loader.load_features(
        args.data_dir, 'test', args.image_features)

    print("Features", features.shape)
    print("Image_id_list", image_id_list.shape)

    image_id_map = {}
    for i in range(len(image_id_list)):
        image_id_map[image_id_list[i]] = i

    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    stop_vocab = ['a', 'an', 'the']

    for i, now_image in enumerate(test_data['testing']):
        now_image_path = 'Data/test2015/COCO_test2015_%.12d.jpg' % (
            now_image['image_id'])
        img = Image.open(now_image_path)
        img.show()
        question_ids = np.zeros((1, vocab_data['max_question_length']),
                                dtype='int32')

        print('Question:', now_image['question'])
        question_words = re.findall(word_regex, now_image['question'])
        question_words = list(
            filter(lambda x: data_loader.vocab_handle(x) not in stop_vocab,
                   question_words))
        base = vocab_data['max_question_length'] - len(question_words)
        for j in range(0, len(question_words)):
            now_question_words = data_loader.vocab_handle(question_words[j])
            if now_question_words in question_vocab:
                question_ids[0][base + j] = question_vocab[now_question_words]
            else:
                question_ids[0][base + j] = question_vocab['UNK']

        now_index = image_id_map[test_data['testing'][i]['image_id']]

        pred, answer_probab = sess.run(
            [t_prediction, t_ans_probab],
            feed_dict={
                input_tensors['features']:
                features[now_index].reshape(1, args.feature_length),
                input_tensors['sentence']:
                question_ids,
            })

        print("Ans:", ans_map[pred[0]])
        answer_probab_tuples = [(-answer_probab[0][idx], idx)
                                for idx in range(len(answer_probab[0]))]
        answer_probab_tuples.sort()
        print("Top Answers:")
        for i in range(5):
            print(ans_map[answer_probab_tuples[i][1]])
        input()
    sess.close()
def main():

	print("Reading QA DATA")
	qa_data = utils.load_questions_answers(FLAGS.data_dir)                                                           
	
	print("Reading image features")
	img_features, image_id_list = utils.load_image_features(FLAGS.data_dir, "train")
	print("img features", img_features.shape)
	print("image_id_list", image_id_list.shape)

	image_id_map = {}
	for i in range(len(image_id_list)):
		image_id_map[ image_id_list[i] ] = i
	
	ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']}

	model_options = {
		"num_lstm_layers" : FLAGS.num_lstm_layers,
		"rnn_size" : FLAGS.rnn_size,
		"embedding_size" : FLAGS.que_feat_len,
		"word_emb_dropout" : FLAGS.word_dropout,
		"image_dropout" : FLAGS.img_dropout,
		"img_feature_length" : FLAGS.img_feat_len,
		"lstm_steps" : qa_data["max_question_length"] + 1,
		"q_vocab_size" : len(qa_data["question_vocab"]),
		"ans_vocab_size" : len(qa_data["answer_vocab"])
	}
	
	
	
	model = vis_lstm_model.Vis_lstm_model(model_options)
	input_tensors, t_loss, t_accuracy, t_p = model.build_model()
	train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(t_loss)
	sess = tf.InteractiveSession()
	tf.initialize_all_variables().run()

	
	saver = tf.train.Saver()
	if FLAGS.checkpoint_path:
		saver.restore(sess, tf.train.latest_checkpoint(FLAGS.checkpoint_path))

	for i in range(FLAGS.epochs):
		batch_no = 0

		while (batch_no*FLAGS.batch_size) < len(qa_data["training"]):
			sentence, answer, img = get_training_batch(batch_no, FLAGS.batch_size, img_features, image_id_map, qa_data, "train")
			_, loss_value, accuracy, pred = sess.run([train_op, t_loss, t_accuracy, t_p], 
				feed_dict={
					input_tensors["img"]:img,
					input_tensors["sentence"]:sentence,
					input_tensors["answer"]:answer
				}
			)
			batch_no += 1
			if FLAGS.debug:
				for idx, p in enumerate(pred):
					print(ans_map[p], ans_map[ np.argmax(answer[idx])])

				print("Loss", loss_value, batch_no, i)
				print("Accuracy", accuracy)
				print("---------------")
			else:
				print("Loss", loss_value, batch_no, i)
				print("Training Accuracy", accuracy)
			
		save_path = saver.save(sess, "./data/pretrain/model/model{}.ckpt".format(i))
示例#11
0
def main(image_path="test.jpg", question="what is in the image?"):

    slim = tf.contrib.slim
    resnet = nets.resnet_v2
	
    """
    tf.app.flags.DEFINE_string("image_path", image_path, "directory of image")
	
    tf.app.flags.DEFINE_string("question", question, "question")

    tf.app.flags.DEFINE_string("img_checkpoint_path", "./data/pretrain/resnet152/resnet_v2_152.ckpt",
                               "directory of checkpoint files for image feature extraction")

    tf.app.flags.DEFINE_string("checkpoint_path", "./data/pretrain/model",
                               "directory of checkpoint files for overall model")

    tf.app.flags.DEFINE_integer("num_lstm_layers", 2, "number of lstm layers")

    tf.app.flags.DEFINE_integer(
        "img_feat_len", 1001, "length of image feature vector")

    tf.app.flags.DEFINE_integer("rnn_size", 300, "size of rnn")

    tf.app.flags.DEFINE_integer(
        "que_feat_len", 300, "length of question feature vector")

    tf.app.flags.DEFINE_float("word_dropout", 0.5, "dropout rate of word nodes")

    tf.app.flags.DEFINE_float("img_dropout", 0.5, "dropout rate of image nodes")

    tf.app.flags.DEFINE_string("data_dir", "./data", "directory of data")
	
    FLAGS = tf.app.flags.FLAGS
    print ("Image:", FLAGS.image_path)
    print ("Question:", FLAGS.question)
    """
	
    #FLAGS = object()
    flags_image_path = image_path
    flags_question = question
    flags_img_checkpoint_path = "./data/pretrain/resnet152/resnet_v2_152.ckpt"
    flags_checkpoint_path = "./data/pretrain/model"
    flags_num_lstm_layers = 2
    flags_img_feat_len = 1001
    flags_rnn_size = 300
    flags_que_feat_len = 300
    flags_word_dropout = 0.5
    flags_img_dropout = 0.5
    flags_data_dir = "./data"
	
    vocab_data = utils.get_question_answer_vocab(flags_data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {vocab_data['question_vocab'][qw]
        : qw for qw in vocab_data['question_vocab']}

    with tf.Graph().as_default():
        images = tf.placeholder("float32", [None, 224, 224, 3])
        with slim.arg_scope(resnet.resnet_arg_scope()):
            net, _ = resnet.resnet_v2_152(images, 1001, is_training=False)
        restorer = tf.train.Saver()

        with tf.Session() as sess:#config=tf.ConfigProto(log_device_placement=True)) as sess:
            start = time.clock()
            image_array = utils.load_image_array(flags_image_path)
            image_feed = np.ndarray((1, 224, 224, 3))
            image_feed[0:, :, :] = image_array

            # checkpoint = tf.train.latest_checkpoint(flags_img_checkpoint_path)
            checkpoint = flags_img_checkpoint_path
            restorer.restore(sess, checkpoint)
            print("Image Model loaded")
            feed_dict = {images: image_feed}
            img_feature = sess.run(net, feed_dict=feed_dict)
            img_feature = np.squeeze(img_feature)
            end = time.clock()
            print("Time elapsed", end - start)
            print("Image processed")

    model_options = {
        'num_lstm_layers': flags_num_lstm_layers,
        'rnn_size': flags_rnn_size,
        'embedding_size': flags_que_feat_len,
        'word_emb_dropout': flags_word_dropout,
        'image_dropout': flags_img_dropout,
        'img_feature_length': flags_img_feat_len,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros(
        (1, vocab_data['max_question_length']), dtype='int32')
    question_words = re.findall(word_regex, flags_question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {vocab_data['answer_vocab'][ans]
        : ans for ans in vocab_data['answer_vocab']}

    with tf.Graph().as_default():
        model = vis_lstm_model.Vis_lstm_model(model_options)
        input_tensors, t_prediction, t_ans_probab = model.build_generator()
        restorer = tf.train.Saver()
        with tf.Session() as sess:#config=tf.ConfigProto(log_device_placement=True)) as sess:
            checkpoint = tf.train.latest_checkpoint(flags_checkpoint_path)
            restorer.restore(sess, checkpoint)
            pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={
                input_tensors['img']: np.reshape(img_feature, [1,1001]),
                input_tensors['sentence']: question_ids,
            })

    print("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])
    
    return (ans_map, answer_probab_tuples)
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/train_2014/COCO_train2014_000000581922.jpg',
        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='Data/train2014/Tri Training 3/Models/model49.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data/train2014/Tri Training 3/',
                        help='Data directory')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        help='Batch Size')
    parser.add_argument('--question',
                        type=str,
                        default='What is this product?',
                        help='Question')

    args = parser.parse_args()
    #vizwiz_file_path = 'Data/Test'
    solution = dict()
    solution["model"] = "model_3"
    solution["predictions"] = []
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json'
    with open(vizwiz_questions_path, 'r') as input_file:
        vizwiz_questions = json.loads(input_file.read())
    '''with open('questions_temp.txt','w') as temp_file:
        for i in range(20000):
            #print(vizwiz_questions['questions'][i]['question']
            temp_file.write(vizwiz_questions['questions'][i]['question'])
            temp_file.write('\n')'''

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    #question_ids = np.zeros((20000, vocab_data['max_question_length']), dtype = 'int32')
    #fc7_features = np.zeros((2, args.fc7_feature_length))

    print("Reading fc7 features")
    fc7_features, image_id_list = data_loader.load_fc7_features(
        'Data/', 'train')
    print("FC7 features", fc7_features.shape)
    print("image_id_list", image_id_list.shape)
    #print(0/0)
    '''i=0
    for file in os.listdir(vizwiz_file_path):
        if file.endswith(".jpg"):
            args.image_path = join(vizwiz_file_path,file)
            #args.question = vizwiz_questions['questions'][i]['question']
            print("Image:", args.image_path)
            print("Question:", args.question)
            fc7_features[i] = utils.extract_fc7_features(args.image_path, 'Data/vgg16-20160129.tfmodel')
            i += 1'''

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }
    #question_words = re.findall(word_regex, args.question)
    #base = vocab_data['max_question_length'] - len(question_words)
    '''for no_questions in range(question_ids.shape[0]):
        for i in range(0, len(question_words)):
            if question_words[i] in question_vocab:
                question_ids[no_questions][base + i] = question_vocab[ question_words[i] ]
            else:
                question_ids[no_questions][base + i] = question_vocab['UNK']'''

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()

    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)
    batch_no = 0
    with open('result3.txt', 'w') as output_file:

        while (batch_no * args.batch_size) < 20000:
            question_ids = np.zeros(
                (args.batch_size, vocab_data['max_question_length']),
                dtype='int32')
            #vizwiz_questions['questions'][i]['question']
            for no_questions in range(question_ids.shape[0]):
                question_formatted = vizwiz_questions['questions'][
                    batch_no * args.batch_size + no_questions]['question']
                question_list = question_formatted.split()
                question_list = question_list[0:20]
                question_formatted = ' '.join(question_list)
                question_words = re.findall(word_regex, question_formatted)
                base = vocab_data['max_question_length'] - len(question_words)
                for i in range(0, len(question_words)):
                    if question_words[i] in question_vocab:
                        question_ids[no_questions][base + i] = question_vocab[
                            question_words[i]]
                    else:
                        question_ids[no_questions][base +
                                                   i] = question_vocab['UNK']

            fc7 = get_batch(batch_no, args.batch_size, fc7_features)
            pred, ans_prob = sess.run(
                [t_prediction, t_ans_probab],
                feed_dict={
                    input_tensors['fc7']: fc7,
                    input_tensors['sentence']: question_ids,
                })

            for i in range(len(pred)):
                current_prediction = dict()
                current_prediction["image_id"] = "VizWiz_train_%.12d.jpg" % (
                    batch_no * args.batch_size + i)
                current_prediction["question"] = vizwiz_questions['questions'][
                    batch_no * args.batch_size + i]['question']

                #output_file.write("Ques:" + vizwiz_questions['questions'][batch_no*args.batch_size + i]['question'])
                answer_list = []
                answer_probab_tuples = [(-ans_prob[i][idx], idx)
                                        for idx in range(len(ans_prob[0]))]
                answer_probab_tuples.sort()
                for j in range(5):
                    answer_list.append(ans_map[answer_probab_tuples[j][1]])
                #output_file.write("Ans:" + ans_map[pred[i]])
                current_prediction["predicted_answer"] = answer_list
                #output_file.write("Ans:" + str(answer_list))
                #output_file.write('\n')
                solution["predictions"].append(current_prediction)
                #print("Ans:", ans_map[pred[i]])
                #print('\n')
            batch_no += 1
        output_file.write(json.dumps(solution))
示例#13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        help='Batch Size')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='Batch Size')
    parser.add_argument('--epochs', type=int, default=100, help='Epochs')
    parser.add_argument('--debug', type=bool, default=False, help='Debug')
    parser.add_argument('--resume_model',
                        type=str,
                        default=None,
                        help='Trained Model Path')
    parser.add_argument('--version',
                        type=int,
                        default=2,
                        help='VQA data version')

    args = parser.parse_args()
    print("Creating QuestionAnswer data")
    prepare_training_data('trainquestions.json', 'trainannotations.json',
                          'valquestions.json', 'valannotations.json')
    print("Prepared given data")
    print("Reading QuestionAnswer data")

    qa_data = load_questions_answers('newqadata.pkl', 'Data')
    print(qa_data['answer_vocab'])

    print("Creating Image features")
    ################################################
    split = 'train'
    vgg_file = open('Data/vgg16.tfmodel', 'rb')
    vgg16raw = vgg_file.read()
    vgg_file.close()

    graph_def = tf.GraphDef()
    graph_def.ParseFromString(vgg16raw)

    images = tf.placeholder("float", [None, 224, 224, 3])
    tf.import_graph_def(graph_def, input_map={"images": images})

    graph = tf.get_default_graph()

    for opn in graph.get_operations():
        print("Name", opn.name, opn.values())

    all_data = load_questions_answers()
    if split == "train":
        qa_data = all_data['training']
    else:
        qa_data = all_data['validation']

    image_ids = {}
    for qa in qa_data:
        image_ids[qa['image_id']] = 1

    image_id_list = [img_id for img_id in image_ids]
    print("Total Images", len(image_id_list))

    sess = tf.Session()
    fc7 = np.ndarray((len(image_id_list), 4096))
    idx = 0

    while idx < len(image_id_list):
        start = time.clock()
        image_batch = np.ndarray((10, 224, 224, 3))

        count = 0
        for i in range(0, args.batch_size):
            if idx >= len(image_id_list):
                break
            image_file = join('Data',
                              '%snew/%.1d.jpg' % (split, image_id_list[idx]))
            image_batch[i, :, :, :] = utils.load_image_array(image_file)
            idx += 1
            count += 1

        feed_dict = {images: image_batch[0:count, :, :, :]}
        fc7_tensor = graph.get_tensor_by_name("import/Relu_1:0")
        fc7_batch = sess.run(fc7_tensor, feed_dict=feed_dict)
        fc7[(idx - count):idx, :] = fc7_batch[0:count, :]
        end = time.clock()
        print("Time for batch 10 photos", end - start)
        print("Hours For Whole Dataset",
              (len(image_id_list) * 1.0) * (end - start) / 60.0 / 60.0 / 10.0)

        print("Images Processed", idx)

    print("Saving fc7 features")
    h5f_fc7 = h5py.File(join('Data', 'fc7new.h5'), 'w')
    h5f_fc7.create_dataset('fc7_features', data=fc7)
    h5f_fc7.close()

    print("Saving image id list")
    h5f_image_id_list = h5py.File(join('Data', 'image_id_listnew.h5'), 'w')
    h5f_image_id_list.create_dataset('image_id_list', data=image_id_list)
    h5f_image_id_list.close()
    print("Done!")

    ##################################################33
    print("Reading image features")
    fc7_features, image_id_list = load_fc7_features('Data', 'train')
    print("FC7 features", fc7_features.shape)
    print("image_id_list", image_id_list.shape)
    qa_data = load_questions_answers('newqadata.pkl', 'Data')
    print(qa_data['answer_vocab'])
    image_id_map = {}
    for i in range(len(image_id_list)):
        image_id_map[image_id_list[i]] = i
    ans_map = {
        qa_data['answer_vocab'][ans]: ans
        for ans in qa_data['answer_vocab']
    }
    model_options = {
        'num_lstm_layers': 2,
        'rnn_size': 512,
        'embedding_size': 512,
        'word_emb_dropout': 0.5,
        'image_dropout': 0.5,
        'fc7_feature_length': 4096,
        'lstm_steps': qa_data['max_question_length'] + 1,
        'q_vocab_size': len(qa_data['question_vocab']),
        'ans_vocab_size': len(qa_data['answer_vocab'])
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_loss, t_accuracy, t_p = model.build_model()
    train_op = tf.train.AdamOptimizer(0.001).minimize(t_loss)
    sess = tf.InteractiveSession()
    tf.initialize_all_variables().run()
    saver = tf.train.Saver()
    #model.summary()
    #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
    if args.resume_model:
        saver.restore(sess, args.resume_model)
    for i in range(100):
        batch_no = 0
        while (batch_no * 10) < len(qa_data['training']):
            sentence, answer, fc7 = get_training_batch(batch_no, 10,
                                                       fc7_features,
                                                       image_id_map, qa_data,
                                                       'train')
            _, loss_value, accuracy, pred = sess.run(
                [train_op, t_loss, t_accuracy, t_p],
                feed_dict={
                    input_tensors['fc7']: fc7,
                    input_tensors['sentence']: sentence,
                    input_tensors['answer']: answer
                })
            batch_no += 1
            if args.debug:
                for idx, p in enumerate(pred):
                    print(ans_map[p], ans_map[np.argmax(answer[idx])])
                print("Loss", loss_value, batch_no, i)
                print("Accuracy", accuracy)
                print("---------------")
                skplt.metrics.plot_roc_curve(answer[idx], ans_map[p])
                plt.show()
            else:
                print("Loss", loss_value, batch_no, i)
                print("Training Accuracy", accuracy)
                #skplt.metrics.plot_roc_curve(answer[0], pred[0])
                #plt.show()
        save_path = saver.save(sess, "Data/Models/modelnew{}.ckpt".format(i))
示例#14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/train_2014/COCO_train2014_000000581922.jpg',
        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='Data/train2014/Tri Training 1/Models/model11.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data/train2014/Tri Training 1/',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='What is this product?',
                        help='Question')

    args = parser.parse_args()
    vizwiz_file_path = 'Data/Images'
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)
    qvocab = vocab_data['question_vocab']
    #print(qvocab)
    #print(0/0)
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json'
    with open(vizwiz_questions_path, 'r') as input_file:
        vizwiz_questions = json.loads(input_file.read())
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    i = 0
    for file in os.listdir(vizwiz_file_path):
        if file.endswith(".jpg"):
            args.image_path = join(vizwiz_file_path, file)
            args.question = vizwiz_questions['questions'][i]['question']
            i += 1
            print("Image:", args.image_path)
            print("Question:", args.question)

            #fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val')
            fc7_features = utils.extract_fc7_features(
                args.image_path,
                'Data/train2014/Tri Training 1/vgg16-20160129.tfmodel')

            model_options = {
                'num_lstm_layers': args.num_lstm_layers,
                'rnn_size': args.rnn_size,
                'embedding_size': args.embedding_size,
                'word_emb_dropout': args.word_emb_dropout,
                'image_dropout': args.image_dropout,
                'fc7_feature_length': args.fc7_feature_length,
                'lstm_steps': vocab_data['max_question_length'] + 1,
                'q_vocab_size': len(vocab_data['question_vocab']),
                'ans_vocab_size': len(vocab_data['answer_vocab'])
            }

            question_words = re.findall(word_regex, args.question)
            base = vocab_data['max_question_length'] - len(question_words)
            for i in range(0, len(question_words)):
                if question_words[i] in question_vocab:
                    question_ids[0][base +
                                    i] = question_vocab[question_words[i]]
                else:
                    question_ids[0][base + i] = question_vocab['UNK']

            ans_map = {
                vocab_data['answer_vocab'][ans]: ans
                for ans in vocab_data['answer_vocab']
            }
            model = vis_lstm_model.Vis_lstm_model(model_options)
            input_tensors, t_prediction, t_ans_probab = model.build_generator()
            sess = tf.InteractiveSession()
            saver = tf.train.Saver()
            saver.restore(sess, args.model_path)

            print(question_ids.shape)
            print(fc7_features.shape)
            print(0 / 0)
            pred, answer_probab = sess.run(
                [t_prediction, t_ans_probab],
                feed_dict={
                    input_tensors['fc7']: fc7_features,
                    input_tensors['sentence']: question_ids,
                })

            print("Ans:", ans_map[pred[0]])
            answer_probab_tuples = [(-answer_probab[0][idx], idx)
                                    for idx in range(len(answer_probab[0]))]
            answer_probab_tuples.sort()
            print("Top Answers")
            for i in range(5):
                #print(ans_map[answer_probab_tuples[i]])
                print(ans_map[answer_probab_tuples[i][1]])
示例#15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/val2015/abstract_v002_val2015_000000022100.png',
        help='Image Path')
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/Models/model19.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--cnn7_feature_length',
                        type=int,
                        default=512,
                        help='cnn7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='What is the man sitting on?',
                        help='Question')
    parser.add_argument('--lstm_direc',
                        type=str,
                        default='uni',
                        help='LSTM Direction')

    args = parser.parse_args()
    #Extract vocabulary of question and answer
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)

    #Build q_map but seems useless, ans_map useful
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    cnn7_features = utils.extract_cnn7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    #Word Splitting
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_words = re.findall(word_regex, args.question)
    #Find question's word in ques_vocab,record it in ques_id
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab[
                'UNK']  #Biggest index in answer

    #preparing model
    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'cnn7_feature_length': args.cnn7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab']),
    }
    #resume model
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator(batch=1)
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)
    #predict
    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['cnn7']: cnn7_features,
            input_tensors['sentence']: question_ids,
        })

    #showing image/question/answer
    print("Image:", args.image_path)
    print("Question:", args.question)
    print("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    sess.close()
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])
示例#16
0
def main():

	tf.app.flags.DEFINE_integer("num_lstm_layers", 2, "number of lstm layers")
	tf.app.flags.DEFINE_integer("img_feat_len", 1001, "length of image feature vector")
	tf.app.flags.DEFINE_integer("rnn_size", 300, "size of rnn")
	tf.app.flags.DEFINE_integer("que_feat_len", 300, "length of question feature vector")
	tf.app.flags.DEFINE_float("word_dropout", 0.5, "dropout rate of word nodes")
	tf.app.flags.DEFINE_float("img_dropout", 0.5, "dropout rate of image nodes")
	tf.app.flags.DEFINE_string("data_dir", "./data", "directory of data")
	tf.app.flags.DEFINE_integer("batch_size", 200, "size of batches")
	tf.app.flags.DEFINE_float("learning_rate", 0.001, "learning rate")
	tf.app.flags.DEFINE_integer("epochs", 200, "number of epochs")
	tf.app.flags.DEFINE_string("checkpoint_path", './data/pretrain/model', "directory of checkpoint files")
	tf.app.flags.DEFINE_bool("debug", True, "debug subroutine")
	FLAGS = tf.app.flags.FLAGS
	
	print("Reading QA DATA")
	qa_data = utils.load_questions_answers(FLAGS.data_dir)                                                           
	vocab_data = utils.get_question_answer_vocab(FLAGS.data_dir)
	print("Reading image features")
	img_features, image_id_list = utils.load_image_features(FLAGS.data_dir, "val")
	print("img features", img_features.shape)
	print("image_id_list", image_id_list.shape)

	image_id_map = {}
	for i in range(len(image_id_list)):
		image_id_map[ image_id_list[i] ] = i
	
	ans_map = { qa_data['answer_vocab'][ans] : ans for ans in qa_data['answer_vocab']}

	model_options = {
		'num_lstm_layers': FLAGS.num_lstm_layers,
		'rnn_size': FLAGS.rnn_size,
		'embedding_size': FLAGS.que_feat_len,
		'word_emb_dropout': FLAGS.word_dropout,
		'image_dropout': FLAGS.img_dropout,
		'img_feature_length': FLAGS.img_feat_len,
		'lstm_steps': vocab_data['max_question_length'] + 1,
		'q_vocab_size': len(vocab_data['question_vocab']),
		'ans_vocab_size': len(vocab_data['answer_vocab'])
	}
	
	model = vis_lstm_model.Vis_lstm_model(model_options)
	input_tensors, t_prediction, t_ans_probab = model.build_generator()
	with tf.Session() as sess:
		restorer = tf.train.Saver()

		avg_accuracy = 0.0
		total = 0
		checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
		restorer.restore(sess, checkpoint)
		
		batch_no = 0
		while (batch_no*FLAGS.batch_size) < len(qa_data['validation']):
			sentence, answer, img = get_batch(batch_no, FLAGS.batch_size, 
				img_features, image_id_map, qa_data)
			
			pred, ans_prob = sess.run([t_prediction, t_ans_probab], feed_dict={
	            input_tensors['img']:img,
	            input_tensors['sentence']:sentence,
	        })
			
			batch_no += 1
			if FLAGS.debug:
				for idx, p in enumerate(pred):
					print(ans_map[p], ans_map[ np.argmax(answer[idx])])

			correct_predictions = np.equal(pred, np.argmax(answer, 1))
			correct_predictions = correct_predictions.astype('float32')
			accuracy = correct_predictions.mean()
			print("Acc", accuracy)
			avg_accuracy += accuracy
			total += 1
		
		print("Acc", avg_accuracy/total)