示例#1
0
def update_output(n_clicks, value):
    vocab_data = data_loader.get_question_answer_vocab("2")
    qvocab = vocab_data['question_vocab']
    q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']}
    print('filename::',filen)
    fc7_features = utils.extract_fc7_features(filen, 'Data/vgg16.tfmodel')
    model_options = {
		'num_lstm_layers' : 2,
		'rnn_size' : 512,
		'embedding_size' : 512,
		'word_emb_dropout' : 0.5,
		'image_dropout' : 0.5,
		'fc7_feature_length' : 4096,
		'lstm_steps' : vocab_data['max_question_length'] + 1,
		'q_vocab_size' : len(vocab_data['question_vocab']),
		'ans_vocab_size' : len(vocab_data['answer_vocab'])
	}
    
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32')
    print('qst',value)
    question_words = re.findall(word_regex, value)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[ question_words[i] ]
        else:
            question_ids[0][base + i] = question_vocab['UNK']
    ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']}
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, 'Data/Models/modelnew99.ckpt')
    pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={
        input_tensors['fc7']:fc7_features,
        input_tensors['sentence']:question_ids,
    })
    print("answerprediction",pred[0])
    #model.summary()
    #plot_model(model,to_file='predictmodel.png')
    print ("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print ("Top Answers")
    for i in range(1):
        print (ans_map[ answer_probab_tuples[0][1] ])
        #ans=(ans_map[answer_probab_tuples[i][1] ])
        lang = "en"
        text="This is a "+ans_map[ answer_probab_tuples[0][1] ]
        speech = Speech(text, lang)

        sox_effects = ("speed", "0.8")
        speech.play(sox_effects)
        
    return ans_map[answer_probab_tuples[0][1]]
示例#2
0
def calcFeatures(image_path):
    data_dir = 'Data'
    vocab_data = data_loader.get_question_answer_vocab(data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    fc7_features = utils.extract_fc7_features(image_path,
                                              join(data_dir, 'vgg16.tfmodel'))
    return fc7_features
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_path',
                        type=str,
                        default='Data/cat.jpeg',
                        help='Image Path')
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/Models/model2.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='Which animal is this?',
                        help='Question')

    args = parser.parse_args()

    print("Image:", args.image_path)
    print("Question:", args.question)

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    fc7_features = utils.extract_fc7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: fc7_features,
            input_tensors['sentence']: question_ids,
        })

    print("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])
示例#4
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--image_path', type=str, default = 'Data/cat.jpeg',
                       help='Image Path')
	parser.add_argument('--model_path', type=str, default = 'Data/Models/model2.ckpt',
                       help='Model Path')
	parser.add_argument('--num_lstm_layers', type=int, default=2,
                       help='num_lstm_layers')
	parser.add_argument('--fc7_feature_length', type=int, default=4096,
                       help='fc7_feature_length')
	parser.add_argument('--rnn_size', type=int, default=512,
                       help='rnn_size')
	parser.add_argument('--embedding_size', type=int, default=512,
                       help='embedding_size'),
	parser.add_argument('--word_emb_dropout', type=float, default=0.5,
                       help='word_emb_dropout')
	parser.add_argument('--image_dropout', type=float, default=0.5,
                       help='image_dropout')
	parser.add_argument('--data_dir', type=str, default='Data',
                       help='Data directory')
	parser.add_argument('--question', type=str, default='Which animal is this?',
                       help='Question')
	
	

	args = parser.parse_args()

	print "Image:", args.image_path
	print "Question:", args.question

	vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
	qvocab = vocab_data['question_vocab']
	q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']}
	
	fc7_features = utils.extract_fc7_features(args.image_path, join(args.data_dir, 'vgg16.tfmodel'))
	
	model_options = {
		'num_lstm_layers' : args.num_lstm_layers,
		'rnn_size' : args.rnn_size,
		'embedding_size' : args.embedding_size,
		'word_emb_dropout' : args.word_emb_dropout,
		'image_dropout' : args.image_dropout,
		'fc7_feature_length' : args.fc7_feature_length,
		'lstm_steps' : vocab_data['max_question_length'] + 1,
		'q_vocab_size' : len(vocab_data['question_vocab']),
		'ans_vocab_size' : len(vocab_data['answer_vocab'])
	}
	
	question_vocab = vocab_data['question_vocab']
	word_regex = re.compile(r'\w+')
	question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32')
	question_words = re.findall(word_regex, args.question)
	base = vocab_data['max_question_length'] - len(question_words)
	for i in range(0, len(question_words)):
		if question_words[i] in question_vocab:
			question_ids[0][base + i] = question_vocab[ question_words[i] ]
		else:
			question_ids[0][base + i] = question_vocab['UNK']

	ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']}
	model = vis_lstm_model.Vis_lstm_model(model_options)
	input_tensors, t_prediction, t_ans_probab = model.build_generator()
	sess = tf.InteractiveSession()
	saver = tf.train.Saver()
	saver.restore(sess, args.model_path)
	
	pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={
        input_tensors['fc7']:fc7_features,
        input_tensors['sentence']:question_ids,
    })

	
	print "Ans:", ans_map[pred[0]]
	answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))]
	answer_probab_tuples.sort()
	print "Top Answers"
	for i in range(5):
		print ans_map[ answer_probab_tuples[i][1] ]
示例#5
0
def serve(img_features, question):
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/model2.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default=question,
                        help='Question')

    args = parser.parse_args()

    tf.reset_default_graph()

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()

    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: img_features,
            input_tensors['sentence']: question_ids,
        })

    sess.close()

    ans_list = []
    ans_list.append(ans_map[pred[0]])

    return ans_list
示例#6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path',
                        type=str,
                        default=MODEL_PATH,
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--feature_length',
                        type=int,
                        default=4096,
                        help='feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=1.0,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=1.0,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default=DATA_PATH,
                        help='Data directory')
    parser.add_argument('--image_features',
                        type=str,
                        default='vgg16',
                        help='Image features')
    args = parser.parse_args()

    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'feature_length': args.feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }
    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')

    print("Reading QA DATA")
    test_data = data_loader.load_test_questions()
    print(len(test_data['question_vocab']))
    features, image_id_list = data_loader.load_features(
        args.data_dir, 'test', args.image_features)

    print("Features", features.shape)
    print("Image_id_list", image_id_list.shape)

    image_id_map = {}
    for i in range(len(image_id_list)):
        image_id_map[image_id_list[i]] = i

    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)

    stop_vocab = ['a', 'an', 'the']

    for i, now_image in enumerate(test_data['testing']):
        now_image_path = 'Data/test2015/COCO_test2015_%.12d.jpg' % (
            now_image['image_id'])
        img = Image.open(now_image_path)
        img.show()
        question_ids = np.zeros((1, vocab_data['max_question_length']),
                                dtype='int32')

        print('Question:', now_image['question'])
        question_words = re.findall(word_regex, now_image['question'])
        question_words = list(
            filter(lambda x: data_loader.vocab_handle(x) not in stop_vocab,
                   question_words))
        base = vocab_data['max_question_length'] - len(question_words)
        for j in range(0, len(question_words)):
            now_question_words = data_loader.vocab_handle(question_words[j])
            if now_question_words in question_vocab:
                question_ids[0][base + j] = question_vocab[now_question_words]
            else:
                question_ids[0][base + j] = question_vocab['UNK']

        now_index = image_id_map[test_data['testing'][i]['image_id']]

        pred, answer_probab = sess.run(
            [t_prediction, t_ans_probab],
            feed_dict={
                input_tensors['features']:
                features[now_index].reshape(1, args.feature_length),
                input_tensors['sentence']:
                question_ids,
            })

        print("Ans:", ans_map[pred[0]])
        answer_probab_tuples = [(-answer_probab[0][idx], idx)
                                for idx in range(len(answer_probab[0]))]
        answer_probab_tuples.sort()
        print("Top Answers:")
        for i in range(5):
            print(ans_map[answer_probab_tuples[i][1]])
        input()
    sess.close()
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/train_2014/COCO_train2014_000000581922.jpg',
        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='Data/train2014/Tri Training 3/Models/model49.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data/train2014/Tri Training 3/',
                        help='Data directory')
    parser.add_argument('--batch_size',
                        type=int,
                        default=200,
                        help='Batch Size')
    parser.add_argument('--question',
                        type=str,
                        default='What is this product?',
                        help='Question')

    args = parser.parse_args()
    #vizwiz_file_path = 'Data/Test'
    solution = dict()
    solution["model"] = "model_3"
    solution["predictions"] = []
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json'
    with open(vizwiz_questions_path, 'r') as input_file:
        vizwiz_questions = json.loads(input_file.read())
    '''with open('questions_temp.txt','w') as temp_file:
        for i in range(20000):
            #print(vizwiz_questions['questions'][i]['question']
            temp_file.write(vizwiz_questions['questions'][i]['question'])
            temp_file.write('\n')'''

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    #question_ids = np.zeros((20000, vocab_data['max_question_length']), dtype = 'int32')
    #fc7_features = np.zeros((2, args.fc7_feature_length))

    print("Reading fc7 features")
    fc7_features, image_id_list = data_loader.load_fc7_features(
        'Data/', 'train')
    print("FC7 features", fc7_features.shape)
    print("image_id_list", image_id_list.shape)
    #print(0/0)
    '''i=0
    for file in os.listdir(vizwiz_file_path):
        if file.endswith(".jpg"):
            args.image_path = join(vizwiz_file_path,file)
            #args.question = vizwiz_questions['questions'][i]['question']
            print("Image:", args.image_path)
            print("Question:", args.question)
            fc7_features[i] = utils.extract_fc7_features(args.image_path, 'Data/vgg16-20160129.tfmodel')
            i += 1'''

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }
    #question_words = re.findall(word_regex, args.question)
    #base = vocab_data['max_question_length'] - len(question_words)
    '''for no_questions in range(question_ids.shape[0]):
        for i in range(0, len(question_words)):
            if question_words[i] in question_vocab:
                question_ids[no_questions][base + i] = question_vocab[ question_words[i] ]
            else:
                question_ids[no_questions][base + i] = question_vocab['UNK']'''

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()

    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)
    batch_no = 0
    with open('result3.txt', 'w') as output_file:

        while (batch_no * args.batch_size) < 20000:
            question_ids = np.zeros(
                (args.batch_size, vocab_data['max_question_length']),
                dtype='int32')
            #vizwiz_questions['questions'][i]['question']
            for no_questions in range(question_ids.shape[0]):
                question_formatted = vizwiz_questions['questions'][
                    batch_no * args.batch_size + no_questions]['question']
                question_list = question_formatted.split()
                question_list = question_list[0:20]
                question_formatted = ' '.join(question_list)
                question_words = re.findall(word_regex, question_formatted)
                base = vocab_data['max_question_length'] - len(question_words)
                for i in range(0, len(question_words)):
                    if question_words[i] in question_vocab:
                        question_ids[no_questions][base + i] = question_vocab[
                            question_words[i]]
                    else:
                        question_ids[no_questions][base +
                                                   i] = question_vocab['UNK']

            fc7 = get_batch(batch_no, args.batch_size, fc7_features)
            pred, ans_prob = sess.run(
                [t_prediction, t_ans_probab],
                feed_dict={
                    input_tensors['fc7']: fc7,
                    input_tensors['sentence']: question_ids,
                })

            for i in range(len(pred)):
                current_prediction = dict()
                current_prediction["image_id"] = "VizWiz_train_%.12d.jpg" % (
                    batch_no * args.batch_size + i)
                current_prediction["question"] = vizwiz_questions['questions'][
                    batch_no * args.batch_size + i]['question']

                #output_file.write("Ques:" + vizwiz_questions['questions'][batch_no*args.batch_size + i]['question'])
                answer_list = []
                answer_probab_tuples = [(-ans_prob[i][idx], idx)
                                        for idx in range(len(ans_prob[0]))]
                answer_probab_tuples.sort()
                for j in range(5):
                    answer_list.append(ans_map[answer_probab_tuples[j][1]])
                #output_file.write("Ans:" + ans_map[pred[i]])
                current_prediction["predicted_answer"] = answer_list
                #output_file.write("Ans:" + str(answer_list))
                #output_file.write('\n')
                solution["predictions"].append(current_prediction)
                #print("Ans:", ans_map[pred[i]])
                #print('\n')
            batch_no += 1
        output_file.write(json.dumps(solution))
示例#8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_path',
                        type=str,
                        default='/home/vmhatre/vqa_supervised/Data/cat.jpeg',
                        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='/home/vmhatre/vqa_supervised/Data/Models/model2.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='----Question to ask?',
                        help='Question')

    args = parser.parse_args()
    vocab_data = data_loader.get_question_answer_vocab(args.data_dir)
    #Load Question and answer dataset
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    # Extract features from VGGmodel
    fc7_features = utils.extract_fc7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'fc7_feature_length': args.fc7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab'])
    }
    # Split data
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']


# map question with answer
    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator()
    # Save the session to help retrieve the trained model at checkpoints
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)
    # Run input features from fc7 sentence.
    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['fc7']: fc7_features,
            input_tensors['sentence']: question_ids,
        })
    #Check thte most probable answers for image and questions
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    for i in range(5):
        print ans_map[answer_probab_tuples[i][1]]
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/train_2014/COCO_train2014_000000581922.jpg',
        help='Image Path')
    parser.add_argument(
        '--model_path',
        type=str,
        default='Data/train2014/Tri Training 1/Models/model11.ckpt',
        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--fc7_feature_length',
                        type=int,
                        default=4096,
                        help='fc7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data/train2014/Tri Training 1/',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='What is this product?',
                        help='Question')

    args = parser.parse_args()
    vizwiz_file_path = 'Data/Images'
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)
    qvocab = vocab_data['question_vocab']
    #print(qvocab)
    #print(0/0)
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json'
    with open(vizwiz_questions_path, 'r') as input_file:
        vizwiz_questions = json.loads(input_file.read())
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    i = 0
    for file in os.listdir(vizwiz_file_path):
        if file.endswith(".jpg"):
            args.image_path = join(vizwiz_file_path, file)
            args.question = vizwiz_questions['questions'][i]['question']
            i += 1
            print("Image:", args.image_path)
            print("Question:", args.question)

            #fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val')
            fc7_features = utils.extract_fc7_features(
                args.image_path,
                'Data/train2014/Tri Training 1/vgg16-20160129.tfmodel')

            model_options = {
                'num_lstm_layers': args.num_lstm_layers,
                'rnn_size': args.rnn_size,
                'embedding_size': args.embedding_size,
                'word_emb_dropout': args.word_emb_dropout,
                'image_dropout': args.image_dropout,
                'fc7_feature_length': args.fc7_feature_length,
                'lstm_steps': vocab_data['max_question_length'] + 1,
                'q_vocab_size': len(vocab_data['question_vocab']),
                'ans_vocab_size': len(vocab_data['answer_vocab'])
            }

            question_words = re.findall(word_regex, args.question)
            base = vocab_data['max_question_length'] - len(question_words)
            for i in range(0, len(question_words)):
                if question_words[i] in question_vocab:
                    question_ids[0][base +
                                    i] = question_vocab[question_words[i]]
                else:
                    question_ids[0][base + i] = question_vocab['UNK']

            ans_map = {
                vocab_data['answer_vocab'][ans]: ans
                for ans in vocab_data['answer_vocab']
            }
            model = vis_lstm_model.Vis_lstm_model(model_options)
            input_tensors, t_prediction, t_ans_probab = model.build_generator()
            sess = tf.InteractiveSession()
            saver = tf.train.Saver()
            saver.restore(sess, args.model_path)

            print(question_ids.shape)
            print(fc7_features.shape)
            print(0 / 0)
            pred, answer_probab = sess.run(
                [t_prediction, t_ans_probab],
                feed_dict={
                    input_tensors['fc7']: fc7_features,
                    input_tensors['sentence']: question_ids,
                })

            print("Ans:", ans_map[pred[0]])
            answer_probab_tuples = [(-answer_probab[0][idx], idx)
                                    for idx in range(len(answer_probab[0]))]
            answer_probab_tuples.sort()
            print("Top Answers")
            for i in range(5):
                #print(ans_map[answer_probab_tuples[i]])
                print(ans_map[answer_probab_tuples[i][1]])
示例#10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--image_path',
        type=str,
        default='Data/val2015/abstract_v002_val2015_000000022100.png',
        help='Image Path')
    parser.add_argument('--model_path',
                        type=str,
                        default='Data/Models/model19.ckpt',
                        help='Model Path')
    parser.add_argument('--num_lstm_layers',
                        type=int,
                        default=2,
                        help='num_lstm_layers')
    parser.add_argument('--cnn7_feature_length',
                        type=int,
                        default=512,
                        help='cnn7_feature_length')
    parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size')
    parser.add_argument('--embedding_size',
                        type=int,
                        default=512,
                        help='embedding_size'),
    parser.add_argument('--word_emb_dropout',
                        type=float,
                        default=0.5,
                        help='word_emb_dropout')
    parser.add_argument('--image_dropout',
                        type=float,
                        default=0.5,
                        help='image_dropout')
    parser.add_argument('--data_dir',
                        type=str,
                        default='Data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='What is the man sitting on?',
                        help='Question')
    parser.add_argument('--lstm_direc',
                        type=str,
                        default='uni',
                        help='LSTM Direction')

    args = parser.parse_args()
    #Extract vocabulary of question and answer
    vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir)

    #Build q_map but seems useless, ans_map useful
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }
    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }
    cnn7_features = utils.extract_cnn7_features(
        args.image_path, join(args.data_dir, 'vgg16.tfmodel'))

    #Word Splitting
    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_words = re.findall(word_regex, args.question)
    #Find question's word in ques_vocab,record it in ques_id
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab[
                'UNK']  #Biggest index in answer

    #preparing model
    model_options = {
        'num_lstm_layers': args.num_lstm_layers,
        'rnn_size': args.rnn_size,
        'embedding_size': args.embedding_size,
        'word_emb_dropout': args.word_emb_dropout,
        'image_dropout': args.image_dropout,
        'cnn7_feature_length': args.cnn7_feature_length,
        'lstm_steps': vocab_data['max_question_length'] + 1,
        'q_vocab_size': len(vocab_data['question_vocab']),
        'ans_vocab_size': len(vocab_data['answer_vocab']),
    }
    #resume model
    model = vis_lstm_model.Vis_lstm_model(model_options)
    input_tensors, t_prediction, t_ans_probab = model.build_generator(batch=1)
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, args.model_path)
    #predict
    pred, answer_probab = sess.run(
        [t_prediction, t_ans_probab],
        feed_dict={
            input_tensors['cnn7']: cnn7_features,
            input_tensors['sentence']: question_ids,
        })

    #showing image/question/answer
    print("Image:", args.image_path)
    print("Question:", args.question)
    print("Ans:", ans_map[pred[0]])
    answer_probab_tuples = [(-answer_probab[0][idx], idx)
                            for idx in range(len(answer_probab[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    sess.close()
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])
示例#11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image_path',
                        type=str,
                        default='data/cat.jpg',
                        help='Image Path')
    parser.add_argument('--model_path',
                        type=str,
                        default='att1_hard.pth',
                        help='Model Path')
    parser.add_argument('--data_dir',
                        type=str,
                        default='data',
                        help='Data directory')
    parser.add_argument('--question',
                        type=str,
                        default='Which animal is this?',
                        help='Question')

    args = parser.parse_args()

    print("Image:", args.image_path)
    print("Question:", args.question)

    # build up vgg image feature extractor
    Vgg19 = models.vgg19(pretrained=True)
    extract_list = [27, 36]
    extractor = FeatureExtractor(Vgg19.features, extract_list)
    extractor.eval()

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    preprocess = transforms.Compose(
        [transforms.Resize((224, 224)),
         transforms.ToTensor(), normalize])

    img = preprocess(Image.open(args.image_path))
    img = torch.unsqueeze(img, 0)
    fc7 = extractor(img)
    fc7 = fc7.permute(0, 2, 3, 1)
    fc7 = fc7.view(1, -1, fc7.shape[3])

    # get vocabulary to encode questions
    vocab_data = data_loader.get_question_answer_vocab(version=2,
                                                       data_dir=args.data_dir)
    qvocab = vocab_data['question_vocab']
    q_map = {
        vocab_data['question_vocab'][qw]: qw
        for qw in vocab_data['question_vocab']
    }

    question_vocab = vocab_data['question_vocab']
    word_regex = re.compile(r'\w+')
    question_ids = np.zeros((1, vocab_data['max_question_length']),
                            dtype='int32')
    question_words = re.findall(word_regex, args.question)
    base = vocab_data['max_question_length'] - len(question_words)
    for i in range(0, len(question_words)):
        if question_words[i] in question_vocab:
            question_ids[0][base + i] = question_vocab[question_words[i]]
        else:
            question_ids[0][base + i] = question_vocab['UNK']

    ans_map = {
        vocab_data['answer_vocab'][ans]: ans
        for ans in vocab_data['answer_vocab']
    }

    model = Attention_net()
    state_dict = torch.load(args.model_path)
    # create new OrderedDict that does not contain `module.`
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]  # remove `module.`
        new_state_dict[name] = v
    # load params
    model.load_state_dict(new_state_dict)
    model.eval()

    q_ids = torch.tensor(question_ids, dtype=torch.long)
    pred, _, _ = model(fc7, q_ids)

    print("Ans:", ans_map[pred.data.max(1)[1].numpy()[0]])

    answer_probab_tuples = [(pred[0][idx], idx) for idx in range(len(pred[0]))]
    answer_probab_tuples.sort()
    print("Top Answers")
    for i in range(5):
        print(ans_map[answer_probab_tuples[i][1]])