def update_output(n_clicks, value): vocab_data = data_loader.get_question_answer_vocab("2") qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']} print('filename::',filen) fc7_features = utils.extract_fc7_features(filen, 'Data/vgg16.tfmodel') model_options = { 'num_lstm_layers' : 2, 'rnn_size' : 512, 'embedding_size' : 512, 'word_emb_dropout' : 0.5, 'image_dropout' : 0.5, 'fc7_feature_length' : 4096, 'lstm_steps' : vocab_data['max_question_length'] + 1, 'q_vocab_size' : len(vocab_data['question_vocab']), 'ans_vocab_size' : len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32') print('qst',value) question_words = re.findall(word_regex, value) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[ question_words[i] ] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']} model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, 'Data/Models/modelnew99.ckpt') pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']:fc7_features, input_tensors['sentence']:question_ids, }) print("answerprediction",pred[0]) #model.summary() #plot_model(model,to_file='predictmodel.png') print ("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print ("Top Answers") for i in range(1): print (ans_map[ answer_probab_tuples[0][1] ]) #ans=(ans_map[answer_probab_tuples[i][1] ]) lang = "en" text="This is a "+ans_map[ answer_probab_tuples[0][1] ] speech = Speech(text, lang) sox_effects = ("speed", "0.8") speech.play(sox_effects) return ans_map[answer_probab_tuples[0][1]]
def calcFeatures(image_path): data_dir = 'Data' vocab_data = data_loader.get_question_answer_vocab(data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } fc7_features = utils.extract_fc7_features(image_path, join(data_dir, 'vgg16.tfmodel')) return fc7_features
def main(): parser = argparse.ArgumentParser() parser.add_argument('--image_path', type=str, default='Data/cat.jpeg', help='Image Path') parser.add_argument('--model_path', type=str, default='Data/Models/model2.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default='Which animal is this?', help='Question') args = parser.parse_args() print("Image:", args.image_path) print("Question:", args.question) vocab_data = data_loader.get_question_answer_vocab(args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } fc7_features = utils.extract_fc7_features( args.image_path, join(args.data_dir, 'vgg16.tfmodel')) model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7_features, input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") for i in range(5): print(ans_map[answer_probab_tuples[i][1]])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--image_path', type=str, default = 'Data/cat.jpeg', help='Image Path') parser.add_argument('--model_path', type=str, default = 'Data/Models/model2.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default='Which animal is this?', help='Question') args = parser.parse_args() print "Image:", args.image_path print "Question:", args.question vocab_data = data_loader.get_question_answer_vocab(args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw] : qw for qw in vocab_data['question_vocab']} fc7_features = utils.extract_fc7_features(args.image_path, join(args.data_dir, 'vgg16.tfmodel')) model_options = { 'num_lstm_layers' : args.num_lstm_layers, 'rnn_size' : args.rnn_size, 'embedding_size' : args.embedding_size, 'word_emb_dropout' : args.word_emb_dropout, 'image_dropout' : args.image_dropout, 'fc7_feature_length' : args.fc7_feature_length, 'lstm_steps' : vocab_data['max_question_length'] + 1, 'q_vocab_size' : len(vocab_data['question_vocab']), 'ans_vocab_size' : len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype = 'int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[ question_words[i] ] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans] : ans for ans in vocab_data['answer_vocab']} model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) pred, answer_probab = sess.run([t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']:fc7_features, input_tensors['sentence']:question_ids, }) print "Ans:", ans_map[pred[0]] answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print "Top Answers" for i in range(5): print ans_map[ answer_probab_tuples[i][1] ]
def serve(img_features, question): parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default='Data/model2.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default=question, help='Question') args = parser.parse_args() tf.reset_default_graph() vocab_data = data_loader.get_question_answer_vocab(args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: img_features, input_tensors['sentence']: question_ids, }) sess.close() ans_list = [] ans_list.append(ans_map[pred[0]]) return ans_list
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default=MODEL_PATH, help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--feature_length', type=int, default=4096, help='feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=1.0, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=1.0, help='image_dropout') parser.add_argument('--data_dir', type=str, default=DATA_PATH, help='Data directory') parser.add_argument('--image_features', type=str, default='vgg16', help='Image features') args = parser.parse_args() vocab_data = data_loader.get_question_answer_vocab(args.data_dir) model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'feature_length': args.feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') print("Reading QA DATA") test_data = data_loader.load_test_questions() print(len(test_data['question_vocab'])) features, image_id_list = data_loader.load_features( args.data_dir, 'test', args.image_features) print("Features", features.shape) print("Image_id_list", image_id_list.shape) image_id_map = {} for i in range(len(image_id_list)): image_id_map[image_id_list[i]] = i model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) stop_vocab = ['a', 'an', 'the'] for i, now_image in enumerate(test_data['testing']): now_image_path = 'Data/test2015/COCO_test2015_%.12d.jpg' % ( now_image['image_id']) img = Image.open(now_image_path) img.show() question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') print('Question:', now_image['question']) question_words = re.findall(word_regex, now_image['question']) question_words = list( filter(lambda x: data_loader.vocab_handle(x) not in stop_vocab, question_words)) base = vocab_data['max_question_length'] - len(question_words) for j in range(0, len(question_words)): now_question_words = data_loader.vocab_handle(question_words[j]) if now_question_words in question_vocab: question_ids[0][base + j] = question_vocab[now_question_words] else: question_ids[0][base + j] = question_vocab['UNK'] now_index = image_id_map[test_data['testing'][i]['image_id']] pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['features']: features[now_index].reshape(1, args.feature_length), input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers:") for i in range(5): print(ans_map[answer_probab_tuples[i][1]]) input() sess.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_path', type=str, default='Data/train_2014/COCO_train2014_000000581922.jpg', help='Image Path') parser.add_argument( '--model_path', type=str, default='Data/train2014/Tri Training 3/Models/model49.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data/train2014/Tri Training 3/', help='Data directory') parser.add_argument('--batch_size', type=int, default=200, help='Batch Size') parser.add_argument('--question', type=str, default='What is this product?', help='Question') args = parser.parse_args() #vizwiz_file_path = 'Data/Test' solution = dict() solution["model"] = "model_3" solution["predictions"] = [] vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json' with open(vizwiz_questions_path, 'r') as input_file: vizwiz_questions = json.loads(input_file.read()) '''with open('questions_temp.txt','w') as temp_file: for i in range(20000): #print(vizwiz_questions['questions'][i]['question'] temp_file.write(vizwiz_questions['questions'][i]['question']) temp_file.write('\n')''' question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') #question_ids = np.zeros((20000, vocab_data['max_question_length']), dtype = 'int32') #fc7_features = np.zeros((2, args.fc7_feature_length)) print("Reading fc7 features") fc7_features, image_id_list = data_loader.load_fc7_features( 'Data/', 'train') print("FC7 features", fc7_features.shape) print("image_id_list", image_id_list.shape) #print(0/0) '''i=0 for file in os.listdir(vizwiz_file_path): if file.endswith(".jpg"): args.image_path = join(vizwiz_file_path,file) #args.question = vizwiz_questions['questions'][i]['question'] print("Image:", args.image_path) print("Question:", args.question) fc7_features[i] = utils.extract_fc7_features(args.image_path, 'Data/vgg16-20160129.tfmodel') i += 1''' model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } #question_words = re.findall(word_regex, args.question) #base = vocab_data['max_question_length'] - len(question_words) '''for no_questions in range(question_ids.shape[0]): for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[no_questions][base + i] = question_vocab[ question_words[i] ] else: question_ids[no_questions][base + i] = question_vocab['UNK']''' ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) batch_no = 0 with open('result3.txt', 'w') as output_file: while (batch_no * args.batch_size) < 20000: question_ids = np.zeros( (args.batch_size, vocab_data['max_question_length']), dtype='int32') #vizwiz_questions['questions'][i]['question'] for no_questions in range(question_ids.shape[0]): question_formatted = vizwiz_questions['questions'][ batch_no * args.batch_size + no_questions]['question'] question_list = question_formatted.split() question_list = question_list[0:20] question_formatted = ' '.join(question_list) question_words = re.findall(word_regex, question_formatted) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[no_questions][base + i] = question_vocab[ question_words[i]] else: question_ids[no_questions][base + i] = question_vocab['UNK'] fc7 = get_batch(batch_no, args.batch_size, fc7_features) pred, ans_prob = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7, input_tensors['sentence']: question_ids, }) for i in range(len(pred)): current_prediction = dict() current_prediction["image_id"] = "VizWiz_train_%.12d.jpg" % ( batch_no * args.batch_size + i) current_prediction["question"] = vizwiz_questions['questions'][ batch_no * args.batch_size + i]['question'] #output_file.write("Ques:" + vizwiz_questions['questions'][batch_no*args.batch_size + i]['question']) answer_list = [] answer_probab_tuples = [(-ans_prob[i][idx], idx) for idx in range(len(ans_prob[0]))] answer_probab_tuples.sort() for j in range(5): answer_list.append(ans_map[answer_probab_tuples[j][1]]) #output_file.write("Ans:" + ans_map[pred[i]]) current_prediction["predicted_answer"] = answer_list #output_file.write("Ans:" + str(answer_list)) #output_file.write('\n') solution["predictions"].append(current_prediction) #print("Ans:", ans_map[pred[i]]) #print('\n') batch_no += 1 output_file.write(json.dumps(solution))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--image_path', type=str, default='/home/vmhatre/vqa_supervised/Data/cat.jpeg', help='Image Path') parser.add_argument( '--model_path', type=str, default='/home/vmhatre/vqa_supervised/Data/Models/model2.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default='----Question to ask?', help='Question') args = parser.parse_args() vocab_data = data_loader.get_question_answer_vocab(args.data_dir) #Load Question and answer dataset qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } # Extract features from VGGmodel fc7_features = utils.extract_fc7_features( args.image_path, join(args.data_dir, 'vgg16.tfmodel')) model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } # Split data question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] # map question with answer ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() # Save the session to help retrieve the trained model at checkpoints sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) # Run input features from fc7 sentence. pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7_features, input_tensors['sentence']: question_ids, }) #Check thte most probable answers for image and questions answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() for i in range(5): print ans_map[answer_probab_tuples[i][1]]
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_path', type=str, default='Data/train_2014/COCO_train2014_000000581922.jpg', help='Image Path') parser.add_argument( '--model_path', type=str, default='Data/train2014/Tri Training 1/Models/model11.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--fc7_feature_length', type=int, default=4096, help='fc7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data/train2014/Tri Training 1/', help='Data directory') parser.add_argument('--question', type=str, default='What is this product?', help='Question') args = parser.parse_args() vizwiz_file_path = 'Data/Images' vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir) qvocab = vocab_data['question_vocab'] #print(qvocab) #print(0/0) q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } vizwiz_questions_path = 'VizWiz_to_VQA_Questions.json' with open(vizwiz_questions_path, 'r') as input_file: vizwiz_questions = json.loads(input_file.read()) question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') i = 0 for file in os.listdir(vizwiz_file_path): if file.endswith(".jpg"): args.image_path = join(vizwiz_file_path, file) args.question = vizwiz_questions['questions'][i]['question'] i += 1 print("Image:", args.image_path) print("Question:", args.question) #fc7_features, image_id_list = data_loader.load_fc7_features(args.data_dir, 'val') fc7_features = utils.extract_fc7_features( args.image_path, 'Data/train2014/Tri Training 1/vgg16-20160129.tfmodel') model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'fc7_feature_length': args.fc7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']) } question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) print(question_ids.shape) print(fc7_features.shape) print(0 / 0) pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['fc7']: fc7_features, input_tensors['sentence']: question_ids, }) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") for i in range(5): #print(ans_map[answer_probab_tuples[i]]) print(ans_map[answer_probab_tuples[i][1]])
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--image_path', type=str, default='Data/val2015/abstract_v002_val2015_000000022100.png', help='Image Path') parser.add_argument('--model_path', type=str, default='Data/Models/model19.ckpt', help='Model Path') parser.add_argument('--num_lstm_layers', type=int, default=2, help='num_lstm_layers') parser.add_argument('--cnn7_feature_length', type=int, default=512, help='cnn7_feature_length') parser.add_argument('--rnn_size', type=int, default=512, help='rnn_size') parser.add_argument('--embedding_size', type=int, default=512, help='embedding_size'), parser.add_argument('--word_emb_dropout', type=float, default=0.5, help='word_emb_dropout') parser.add_argument('--image_dropout', type=float, default=0.5, help='image_dropout') parser.add_argument('--data_dir', type=str, default='Data', help='Data directory') parser.add_argument('--question', type=str, default='What is the man sitting on?', help='Question') parser.add_argument('--lstm_direc', type=str, default='uni', help='LSTM Direction') args = parser.parse_args() #Extract vocabulary of question and answer vocab_data = data_loader.get_question_answer_vocab(data_dir=args.data_dir) #Build q_map but seems useless, ans_map useful q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } cnn7_features = utils.extract_cnn7_features( args.image_path, join(args.data_dir, 'vgg16.tfmodel')) #Word Splitting question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_words = re.findall(word_regex, args.question) #Find question's word in ques_vocab,record it in ques_id question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab[ 'UNK'] #Biggest index in answer #preparing model model_options = { 'num_lstm_layers': args.num_lstm_layers, 'rnn_size': args.rnn_size, 'embedding_size': args.embedding_size, 'word_emb_dropout': args.word_emb_dropout, 'image_dropout': args.image_dropout, 'cnn7_feature_length': args.cnn7_feature_length, 'lstm_steps': vocab_data['max_question_length'] + 1, 'q_vocab_size': len(vocab_data['question_vocab']), 'ans_vocab_size': len(vocab_data['answer_vocab']), } #resume model model = vis_lstm_model.Vis_lstm_model(model_options) input_tensors, t_prediction, t_ans_probab = model.build_generator(batch=1) sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, args.model_path) #predict pred, answer_probab = sess.run( [t_prediction, t_ans_probab], feed_dict={ input_tensors['cnn7']: cnn7_features, input_tensors['sentence']: question_ids, }) #showing image/question/answer print("Image:", args.image_path) print("Question:", args.question) print("Ans:", ans_map[pred[0]]) answer_probab_tuples = [(-answer_probab[0][idx], idx) for idx in range(len(answer_probab[0]))] answer_probab_tuples.sort() print("Top Answers") sess.close() for i in range(5): print(ans_map[answer_probab_tuples[i][1]])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--image_path', type=str, default='data/cat.jpg', help='Image Path') parser.add_argument('--model_path', type=str, default='att1_hard.pth', help='Model Path') parser.add_argument('--data_dir', type=str, default='data', help='Data directory') parser.add_argument('--question', type=str, default='Which animal is this?', help='Question') args = parser.parse_args() print("Image:", args.image_path) print("Question:", args.question) # build up vgg image feature extractor Vgg19 = models.vgg19(pretrained=True) extract_list = [27, 36] extractor = FeatureExtractor(Vgg19.features, extract_list) extractor.eval() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) preprocess = transforms.Compose( [transforms.Resize((224, 224)), transforms.ToTensor(), normalize]) img = preprocess(Image.open(args.image_path)) img = torch.unsqueeze(img, 0) fc7 = extractor(img) fc7 = fc7.permute(0, 2, 3, 1) fc7 = fc7.view(1, -1, fc7.shape[3]) # get vocabulary to encode questions vocab_data = data_loader.get_question_answer_vocab(version=2, data_dir=args.data_dir) qvocab = vocab_data['question_vocab'] q_map = { vocab_data['question_vocab'][qw]: qw for qw in vocab_data['question_vocab'] } question_vocab = vocab_data['question_vocab'] word_regex = re.compile(r'\w+') question_ids = np.zeros((1, vocab_data['max_question_length']), dtype='int32') question_words = re.findall(word_regex, args.question) base = vocab_data['max_question_length'] - len(question_words) for i in range(0, len(question_words)): if question_words[i] in question_vocab: question_ids[0][base + i] = question_vocab[question_words[i]] else: question_ids[0][base + i] = question_vocab['UNK'] ans_map = { vocab_data['answer_vocab'][ans]: ans for ans in vocab_data['answer_vocab'] } model = Attention_net() state_dict = torch.load(args.model_path) # create new OrderedDict that does not contain `module.` from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v # load params model.load_state_dict(new_state_dict) model.eval() q_ids = torch.tensor(question_ids, dtype=torch.long) pred, _, _ = model(fc7, q_ids) print("Ans:", ans_map[pred.data.max(1)[1].numpy()[0]]) answer_probab_tuples = [(pred[0][idx], idx) for idx in range(len(pred[0]))] answer_probab_tuples.sort() print("Top Answers") for i in range(5): print(ans_map[answer_probab_tuples[i][1]])