def main(args): with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(256) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) measurement_score = test(encoder, decoder, vocab, args.num_samples, args.num_hints, args.debug, args.c_step, args.no_avg) if args.msm == "co": scores = cocoEval() scores_u = cocoEval(res='data/captions_val2014_results_u.json') print(scores) print(scores_u) with open(args.filepath, 'w+') as f: pickle.dump((scores, scores_u), f)
def main(image): # Configuration for hyper-parameters config = Config() # Image Preprocessing transform = config.test_transform # Load vocabulary with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) # Load the trained model parameters encoder.load_state_dict( torch.load( os.path.join(config.teacher_cnn_path, config.trained_encoder))) decoder.load_state_dict( torch.load( os.path.join(config.teacher_lstm_path, config.trained_decoder))) # Prepare Image image = Image.open(image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)), Variable(torch.zeros(config.num_layers, 1, config.hidden_size))) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() state = [s.cuda() for s in state] image_tensor = image_tensor.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature, state) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word_id == 96: sampled_caption.append('<end>') break if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence) return sentence
def main(args): # Val images folder filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014' onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))] # image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # load vocabulary wrapper pickle file with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(args.embed_size) # build encoder encoder.eval() # evaluation mode by moving mean and variance decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # build decoder # load the trained CNN and RNN parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Load all images in val folder for i in onlyfiles: badsize = 0 # count the unload images args_image = filepath + '/' # val folder path with image names args_image = args_image + i # transform image and wrap it to tensor image = load_image(args_image, transform) image_tensor = to_var(image, volatile=True) if torch.cuda.is_available(): # load GPU encoder.cuda() decoder.cuda() # generate caption from image try: feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # print out image and generated caption without start and end print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8]) except: badsize = badsize + 1 # count some wrong images
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) alexnet = models.alexnet(pretrained=True) alexnet2 = AlexNet2(alexnet) # Build Models encoder = EncoderCNN(4096, args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = Image.open(args.image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)), Variable(torch.zeros(args.num_layers, 1, args.hidden_size))) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() alexnet2.cuda() state = [s.cuda() for s in state] image_tensor = image_tensor.cuda() # Generate caption from image alexnet2(image_tensor) feature = encoder(alexnet2.fc7_value) sampled_ids = decoder.sample(feature, state) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence)
def inference_coco(encoder_file: str, decoder_file: str, embed_size: int, hidden_size: int, from_cpu: bool) -> None: """ Displays an original image from coco test dataset and prints its associated caption. encoder_file: Name of the encoder to load. decoder_file: Name of the decoder to load. embed_size: Word embedding size for the encoder. hidden_size: Hidden layer of the LSTM size. from_cpu: Whether the model has been saved on CPU. """ # Define transform transform_test = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize( (0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225)) ]) # Device to use fo inference device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create the data loader. data_loader = get_loader(transform=transform_test, mode='test') # Obtain sample image _, image = next(iter(data_loader)) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. if from_cpu: encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file), map_location='cpu')) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file), map_location='cpu')) else: encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) get_prediction(encoder, decoder, data_loader, device)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. sentence = sentence.replace('<start> ', '').replace(' <end>', '').replace('.', '').strip() translator = Translator() sentence_indo = translator.translate(sentence, dest='id').text print('This is an image of: ' + sentence_indo) tts = gTTS(sentence_indo, 'id') tts.save('result.mp3') playsound('result.mp3') image = Image.open(args.image) plt.imshow(np.asarray(image)) plt.show()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image #image = load_image(args.image, transform) #image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() data = [] try: img_path = args.image # Prepare Image image = load_image(img_path, transform) image_tensor = to_var(image, volatile=True) # Generate caption from image feature = encoder(image_tensor) #pdb.set_trace() sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence) data.append({'key': img_path.split('/')[-1], 'sentence': sentence}) except: print(img_path)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Prepare Image image_dir = args.image images = os.listdir(image_dir) for image_id in images: if not image_id.endswith('.jpg'): continue image = os.path.join(image_dir, image_id) image = load_image(image, transform) image_tensor = to_var(image, volatile=True) # Generate caption from image try: feature, cnn_features = encoder(image_tensor) sampled_ids = decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() except: continue # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (image_id + '\t' + sentence)
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sentence = decode(feature,[],decoder,vocab) print (sentence) user_input = raw_input("Does it make sense to you?(y/n)\n") if str(user_input) == "n": f = open('data/step_1/caption_1.txt','r') ground_true = f.read() teach_wordid = [] teach_wordid.append(vocab.word2idx["<start>"]) while(True): print "This is the ground true:\n"+ground_true+"\n"+\ "###################################################\n" reference = ground_true.split() hypothesis = sentence.split() BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis) print "Current BLEU score is "+str(BLEUscore) word = raw_input("next word:\n") word_idx = vocab.word2idx[word] teach_wordid.append(word_idx) sentence = decode(feature,teach_wordid,decoder,vocab) print "###################################################\n" print "Current Translated sentence is: \n"+sentence+"\n"
def main(): st.title('Image Captioning App') st.markdown(STYLE, unsafe_allow_html=True) file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"]) show_file = st.empty() if not file: show_file.info("Please upload a file of type: " + ", ".join(["png", "jpg", "jpeg"])) return content = file.getvalue() show_file.image(file) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl' decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl' embed_size = 300 hidden_size = 256 vocab_size, word2idx, idx2word = get_vocab() encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file))) encoder.to(device) decoder.to(device) transform_test = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) PIL_image = Image.open(file).convert('RGB') orig_image = np.array(PIL_image) image = transform_test(PIL_image) image = image.to(device).unsqueeze(0) features = encoder(image).unsqueeze(1) output = decoder.sample(features) sentence = clean_sentence(output, idx2word) st.info("Generated caption --> " + sentence) file.close()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() bar = Bar('Processing', max=100) for i in range(100): bar.next() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) bar.finish() # Print out image and generated caption. print("\n") print(sentence) image = Image.open(args.image) imgplot = plt.imshow(np.asarray(image)) plt.show()
def load_coco_encoder(self): # Hard-code embedding size to that used in pretrained model at path init = torch.load('/data/rxdh/conventions_data/encoder-5-3000.pkl') encoder = EncoderCNN(256).to('cuda') encoder.eval() encoder.load_state_dict(init) self.transform = transforms.Compose([ transforms.CenterCrop(self.imsize), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) return encoder
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), #transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() image_tensor = image_tensor.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature, args.length) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word != '<start>' and word != '<end>': sampled_caption.append(word) if word == '<end>': break sentence = ''.join(sampled_caption) # Print out image and generated caption. print(sentence)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def encode(img,vocab): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) encoder = EncoderCNN(256) encoder.eval() # evaluation mode (BN uses moving mean/variance) encoder.load_state_dict(torch.load('../models/encoder-4-3000.pkl')) image = load_image(img, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() feature = encoder(image_tensor) return feature
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN( args.embed_size) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) encoder.eval() decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def generatecaption(image): # Image preprocessing device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open('/root/ImageCaptioning/data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(256).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl', map_location='cpu')) decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl', map_location='cpu')) encoder.eval() decoder.eval() # Prepare an image image = load_image(image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break self.sentence = ' '.join(sampled_caption) # Print out the image and the generated caption self.Entry1.delete(0, END) self.Entry1.insert(0,self.sentence[7:-5])
def get_text_caption(image): # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN( args.embed_size, args.model_type, args.mode) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict( torch.load(args.model_path + "_" + args.model_type + "/encoder.pt")) encoder.eval() decoder.load_state_dict( torch.load(args.model_path + "_" + args.model_type + "/decoder.pt")) decoder.eval() # Prepare an image image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) print(sampled_ids) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) return (sentence.split("<start> ")[1].split(" <end>")[0] [:-2].capitalize().replace(" , ", ", "))
class Annotator(): def __init__(self): self.transform = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.CenterCrop(224), # get 224x224 crop from the center transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Load cherckpoint with best model self.checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), 'cpu') # Specify values for embed_size and hidden_size - we use the same values as in training step self.embed_size = 512 self.hidden_size = 512 # Get the vocabulary and its size self.vocab = Vocabulary(None, './vocab.pkl', "<start>", "<end>", "<unk>", "<pad>", "", "", True) self.vocab_size = len(self.vocab) # Initialize the encoder and decoder, and set each to inference mode self.encoder = EncoderCNN(self.embed_size) self.encoder.eval() self.decoder = DecoderRNN(self.embed_size, self.hidden_size, self.vocab_size) self.decoder.eval() # Load the pre-trained weights self.encoder.load_state_dict(self.checkpoint['encoder']) self.decoder.load_state_dict(self.checkpoint['decoder']) # Move models to GPU if CUDA is available. #if torch.cuda.is_available(): # encoder.cuda() # decoder.cuda() def annotate(self, image): transformed = self.transform(image).unsqueeze(0) features = self.encoder(transformed).unsqueeze(1) # Pass the embedded image features through the model to get a predicted caption. output = self.decoder.sample_beam_search(features) print('example output:', output) sentence = clean_sentence(output[0], self.vocab) print('example sentence:', sentence) return sentence
def get_caption(self, img_tensor): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) print("running") # Models encoder_file = 'legit_model/encoder_1.pkl' decoder_file = 'legit_model/decoder_1.pkl' # Embed and hidden embed_size = 512 hidden_size = 512 # The size of the vocabulary. vocab_size = 8856 # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) img_d = img_tensor.to(device) # Obtain the embedded image features. features = encoder(img_d).unsqueeze(1) # Pass the embedded image features through the model to get a predicted caption. img_output = decoder.sample(features) sentence = self.clean_sentence(img_output) return sentence
def initialize(): checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), map_location=torch.device('cpu')) embed_size = 256 hidden_size = 512 with open('./vocab.pkl', "rb") as f: vocab = pickle.load(f) vocab_size = len(vocab) encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) encoder.eval() decoder.eval() encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) return encoder, decoder, vocab
def get_model(device,vocab_size): # model weights file encoder_file = "models/encoder-3.pkl" decoder_file = "models/decoder-3.pkl" embed_size = 512 hidden_size = 512 # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. #print(torch.load(encoder_file)) encoder.load_state_dict(torch.load(encoder_file)) decoder.load_state_dict(torch.load(decoder_file)) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) return encoder,decoder
# The size of the vocabulary. vocab_size = len(vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) device = torch.device("cpu") # encoder.to(device) # decoder.to(device) # Load the pretrained model encoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('encoder'))) decoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('decoder'))) encoder.eval() decoder.eval() images, conv_images = next(iter(data_loader)) features = encoder(conv_images).unsqueeze(1) output = decoder.sample(features, max_len=max_len) word_list = [] for word_idx in output: if word_idx == vocab.word2idx[vocab.start_word]: continue if word_idx == vocab.word2idx[vocab.end_word]: break word_list.append(vocab.idx2word[word_idx]) print(' '.join(word_list))
def main(): # Configuration for hyper-parameters config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models teachercnn = EncoderCNN(config.embed_size) teachercnn.eval() studentcnn = StudentCNN_Model1(config.embed_size) #Load the best teacher model teachercnn.load_state_dict(torch.load(os.path.join('../TrainedModels/TeacherCNN', config.trained_encoder))) studentlstm = DecoderRNN(config.embed_size, config.hidden_size/2, len(vocab), config.num_layers/2) if torch.cuda.is_available(): teachercnn.cuda() studentcnn.cuda() studentlstm.cuda() # Loss and Optimizer criterion_lstm = nn.CrossEntropyLoss() criterion_cnn = nn.MSELoss() params = list(studentlstm.parameters()) + list(studentcnn.parameters()) optimizer_lstm = torch.optim.Adam(params, lr=config.learning_rate) optimizer_cnn = torch.optim.Adam(studentcnn.parameters(), lr=config.cnn_learningrate) print('entering in to training loop') # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer_lstm.zero_grad() optimizer_cnn.zero_grad() features_tr = teachercnn(images) features_st = studentcnn(images) outputs = studentlstm(features_st, captions, lengths) loss = criterion(features_st, features_tr.detach()) + criterion_lstm(outputs, targets) loss.backward() optimizer_cnn.step() optimizer_lstm.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(studentlstm.state_dict(), os.path.join(config.student_lstm_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(studentcnn.state_dict(), os.path.join(config.student_cnn_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
if checkpoints: for cp in checkpoints: name, num = cp[:-4].split('_') num = int(num) if name == model_name and model_idx == num: state_dict = torch.load( 'checkpoint/{}_{}.tar'.format(model_name, num)) encoder.load_state_dict(state_dict['encoder_state_dict']) decoder.load_state_dict(state_dict['decoder_state_dict']) #optimizer.load_state_dict(state_dict['optimizer_state_dict']) print('model_{}_{} is being used'.format(name,state_dict['epoch'])) break # test decoder.eval() encoder.eval() with torch.no_grad(): all_ref = [] all_pred = [] #print('to device finish') for i, (images, batch_captions) in enumerate(BLEU4loader): if i >= 40: continue all_ref.extend(batch_captions) images = images.to(device) #all_ref.extend(batch_captions) # Generate an caption from the image feature = encoder(images) all_pred.extend(decoder.beam_search(feature))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' if decoder_state != 'new': start_epoch = int(decoder_state.split('-')[1]) print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Training on vanilla loss (using new model). Started {} .\n". format(str(datetime.now()))) f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) out = decoder(features, captions, lengths) loss = criterion(out, targets) batch_loss.append(loss.data[0]) loss.backward() optimizer.step() # # Print log info # if i % args.log_step == 0: # print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # with open(args.model_path + args.logfile, 'a') as f: # f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
def _main(): parser = argparse.ArgumentParser() parser.add_argument("filename", help="(optional) path to photograph, for which a caption will be generated", nargs = "?") parser.add_argument("--host", help="(optional) host to start a webserver on. Default: 0.0.0.0", nargs = "?", default = "0.0.0.0") parser.add_argument("--port", help="(optional) port to start a webserver on. http://hostname:port/query", nargs = "?", type = int, default = 1985) parser.add_argument("--verbose", "-v", help="print verbose query information", action="store_true") global _args _args = parser.parse_args() if not _args.filename and not _args.port: parser.print_help() sys.exit(-1) global _device _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("PyTorch device = ", _device) # Load the vocabulary dictionary vocab_threshold = None, vocab_file = "./vocab.pkl" start_word = "<start>" end_word = "<end>" unk_word = "<unk>" load_existing_vocab = True #annotations_file = "/opt/cocoapi/annotations/captions_train2014.json" annotations_file = None print("Loading vocabulary...") global _vocab _vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, load_existing_vocab) vocab_size = len (_vocab) print("Vocabulary contains %d words" % vocab_size) # Load pre-trained models: # encoder (Resnet + embedding layers) # decoder (LSTM) global _encoder global _decoder encoder_path = os.path.join("./models/", _encoder_file) decoder_path = os.path.join("./models/", _decoder_file) print("Loading ", encoder_path) _encoder = EncoderCNN(_embed_size) _encoder.load_state_dict(torch.load(encoder_path)) _encoder.eval() _encoder.to(_device) print("Loading ", decoder_path) _decoder = DecoderRNN(_embed_size, _hidden_size, vocab_size, _num_layers) _decoder.load_state_dict(torch.load(decoder_path)) _decoder.eval() _decoder.to(_device) # Caption the photo, or start a web server if no photo specified if _args.filename: _get_prediction_from_file(_args.filename) else: global _app global _api _app = Flask(__name__) _api = Api(_app) _api.add_resource(ImageCaptionResource, "/v1/caption", "/v1/caption/") _app.run(host = _args.host, port = _args.port)
def extract(args): # Image preprocessing transform = transforms.Compose([ transforms.Resize(SIZE), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) # decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) dissection.retain_layers(encoder, [ ('resnet.7.2.relu', 'final_layer'), ]) encoder = encoder.to(device) # decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) # decoder.load_state_dict(torch.load(args.decoder_path)) encoder.eval() encoder = encoder.to(device) # decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) # decoder.load_state_dict(torch.load(args.decoder_path)) # Load data data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) # Run the models with torch.no_grad(): total_step = len(data_loader) os.makedirs(os.path.join(PARENT_DIR, 'results', 'activations'), exist_ok=True) path = os.path.join(PARENT_DIR, 'results', 'samples.txt') with open(path, 'w') as results_file: start = time.time() for batch, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) # captions = captions.to(device) # targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) # outputs = decoder(features, captions, lengths) # loss = criterion(outputs, targets) # decoder.zero_grad() # encoder.zero_grad() # loss.backward() # optimizer.step() activations = encoder.retained['final_layer'] images = dissection.ReverseNormalize( (0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(images) images = images.cpu().numpy().transpose([0, 2, 3, 1]) activations = activations.cpu().numpy() scores = np.max(activations, axis=(-1, -2)) samples = np.argmax(scores, axis=-1) gathered = activations[np.arange(len(samples)), samples].transpose([1, 2, 0]) mask = cv2.resize(gathered, SIZE).transpose([2, 0, 1]) k = int(0.8 * mask.size) threshhold = np.partition(mask, k, axis=None)[k] mask = mask >= threshhold mask = np.expand_dims(mask, axis=-1) outimg = np.concatenate((images, (1 + mask) / 2.), axis=-1) # outimg = outimg * mask activations = outimg for i, sample in enumerate(samples): i += args.batch_size * batch results_file.write('{} {}\n'.format(i, sample)) for i, activation in enumerate(activations): i += args.batch_size * batch path = os.path.join(PARENT_DIR, 'results', 'activations', '{}.png'.format(i)) outactivation = skimage.img_as_ubyte(activation) imageio.imwrite(path, outactivation) clock = time.time() delay = clock - start start = clock max_batch = 100 # print('Step {}/{}: Time = {:.2f}'.format(batch, len(data_loader), delay)) print('Step {}/{}: Time = {:.2f}'.format( batch, max_batch, delay)) if batch == max_batch: break
def main(): #write predicted caption if not os.path.exists(args['generate_caption_path']): os.makedirs(args['generate_caption_path']) caption_string = os.path.join(args['generate_caption_path'], "caption_ncrt_class5.txt") #mode = "a" if os.path.exists(caption_string) else "w" fp =open(caption_string, "w+") # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.9638, 0.9638, 0.9638), (0.1861, 0.1861, 0.1861))]) # Load vocabulary wrapper with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args['embed_size']) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers'], max_seq_length=50) decoder.eval() # Load the trained model parameters encoder.load_state_dict(torch.load(args['encoder_path'])) decoder.load_state_dict(torch.load(args['decoder_path'])) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Prepare Image image_dir = args['image_path'] images = os.listdir(image_dir) i = 1 for image_id in images: #print('i->',i) #i = i+1 if not image_id.endswith('.jpg'): continue image = os.path.join(image_dir, image_id) image = load_image(image, transform) image_tensor = image.cuda() # Generate caption from image try: feature, cnn_features = encoder(image_tensor) sampled_ids = decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() except: continue #print('image_ids->',image_id) # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) print ('i->', i, image_id + '\t' + sentence) fp.write(image_id) fp.write('\t') fp.write(sentence) if i<398: fp.write("\n") i = i+1 fp.close()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) ''' # Load vocabulary wrapper with open(args.inverse_object_id_mapping, 'rb') as f: inverse_object_id_mapping = pickle.load(f) num_objects = len(inverse_object_id_mapping.keys()) ''' # Build models encoderCNN = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) encoderRNN = EncoderRNN(num_objects, args.embed_size, args.hidden_size) model = Model(num_objects, args.embed_size) encoderCNN = encoderCNN.to(device) encoderRNN = encoderRNN.to(device) model = model.to(device) encoderCNN.eval() encoderRNN.eval() model.eval() # Load the trained model parameters encoderCNN.load_state_dict(torch.load(args.encoderCNN_path)) encoderRNN.load_state_dict(torch.load(args.encoderRNN_path)) model.load_state_dict(torch.load(args.model_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image image_features = encoderCNN(image_tensor) input = torch.LongTensor([[[1]]]).to(device) h0 = torch.zeros((1, 1, args.hidden_size)).to(device) c0 = torch.zeros((1, 1, args.hidden_size)).to(device) max_seqlen = 10 result = [] K = 3 all_candidates = [([1], 1.0, h0, c0) for i in range(K)] for i in range(max_seqlen): Q = [] for _k in range(K): if i == 0 and _k == 1: # first word break hashtag_features, (h0, c0), Ul = encoderRNN(input[_k], all_candidates[_k][2], all_candidates[_k][3]) outputs = model(image_features, hashtag_features, Ul) prob, topk = torch.topk(outputs, 20, dim=1) tup = list(zip(topk[0].cpu().tolist(), prob[0].cpu().tolist())) topk = [a for a in tup if a[0] not in all_candidates[_k][0]] try: topk.remove(1) topk.remove(0) except: pass for _k_ in range(K): Q.append((all_candidates[_k][0] + [topk[_k_][0]], abs(all_candidates[_k][1] * topk[_k_][1]), h0, c0)) all_candidates = sorted(Q, key=lambda x: x[1], reverse=True)[:K] input = [] for _k in range(K): input.append([[all_candidates[_k][0][-1]]]) input = torch.LongTensor(input).to(device) #result.append(top1.cpu().numpy()[0][0]) result = sorted(all_candidates, key=lambda x: x[1], reverse=True) result = [i[0] for i in result] print(result) for i in range(K): tmp = [inverse_object_id_mapping[j] for j in result[i]] final = zip([j['name'] for j in tmp], [j['supercategory'] for j in tmp]) for j in final: print(j) print("-" * 50) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): with open('./data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(256) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) if args.test_prop0: decoder.test_h_from_c() return if args.test_c_step: data_points = test(encoder, decoder, vocab, args.num_samples, args.num_hints) with open(args.filepath, 'w+') as f: pickle.dump(data_points, f) print("Done sampling for c_step evaluation. Data saved to {}".format( args.filepath)) return measurement_score = test(encoder, decoder, vocab, args.num_samples, args.num_hints, args.debug, args.c_step, args.no_avg) if args.msm == "ps": if not args.no_avg: print "ground truth prediction score without update\n" + str( measurement_score[0]) print "ground truth prediction score with update\n" + str( measurement_score[1]) print "Difference\n" + str(measurement_score[1] - measurement_score[0]) else: with open(args.filepath, 'w+') as f: pickle.dump(measurement_score, f) print "Done. Data saved to {}".format(args.filepath) elif args.msm == "ce": if not args.no_avg: print "Cross Entropy Loss without update\n" + str( measurement_score[0]) print "Cross Entropy Loss with update\n" + str( measurement_score[1]) print "Difference\n" + str(measurement_score[1] - measurement_score[0]) else: with open(args.filepath, 'w+') as f: pickle.dump(measurement_score, f) print "Done. Data saved to {}".format(args.filepath) elif args.msm == "co": scores = cocoEval() scores_u = cocoEval(res='data/captions_val2014_results_u.json') print(scores) print(scores_u)
def main(): # Configuration for hyper-parameters torch.cuda.set_device(0) config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader train_image_path = os.path.join(config.image_path, 'train2017') json_path = os.path.join(config.caption_path, 'captions_train2017.json') train_loader = get_data_loader(train_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) val_image_path = os.path.join(config.image_path, 'val2017') json_path = os.path.join(config.caption_path, 'captions_val2017.json') val_loader = get_data_loader(val_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) print('entering in to training loop') # Train the Models with open('train1_log.txt', 'w') as logfile: logfile.write('Validation Error,Training Error') for epoch in range(0, 25): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i + 1) % config.save_step == 0: torch.save( encoder.state_dict(), os.path.join(config.teacher_cnn_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( decoder.state_dict(), os.path.join(config.teacher_lstm_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) print('Just Completed an Epoch, Initite Validation Error Test') avgvalloss = 0 for j, (images, captions, lengths, img_ids) in enumerate(val_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) valloss = criterion(outputs, targets) if j == 0: avgvalloss = valloss.data[0] avgvalloss = (avgvalloss + valloss.data[0]) / 2 if ((j + 1) % 1000 == 0): print('Average Validation Loss: %.4f' % (avgvalloss)) logfile.write( str(avgvalloss) + ',' + str(loss.data[0]) + str('\n')) break