def get_prediction(image): #image = Image. transform_test = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.CenterCrop(224), # get 224x224 crop from the center transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize( (0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225)) ]) orig_img = np.array(image) test_img = transform_test(image) sample_vocab = Vocabulary(threshold=5, load_vocab=True, anns_file="captions_train2014.json") vocab_size = len(sample_vocab) #Model checkpoint = torch.load('train-model-1-9900.pkl') # Specify values for embed_size and hidden_size - we use the same values as in training step embed_size = 256 hidden_size = 512 # Initialize the encoder and decoder, and set each to inference mode encoder = ResNetEncoder(embed_size) encoder.eval() decoder = RNNDecoder(embed_size, hidden_size, vocab_size) decoder.eval() # Load the pre-trained weights encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) # Move models to GPU if CUDA is available. if torch.cuda.is_available(): encoder.cuda() decoder.cuda() image = image.cuda() test_img = test_img.unsqueeze(0) features = encoder(test_img).unsqueeze(1) output = decoder.greedy_search(features) cleaned_pred = [] for i in range(len(output)): vocab_id = output[i] word = sample_vocab.id2word[vocab_id] if word == sample_vocab.end_seq: break if word != sample_vocab.start_seq: cleaned_pred.append(word) caption = " ".join(cleaned_pred) return caption
mode='val', batch_size=batch_size, threshold=vocab_threshold, load_vocab=load_vocab) # The size of the vocabulary vocab_size = len(train_loader.dataset.vocab) # Initialize the encoder and decoder encoder = ResNetEncoder(embedding_size) decoder = RNNDecoder(embedding_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Define the loss function criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available( ) else nn.CrossEntropyLoss() # Specify the learnable parameters of the model params = list(decoder.parameters()) + list(encoder.embed.parameters()) + list( encoder.bn.parameters()) # Define the optimizer optimizer = torch.optim.Adam(params=params, lr=0.001) total_train_step = math.ceil( len(train_loader.dataset.caption_lengths) / train_loader.batch_sampler.batch_size)