def __init__(self): self.transform = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.CenterCrop(224), # get 224x224 crop from the center transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Load cherckpoint with best model self.checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), 'cpu') # Specify values for embed_size and hidden_size - we use the same values as in training step self.embed_size = 512 self.hidden_size = 512 # Get the vocabulary and its size self.vocab = Vocabulary(None, './vocab.pkl', "<start>", "<end>", "<unk>", "<pad>", "", "", True) self.vocab_size = len(self.vocab) # Initialize the encoder and decoder, and set each to inference mode self.encoder = EncoderCNN(self.embed_size) self.encoder.eval() self.decoder = DecoderRNN(self.embed_size, self.hidden_size, self.vocab_size) self.decoder.eval() # Load the pre-trained weights self.encoder.load_state_dict(self.checkpoint['encoder']) self.decoder.load_state_dict(self.checkpoint['decoder'])
def __init__(self, args, vocab_len): super(BFM, self).__init__() self.encoder = EncoderCNN(args.embed_size).eval().cpu() self.encoder.load_state_dict(torch.load('encoder.ckpt', map_location=torch.device('cpu'))) self.decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_len, args.num_layers).eval().cpu() self.decoder.forward = self.decoder.sample self.decoder.load_state_dict(torch.load('decoder.ckpt', map_location=torch.device('cpu')))
def test(args): transform = transforms.Compose([ transforms.ToTensor(), ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(args.embed_size).eval() decoder = DecoderRNN(args.embed_size, len(vocab), args.hidden_size, args.num_layers) # 加载训练好的模型的参数 encoder.load_state_dict(torch.load(args.encoder_path, map_location='cpu')) decoder.load_state_dict(torch.load(args.decoder_path, map_location='cpu')) image = load_img(args.img_path, transform) feature = encoder(image) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) print(sentence) image = Image.open(args.img_path) plt.imshow(np.asarray(image)) plt.show()
def __init__(self): print("Defining I.A") # Device configuration self.device = torch.device('cpu') #vars embed_size = 256 hidden_size = 512 num_layers = 1 encoder_path = 'models/encoder-5-3000.pkl' decoder_path = 'models/decoder-5-3000.pkl' vocab_path = 'data/vocab.pkl' # Image preprocessing self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) with open(vocab_path, 'rb') as f: self.vocab = pickle.load(f) print("Building Model") # Build models self.encoder = EncoderCNN(embed_size).eval() # eval mode (batchnorm uses moving mean/variance) self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers) self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) print("loading checkpoint") # Load the trained model parameters self.encoder.load_state_dict(torch.load(encoder_path)) self.decoder.load_state_dict(torch.load(decoder_path))
def inference_coco(encoder_file: str, decoder_file: str, embed_size: int, hidden_size: int, from_cpu: bool) -> None: """ Displays an original image from coco test dataset and prints its associated caption. encoder_file: Name of the encoder to load. decoder_file: Name of the decoder to load. embed_size: Word embedding size for the encoder. hidden_size: Hidden layer of the LSTM size. from_cpu: Whether the model has been saved on CPU. """ # Define transform transform_test = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize( (0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225)) ]) # Device to use fo inference device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create the data loader. data_loader = get_loader(transform=transform_test, mode='test') # Obtain sample image _, image = next(iter(data_loader)) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. if from_cpu: encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file), map_location='cpu')) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file), map_location='cpu')) else: encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) get_prediction(encoder, decoder, data_loader, device)
class Neuraltalk2: def __init__(self): print("Defining I.A") # Device configuration self.device = torch.device('cpu') #vars embed_size = 256 hidden_size = 512 num_layers = 1 encoder_path = 'models/encoder-5-3000.pkl' decoder_path = 'models/decoder-5-3000.pkl' vocab_path = 'data/vocab.pkl' # Image preprocessing self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) with open(vocab_path, 'rb') as f: self.vocab = pickle.load(f) print("Building Model") # Build models self.encoder = EncoderCNN(embed_size).eval() # eval mode (batchnorm uses moving mean/variance) self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers) self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) print("loading checkpoint") # Load the trained model parameters self.encoder.load_state_dict(torch.load(encoder_path)) self.decoder.load_state_dict(torch.load(decoder_path)) def eval_image(self, image_path): # Prepare an image image = load_image(image_path, self.transform) image_tensor = image.to(self.device) # Generate an caption from the image feature = self.encoder(image_tensor) sampled_ids = self.decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = self.vocab.idx2word[word_id] if word == '<end>': break if word == '<start>': continue sampled_caption.append(word) sentence = ' '.join(sampled_caption) return sentence
def epoch_training(train_iter, val_iter, num_epoch=100, learning_rate=1e-4, hidden_size=100, early_stop=False, patience=2, epsilon=1e-4): # define model encoder = EncoderRNN(input_size=len(EN.vocab), hidden_size=hidden_size) decoder = DecoderRNN(hidden_size=hidden_size, output_size=len(DE.vocab)) # define loss criterion criterion = nn.NLLLoss(ignore_index=PAD_token) encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) losses = np.ndarray(patience) res_loss = 13 res_encoder = None res_decoder = None res_epoch = 0 base_bleu = 0 not_updated = 0 for epoch in range(num_epoch): tl = train(train_iter, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) loss, val_bleu = evaluate(val_iter, encoder, decoder, criterion) logging.warning('******Epoch: ' + str(epoch) + ' Training Loss: ' + str(tl) + ' Validation Loss: ' + str(loss) + ' Validation Bleu: ' + str(val_bleu) + '*********') #save the model with the lowest validation loss if base_bleu <= val_bleu: base_bleu = val_bleu res_loss = loss res_encoder = encoder res_decoder = decoder res_epoch = epoch not_updated = 0 logging.warning('Updated validation loss as ' + str(res_loss) + 'With validation Bleu as ' + str(base_bleu) + ' at epoch ' + str(res_epoch)) else: not_updated += 1 if not_updated == patience: break print('Stop at Epoch: ' + str(res_epoch) + ", With Validation Loss: " + str(res_loss) + ", Validation Bleu: " + str(base_bleu)) logging.warning('Stop at Epoch: ' + str(res_epoch) + ", With Validation Loss: " + str(res_loss) + ", Validation Bleu: " + str(base_bleu)) return res_loss, res_encoder, res_decoder, base_bleu
class BFM(nn.Module): def __init__(self, args, vocab_len): super(BFM, self).__init__() self.encoder = EncoderCNN(args.embed_size).eval().cpu() self.encoder.load_state_dict(torch.load('encoder.ckpt', map_location=torch.device('cpu'))) self.decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_len, args.num_layers).eval().cpu() self.decoder.forward = self.decoder.sample self.decoder.load_state_dict(torch.load('decoder.ckpt', map_location=torch.device('cpu'))) def forward(self, image): feature = self.encoder(image) sampled_ids = self. decoder(feature) return sampled_ids
def main(): # Load vocabulary wrapper. with open(vocab_path) as f: vocab = pickle.load(f) encoder = EncoderCNN(4096, embed_dim) decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) #load data with open(image_data_file) as f: image_data = pickle.load(f) image_features = si.loadmat(image_feature_file) img_features = image_features['fc7'][0] img_features = np.concatenate(img_features) print 'here' iteration = 0 save_loss = [] for i in range(10): # epoch use_caption = i % 5 print 'Epoch', i for x, y in make_mini_batch(img_features, image_data, use_caption=use_caption): word_padding, lengths = make_word_padding(y, vocab) x = Variable(torch.from_numpy(x).cuda()) word_index = Variable(torch.from_numpy(word_padding).cuda()) encoder.zero_grad() decoder.zero_grad() features = encoder(x) targets = pack_padded_sequence(word_index, lengths, batch_first=True)[0] outputs = decoder(features, word_index, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if iteration % 100 == 0: print 'loss', loss.data[0] save_loss.append(loss.data[0]) iteration += 1 torch.save(decoder.state_dict(), 'decoder.pkl') torch.save(encoder.state_dict(), 'encoder.pkl') with open('losses.txt', 'w') as f: print >> f, losses
def main(): ### load word embedding pickle_file = open(embedding_path, "rb") word_embedding = pickle.load(pickle_file) pickle_file.close() word_index = word_embedding[0] embedding_map = word_embedding[1] output_size = len(word_index) ### initialize model hidden_size = 100 encoder = EncoderRNN(hidden_size) decoder = DecoderRNN(hidden_size, output_size) ### load train data parser = AcademicParser("../train_data/Academic_papers/docs.json") abstracts = parser.get_paperAbstract() titles = parser.get_title() assert (len(abstracts) == len(titles)) ### prepare train data train_set = [] for i in range(len(abstracts)): abstract = abstracts[i] title = titles[i] new_pair = variablesFromPair((abstract, title), word_index, embedding_map) if (len(new_pair[1]) > 0): train_set.append(new_pair) trainIters(encoder, decoder, 20000, train_set)
def main(): st.title('Image Captioning App') st.markdown(STYLE, unsafe_allow_html=True) file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"]) show_file = st.empty() if not file: show_file.info("Please upload a file of type: " + ", ".join(["png", "jpg", "jpeg"])) return content = file.getvalue() show_file.image(file) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl' decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl' embed_size = 300 hidden_size = 256 vocab_size, word2idx, idx2word = get_vocab() encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file))) encoder.to(device) decoder.to(device) transform_test = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) PIL_image = Image.open(file).convert('RGB') orig_image = np.array(PIL_image) image = transform_test(PIL_image) image = image.to(device).unsqueeze(0) features = encoder(image).unsqueeze(1) output = decoder.sample(features) sentence = clean_sentence(output, idx2word) st.info("Generated caption --> " + sentence) file.close()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.033, 0.032, 0.033), (0.027, 0.027, 0.027))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) len_vocab = vocab.idx # Build Models encoder = ResNet(ResidualBlock, [3, 3, 3], len_vocab) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(len_vocab, args.hidden_size, len(vocab), args.num_layers) attn_encoder = AttnEncoder(ResidualBlock, [3, 3, 3]) attn_encoder.eval() attn_decoder = SANDecoder(args.feature_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters attn_encoder.load_state_dict(torch.load(args.encoder_path)) attn_decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): attn_encoder.cuda(1) attn_decoder.cuda(1) # Generate caption from image feature = attn_encoder(image_tensor) sampled_ids = attn_decoder.sample(feature) ids_arr = [] for element in sampled_ids: temp = element.cpu().data.numpy() ids_arr.append(int(temp)) # Decode word_ids to words sampled_caption = [] for word_id in ids_arr: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (sentence)
def main(args): with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) example = torch.rand(1, 3, 224, 224) encoder = EncoderCNN(args.embed_size).eval().cpu() encoder.load_state_dict(torch.load('encoder.ckpt')) traced_script_module = torch.jit.trace(encoder, example) traced_script_module.save("./encoder.pt") example = torch.rand(1, 256) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).eval().cpu() decoder.load_state_dict(torch.load('decoder.ckpt')) traced_script_module = torch.jit.trace(decoder, example) traced_script_module.save("./decoder.pt")
def main(image): # Configuration for hyper-parameters config = Config() # Image Preprocessing transform = config.test_transform # Load vocabulary with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) # Load the trained model parameters encoder.load_state_dict( torch.load( os.path.join(config.teacher_cnn_path, config.trained_encoder))) decoder.load_state_dict( torch.load( os.path.join(config.teacher_lstm_path, config.trained_decoder))) # Prepare Image image = Image.open(image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)), Variable(torch.zeros(config.num_layers, 1, config.hidden_size))) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() state = [s.cuda() for s in state] image_tensor = image_tensor.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature, state) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word_id == 96: sampled_caption.append('<end>') break if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence) return sentence
def main(args): # Val images folder filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014' onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))] # image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # load vocabulary wrapper pickle file with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(args.embed_size) # build encoder encoder.eval() # evaluation mode by moving mean and variance decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # build decoder # load the trained CNN and RNN parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Load all images in val folder for i in onlyfiles: badsize = 0 # count the unload images args_image = filepath + '/' # val folder path with image names args_image = args_image + i # transform image and wrap it to tensor image = load_image(args_image, transform) image_tensor = to_var(image, volatile=True) if torch.cuda.is_available(): # load GPU encoder.cuda() decoder.cuda() # generate caption from image try: feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # print out image and generated caption without start and end print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8]) except: badsize = badsize + 1 # count some wrong images
def main(args): vectore_dir = '/root/server/best_model/' # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models # encoder = EncoderCNN(args.embed_size) qvecs_pca = np.load( os.path.join(vectore_dir, "q_2{}.npy".format(args.embed_size))) # encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters # encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image #image = load_image(args.image, transform) #image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): # encoder.cuda() decoder.cuda() data = [] # img_path = args.image # # Prepare Image # image = load_image(img_path, transform) # image_tensor = to_var(image, volatile=True) # Generate caption from image # feature = encoder(image_tensor) num = 29 feature = torch.from_numpy(qvecs_pca[num:num + 1, :]).cuda() #pdb.set_trace() sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence)
class Annotator(): def __init__(self): self.transform = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.CenterCrop(224), # get 224x224 crop from the center transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Load cherckpoint with best model self.checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), 'cpu') # Specify values for embed_size and hidden_size - we use the same values as in training step self.embed_size = 512 self.hidden_size = 512 # Get the vocabulary and its size self.vocab = Vocabulary(None, './vocab.pkl', "<start>", "<end>", "<unk>", "<pad>", "", "", True) self.vocab_size = len(self.vocab) # Initialize the encoder and decoder, and set each to inference mode self.encoder = EncoderCNN(self.embed_size) self.encoder.eval() self.decoder = DecoderRNN(self.embed_size, self.hidden_size, self.vocab_size) self.decoder.eval() # Load the pre-trained weights self.encoder.load_state_dict(self.checkpoint['encoder']) self.decoder.load_state_dict(self.checkpoint['decoder']) # Move models to GPU if CUDA is available. #if torch.cuda.is_available(): # encoder.cuda() # decoder.cuda() def annotate(self, image): transformed = self.transform(image).unsqueeze(0) features = self.encoder(transformed).unsqueeze(1) # Pass the embedded image features through the model to get a predicted caption. output = self.decoder.sample_beam_search(features) print('example output:', output) sentence = clean_sentence(output[0], self.vocab) print('example sentence:', sentence) return sentence
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) alexnet = models.alexnet(pretrained=True) alexnet2 = AlexNet2(alexnet) # Build Models encoder = EncoderCNN(4096, args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = Image.open(args.image) image_tensor = Variable(transform(image).unsqueeze(0)) # Set initial states state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)), Variable(torch.zeros(args.num_layers, 1, args.hidden_size))) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() alexnet2.cuda() state = [s.cuda() for s in state] image_tensor = image_tensor.cuda() # Generate caption from image alexnet2(image_tensor) feature = encoder(alexnet2.fc7_value) sampled_ids = decoder.sample(feature, state) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print (sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. sentence = sentence.replace('<start> ', '').replace(' <end>', '').replace('.', '').strip() translator = Translator() sentence_indo = translator.translate(sentence, dest='id').text print('This is an image of: ' + sentence_indo) tts = gTTS(sentence_indo, 'id') tts.save('result.mp3') playsound('result.mp3') image = Image.open(args.image) plt.imshow(np.asarray(image)) plt.show()
def setUpClass(cls): cls.pre_processing = PreProcessing(sentences) cls.dataset = ds.process(cls.pre_processing) cls.word_embedding = WordEmbedding(source=cls.dataset.pairs) encoder = EncoderRNN(cls.word_embedding, 300, 1).to(settings.device) decoder = DecoderRNN(300, cls.word_embedding, 0.0, 1).to(settings.device) cls.model = Model(encoder, decoder) cls.model.train(cls.dataset)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image #image = load_image(args.image, transform) #image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() data = [] try: img_path = args.image # Prepare Image image = load_image(img_path, transform) image_tensor = to_var(image, volatile=True) # Generate caption from image feature = encoder(image_tensor) #pdb.set_trace() sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) # Print out image and generated caption. print(sentence) data.append({'key': img_path.split('/')[-1], 'sentence': sentence}) except: print(img_path)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Prepare Image image_dir = args.image images = os.listdir(image_dir) for image_id in images: if not image_id.endswith('.jpg'): continue image = os.path.join(image_dir, image_id) image = load_image(image, transform) image_tensor = to_var(image, volatile=True) # Generate caption from image try: feature, cnn_features = encoder(image_tensor) sampled_ids = decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() except: continue # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (image_id + '\t' + sentence)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models # encoder = EncoderCNN(args.embed_size) # encoder.eval() # evaluation mode (BN uses moving mean/variance) layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters layout_encoder.load_state_dict(torch.load(args.layout_encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # If use gpu if torch.cuda.is_available(): layout_encoder.cuda() decoder.cuda() # validation(layout_encoder,decoder, args,vocab,transform,args.batch_size) out = save_output(layout_encoder,decoder, args,vocab,transform,args.batch_size) with open('bsl_output.txt', 'w') as outfile: json.dump(out, outfile)
def main(args): # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) total_step = len(data_loader) # List to score the BLEU scores bleu_scores = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) # captions = captions.to(device) # Generate an caption from the image feature = encoder(images) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) score = sentence_bleu(captions, sentence, args.bleu_weights) bleu_scores.append(score) # Print log info if i % args.log_step == 0: print('Finish [{}/{}], Current BLEU Score: {:.4f}' .format(i, total_step, np.mean(bleu_scores))) np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
def getCaption(self, imgs, output_path='', vocab_path='data/vocab.pkl', decoder_path='models/decoder-5-3000.pkl', encoder_path='models/encoder-5-3000.pkl', embed_size=256, hidden_size=512, num_layers=1): if (output_path == ''): output_path = self.DEFAULT_OUTPUT_PATH device = self.device transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) CAPTIONS = [] for img in imgs: # Prepare an image image = self.load_image(img, transform=transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption CAPTIONS.append(self.prune_caption(sentence)) json_captions = self.writeJSON(imgs, CAPTIONS, output_path=output_path) return json_captions
def main(args): with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(256) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder)) decoder.load_state_dict(torch.load(args.decoder)) measurement_score = test(encoder, decoder, vocab, args.num_samples, args.num_hints, args.debug, args.c_step, args.no_avg) if args.msm == "co": scores = cocoEval() scores_u = cocoEval(res='data/captions_val2014_results_u.json') print(scores) print(scores_u) with open(args.filepath, 'w+') as f: pickle.dump((scores, scores_u), f)
def main(): args = parse_arguments() hidden_size = 300 embed_size = 50 kld_weight = 0.05 temperature = 0.9 use_cuda = torch.cuda.is_available() print("[!] preparing dataset...") TEXT = data.Field(lower=True, fix_length=30) LABEL = data.Field(sequential=False) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train_data, max_size=250000) LABEL.build_vocab(train_data) train_iter, test_iter = data.BucketIterator.splits( (train_data, test_data), batch_size=args.batch_size, repeat=False) vocab_size = len(TEXT.vocab) + 2 print("[!] Instantiating models...") encoder = EncoderRNN(vocab_size, hidden_size, embed_size, n_layers=2, dropout=0.5, use_cuda=use_cuda) decoder = DecoderRNN(embed_size, hidden_size, vocab_size, n_layers=2, dropout=0.5, use_cuda=use_cuda) vae = VAE(encoder, decoder) optimizer = optim.Adam(vae.parameters(), lr=args.lr) if use_cuda: print("[!] Using CUDA...") vae.cuda() best_val_loss = None for e in range(1, args.epochs + 1): train(e, vae, optimizer, train_iter, vocab_size, kld_weight, temperature, args.grad_clip, use_cuda, TEXT) val_loss = evaluate(vae, test_iter, vocab_size, kld_weight, use_cuda) print("[Epoch: %d] val_loss:%5.3f | val_pp:%5.2fS" % (e, val_loss, math.exp(val_loss))) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: print("[!] saving model...") if not os.path.isdir("snapshot"): os.makedirs("snapshot") torch.save(vae.state_dict(), './snapshot/vae_{}.pt'.format(e)) best_val_loss = val_loss
def run_inference(image_path, encoder_path, decoder_path, vocab_path, embed_size=256, hidden_size=512, num_layers=1): print(f'sample.py running ... ') # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: print("using " + vocab_path) vocab = pickle.load(f) # Build models encoder = EncoderCNN( embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict( torch.load(encoder_path, map_location=torch.device('cpu'))) decoder.load_state_dict( torch.load(decoder_path, map_location=torch.device('cpu'))) # Prepare an image image = load_image(image_path, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] print(word) sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption).replace('<start>', '') sentence = sentence.replace('<end>', '') sentence = sentence.replace('_', ' ') # Print out the image and the generated caption print(sentence) print(f'debug: chay xong roi ne') return sentence.strip().capitalize()
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) vocab = Vocabulary.load_vocab(args['data_dir']) args['vocab_size'] = len(vocab) encoder = EncoderCNN(args).eval() decoder = DecoderRNN(args) encoder.to(device) decoder.to(device) encoder.load_state_dict( torch.load(os.path.join(args['model_dir'], args['encoder_name']))) decoder.load_state_dict( torch.load(os.path.join(args['model_dir'], args['decoder_name']))) test_caption_list = [] for file_name in os.listdir( os.path.join(args['data_dir'], args['image_dir'])): if os.path.isfile( os.path.join(args['data_dir'], args['image_dir'], file_name)): image = load_image( os.path.join(args['data_dir'], args['image_dir'], file_name), transform) image_tensor = image.to(device) else: continue feature = encoder(image_tensor) sample_ids = decoder.sample(feature) sample_ids = sample_ids[0].cpu().numpy() sample_caption = [] for word_id in sample_ids: word = vocab.idx2word[word_id] sample_caption.append(word) if word == '<end>': break sentence = ' '.join(sample_caption) print(sentence) test_caption_list.append((file_name, sentence)) # image=Image.open(os.path.join(args['data_dir'],args['image_dir'],file_name)) # plt.imshow(np.asarray(image)) with open(os.path.join(args['data_dir'], 'test_caption.txt'), 'w') as f: for item in test_caption_list: f.write('image_name:{} ---- generated_caption:{}\n'.format( item[0], item[1])) f.write('\n')
def get_caption(self, img_tensor): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) print("running") # Models encoder_file = 'legit_model/encoder_1.pkl' decoder_file = 'legit_model/decoder_1.pkl' # Embed and hidden embed_size = 512 hidden_size = 512 # The size of the vocabulary. vocab_size = 8856 # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) img_d = img_tensor.to(device) # Obtain the embedded image features. features = encoder(img_d).unsqueeze(1) # Pass the embedded image features through the model to get a predicted caption. img_output = decoder.sample(features) sentence = self.clean_sentence(img_output) return sentence
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build Models encoder = EncoderCNN(args.embed_size) encoder.eval() # evaluation mode (BN uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare Image image = load_image(args.image, transform) image_tensor = to_var(image, volatile=True) # If use gpu if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Generate caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() # Decode word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out image and generated caption. print (sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
(0.229, 0.224, 0.225))]) # Build data loader. data_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file, cocoapi_loc=COCOPATH) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # TODO #3: Specify the learnable parameters of the model. params = list(decoder.parameters()) +\ list(encoder.embed.parameters()) # We don't want to retrain the resnet # TODO #4: Define the optimizer. optimizer = torch.optim.RMSprop(params)
# Build data loader. data_loader = get_loader(transform=transform_test, mode='test_small', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file, cocoapi_loc=COCOPATH) vocab = data_loader.dataset.vocab # The size of the vocabulary. vocab_size = len(vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) device = torch.device("cpu") # encoder.to(device) # decoder.to(device) # Load the pretrained model encoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('encoder'))) decoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('decoder'))) encoder.eval() decoder.eval() images, conv_images = next(iter(data_loader)) features = encoder(conv_images).unsqueeze(1) output = decoder.sample(features, max_len=max_len)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))