def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print (sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def train_caption_model(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.device_count() > 1: print("{} GPUs are in use.".format(torch.cuda.device_count())) encoder = nn.DataParallel(encoder) decoder = nn.DataParallel(decoder) encoder.to(device) decoder.to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) import pdb; pdb.set_trace() outputs = decoder(features, captions, lengths) import pdb; pdb.set_trace() loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, len(data_loader), loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) if args.with_glove == 'True': # Get glove pickles glove_path = args.glove_path vectors = bcolz.open(f'{glove_path}/6B.{args.embed_size}.dat')[:] words = pickle.load( open(f'{glove_path}/6B.{args.embed_size}_words.pkl', 'rb')) word2idx = pickle.load( open(f'{glove_path}/6B.{args.embed_size}_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} # Get weights matrix weights_matrix = np.zeros((len(vocab), args.embed_size)) words_found = 0 # We compare the vocabulary from the built vocab, and the glove word vectors for i in range(len(vocab)): try: word = vocab.idx2word[i] weights_matrix[i] = glove[word] words_found += 1 except KeyError: weights_matrix[i] = np.random.normal(scale=0.6, size=(args.embed_size, )) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNNGlove(args.hidden_size, weights_matrix, args.num_layers) else: encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word != '<start>' and word != '<end>': sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) pickle.dump(sentence, open("save.p", "wb")) image = Image.open(args.image) plt.imshow(np.asarray(image))
mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file, cocoapi_loc=COCOPATH) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # TODO #3: Specify the learnable parameters of the model. params = list(decoder.parameters()) +\ list(encoder.embed.parameters()) # We don't want to retrain the resnet # TODO #4: Define the optimizer. optimizer = torch.optim.RMSprop(params) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)
hidden_size = 512 vocab_size = len(data_loader.dataset.vocab) encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() encoder_file = 'encoder-3.pkl' decoder_file = 'decoder-3.pkl' encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file))) encoder.to(device) decoder.to(device) # prediction def clean_sentence(output): sentence = "" for idx in output: if idx == 0: continue elif idx == 1: break else: word = data_loader.dataset.vocab.idx2word[idx] sentence = sentence + word + " "
def extract(args): # Image preprocessing transform = transforms.Compose([ transforms.Resize(SIZE), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) # decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) dissection.retain_layers(encoder, [ ('resnet.7.2.relu', 'final_layer'), ]) encoder = encoder.to(device) # decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) # decoder.load_state_dict(torch.load(args.decoder_path)) encoder.eval() encoder = encoder.to(device) # decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) # decoder.load_state_dict(torch.load(args.decoder_path)) # Load data data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) # Run the models with torch.no_grad(): total_step = len(data_loader) os.makedirs(os.path.join(PARENT_DIR, 'results', 'activations'), exist_ok=True) path = os.path.join(PARENT_DIR, 'results', 'samples.txt') with open(path, 'w') as results_file: start = time.time() for batch, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) # captions = captions.to(device) # targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) # outputs = decoder(features, captions, lengths) # loss = criterion(outputs, targets) # decoder.zero_grad() # encoder.zero_grad() # loss.backward() # optimizer.step() activations = encoder.retained['final_layer'] images = dissection.ReverseNormalize( (0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(images) images = images.cpu().numpy().transpose([0, 2, 3, 1]) activations = activations.cpu().numpy() scores = np.max(activations, axis=(-1, -2)) samples = np.argmax(scores, axis=-1) gathered = activations[np.arange(len(samples)), samples].transpose([1, 2, 0]) mask = cv2.resize(gathered, SIZE).transpose([2, 0, 1]) k = int(0.8 * mask.size) threshhold = np.partition(mask, k, axis=None)[k] mask = mask >= threshhold mask = np.expand_dims(mask, axis=-1) outimg = np.concatenate((images, (1 + mask) / 2.), axis=-1) # outimg = outimg * mask activations = outimg for i, sample in enumerate(samples): i += args.batch_size * batch results_file.write('{} {}\n'.format(i, sample)) for i, activation in enumerate(activations): i += args.batch_size * batch path = os.path.join(PARENT_DIR, 'results', 'activations', '{}.png'.format(i)) outactivation = skimage.img_as_ubyte(activation) imageio.imwrite(path, outactivation) clock = time.time() delay = clock - start start = clock max_batch = 100 # print('Step {}/{}: Time = {:.2f}'.format(batch, len(data_loader), delay)) print('Step {}/{}: Time = {:.2f}'.format( batch, max_batch, delay)) if batch == max_batch: break
def evaluate(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dataset = Dataset({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'transform': transform, 'mode': 'test' }) args['vocab_size'] = len(dataset.vocab) encoder = EncoderCNN(args).eval() decoder = DecoderRNN(args).eval() encoder = encoder.to(device) decoder = decoder.to(device) encoder.load_state_dict( torch.load(os.path.join(args['model_path'], 'encoder.pt'))) decoder.load_state_dict( torch.load(os.path.join(args['model_path'], 'decoder.pt'))) generated_captions = [] image_ids = [] target_captions = [] for idx in range(len(dataset.ids)): image_id, image, captions = dataset.get_test_item(idx) image = image.to(device) print(idx) features = encoder(image) word_ids = decoder.sample(features) word_ids = word_ids[0].cpu().tolist() words = [] for word_id in word_ids: if dataset.vocab.idx2word[word_id] == '<start>': continue if dataset.vocab.idx2word[word_id] != '<end>': words.append(dataset.vocab.idx2word[word_id]) else: break image_ids.append(image_id) generated_captions.append(words) target_captions.append(captions) print(words) image_captions = [{ 'image_id': image_ids[idx], 'caption': ' '.join(generated_captions[idx]) } for idx in range(len(image_ids))] captions_path = os.path.join(args['exp_dir'], args['caption_file']) image_caption_path = os.path.join(args['exp_dir'], args['evaluation_file']) with open(captions_path, 'w') as f: for idx in range(len(generated_captions)): f.write('*' * 50 + '\n') f.write('-' * 20 + 'generated_captions' + '-' * 20 + '\n') f.write(' '.join(generated_captions[idx]) + '\n') f.write('-' * 20 + 'target_captions' + '-' * 20 + '\n') for words in target_captions[idx]: f.write(' '.join(words) + '\n') f.write('*' * 50 + '\n') f.write('\n') with open(bleu_score_path, 'w') as f: f.write('BLEU_score: {}'.format(str(BLEU_score))) with open(image_caption_path, 'w') as f: json.dump(image_captions, f)
# data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) trainloader = get_loader(train_image_dir, train_caption_path, vocab, transform_train, batch_size, shuffle=True, num_workers=8) testloader = get_loader(test_image_dir, test_caption_path, vocab, transform_test, batch_size, shuffle=False, num_workers=8) checkpoints = os.listdir('checkpoint') encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers=1) encoder = encoder.to(device) decoder = decoder.to(device) params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) cur_epoch = 0 if checkpoints: num_checkpoint = -1 for cp in checkpoints: name, num = cp[:-4].split('_') num = int(num) if name == model_name and num_checkpoint < num: num_checkpoint = num if num_checkpoint > -1: state_dict = torch.load('checkpoint/{}_{}.tar'.format(model_name,num_checkpoint)) encoder.load_state_dict(state_dict['encoder_state_dict'])
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) words = [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ] # Prepare images if args.images: # inputs path input_path = os.listdir(args.images) sentences = [] # folders in inputs for path in input_path: file_path = args.images + path + '/' if os.path.isdir(file_path): files = os.listdir(file_path) # files in folders for file in files: image = load_image(file_path + file, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break caption = ' '.join(sampled_caption)[8:-6] sentences.append(caption) for word in words: if word in caption: f = open('captions2.csv', 'a', encoding='utf-8', newline="") writer = csv.writer(f) writer.writerow([file_path + file, word, caption]) f.close() # Print out the image and the generated caption # for s in sentences: # print(s) # Prepare an image else: image = load_image(args.image, transform) image_tensor = image.to(device) found_words = [] # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) for word in words: if word in sentence: found_words.append(word) if 'hot dog' in sentence: found_words.remove('dog') return sentence, found_words
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # get test ids ids = [] with open('TestImageIds.csv', 'r') as f: reader = csv.reader(f) testIds = list(reader) testIds = [int(i) for i in testIds[0]] coco = COCO(args.caption_path) for img_id in testIds: for entry in coco.imgToAnns[img_id]: ids.append(entry['id']) # create data loader test_loader = get_loader(args.image_dir, args.caption_path, ids, vocab, transform, 1, shuffle=False, num_workers=0) # Loss and optimizer criterion = nn.CrossEntropyLoss() # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).eval() encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # evaluate loss running_loss = 0.0 num_imgs = len(ids) for i, (images, captions, lengths) in enumerate(test_loader): sys.stdout.write("\rEvaluating Caption: %d/%d" % (i, num_imgs)) images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths, pretrained=args.pretrained) outputs = outputs loss = criterion(outputs, targets) running_loss += loss.item() * images.size(0) test_loss = running_loss / num_imgs print("Test Loss : %.2f" % (test_loss)) print("\rWriting captions to json file...") # write to json file anns = [] for img_id in tqdm(testIds): # Prepare an image image = load_image( args.image_dir + '/' + coco.loadImgs(img_id)[0]['file_name'], transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) if args.stochastic: sampled_ids = decoder.stochastic_sample( feature, temperature=args.temperature, pretrained=args.pretrained) else: sampled_ids = decoder.sample(feature, pretrained=args.pretrained) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption ann = {'image_id': img_id, 'id': 0, 'caption': sentence} anns.append(ann) # print (sentence, img_id) pred_annotations_file = "./results/{}.json".format(args.model_name) with open(pred_annotations_file, 'w') as f: json.dump(anns, f) true_annotations_file = args.caption_path BLEU1, BLEU4 = evaluate_captions(true_annotations_file, pred_annotations_file) print("Test Loss : %.2f" % (test_loss)) print("BLEU1 score : %.2f" % (BLEU1)) print("BLEU4 score : %.2f" % (BLEU4))
def train( num_epochs: int, lr: float, batch_size: int, vocab_threshold: int, vocab_from_file: bool, embed_size: int, hidden_size: int, save_every: int, print_every: int, log_file: str )-> None: """ Train the captioning network with the required parameters. The training logs are saved in log_file. num_epochs: Number of epochs to train the model. batch_size: Mini-batch size for training. vocab_threshold: Minimum word count threshold for vocabulary initialisation. A word that appears in the dataset a fewer number of times than vocab_threshold will be discarded and will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold, the bigger the vocabulary. vocab_from_file: Whether to load the vocabulary from a pre-initialized file. embed_size: Dimensionality of image and word embeddings. hidden_size: Number of features in hidden state of the RNN decoder. save_every: Number of epochs between each checkpoint saving. print_every: Number of batches for printing average loss. log_file: Name of the training log file. Saves loss and perplexity. """ transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5 transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Build data loader. data_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # Parameters to update. We do not re-train de CNN here params = list(encoder.embed.parameters()) + list(decoder.parameters()) # TODO: add learning rate scheduler # Optimizer for minimum search. optimizer = optim.Adam(params, lr=lr) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) # Open the training log file. f = open(log_file, 'w') for epoch in range(1, num_epochs + 1): for i_step in range(1, total_step + 1): # Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # for i in range(10): # print(torch.argmax(outputs[0,i, :]).item()) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % ( epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl")) torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl")) # Close the training log file. f.close()
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) actual_captions = [] predicted_captions = [] annotation_path = '../data/annotations/captions_val2014.json' with open(annotation_path) as f: anns = json.load(f) anns = anns["annotations"] for index, _ in enumerate(anns): anns[index]['image_id'] = str(anns[index]['image_id']).rjust(12, '0') if anns[index]['caption'][-1] == '.': #print (anns[index]['caption']) anns[index]['caption'] = str(anns[index]['caption'])[:-1] anns = pd.DataFrame(anns) #print (anns.head()) for index, image_name in enumerate(os.listdir("../data/val2014/")): try: print(index) image = load_image("../data/val2014/" + image_name, transform) image_tensor = image.to(device) feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) #print (sampled_caption) sampled_caption = sampled_caption[1:-2] #print (sampled_caption) predicted_captions.append(sampled_caption) #print (image_name) image_id = image_name[-16:-4] #print (image_id) temp = anns[anns['image_id'] == image_id] actual = [i.split(' ') for i in temp['caption']] #print (actual) actual_captions.append(actual) except RuntimeError: print(image_name + " errored out") pass pickle.dump(predicted_captions, open("predicted_captions.p", 'wb')) pickle.dump(actual_captions, open("actual_captions.p", 'wb')) one_reference = [cap[0] for cap in actual_captions] pickle.dump(one_reference, open("one_reference_actual.p", 'wb')) print(corpus_bleu(one_reference, predicted_captions))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Resize(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Loss criterion = nn.CrossEntropyLoss() # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) print('val_loader length = {}'.format(len(data_loader))) val_loss = 0 start = time.time() with torch.no_grad(): for i, (images, captions, lengths) in enumerate(data_loader): images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] references = [idx2word2list(vocab, targets)] pdb.set_trace() # Forward features = encoder(images) outputs = decoder(features, captions, lengths) val_loss += criterion(outputs, targets) # Print log info if i % args.log_step == 0: print('step {}/{}, time {}'.format(i, len(data_loader), timeSince(start))) val_loss = val_loss / len(data_loader) print('val_loss = {:.3f}'.format(val_loss))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(os.path.join('./Models', args.encoder_path))) decoder.load_state_dict(torch.load(os.path.join('./Models', args.decoder_path))) # Prepare an image for images in valimages: image = load_image(args.image, transform) image_tensor = image.to(device) # Generate a caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print (sentence) fig = plt.figure() ax = fig.add_subplot(111) ax.set_title(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image)) plt.show() def cal_bleu_score(dataset, model, source_vocab, target_vocab): targets = [] predictions = [] for i in range(len(dataset)): target = vars(test_data.examples[i])['trg'] predicted_words = predict(i, model, source_vocab, target_vocab, dataset) predictions.append(predicted_words[1:-1]) targets.append([target]) print(f'BLEU Score: {round(bleu_score(predictions, targets) * 100, 2)}') source_vocab = args.vocab_path target_vocab cal_bleu_score(dataset, model, source_vocab, target_vocab) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--valImage', type=str, default = '/home/khaaq/Documents/COCOTorch_Yunjey/ResizeTest2014/COCO_val2014_000000000536.jpg', help='input image for generating caption') parser.add_argument('--encoder_path', type=str, default='/home/khaaq/Documents/COCOTorch_Yunjey/Models/encoder-10-3000.ckpt', help='path for trained encoder') parser.add_argument('--decoder_path', type=str, default='/home/khaaq/Documents/COCOTorch_Yunjey/Models/decoder-10-3000.ckpt', help='path for trained decoder') parser.add_argument('--vocab_path', type=str, default='/home/khaaq/Documents/COCOTorch_Yunjey/vocab.pkl', help='path for vocabulary wrapper') # parser.add_argument('--caption_path', type=str, default='/home/khaaq/Documents/COCO_KarepathyData2014/annotations/captions_val2014.json', help='path for train annotation json file') # Model parameters (should be same as paramters in train.py) parser.add_argument('--embed_size', type=int , default=256, help='dimension of word embedding vectors') parser.add_argument('--hidden_size', type=int , default=512, help='dimension of lstm hidden states') parser.add_argument('--num_layers', type=int , default=1, help='number of layers in lstm') args = parser.parse_args() main(args)
def main(args): global best_bleu4 # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(reso=args.reso) decoder = AttnDecoderRNN(attention_dim=args.attention_dim, embed_dim=args.embed_dim, decoder_dim=args.decoder_dim, vocab_size=len(vocab), dropout=args.dropout) encoder.to(device) decoder.to(device) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=args.decoder_lr) encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=args.encoder_lr) if args.fine_tune_encoder else None criterion = nn.CrossEntropyLoss().to(device) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Build data loader train_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.image_dir_val, args.caption_path_val, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) TrainStdout = Logger('train.txt') ValStdout = Logger('val.txt') for epoch in range(args.start_epoch, args.epochs): if args.epochs_since_improvement == 20: break if args.epochs_since_improvement > 0 and args.epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.8) if args.fine_tune_encoder: adjust_learning_rate(encoder_optimizer, 0.8) train(train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch, stdout=TrainStdout) recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion, word_map=vocab.word2idx, stdout=ValStdout) is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: args.epochs_since_improvement += 1 print("\nEpoch since last improvement: %d\n" % (args.epochs_since_improvement, )) else: args.epochs_since_improvement = 0 save_checkpoint(args.data_name, epoch, args.epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best)
def main(args): configure(os.path.join(args['exp_dir'], 'log_dir')) transform = transforms.Compose([ transforms.RandomCrop(args['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) data_loader = get_loader({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'batch_size': args['batch_size'], 'transform': transform, 'num_workers': args['num_workers'], 'shuffle': args['shuffle'], 'mode': 'train' }) # valid_data_loader=get_loader({'data_dir' : args['data_dir'], # 'raw_data_dir' : args['raw_data_dir'], # 'batch_size' : int(args['batch_size']/4), # 'transform' : transform, # 'num_workers' : args['num_workers'], # 'shuffle' : args['shuffle'], # 'mode':'validate'}) args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir'])) encoder = EncoderCNN(args).train() decoder = DecoderRNN(args).train() if args['pretrained']: checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir']) checkpoint = Checkpoint.load(checkpoint_path) encoder.load_state_dict(checkpoint.encoder) decoder.load_state_dict(checkpoint.decoder) step = checkpoint.step epoch = checkpoint.epoch omit = True else: step = 0 epoch = 0 omit = False encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) # params=list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args['lr']) scheduler = StepLR(optimizer, step_size=40, gamma=0.1) # optimizer=YFOptimizer(params) total_step = len(data_loader) min_valid_loss = float('inf') for epoch in range(epoch, args['num_epochs']): scheduler.step() for idx, (images, captions, leng) in enumerate(data_loader): if omit: if idx < (step - total_step * epoch): logger.info( 'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'. format(idx, step, epoch, total_step, step - total_step * epoch)) continue else: omit = False images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, leng, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, leng) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5) optimizer.step() log_value('loss', loss.item(), step) step += 1 if step % args['log_step'] == 0: logger.info( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], idx, total_step, loss.item(), np.exp(loss.item()))) if step % args['valid_step'] == 0: # valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader) # if valid_loss<min_valid_loss: # min_valid_loss=valid_loss Checkpoint(encoder, decoder, optimizer, epoch, step).save(args['exp_dir'])
def uploaded_file(filename): print("####Entry File Name", filename) PATH_TO_TEST_IMAGES_DIR = app.config['UPLOAD_FOLDER'] TEST_IMAGE_PATHS = [ os.path.join(PATH_TO_TEST_IMAGES_DIR, filename.format(i)) for i in range(1, 2) ] print("*******PRINT******", TEST_IMAGE_PATHS) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) '''Load vocabulary wrapper''' with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) '''created instance to build models''' encoder = EncoderCNN( 256).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) encoder = encoder.to(device) decoder = decoder.to(device) ''' Load the trained model parameters EncoderCNN pickle- objects detection Decoder RNN pickle pretrained- sequence prediction ''' encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl')) decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl')) for img in TEST_IMAGE_PATHS: '''Prepare an image''' image = load_image(img, transform) image_tensor = image.to(device) '''Generate an caption from the image''' feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() '''Convert word_ids to words''' sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] if word == '<start>': #or word == '<end>':# or word == '.': continue if word == '<end>': break sampled_caption.append(word) sentence = ' '.join(sampled_caption) ''' Print the sentence in the console read the image and overlay the predicted text on the image save the result image route/return the saved image result location as output ''' print(sentence) print("FileName", img) image = Image.open(img) draw = ImageDraw.Draw(image) font = ImageFont.truetype( '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', size=15) (x, y) = (10, 10) color = 'rgb(244,208,63)' draw.text((x, y), sentence, fill=color, font=font) image.save('uploads/' + filename) return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) ''' # Load vocabulary wrapper with open(args.inverse_object_id_mapping, 'rb') as f: inverse_object_id_mapping = pickle.load(f) num_objects = len(inverse_object_id_mapping.keys()) ''' # Build models encoderCNN = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) encoderRNN = EncoderRNN(num_objects, args.embed_size, args.hidden_size) model = Model(num_objects, args.embed_size) encoderCNN = encoderCNN.to(device) encoderRNN = encoderRNN.to(device) model = model.to(device) encoderCNN.eval() encoderRNN.eval() model.eval() # Load the trained model parameters encoderCNN.load_state_dict(torch.load(args.encoderCNN_path)) encoderRNN.load_state_dict(torch.load(args.encoderRNN_path)) model.load_state_dict(torch.load(args.model_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image image_features = encoderCNN(image_tensor) input = torch.LongTensor([[[1]]]).to(device) h0 = torch.zeros((1, 1, args.hidden_size)).to(device) c0 = torch.zeros((1, 1, args.hidden_size)).to(device) max_seqlen = 10 result = [] K = 3 all_candidates = [([1], 1.0, h0, c0) for i in range(K)] for i in range(max_seqlen): Q = [] for _k in range(K): if i == 0 and _k == 1: # first word break hashtag_features, (h0, c0), Ul = encoderRNN(input[_k], all_candidates[_k][2], all_candidates[_k][3]) outputs = model(image_features, hashtag_features, Ul) prob, topk = torch.topk(outputs, 20, dim=1) tup = list(zip(topk[0].cpu().tolist(), prob[0].cpu().tolist())) topk = [a for a in tup if a[0] not in all_candidates[_k][0]] try: topk.remove(1) topk.remove(0) except: pass for _k_ in range(K): Q.append((all_candidates[_k][0] + [topk[_k_][0]], abs(all_candidates[_k][1] * topk[_k_][1]), h0, c0)) all_candidates = sorted(Q, key=lambda x: x[1], reverse=True)[:K] input = [] for _k in range(K): input.append([[all_candidates[_k][0][-1]]]) input = torch.LongTensor(input).to(device) #result.append(top1.cpu().numpy()[0][0]) result = sorted(all_candidates, key=lambda x: x[1], reverse=True) result = [i[0] for i in result] print(result) for i in range(K): tmp = [inverse_object_id_mapping[j] for j in result[i]] final = zip([j['name'] for j in tmp], [j['supercategory'] for j in tmp]) for j in final: print(j) print("-" * 50) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) print(sampled_ids) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] #print(word_id) sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) #add #message = raw_input("Enter message to encode: ") #print("Decoded string (in ASCII):") #for ch in sentence: # print(ord(ch)) # print("\t") sen = list(sentence.split(" ")) # print(sen) sen1 = sen[1:-1] #print([sen1]) #end image = Image.open(args.image) plt.imshow(np.asarray(image)) #print(args.image) #add2 if args.image == "png/ex1.jpg": caption = [ 'a', 'picture', 'of', 'an', 'elephant', 'on', 'a', 'road', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex4.jpg": caption = [ 'a', 'man', 'is', 'sitting', 'at', 'a', 'table', 'with', 'a', 'laptop', 'on', 'it' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex2.jpg": caption = [ 'a', 'man', 'holding', 'tennis', 'racket', 'in', 'a', 'tennis', 'court' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex3.jpg": caption = [ 'a', 'man', 'and', 'woman', 'are', 'standing', 'near', 'a', 'beach', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex5.jpg": caption = [ 'a', 'group', 'of', 'people', 'sitting', 'in', 'a', 'room', 'working' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex6.jpg": caption = ['a', 'man', 'playing', 'tennis', 'in', 'a', 'court'] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex7.jpg": caption = [ 'a', 'fire', 'hydrant', 'is', 'on', 'a', 'snowy', 'streets', 'with', 'trees', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex8.jpg": caption = [ 'an', 'indoor', 'court', 'with', 'table', 'tennis', 'tables' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex9.jpg": caption = [ 'a', 'man', 'sitting', 'at', 'a', 'table', 'talking', 'to', 'another', 'man', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex10.jpg": caption = [ 'a', 'cat', 'is', 'sitting', 'on', 'floor', 'with', 'a', 'man', 'standing', 'behind', 'it' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/img5.jpg": caption = [ 'a', 'vase', 'filled', 'with', 'flowers', 'on', 'a', 'table', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/img10.jpg": caption = [ 'a', 'woman', 'is', 'sitting', 'at', 'a', 'table', 'with', 'a', 'cake', 'on', 'it', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/img18.jpg": caption = ['a', 'person', 'holding', 'a', 'coconut', '.'] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex12.jpg": caption = ['motocycles', 'parked', 'in', 'a', 'parking', 'lot', '.'] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex13.jpg": caption = [ 'a', 'zebra', 'standing', 'next', 'to', 'a', 'zebra', 'on', 'an', 'ice', 'road', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex14.jpg": caption = [ 'a', 'black', 'dog', 'and', 'two', 'cats', 'laying', 'on', 'a', 'bed', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex16.jpg": caption = [ 'a', 'woman', 'is', 'cutting', 'apples', 'at', 'a', 'table', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex16.jpg": caption = [ 'a', 'woman', 'is', 'cutting', 'apples', 'at', 'a', 'table', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex19.jpg": caption = [ 'a', 'black', 'bear', 'is', 'walking', 'through', 'a', 'stony', 'road', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex21.jpg": caption = ['a', 'table', 'with', 'many', 'plates', 'of', 'food', '.'] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex22.jpg": caption = [ 'a', 'brown', 'bear', 'is', 'sitting', 'in', 'the', 'graph', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex24.jpg": caption = [ 'a', 'group', 'of', 'people', 'playing', 'in', 'a', 'field', 'with', 'a', 'frisbee', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/ex25.jpg": caption = [ 'a', 'group', 'of', 'sheep', 'standing', 'in', 'a', 'field', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/example2.jpg": caption = [ 'a', 'truck', 'and', 'a', 'car', 'parked', 'in', 'a', 'parking', 'lot', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1) elif args.image == "png/puppy.jpg": caption = [ 'a', 'dog', 'is', 'laying', 'on', 'the', 'floor', 'with', 'a', 'pillow', 'at', 'its', 'side', '.' ] print(caption) score1 = bluescore([sen1], caption) print(score1)
def evaluate_with_beam_search(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dataset = Dataset({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'transform': transform, 'mode': 'test' }) args['vocab_size'] = len(dataset.vocab) encoder = EncoderCNN(args).eval() decoder = DecoderRNN(args).eval() encoder = encoder.to(device) decoder = decoder.to(device) encoder.load_state_dict( torch.load(os.path.join(args['model_path'], 'encoder.pt'))) decoder.load_state_dict( torch.load(os.path.join(args['model_path'], 'decoder.pt'))) generated_captions = [] image_ids = [] target_captions = [] for idx in range(len(dataset.ids)): image_id, image, captions = dataset.get_test_item(idx) image = image.to(device) print(idx) features = encoder(image) generated_sents = decoder.decode_with_beam_search(features) # print(generated_sents) sents = [] for sent_id in generated_sents: words = [] for word_id in sent_id[0]: if dataset.vocab.idx2word[word_id] == '<start>': continue elif dataset.vocab.idx2word[word_id] != '<end>': words.append(dataset.vocab.idx2word[word_id]) else: break sents.append((' '.join(words), sent_id[1] / len(sent_id[0]))) sents = sorted(sents, key=lambda x: x[1], reverse=True) generated_captions.append(sents) image_ids.append(image_id) target_captions.append(captions) image_captions = [{ 'image_id': image_ids[idx], 'caption': generated_captions[idx][0][0] } for idx in range(len(image_ids))] captions_path = os.path.join(args['exp_dir'], args['model_dir'], args['caption_fils']) image_caption_path = os.path.join(args['exp_dir'], args['model_dir'], args['evaluation_file']) with open(captions_path, 'w') as f: for idx in range(len(generated_captions)): f.write('*' * 50 + '\n') f.write('-' * 20 + 'generated_captions' + '-' * 20 + '\n') for sent in generated_captions[idx]: f.write(sent[0] + '\n') f.write('-' * 20 + 'target_captions' + '-' * 20 + '\n') for words in target_captions[idx]: f.write(' '.join(words) + '\n') f.write('*' * 50 + '\n') f.write('\n') with open(image_caption_path, 'w') as f: json.dump(image_captions, f)
def main(args): threshold = 20 captions_dict = load_captions(train_dir) vocab = Vocabulary(captions_dict, threshold) vocab_size = vocab.index # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataloader = DataLoader(val_dir, vocab, transform) imagenumbers, captiontotal, imagetotal = dataloader.gen_data() # Build data loader data_loader = get_loader(imagenumbers, captiontotal, imagetotal, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build models encoder = EncoderCNN(args.embed_size).eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_size, args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) # Build data loader total_step = len(data_loader) # List to score the BLEU scores bleu_scores = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) # captions = captions.to(device) # Generate an caption from the image feature = encoder(images) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.get_word(word_id) sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) score = sentence_bleu([captions], sentence, args.bleu_weights) bleu_scores.append(score) # Print log info if i % args.log_step == 0: print('Finish [{}/{}], Current BLEU Score: {:.4f}'.format( i, total_step, np.mean(bleu_scores))) print(sentence) print(captions) np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
def _main(): parser = argparse.ArgumentParser() parser.add_argument("filename", help="(optional) path to photograph, for which a caption will be generated", nargs = "?") parser.add_argument("--host", help="(optional) host to start a webserver on. Default: 0.0.0.0", nargs = "?", default = "0.0.0.0") parser.add_argument("--port", help="(optional) port to start a webserver on. http://hostname:port/query", nargs = "?", type = int, default = 1985) parser.add_argument("--verbose", "-v", help="print verbose query information", action="store_true") global _args _args = parser.parse_args() if not _args.filename and not _args.port: parser.print_help() sys.exit(-1) global _device _device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("PyTorch device = ", _device) # Load the vocabulary dictionary vocab_threshold = None, vocab_file = "./vocab.pkl" start_word = "<start>" end_word = "<end>" unk_word = "<unk>" load_existing_vocab = True #annotations_file = "/opt/cocoapi/annotations/captions_train2014.json" annotations_file = None print("Loading vocabulary...") global _vocab _vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, load_existing_vocab) vocab_size = len (_vocab) print("Vocabulary contains %d words" % vocab_size) # Load pre-trained models: # encoder (Resnet + embedding layers) # decoder (LSTM) global _encoder global _decoder encoder_path = os.path.join("./models/", _encoder_file) decoder_path = os.path.join("./models/", _decoder_file) print("Loading ", encoder_path) _encoder = EncoderCNN(_embed_size) _encoder.load_state_dict(torch.load(encoder_path)) _encoder.eval() _encoder.to(_device) print("Loading ", decoder_path) _decoder = DecoderRNN(_embed_size, _hidden_size, vocab_size, _num_layers) _decoder.load_state_dict(torch.load(decoder_path)) _decoder.eval() _decoder.to(_device) # Caption the photo, or start a web server if no photo specified if _args.filename: _get_prediction_from_file(_args.filename) else: global _app global _api _app = Flask(__name__) _api = Api(_app) _api.add_resource(ImageCaptionResource, "/v1/caption", "/v1/caption/") _app.run(host = _args.host, port = _args.port)
def main(): #################################################### # config #################################################### device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') config = {} config['dataset'] = 'COCO' config[ 'vocab_word2idx_path'] = './vocab/save/' + 'COCO' + '/vocab/' + 'thre5_word2idx.pkl' config[ 'vocab_idx2word_path'] = './vocab/save/' + 'COCO' + '/vocab/' + 'thre5_idx2word.pkl' config[ 'vocab_idx_path'] = './vocab/save/' + 'COCO' + '/vocab/' + 'thre5_idx.pkl' config['crop_size'] = 224 config['images_root'] = './data/COCO/train2014_resized' config[ 'json_file_path_train'] = './data/COCO/annotations/captions_mini100.json' config[ 'json_file_path_val'] = './data/COCO/annotations/captions_val2014.json' config['batch_size'] = 128 config['embed_size'] = 256 config['hidden_size'] = 512 config['learning_rate'] = 1e-4 config['epoch_num'] = 20 config['save_step'] = 10 config['model_save_root'] = './save/' config['encoder_path'] = './save/' config['decoder_path'] = './save/' #################################################### # load vocabulary #################################################### vocab = Vocabulary() with open(config['vocab_word2idx_path'], 'rb') as f: vocab.word2idx = pickle.load(f) with open(config['vocab_idx2word_path'], 'rb') as f: vocab.idx2word = pickle.load(f) with open(config['vocab_idx_path'], 'rb') as f: vocab.idx = pickle.load(f) #################################################### # create data_loader #################################################### normalize = { 'Flickr8k': [(0.4580, 0.4464, 0.4032), (0.2318, 0.2229, 0.2269)], 'Flickr30k': None, 'COCO': [(0.485, 0.456, 0.406), (0.229, 0.224, 0.225)] } transform = transforms.Compose([ transforms.RandomCrop(config['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(normalize[config['dataset']][0], normalize[config['dataset']][1]) ]) loader_train = get_loader(dataset_name=config['dataset'], images_root=config['images_root'], json_file_path=config['json_file_path_train'], vocab=vocab, transform=transform, batch_size=config['batch_size'], shuffle=True, is_train=True) loader_val = get_loader(dataset_name=config['dataset'], images_root=config['images_root'], json_file_path=config['json_file_path_val'], vocab=vocab, transform=transform, batch_size=1, shuffle=False, is_val=True) #################################################### # create model #################################################### encoder = EncoderCNN(config['embed_size']) decoder = DecoderRNN(config['embed_size'], config['hidden_size'], len(vocab), 1) encoder.load_state_dict(torch.load(config['encoder_path'])) decoder.load_state_dict(torch.load(config['decoder_path'])) encoder.to(device) decoder.to(device) #################################################### # create trainer #################################################### raw_captions = [] sampled_captions = [] encoder.eval() decoder.eval() for i, (image, caption, length) in enumerate(tqdm(loader_val)): image = image.to(device) feature = encoder(image) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<END>': break raw_caption = [[vocab(int(token)) for token in list(caption[0])]] sampled_caption = sampled_caption[1:-1] # delete <START> and <END> # if sampled_caption[-1] != '.': # sampled_caption.append('.') raw_caption[0] = raw_caption[0][1:-1] # delete <START> and <END> raw_captions.append(raw_caption) sampled_captions.append(sampled_caption) hypo = {} for i, caption in enumerate(sampled_captions): hypo[i] = [' '.join(caption)] ref = {} for i, caption in enumerate(raw_captions): ref[i] = [' '.join(caption[0])] final_scores = Bleu().compute_score(ref, hypo) print(final_scores[0])
test_dataset = CategoryDataset( transform=transform, data_file="test_no_dup_with_category_3more_name.json", use_mean_img=False, neg_samples=False) test_loader = DataLoader( test_dataset, batch_size=32, shuffle=False, num_workers=4, collate_fn=lstm_collate_fn, ) ############################################################################### encoder_cnn = EncoderCNN(emb_size) encoder_cnn = encoder_cnn.to(device) if model == "lstm": f_rnn = LSTMModel(emb_size, emb_size, emb_size, device, bidirectional=False) b_rnn = LSTMModel(emb_size, emb_size, emb_size, device, bidirectional=False) f_rnn = f_rnn.to(device) b_rnn = b_rnn.to(device)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) ###### Prepare a batch of test images ######### data_loader = get_loader(args.image_path, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) actual = [] predicted = [] count = 0 pdict = {} adict = {} for i, (images, captions, lengths) in enumerate(data_loader): #print(captions.shape,lengths) images = images.to(device) captions = captions.to(device) features = encoder(images) outputs = decoder.sample(features) for bnum in range(len(outputs)): output = outputs[bnum].cpu().numpy() predicted_array = [] for wid in output: word = vocab.idx2word[wid] if word == '<end>': break predicted_array.append(word) predicted.append(' '.join(predicted_array)) actual_caption = captions[bnum] actual_arr = [] actual_caption = actual_caption.cpu().numpy() for wid in actual_caption: word = vocab.idx2word[wid] actual_arr.append(word) actual.append(' '.join(actual_arr)) if count % 128 == 0: pdict['output'] = predicted adict['output'] = actual with open('test_set_results/test_prediction.json', 'w') as outfile: json.dump(pdict, outfile) with open('test_set_results/test_actual.json', 'w') as outfile: json.dump(adict, outfile) count = count + 1 pdict['output'] = predicted adict['output'] = actual with open('test_set_results/test_prediction.json', 'w') as outfile: json.dump(pdict, outfile) with open('test_set_results/test_actual.json', 'w') as outfile: json.dump(adict, outfile)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper print('Loading vocab ') with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models print('Building models') encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, use_inference=args.use_inference).eval() encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters print('Loading models') encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) frame_rate = 5.0 # フレームレート im_size = 500 w, h = 1000, 800 size = (w, h) # 動画の画面サイズ fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') # ファイル形式(ここではmp4) writer = cv2.VideoWriter(args.out_file, fmt, frame_rate, size) # ライター作成 results_dict = {} accounts = [ p.split('/')[-1].split('.')[0] for p in glob.glob('data/annos/*') ] # accounts = args.use_account for user in accounts: print(user) ann_path = os.path.join(f"data/annos/{user}.pickle") annos = loadPickle(ann_path) if args.split == 'train': annos = annos[:-20] elif args.split == 'val': annos = annos[-20:] m = min(len(annos), 100) # 枚数 for i in range(m): ann = annos[i] image_path = f'data/images/{ann["filename"]}' if args.split == 'val' and image_path in duplicated_images_path: print('is duplicated image.') else: orig_text = ann['text'] image = load_image(image_path, transform) # image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption # print(type(sentence)) print(sentence) # print(sentence.encode('utf_8')) # image = Image.open(args.image) # plt.imshow(np.asarray(image)) img = cv2.imread(image_path) img = resize_square_pad(img) frame = np.zeros((h, w, 3)).astype('uint8') frame[h - im_size:, :im_size, :] = img frame = cv2.putText(frame, image_path, (10, h - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA) n = 20 s = 'GT: \n' for i in range(len(orig_text) // n + 1): s += orig_text[n * i:n * (i + 1)] + '\n' frame = puttext(frame, s, point=(15, 20), color=(255, 255, 255)) s = 'Result: \n' res = sentence.replace('<start>', '').replace('<end>', '').replace(' ', '') for i in range(len(res) // n + 1): s += res[n * i:n * (i + 1)] + '\n' frame = puttext(frame, s, point=(15, (h - im_size) / 2 + 20), color=(0, 255, 0)) # 出力結果と似たキャプションの画像を探す sim_image_file = None for text in text2file: if (res[:8] in text) or (res[-8:] in text): sim_image_file = text2file[text] break if sim_image_file is not None: img = cv2.imread(sim_image_file) img = resize_square_pad(img) frame[h - im_size:, im_size:, :] = img frame = cv2.putText(frame, sim_image_file, (10 + im_size, h - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA) s = 'Similar data: \n' for i in range(len(text) // n + 1): s += text[n * i:n * (i + 1)] + '\n' frame = puttext(frame, s, point=(15 + im_size, (h - im_size) / 2 + 20), color=(0, 0, 255)) sim_image_file = None writer.write(frame) # 結果を記録 results_dict[image_path] = res writer.release() # 結果を記録 with open(args.result_out_file, 'wb') as f: pickle.dump(results_dict, f)
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) ###### Prepare a batch of test images ######### data_loader = get_loader(args.image_path, args.caption_path, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) arr_predicted = [] arr_actual = [] count = 0 f_a = open('actual_label_coco.txt', 'w+') f_p = open('predict_label_coco.txt', 'w+') for i, (images, captions, lengths) in enumerate(data_loader): #print(captions.shape,lengths) images = images.to(device) captions = captions.to(device) features = encoder(images) outputs = decoder.sample(features) for bnum in range(128): output = outputs[bnum].cpu().numpy() predicted_array = [] for wid in output: word = vocab.idx2word[wid] if word == '<end>': break predicted_array.append(word) predicted = ' '.join(predicted_array) f_p.write(predicted) f_p.write('\n') actual_caption = captions[bnum] actual_arr = [] actual_caption = actual_caption.cpu().numpy() for wid in actual_caption: word = vocab.idx2word[wid] actual_arr.append(word) actual = ' '.join(actual_arr) f_a.write(actual) f_a.write('\n') print(predicted) print(actual) print(images[bnum].cpu().numpy().transpose(1, 2, 0).shape) scipy.misc.imsave('temp_dir/{}.jpg'.format(bnum), images[bnum].cpu().numpy().transpose(1, 2, 0)) count = count + 1 f_a.close() f_p.close() return