def main(args): vocab = load_vocab() encoder = CNNEncoder() decoder = DecoderRNN(512,512,len(vocab)) encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(args.checkpoint_file,False) encoder.load_state_dict(encoder_state_dict) decoder.load_state_dict(decoder_state_dict) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) inp = cv2.imread(args.image_path) inp = transform(Image.fromarray(inp)).unsqueeze(0) inp = utils.to_var(inp, volatile=True) features = encoder(inp) sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(sampled_ids, vocab) print('Caption:', sentence)
def main(args): print("Process %s, running on %s: starting (%s)" % ( os.getpid(), os.name, time.asctime())) encoder = EncoderCNN() decoder = DecoderRNN() if torch.cuda.is_available() and args.gpu: encoder = encoder.cuda() decoder = decoder.cuda() encoder_trainables = [p for p in encoder.parameters() if p.requires_grad] decoder_trainables = [p for p in decoder.parameters() if p.requires_grad] params = encoder_trainables + decoder_trainables transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) data_loader = trainloader(transform=transform) optimizer = torch.optim.SGD(params=params, lr=args.lr, momentum=0.9)
def main(args): # hyperparameters batch_size = args.batch_size num_workers = 1 # Image Preprocessing transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) # load COCOs dataset IMAGES_PATH = 'data/train2014' CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json' vocab = load_vocab() train_loader = get_coco_data_loader(path=IMAGES_PATH, json=CAPTION_FILE_PATH, vocab=vocab, transform=transform, batch_size=batch_size, shuffle=True, num_workers=num_workers) IMAGES_PATH = 'data/val2014' CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json' val_loader = get_coco_data_loader(path=IMAGES_PATH, json=CAPTION_FILE_PATH, vocab=vocab, transform=transform, batch_size=batch_size, shuffle=True, num_workers=num_workers) losses_val = [] losses_train = [] # Build the models ngpu = 1 initial_step = initial_epoch = 0 embed_size = args.embed_size num_hiddens = args.num_hidden learning_rate = 5e-4 num_epochs = 2 log_step = args.log_step save_step = 500 checkpoint_dir = args.checkpoint_dir encoder = CNNEncoder() decoder = DecoderRNN(embed_size, num_hiddens, len(vocab)) # Loss criterion = nn.CrossEntropyLoss() if args.checkpoint_file: encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models( args.checkpoint_file, args.sample) initial_step, initial_epoch, losses_train, losses_val = meta encoder.load_state_dict(encoder_state_dict) decoder.load_state_dict(decoder_state_dict) else: params = list(decoder.parameters()) + list( encoder.batchnorm.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() if args.sample: return utils.sample(encoder, decoder, vocab, val_loader) # Train the Models total_step = len(train_loader) try: for epoch in range(initial_epoch, num_epochs): for step, (images, captions, lengths) in enumerate(train_loader, start=initial_step): # Set mini-batch dataset images = utils.to_var(images, volatile=True) captions = utils.to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() if ngpu > 1: # run on multiple GPU features = nn.parallel.data_parallel( encoder, images, range(ngpu)) outputs, alphas = nn.parallel.data_parallel( decoder, features, range(ngpu)) else: # run on single GPU features = encoder(images) outputs, alphas = decoder(features, captions, lengths) train_loss = criterion(outputs, targets.cpu()) train_loss += ((1. - alphas.sum(dim=1))**2).mean() losses_train.append(train_loss.data) train_loss.backward() optimizer.step() print('Epoch: {} - Step: {} - Train Loss: {}'.format( epoch, step, losses_train[-1])) # Run validation set and predict if step % log_step == 404: encoder.batchnorm.eval() # run validation set batch_loss_val = [] for val_step, (images, captions, lengths) in enumerate(val_loader): images = utils.to_var(images, volatile=True) captions = utils.to_var(captions, volatile=True) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs, alphas = decoder(features, captions, lengths) val_loss = criterion(outputs, targets.cpu()) val_loss += ((1. - alphas.sum(dim=1))**2).mean() batch_loss_val.append(val_loss.data) if val_step % 50 == 0: print('Epoch: {} - Step: {} - Mini Eval Loss: {}'. format(epoch, val_step, val_loss)) sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().data.numpy()[0] sentence = utils.convert_back_to_text( sampled_ids, vocab) print('Sample:', sentence) true_ids = captions.cpu().data.numpy()[0] sentence = utils.convert_back_to_text( true_ids, vocab) print('Target:', sentence) losses_val.append(np.mean(batch_loss_val)) # predict print('Epoch: {} - Step: {} - Eval Loss: {}'.format( epoch, step, losses_val[-1])) encoder.batchnorm.train() # Save the models if (step + 1) % save_step == 0: utils.save_models(encoder, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses( losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl')) except KeyboardInterrupt: pass finally: # Do final save utils.save_models(encoder, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses(losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl'))
decoder_input = decoder_input.cuda() if use_cuda else decoder_input return decoded_words #, decoder_attentions[:di + 1] def evaluateRandomly(encoder, decoder, n=10): for i in range(n): pair = random.choice(pairs) print('>', pair[0]) print('=', pair[1]) output_words = evaluate(encoder, decoder, pair[0]) output_sentence = ' '.join(output_words) print('<', output_sentence) print('') input_lang, output_lang, pairs = prepareData('eng', 'fra', True) noise = torch.Tensor(list(range(output_lang.n_words))) print(random.choice(pairs)) hidden_size = 256 encoder1 = EncoderRNN(input_lang.n_words, hidden_size) decoder1 = DecoderRNN(hidden_size, output_lang.n_words, 1) if use_cuda: encoder1 = encoder1.cuda() decoder1 = decoder1.cuda() trainIters(encoder1, decoder1, 25000, print_every=50) evaluateRandomly(encoder1, decoder1, 20)
# Build Dataset Loader train_loader = get_loader(train_image_path, train_json_path, vocab, transform, batch_size=batch_size, shuffle=True, num_workers=2) total_step = len(train_loader) # Build Models encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate) # Train the Decoder for epoch in range(num_epochs): for i, (images, captions, lengths) in enumerate(train_loader): # Set mini-batch dataset images = Variable(images).cuda() captions = Variable(captions).cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad()