def main(hparams: HParams): ''' generate captions from images ''' device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu') text_preprocessor = TextPreprocessor.load(hparams.text_preprocessor_path) transform = transforms.Compose([ transforms.Resize([hparams.crop_size, hparams.crop_size]), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # build model encoder = EncoderCNN(hparams.hidden_dim).eval() decoder = FactoredLSTM(hparams.embed_dim, text_preprocessor.vocab_size, hparams.hidden_dim, hparams.style_dim, hparams.num_layers, train=False, device=device) encoder = encoder.to(device) decoder = decoder.to(device) checkpoints = torch.load(hparams.checkpoint_path, map_location=device) encoder.load_state_dict(checkpoints['encoder']) decoder.load_state_dict(checkpoints['decoder']) img_names, img_list = load_images(hparams.img_dir, transform) for idx, (img_name, img) in enumerate(zip(img_names, img_list)): img = img.to(device) features = encoder(img) if hparams.decoder == 'greedy': output = decoder.sample_greedy(features, hparams.gen_max_len, hparams.mode, text_preprocessor.SOS_ID, text_preprocessor.EOS_ID) output = output[0].cpu().tolist() else: output = decoder.sample_beam(features, hparams.beam_width, hparams.gen_max_len, hparams.mode, text_preprocessor.SOS_ID, text_preprocessor.EOS_ID) output = output[1:output.index(text_preprocessor.EOS_ID)] # delete SOS and EOS caption = text_preprocessor.indice2tokens(output) print(img_names[idx]) print(' '.join(token for token in caption))
def main(hparams: HParams): ''' setup training. ''' if torch.cuda.is_available() and not hparams.gpus: warnings.warn( 'WARNING: you have a CUDA device, so you should probably run with -gpus 0' ) device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu') # data setup print(f"Loading vocabulary...") text_preprocessor = TextPreprocessor.load(hparams.preprocessor_path) transform = transforms.Compose([ transforms.Resize([hparams.img_size, hparams.img_size]), transforms.RandomCrop([hparams.crop_size, hparams.crop_size]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # create dataloader print('Creating DataLoader...') normal_data_loader = get_image_caption_loader( hparams.img_dir, hparams.normal_caption_path, text_preprocessor, hparams.normal_batch_size, transform, shuffle=True, num_workers=hparams.num_workers, ) style_data_loader = get_caption_loader( hparams.style_caption_path, text_preprocessor, batch_size=hparams.style_batch_size, shuffle=True, num_workers=hparams.num_workers, ) if hparams.train_from: # loading checkpoint print('Loading checkpoint...') checkpoint = torch.load(hparams.train_from) else: normal_opt = Optim( hparams.optimizer, hparams.normal_lr, hparams.max_grad_norm, hparams.lr_decay, hparams.start_decay_at, ) style_opt = Optim( hparams.optimizer, hparams.style_lr, hparams.max_grad_norm, hparams.lr_decay, hparams.start_decay_at, ) print('Building model...') encoder = EncoderCNN(hparams.hidden_dim) decoder = FactoredLSTM(hparams.embed_dim, text_preprocessor.vocab_size, hparams.hidden_dim, hparams.style_dim, hparams.num_layers, hparams.random_init, hparams.dropout_ratio, train=True, device=device) encoder = encoder.to(device) decoder = decoder.to(device) # loss and optimizer criterion = nn.CrossEntropyLoss(ignore_index=text_preprocessor.PAD_ID) normal_params = list(encoder.parameters()) + list( decoder.default_parameters()) style_params = list(decoder.style_parameters()) normal_opt.set_parameters(normal_params) style_opt.set_parameters(style_params) if hparams.train_from: encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) normal_opt.load_state_dict(checkpoint['normal_opt']) style_opt.load_state_dict(checkpoint['style_opt']) # traininig loop print('Start training...') for epoch in range(hparams.num_epoch): # result sum_normal_loss, sum_style_loss, sum_normal_ppl, sum_style_ppl = 0, 0, 0, 0 # normal caption for i, (images, in_captions, out_captions, lengths) in enumerate(normal_data_loader): images = images.to(device) in_captions = in_captions.to(device) out_captions = out_captions.contiguous().view(-1).to(device) # Forward, backward and optimize features = encoder(images) outputs = decoder(in_captions, features, mode='default') loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions) encoder.zero_grad() decoder.zero_grad() loss.backward() normal_opt.step() # print log sum_normal_loss += loss.item() sum_normal_ppl += np.exp(loss.item()) if i % hparams.normal_log_step == 0: print( f'Epoch [{epoch}/{hparams.num_epoch}], Normal Step: [{i}/{len(normal_data_loader)}] ' f'Normal Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}' ) # style caption for i, (in_captions, out_captions, lengths) in enumerate(style_data_loader): in_captions = in_captions.to(device) out_captions = out_captions.contiguous().view(-1).to(device) # Forward, backward and optimize outputs = decoder(in_captions, None, mode='style') loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions) decoder.zero_grad() loss.backward() style_opt.step() sum_style_loss += loss.item() sum_style_ppl += np.exp(loss.item()) # print log if i % hparams.style_log_step == 0: print( f'Epoch [{epoch}/{hparams.num_epoch}], Style Step: [{i}/{len(style_data_loader)}] ' f'Style Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}' ) model_params = { 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'epoch': epoch, 'normal_opt': normal_opt.optimizer.state_dict(), 'style_opt': style_opt.optimizer.state_dict(), } avg_normal_loss = sum_normal_loss / len(normal_data_loader) avg_style_loss = sum_style_loss / len(style_data_loader) avg_normal_ppl = sum_normal_ppl / len(normal_data_loader) avg_style_ppl = sum_style_ppl / len(style_data_loader) print(f'Epoch [{epoch}/{hparams.num_epoch}] statistics') print( f'Normal Loss: {avg_normal_loss:.4f} Normal ppl: {avg_normal_ppl:5.4f} ' f'Style Loss: {avg_style_loss:.4f} Style ppl: {avg_style_ppl:5.4f}' ) torch.save( model_params, f'{hparams.model_path}/n-loss_{avg_normal_loss:.4f}_s-loss_{avg_style_loss:.4f}_' f'n-ppl_{avg_normal_ppl:5.4f}_s-ppl_{avg_style_ppl:5.4f}_epoch_{epoch}.pt' )