Exemplo n.º 1
0
def main(hparams: HParams):
    '''
    generate captions from images
    '''
    device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu')
    text_preprocessor = TextPreprocessor.load(hparams.text_preprocessor_path)

    transform = transforms.Compose([
        transforms.Resize([hparams.crop_size, hparams.crop_size]),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))
    ])

    # build model
    encoder = EncoderCNN(hparams.hidden_dim).eval()
    decoder = FactoredLSTM(hparams.embed_dim, text_preprocessor.vocab_size, hparams.hidden_dim,
                           hparams.style_dim, hparams.num_layers, train=False, device=device)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    checkpoints = torch.load(hparams.checkpoint_path, map_location=device)
    encoder.load_state_dict(checkpoints['encoder'])
    decoder.load_state_dict(checkpoints['decoder'])

    img_names, img_list = load_images(hparams.img_dir, transform)
    for idx, (img_name, img) in enumerate(zip(img_names, img_list)):
        img = img.to(device)
        features = encoder(img)

        if hparams.decoder == 'greedy':
            output = decoder.sample_greedy(features, hparams.gen_max_len, hparams.mode,
                                           text_preprocessor.SOS_ID, text_preprocessor.EOS_ID)
            output = output[0].cpu().tolist()
        else:
            output = decoder.sample_beam(features, hparams.beam_width, hparams.gen_max_len, hparams.mode,
                                         text_preprocessor.SOS_ID, text_preprocessor.EOS_ID)

        output = output[1:output.index(text_preprocessor.EOS_ID)]  # delete SOS and EOS
        caption = text_preprocessor.indice2tokens(output)

        print(img_names[idx])
        print(' '.join(token for token in caption))
Exemplo n.º 2
0
def main(hparams: HParams):
    '''
    setup training.
    '''
    if torch.cuda.is_available() and not hparams.gpus:
        warnings.warn(
            'WARNING: you have a CUDA device, so you should probably run with -gpus 0'
        )

    device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu')

    # data setup
    print(f"Loading vocabulary...")
    text_preprocessor = TextPreprocessor.load(hparams.preprocessor_path)

    transform = transforms.Compose([
        transforms.Resize([hparams.img_size, hparams.img_size]),
        transforms.RandomCrop([hparams.crop_size, hparams.crop_size]),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # create dataloader
    print('Creating DataLoader...')
    normal_data_loader = get_image_caption_loader(
        hparams.img_dir,
        hparams.normal_caption_path,
        text_preprocessor,
        hparams.normal_batch_size,
        transform,
        shuffle=True,
        num_workers=hparams.num_workers,
    )

    style_data_loader = get_caption_loader(
        hparams.style_caption_path,
        text_preprocessor,
        batch_size=hparams.style_batch_size,
        shuffle=True,
        num_workers=hparams.num_workers,
    )

    if hparams.train_from:
        # loading checkpoint
        print('Loading checkpoint...')
        checkpoint = torch.load(hparams.train_from)
    else:
        normal_opt = Optim(
            hparams.optimizer,
            hparams.normal_lr,
            hparams.max_grad_norm,
            hparams.lr_decay,
            hparams.start_decay_at,
        )
        style_opt = Optim(
            hparams.optimizer,
            hparams.style_lr,
            hparams.max_grad_norm,
            hparams.lr_decay,
            hparams.start_decay_at,
        )

    print('Building model...')
    encoder = EncoderCNN(hparams.hidden_dim)
    decoder = FactoredLSTM(hparams.embed_dim,
                           text_preprocessor.vocab_size,
                           hparams.hidden_dim,
                           hparams.style_dim,
                           hparams.num_layers,
                           hparams.random_init,
                           hparams.dropout_ratio,
                           train=True,
                           device=device)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # loss and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=text_preprocessor.PAD_ID)
    normal_params = list(encoder.parameters()) + list(
        decoder.default_parameters())
    style_params = list(decoder.style_parameters())
    normal_opt.set_parameters(normal_params)
    style_opt.set_parameters(style_params)

    if hparams.train_from:
        encoder.load_state_dict(checkpoint['encoder'])
        decoder.load_state_dict(checkpoint['decoder'])
        normal_opt.load_state_dict(checkpoint['normal_opt'])
        style_opt.load_state_dict(checkpoint['style_opt'])

    # traininig loop
    print('Start training...')
    for epoch in range(hparams.num_epoch):

        # result
        sum_normal_loss, sum_style_loss, sum_normal_ppl, sum_style_ppl = 0, 0, 0, 0

        # normal caption
        for i, (images, in_captions, out_captions,
                lengths) in enumerate(normal_data_loader):
            images = images.to(device)
            in_captions = in_captions.to(device)
            out_captions = out_captions.contiguous().view(-1).to(device)

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(in_captions, features, mode='default')
            loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions)
            encoder.zero_grad()
            decoder.zero_grad()
            loss.backward()
            normal_opt.step()

            # print log
            sum_normal_loss += loss.item()
            sum_normal_ppl += np.exp(loss.item())
            if i % hparams.normal_log_step == 0:
                print(
                    f'Epoch [{epoch}/{hparams.num_epoch}], Normal Step: [{i}/{len(normal_data_loader)}] '
                    f'Normal Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}'
                )

        # style caption
        for i, (in_captions, out_captions,
                lengths) in enumerate(style_data_loader):
            in_captions = in_captions.to(device)
            out_captions = out_captions.contiguous().view(-1).to(device)

            # Forward, backward and optimize
            outputs = decoder(in_captions, None, mode='style')
            loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions)

            decoder.zero_grad()
            loss.backward()
            style_opt.step()

            sum_style_loss += loss.item()
            sum_style_ppl += np.exp(loss.item())
            # print log
            if i % hparams.style_log_step == 0:
                print(
                    f'Epoch [{epoch}/{hparams.num_epoch}], Style Step: [{i}/{len(style_data_loader)}] '
                    f'Style Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}'
                )

        model_params = {
            'encoder': encoder.state_dict(),
            'decoder': decoder.state_dict(),
            'epoch': epoch,
            'normal_opt': normal_opt.optimizer.state_dict(),
            'style_opt': style_opt.optimizer.state_dict(),
        }

        avg_normal_loss = sum_normal_loss / len(normal_data_loader)
        avg_style_loss = sum_style_loss / len(style_data_loader)
        avg_normal_ppl = sum_normal_ppl / len(normal_data_loader)
        avg_style_ppl = sum_style_ppl / len(style_data_loader)
        print(f'Epoch [{epoch}/{hparams.num_epoch}] statistics')
        print(
            f'Normal Loss: {avg_normal_loss:.4f} Normal ppl: {avg_normal_ppl:5.4f} '
            f'Style Loss: {avg_style_loss:.4f} Style ppl: {avg_style_ppl:5.4f}'
        )

        torch.save(
            model_params,
            f'{hparams.model_path}/n-loss_{avg_normal_loss:.4f}_s-loss_{avg_style_loss:.4f}_'
            f'n-ppl_{avg_normal_ppl:5.4f}_s-ppl_{avg_style_ppl:5.4f}_epoch_{epoch}.pt'
        )