示例#1
0
    def __init__(self, encoded_size, projected_size, hidden_size, max_words):
        """

        Args:
            encoded_size: Encoder hidden_size.
            projected_size: Global projected_size.
            hidden_size: Decoder hidden_size.
            max_words: Maximum word sequence.
        """
        super(Decoder, self).__init__()
        self.encoded_size = encoded_size
        self.projected_size = projected_size
        self.hidden_size = hidden_size
        self.max_words = max_words

        self.word_embed = nn.Embedding(len(vocab()), projected_size)
        self.word_drop = nn.Dropout(p=0.5)

        # REVIEW josephz: ??? Understand this.
        # The GRU in the paper has three inputs, except the last hidden layer state of the input GRU.
        # We also need to enter the two dimensions of video features and word features.
        # However, the standard GRU only accepts two inputs
        # Therefore, we use two fully connected layers to merge the two dimensional features
        # into one dimension outside the GRU.
        self.v2m = nn.Linear(encoded_size, projected_size)
        self.w2m = nn.Linear(projected_size, projected_size)
        self.gru_cell = nn.GRUCell(projected_size, hidden_size)
        self.gru_drop = nn.Dropout(p=0.5)
        self.word_restore = nn.Linear(hidden_size, len(vocab()))
def eval_step(eval_loader, banet, prediction_txt_path, reference, use_cuda=False):
    result = {}
    for i, (videos, video_ids) in enumerate(eval_loader):
        if use_cuda:
            videos = videos.cuda()

        outputs, _ = banet(videos, None)
        for (tokens, vid) in zip(outputs, video_ids):
            s = vocab().decode(tokens.data)
            result[vid] = s

    prediction_txt = open(prediction_txt_path, 'w')
    for vid, s in result.items():
        prediction_txt.write('{}\t{}\n'.format(vid[5:], s))

    prediction_txt.close()

    metrics = measure(prediction_txt_path, reference)
    return metrics
示例#3
0
    def forward(self,
                video_encoded,
                captions,
                use_cuda=False,
                teacher_forcing_ratio=0.5,
                use_argmax=False):
        """

        Args:
            video_encoded (torch.FloatTensor [N, hidden_size]): Encoded hidden state from encoder.
            captions (torch.LongTensor [max_vid_len, max_cap_len]): Caption indices.
            teacher_forcing_ratio:
            use_cuda: Flag whether to use the GPU.
            use_argmax: Flag whether to decode using greedy or multinomial sampling.

        Returns:
            outputs (torch.Tensor[]):

        """
        batch_size = len(video_encoded)

        # During inference time, caption-labels are not available.
        infer = True if captions is None else False
        if not infer:
            # captions[captions >= len(vocab())] = vocab()[Token.UNK]
            assert captions.max() <= len(
                vocab()
            )  # REVIEW josephz: Fix this afterwards, with comment on obscure bug

        # Initialize GRU state.
        # video_encoded: [N, encoded_size]
        # gru_h: [projected_size, hidden_size]
        gru_h = self._init_gru_state(video_encoded)

        # outputs: [max_words, N] during inference time, represents word_idx.
        # outputs: [max_words, N, vocab_size] else, represents logits.
        if infer:
            if use_cuda:
                outputs = torch.cuda.FloatTensor(self.max_words,
                                                 batch_size).fill_(0)
            else:
                outputs = torch.FloatTensor(self.max_words,
                                            batch_size).fill_(0)
            outputs[0] = vocab()[Token.START]
        else:
            if use_cuda:
                outputs = torch.cuda.FloatTensor(self.max_words, batch_size,
                                                 len(vocab())).fill_(0)
            else:
                outputs = torch.FloatTensor(self.max_words, batch_size,
                                            len(vocab())).fill_(0)
            outputs[0, :, vocab()[Token.START]] = 1.0
        assert captions is None or captions[:, 0].max() == captions[:, 0].min(
        ) == vocab()[Token.START]

        # Append START token to to sentence.
        word_id = vocab()[Token.START]

        # word: [N, 1], filled with word_id=START.
        # This represents, for each batch, the START token.
        word = video_encoded.data.new(batch_size, 1).long().fill_(word_id)

        # word: [N, projected_size]
        word = self.word_embed(word).squeeze(1)
        word = self.word_drop(word)

        # video_encoded: [N, encoded_size]
        # vm: [N, encoded_size] → [N, projected_size]
        vm = self.v2m(video_encoded)
        for i in range(1, self.max_words):
            if not infer:
                allThings = True
                for x in captions[:, i]:
                    if x != vocab()[Token.PAD]:
                        allThings = False
                if allThings:
                    break

            # if not infer and all(x == v[Token.PAD] for x in captions[:, i].data):
            #     If all the word ids are Token.PAD, then we have hit the end of the sentence.
            # break
            # Push word to decoder.
            # word_i: [N, projected_size] →
            #   wm: [N, hidden_size]
            wm = self.w2m(word)

            # Concatenate the video encoding and word encoding.
            m = vm + wm
            gru_h = self.gru_cell(m, gru_h)
            gru_h = self.gru_drop(gru_h)

            # Finally decode the word_{i+1}.
            word_logits = self.word_restore(gru_h)
            use_teacher_forcing = not infer and (random.random() <
                                                 teacher_forcing_ratio)
            if use_teacher_forcing:
                word_id = captions[:, i]
            else:
                if use_argmax:
                    word_id = word_logits.max(1)[1]
                else:
                    posterior = F.softmax(word_logits, dim=1)
                    word_id = torch.multinomial(posterior, 1).squeeze(1)

            if infer:
                # In infer mode, use word_id from label.
                outputs[i] = word_id
            else:
                # Otherwise, generate word from logits.
                outputs[i] = word_logits
            # Compute word representation.
            word = self.word_embed(word_id).squeeze(1)
            word = self.word_drop(word)

        # unsqueeze(1) will pull a vector (n) into a column vector (nx1)
        # Each vector in  output is the output of the entire batch at a certain time step
        # Pull it into a column vector and then slash it up to get the output of the entire batch at all time steps.
        assert len(outputs) > 0
        outputs = torch.cat([o.unsqueeze(1) for o in outputs], 1).contiguous()
        return outputs
示例#4
0
def train(
    # General training hyperparameters.
    dataset: str,
    num_epochs: int = 100,
    batch_size: int = 128,

    # Learning rate schedulers.
    learning_rate: float = 3e-4,
    ss_factor: int = 24,
    min_ss: float = 0.6,

    # Representation hyperparameters.
    projected_size: int = 500,
    hidden_size: int = 1024,  # Hidden size of the recurrent cells.
    mid_size: int = 128,  # Dimension of the boundary detection layer.

    # REVIEW josephz: Remove this?
    # frame_shape: tuple=(3, 224, 224),  # Video frame shape.
    a_feature_size: int = 2048,  # Appearance model feature-dimension size.
    # REVIEW josephz: Remove this?
    # m_feature_size=4096,  # Motion model feature-dimension size.

    # Maximum-size hyperparameters.
    # frame_sample_rate: int=10,  # Sample rate of video frames.
    max_frames: int = 30,  # Maximum length of the video-frame sequence.
    max_words: int = 30,  # Maximum length of the caption-word sequence.

    # Misc hyperparameters.
    ckpt_freq: int = 3,
    use_cuda: bool = False,
    use_ckpt: bool = False,
    use_argmax: bool = False,
    seed: int = 0,
):
    """

    Args:
        dataset (str): Dataset to train on.
        num_epochs (int): Number of epochs to train for.
        batch_size (int): Batch size to train with.

        learning_rate (float): Learning rate.
        ss_factor (int): Scheduled Sampling factor, to compute a teacher-forcing ratio.
        min_ss (float): Minimum teacher-forcing ratio.

        projected_size (int): Projection size for the Encoder-Decoder model.
        hidden_size (int): Hidden state size for the recurrent network in the encoder.
        mid_size (int): Hidden state size for the Boundary Detector network in the encoder.
        a_feature_size: Input feature size for the Encoder network.

        max_frames (int): Maximum length of the video-frame sequence.
        max_words (int): Maximum length of the caption-word sequence.

        ckpt_freq (int): Frequency to compute evaluation metrics and save checkpoint.
        use_cuda (bool): Flag whether to use CUDA devices.
        use_ckpt (bool): Flag on whether to load checkpoint if possible.
        use_argmax (bool): Flag on whether to use greedy or multinomial sampling during decoding.
        seed (int): Random seed.

    Effects:
        We will have several outputs:
            - Checkpoints (model weights)
            - Logs (tensorboard logs)
    """
    # Set seeds.
    torch.random.manual_seed(seed)
    np.random.seed(seed)

    # Prepare output paths.
    # REVIEW josephz: This is unbelievably hacky, but we want an easy way to allow the user to set and track
    #   hyperparameters using the cmd_line interface? This should probably be abstracted in utility.py.
    hparams = locals()
    params = {
        arg_name: hparams[arg_name]
        for arg_name in inspect.signature(train).parameters.keys()
    }

    ckpt_path = _util.get_weights_path_by_param(reuse=False, **params)
    print(
        "Saving checkpoints to '{ckpt_path}', you may visualize in tensorboard with the following: \n\n\t`tensorboard --logdir={ckpt_path}`\n"
        .format(ckpt_path=ckpt_path))

    # Setup logging paths.
    log_path = os.path.join(ckpt_path, 'logs')
    _util.mkdir(log_path)
    _tb_logger.configure(log_path, flush_secs=10)

    # REVIEW josephz: Todo, clean this up.
    banet_pth_path_fmt = os.path.join(ckpt_path, '{:04d}_{:04d}.pth')
    best_banet_pth_path = os.path.join(ckpt_path, 'weights.pth')
    optimizer_pth_path = os.path.join(ckpt_path, 'optimizer.pth')
    best_optimizer_pth_path = os.path.join(ckpt_path, 'best_optimizer.pth')

    # Load Vocabulary.
    vocab_size = len(vocab())

    # Load Reference for COCO.
    # val_dir = _util.get_dataset_by_name(dataset, mode='val')
    # val_reference_txt_path = os.path.join(val_dir, 'reference.json')
    # val_prediction_txt_path = os.path.join(val_dir, 'prediction.txt')
    # reference = COCO(val_reference_txt_path)

    eval_mode = 'val'
    eval_dir = _util.get_dataset_by_name(dataset, mode=eval_mode)
    test_reference_txt_path = os.path.join(eval_dir, 'reference.json')
    test_prediction_txt_path = os.path.join(eval_dir, 'prediction.txt')
    reference = COCO(test_reference_txt_path)
    print("Evaluating on '{}'".format(eval_dir))

    # Initialize the model.
    banet = _models.BANet(a_feature_size,
                          projected_size,
                          mid_size,
                          hidden_size,
                          max_frames,
                          max_words,
                          use_cuda=use_cuda)

    # Load model weights if possible.
    if use_ckpt:
        pretrained_path = os.path.join(_util.get_raw_dataset_by_name('MSRVTT'),
                                       'pretrained_weights.pth')
        weights = torch.load(pretrained_path)

        # REVIEW josephz: Figure out how to do the decoder weights partially:
        #   https://discuss.pytorch.org/t/how-to-load-part-of-pre-trained-model/1113/6
        del weights['decoder.word_embed.weight']
        del weights['decoder.word_restore.bias']
        del weights['decoder.word_restore.weight']
        banet.load_state_dict(weights, strict=False)
    if use_cuda:
        banet.cuda()

    # Initialize loss and optimizer.
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(banet.parameters(), lr=learning_rate)
    if os.path.exists(optimizer_pth_path) and use_ckpt:
        optimizer.load_state_dict(torch.load(optimizer_pth_path))

    # Initialize Dataloaders.
    train_loader = _data.get_train_dataloader(dataset, batch_size=batch_size)
    eval_loader = _data.get_eval_dataloader(dataset,
                                            eval_mode,
                                            batch_size=batch_size)

    num_train_steps = len(train_loader)
    num_eval_steps = len(eval_loader)

    # Begin Training Loop.
    print("Training Configuration:")
    print("\tLearning Rate: '{0:.4f}'".format(learning_rate))
    print("\tScheduled Sampling:")
    print("\t\tMax Teacher Forcing Rate: '{0:.4f}'".format(min_ss))
    print("\t\tScheduled Factor: '{0:.4f}'".format(ss_factor))
    print("\tBatch Size: '{}'".format(batch_size))
    print("\tEpochs: '{}'".format(num_epochs))
    print("\tDataset: '{}'".format(dataset))
    print("\tCheckpoint Path: '{}'".format(ckpt_path))

    best_meteor = 0
    loss_count = 0
    for epoch in range(num_epochs):
        epsilon = max(min_ss,
                      ss_factor / (ss_factor + np.exp(epoch / ss_factor)))
        print('epoch:%d\tepsilon:%.8f' % (epoch, epsilon))
        _tb_logger.log_value('epsilon', epsilon, epoch)

        for i, (videos, captions, cap_lens,
                video_ids) in tqdm.tqdm(enumerate(train_loader, start=1),
                                        total=num_train_steps):
            if use_cuda:
                videos = videos.cuda()
                targets = captions.cuda()
            else:
                targets = captions

            # Zero the gradients and run the encoder-decoder model.
            optimizer.zero_grad()
            outputs, video_encoded = banet(videos,
                                           targets,
                                           teacher_forcing_ratio=epsilon,
                                           use_argmax=use_argmax)

            # NOTE: Usually the last batch is less than the selected batch_size, so we dynamically
            #       compute the correct batch_size to use here, rather than throwing away the last
            #       training batch.
            bsz = len(targets)

            # Un-pad and flatten the outputs and labels.
            outputs = torch.cat([outputs[j][:cap_lens[j]] for j in range(bsz)],
                                dim=0)
            targets = torch.cat([targets[j][:cap_lens[j]] for j in range(bsz)],
                                dim=0)

            outputs = outputs.view(-1, vocab_size)
            targets = targets.view(-1)

            # Compute loss for back-propagation.
            # assert all(targets > 0) and all(outputs > 0)
            loss = criterion(outputs, targets)
            loss_val = loss.item()
            _tb_logger.log_value('loss', loss_val, epoch * num_train_steps + i)
            loss_count += loss_val
            # REVIEW josephz: Is there grad_norm?
            loss.backward()
            optimizer.step()

            eval_steps = 25
            if i % eval_steps == 0 or bsz < batch_size:
                loss_count /= eval_steps if bsz == batch_size else i % eval_steps
                perplexity = np.exp(loss_count)
                print(
                    'Epoch [%d/%d]:\n\tStep [%d/%d]\n\tLoss: %.4f\n\tPerplexity: %5.4f'
                    % (epoch, num_epochs, i, num_train_steps, loss_count,
                       perplexity))
                _tb_logger.log_value('perplexity', perplexity,
                                     epoch * num_train_steps + i)
                loss_count = 0
                tokens = banet.decoder.sample(video_encoded)
                for j in range(5):
                    we = vocab().decode(tokens.data[j].squeeze())
                    gt = vocab().decode(captions[j].squeeze())
                    print('\t\t[vid_id={}]'.format(video_ids[j]))
                    print('\t\t\tWE: %s\n\t\t\tGT: %s' % (we, gt))

        # Finally, compute evaluation metrics and save the best models.
        if epoch % ckpt_freq == 0:
            # Save epoch checkpoint.
            banet_pth_path = banet_pth_path_fmt.format(epoch, num_epochs)
            print("Saving checkpoints to '{}'".format(banet_pth_path))
            torch.save(banet.state_dict(), banet_pth_path)
            torch.save(optimizer.state_dict(), optimizer_pth_path)

            # Compute evaluation.
            banet.eval()
            print("Computing Metrics:...")
            metrics = _train.eval_step(eval_loader,
                                       banet,
                                       test_prediction_txt_path,
                                       reference,
                                       use_cuda=use_cuda)
            for k, v in metrics.items():
                _tb_logger.log_value(k, v, epoch)
                if k == 'METEOR' and v > best_meteor:
                    # Save the best model based on the METEOR metric.
                    # For reference, see https://www.cs.cmu.edu/~alavie/papers/BanerjeeLavie2005-final.pdf
                    print("Saving best checkpoint of metric: '{}'".format(
                        best_meteor))
                    shutil.copy2(banet_pth_path, best_banet_pth_path)
                    shutil.copy2(optimizer_pth_path, best_optimizer_pth_path)
                    best_meteor = v
            banet.train()
示例#5
0
def evaluate(raw: str,
             dataset: str,
             mode: str,
             weights_path: str,
             batch_size: int = 64,
             use_cuda: bool = False) -> None:
    dataset_dir = _util.get_dataset_by_name(dataset, mode)
    raw_dir = _util.get_raw_dataset_by_name(raw, mode)

    model, run, args, weights_path = _util.get_params_by_weights_path(
        weights_path)

    a_feature_size = int(args["a_feature_size"])
    projected_size = int(args["projected_size"])
    mid_size = int(args["mid_size"])
    hidden_size = int(args["hidden_size"])
    max_frames = int(args["max_frames"])
    max_words = int(args["max_words"])
    banet = _models.BANet(a_feature_size,
                          projected_size,
                          mid_size,
                          hidden_size,
                          max_frames,
                          max_words,
                          use_cuda=use_cuda)

    pretrained_path = os.path.join(weights_path, "weights.pth")
    weights = torch.load(pretrained_path)
    banet.load_state_dict(weights)
    if use_cuda:
        banet.cuda()

    print("Computing metrics...")
    eval_loader = _data.get_eval_dataloader(dataset,
                                            mode,
                                            batch_size=batch_size)
    test_reference_txt_path = os.path.join(dataset_dir, 'reference.json')
    test_prediction_txt_path = os.path.join(dataset_dir, 'prediction.txt')
    reference = COCO(test_reference_txt_path)

    _train.eval_step(eval_loader,
                     banet,
                     test_prediction_txt_path,
                     reference,
                     use_cuda=use_cuda)

    # Must switch to a new loder which provides captions.
    eval_loader = _data.get_dataloader(dataset, mode, batch_size=batch_size)
    for i, (videos, captions, cap_lens,
            video_ids) in tqdm(enumerate(eval_loader, start=1),
                               total=len(eval_loader)):
        if use_cuda:
            videos = videos.cuda()

        video_encoded = banet.encoder(videos)
        tokens = banet.decoder.sample(video_encoded)

        # vid_paths = [os.path.join(raw_dir, "{}.mp4".format(video_id)) for video_id in video_ids]

        for j in range(len(tokens)):
            # vid = imageio.get_reader(vid_paths[j]).iter_data()

            print('[vid_id={}]'.format(video_ids[j]))
            print("gt  :", vocab().decode(captions[j]))
            print("pred:", vocab().decode(tokens.data[j].squeeze()))
            print()