def __init__(self, encoded_size, projected_size, hidden_size, max_words): """ Args: encoded_size: Encoder hidden_size. projected_size: Global projected_size. hidden_size: Decoder hidden_size. max_words: Maximum word sequence. """ super(Decoder, self).__init__() self.encoded_size = encoded_size self.projected_size = projected_size self.hidden_size = hidden_size self.max_words = max_words self.word_embed = nn.Embedding(len(vocab()), projected_size) self.word_drop = nn.Dropout(p=0.5) # REVIEW josephz: ??? Understand this. # The GRU in the paper has three inputs, except the last hidden layer state of the input GRU. # We also need to enter the two dimensions of video features and word features. # However, the standard GRU only accepts two inputs # Therefore, we use two fully connected layers to merge the two dimensional features # into one dimension outside the GRU. self.v2m = nn.Linear(encoded_size, projected_size) self.w2m = nn.Linear(projected_size, projected_size) self.gru_cell = nn.GRUCell(projected_size, hidden_size) self.gru_drop = nn.Dropout(p=0.5) self.word_restore = nn.Linear(hidden_size, len(vocab()))
def eval_step(eval_loader, banet, prediction_txt_path, reference, use_cuda=False): result = {} for i, (videos, video_ids) in enumerate(eval_loader): if use_cuda: videos = videos.cuda() outputs, _ = banet(videos, None) for (tokens, vid) in zip(outputs, video_ids): s = vocab().decode(tokens.data) result[vid] = s prediction_txt = open(prediction_txt_path, 'w') for vid, s in result.items(): prediction_txt.write('{}\t{}\n'.format(vid[5:], s)) prediction_txt.close() metrics = measure(prediction_txt_path, reference) return metrics
def forward(self, video_encoded, captions, use_cuda=False, teacher_forcing_ratio=0.5, use_argmax=False): """ Args: video_encoded (torch.FloatTensor [N, hidden_size]): Encoded hidden state from encoder. captions (torch.LongTensor [max_vid_len, max_cap_len]): Caption indices. teacher_forcing_ratio: use_cuda: Flag whether to use the GPU. use_argmax: Flag whether to decode using greedy or multinomial sampling. Returns: outputs (torch.Tensor[]): """ batch_size = len(video_encoded) # During inference time, caption-labels are not available. infer = True if captions is None else False if not infer: # captions[captions >= len(vocab())] = vocab()[Token.UNK] assert captions.max() <= len( vocab() ) # REVIEW josephz: Fix this afterwards, with comment on obscure bug # Initialize GRU state. # video_encoded: [N, encoded_size] # gru_h: [projected_size, hidden_size] gru_h = self._init_gru_state(video_encoded) # outputs: [max_words, N] during inference time, represents word_idx. # outputs: [max_words, N, vocab_size] else, represents logits. if infer: if use_cuda: outputs = torch.cuda.FloatTensor(self.max_words, batch_size).fill_(0) else: outputs = torch.FloatTensor(self.max_words, batch_size).fill_(0) outputs[0] = vocab()[Token.START] else: if use_cuda: outputs = torch.cuda.FloatTensor(self.max_words, batch_size, len(vocab())).fill_(0) else: outputs = torch.FloatTensor(self.max_words, batch_size, len(vocab())).fill_(0) outputs[0, :, vocab()[Token.START]] = 1.0 assert captions is None or captions[:, 0].max() == captions[:, 0].min( ) == vocab()[Token.START] # Append START token to to sentence. word_id = vocab()[Token.START] # word: [N, 1], filled with word_id=START. # This represents, for each batch, the START token. word = video_encoded.data.new(batch_size, 1).long().fill_(word_id) # word: [N, projected_size] word = self.word_embed(word).squeeze(1) word = self.word_drop(word) # video_encoded: [N, encoded_size] # vm: [N, encoded_size] → [N, projected_size] vm = self.v2m(video_encoded) for i in range(1, self.max_words): if not infer: allThings = True for x in captions[:, i]: if x != vocab()[Token.PAD]: allThings = False if allThings: break # if not infer and all(x == v[Token.PAD] for x in captions[:, i].data): # If all the word ids are Token.PAD, then we have hit the end of the sentence. # break # Push word to decoder. # word_i: [N, projected_size] → # wm: [N, hidden_size] wm = self.w2m(word) # Concatenate the video encoding and word encoding. m = vm + wm gru_h = self.gru_cell(m, gru_h) gru_h = self.gru_drop(gru_h) # Finally decode the word_{i+1}. word_logits = self.word_restore(gru_h) use_teacher_forcing = not infer and (random.random() < teacher_forcing_ratio) if use_teacher_forcing: word_id = captions[:, i] else: if use_argmax: word_id = word_logits.max(1)[1] else: posterior = F.softmax(word_logits, dim=1) word_id = torch.multinomial(posterior, 1).squeeze(1) if infer: # In infer mode, use word_id from label. outputs[i] = word_id else: # Otherwise, generate word from logits. outputs[i] = word_logits # Compute word representation. word = self.word_embed(word_id).squeeze(1) word = self.word_drop(word) # unsqueeze(1) will pull a vector (n) into a column vector (nx1) # Each vector in output is the output of the entire batch at a certain time step # Pull it into a column vector and then slash it up to get the output of the entire batch at all time steps. assert len(outputs) > 0 outputs = torch.cat([o.unsqueeze(1) for o in outputs], 1).contiguous() return outputs
def train( # General training hyperparameters. dataset: str, num_epochs: int = 100, batch_size: int = 128, # Learning rate schedulers. learning_rate: float = 3e-4, ss_factor: int = 24, min_ss: float = 0.6, # Representation hyperparameters. projected_size: int = 500, hidden_size: int = 1024, # Hidden size of the recurrent cells. mid_size: int = 128, # Dimension of the boundary detection layer. # REVIEW josephz: Remove this? # frame_shape: tuple=(3, 224, 224), # Video frame shape. a_feature_size: int = 2048, # Appearance model feature-dimension size. # REVIEW josephz: Remove this? # m_feature_size=4096, # Motion model feature-dimension size. # Maximum-size hyperparameters. # frame_sample_rate: int=10, # Sample rate of video frames. max_frames: int = 30, # Maximum length of the video-frame sequence. max_words: int = 30, # Maximum length of the caption-word sequence. # Misc hyperparameters. ckpt_freq: int = 3, use_cuda: bool = False, use_ckpt: bool = False, use_argmax: bool = False, seed: int = 0, ): """ Args: dataset (str): Dataset to train on. num_epochs (int): Number of epochs to train for. batch_size (int): Batch size to train with. learning_rate (float): Learning rate. ss_factor (int): Scheduled Sampling factor, to compute a teacher-forcing ratio. min_ss (float): Minimum teacher-forcing ratio. projected_size (int): Projection size for the Encoder-Decoder model. hidden_size (int): Hidden state size for the recurrent network in the encoder. mid_size (int): Hidden state size for the Boundary Detector network in the encoder. a_feature_size: Input feature size for the Encoder network. max_frames (int): Maximum length of the video-frame sequence. max_words (int): Maximum length of the caption-word sequence. ckpt_freq (int): Frequency to compute evaluation metrics and save checkpoint. use_cuda (bool): Flag whether to use CUDA devices. use_ckpt (bool): Flag on whether to load checkpoint if possible. use_argmax (bool): Flag on whether to use greedy or multinomial sampling during decoding. seed (int): Random seed. Effects: We will have several outputs: - Checkpoints (model weights) - Logs (tensorboard logs) """ # Set seeds. torch.random.manual_seed(seed) np.random.seed(seed) # Prepare output paths. # REVIEW josephz: This is unbelievably hacky, but we want an easy way to allow the user to set and track # hyperparameters using the cmd_line interface? This should probably be abstracted in utility.py. hparams = locals() params = { arg_name: hparams[arg_name] for arg_name in inspect.signature(train).parameters.keys() } ckpt_path = _util.get_weights_path_by_param(reuse=False, **params) print( "Saving checkpoints to '{ckpt_path}', you may visualize in tensorboard with the following: \n\n\t`tensorboard --logdir={ckpt_path}`\n" .format(ckpt_path=ckpt_path)) # Setup logging paths. log_path = os.path.join(ckpt_path, 'logs') _util.mkdir(log_path) _tb_logger.configure(log_path, flush_secs=10) # REVIEW josephz: Todo, clean this up. banet_pth_path_fmt = os.path.join(ckpt_path, '{:04d}_{:04d}.pth') best_banet_pth_path = os.path.join(ckpt_path, 'weights.pth') optimizer_pth_path = os.path.join(ckpt_path, 'optimizer.pth') best_optimizer_pth_path = os.path.join(ckpt_path, 'best_optimizer.pth') # Load Vocabulary. vocab_size = len(vocab()) # Load Reference for COCO. # val_dir = _util.get_dataset_by_name(dataset, mode='val') # val_reference_txt_path = os.path.join(val_dir, 'reference.json') # val_prediction_txt_path = os.path.join(val_dir, 'prediction.txt') # reference = COCO(val_reference_txt_path) eval_mode = 'val' eval_dir = _util.get_dataset_by_name(dataset, mode=eval_mode) test_reference_txt_path = os.path.join(eval_dir, 'reference.json') test_prediction_txt_path = os.path.join(eval_dir, 'prediction.txt') reference = COCO(test_reference_txt_path) print("Evaluating on '{}'".format(eval_dir)) # Initialize the model. banet = _models.BANet(a_feature_size, projected_size, mid_size, hidden_size, max_frames, max_words, use_cuda=use_cuda) # Load model weights if possible. if use_ckpt: pretrained_path = os.path.join(_util.get_raw_dataset_by_name('MSRVTT'), 'pretrained_weights.pth') weights = torch.load(pretrained_path) # REVIEW josephz: Figure out how to do the decoder weights partially: # https://discuss.pytorch.org/t/how-to-load-part-of-pre-trained-model/1113/6 del weights['decoder.word_embed.weight'] del weights['decoder.word_restore.bias'] del weights['decoder.word_restore.weight'] banet.load_state_dict(weights, strict=False) if use_cuda: banet.cuda() # Initialize loss and optimizer. criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(banet.parameters(), lr=learning_rate) if os.path.exists(optimizer_pth_path) and use_ckpt: optimizer.load_state_dict(torch.load(optimizer_pth_path)) # Initialize Dataloaders. train_loader = _data.get_train_dataloader(dataset, batch_size=batch_size) eval_loader = _data.get_eval_dataloader(dataset, eval_mode, batch_size=batch_size) num_train_steps = len(train_loader) num_eval_steps = len(eval_loader) # Begin Training Loop. print("Training Configuration:") print("\tLearning Rate: '{0:.4f}'".format(learning_rate)) print("\tScheduled Sampling:") print("\t\tMax Teacher Forcing Rate: '{0:.4f}'".format(min_ss)) print("\t\tScheduled Factor: '{0:.4f}'".format(ss_factor)) print("\tBatch Size: '{}'".format(batch_size)) print("\tEpochs: '{}'".format(num_epochs)) print("\tDataset: '{}'".format(dataset)) print("\tCheckpoint Path: '{}'".format(ckpt_path)) best_meteor = 0 loss_count = 0 for epoch in range(num_epochs): epsilon = max(min_ss, ss_factor / (ss_factor + np.exp(epoch / ss_factor))) print('epoch:%d\tepsilon:%.8f' % (epoch, epsilon)) _tb_logger.log_value('epsilon', epsilon, epoch) for i, (videos, captions, cap_lens, video_ids) in tqdm.tqdm(enumerate(train_loader, start=1), total=num_train_steps): if use_cuda: videos = videos.cuda() targets = captions.cuda() else: targets = captions # Zero the gradients and run the encoder-decoder model. optimizer.zero_grad() outputs, video_encoded = banet(videos, targets, teacher_forcing_ratio=epsilon, use_argmax=use_argmax) # NOTE: Usually the last batch is less than the selected batch_size, so we dynamically # compute the correct batch_size to use here, rather than throwing away the last # training batch. bsz = len(targets) # Un-pad and flatten the outputs and labels. outputs = torch.cat([outputs[j][:cap_lens[j]] for j in range(bsz)], dim=0) targets = torch.cat([targets[j][:cap_lens[j]] for j in range(bsz)], dim=0) outputs = outputs.view(-1, vocab_size) targets = targets.view(-1) # Compute loss for back-propagation. # assert all(targets > 0) and all(outputs > 0) loss = criterion(outputs, targets) loss_val = loss.item() _tb_logger.log_value('loss', loss_val, epoch * num_train_steps + i) loss_count += loss_val # REVIEW josephz: Is there grad_norm? loss.backward() optimizer.step() eval_steps = 25 if i % eval_steps == 0 or bsz < batch_size: loss_count /= eval_steps if bsz == batch_size else i % eval_steps perplexity = np.exp(loss_count) print( 'Epoch [%d/%d]:\n\tStep [%d/%d]\n\tLoss: %.4f\n\tPerplexity: %5.4f' % (epoch, num_epochs, i, num_train_steps, loss_count, perplexity)) _tb_logger.log_value('perplexity', perplexity, epoch * num_train_steps + i) loss_count = 0 tokens = banet.decoder.sample(video_encoded) for j in range(5): we = vocab().decode(tokens.data[j].squeeze()) gt = vocab().decode(captions[j].squeeze()) print('\t\t[vid_id={}]'.format(video_ids[j])) print('\t\t\tWE: %s\n\t\t\tGT: %s' % (we, gt)) # Finally, compute evaluation metrics and save the best models. if epoch % ckpt_freq == 0: # Save epoch checkpoint. banet_pth_path = banet_pth_path_fmt.format(epoch, num_epochs) print("Saving checkpoints to '{}'".format(banet_pth_path)) torch.save(banet.state_dict(), banet_pth_path) torch.save(optimizer.state_dict(), optimizer_pth_path) # Compute evaluation. banet.eval() print("Computing Metrics:...") metrics = _train.eval_step(eval_loader, banet, test_prediction_txt_path, reference, use_cuda=use_cuda) for k, v in metrics.items(): _tb_logger.log_value(k, v, epoch) if k == 'METEOR' and v > best_meteor: # Save the best model based on the METEOR metric. # For reference, see https://www.cs.cmu.edu/~alavie/papers/BanerjeeLavie2005-final.pdf print("Saving best checkpoint of metric: '{}'".format( best_meteor)) shutil.copy2(banet_pth_path, best_banet_pth_path) shutil.copy2(optimizer_pth_path, best_optimizer_pth_path) best_meteor = v banet.train()
def evaluate(raw: str, dataset: str, mode: str, weights_path: str, batch_size: int = 64, use_cuda: bool = False) -> None: dataset_dir = _util.get_dataset_by_name(dataset, mode) raw_dir = _util.get_raw_dataset_by_name(raw, mode) model, run, args, weights_path = _util.get_params_by_weights_path( weights_path) a_feature_size = int(args["a_feature_size"]) projected_size = int(args["projected_size"]) mid_size = int(args["mid_size"]) hidden_size = int(args["hidden_size"]) max_frames = int(args["max_frames"]) max_words = int(args["max_words"]) banet = _models.BANet(a_feature_size, projected_size, mid_size, hidden_size, max_frames, max_words, use_cuda=use_cuda) pretrained_path = os.path.join(weights_path, "weights.pth") weights = torch.load(pretrained_path) banet.load_state_dict(weights) if use_cuda: banet.cuda() print("Computing metrics...") eval_loader = _data.get_eval_dataloader(dataset, mode, batch_size=batch_size) test_reference_txt_path = os.path.join(dataset_dir, 'reference.json') test_prediction_txt_path = os.path.join(dataset_dir, 'prediction.txt') reference = COCO(test_reference_txt_path) _train.eval_step(eval_loader, banet, test_prediction_txt_path, reference, use_cuda=use_cuda) # Must switch to a new loder which provides captions. eval_loader = _data.get_dataloader(dataset, mode, batch_size=batch_size) for i, (videos, captions, cap_lens, video_ids) in tqdm(enumerate(eval_loader, start=1), total=len(eval_loader)): if use_cuda: videos = videos.cuda() video_encoded = banet.encoder(videos) tokens = banet.decoder.sample(video_encoded) # vid_paths = [os.path.join(raw_dir, "{}.mp4".format(video_id)) for video_id in video_ids] for j in range(len(tokens)): # vid = imageio.get_reader(vid_paths[j]).iter_data() print('[vid_id={}]'.format(video_ids[j])) print("gt :", vocab().decode(captions[j])) print("pred:", vocab().decode(tokens.data[j].squeeze())) print()