def __init__( self, data_root: str, split: str, tokenizer: SentencePieceBPETokenizer, image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM, mask_proportion: float = 0.15, mask_probability: float = 0.80, replace_probability: float = 0.10, max_caption_length: int = 30, use_single_caption: bool = False, percentage: float = 100.0, ): lmdb_path = os.path.join(data_root, 'virtex', f"serialized_{split}.lmdb") self.reader = LmdbReader(lmdb_path, percentage=percentage) self.image_transform = image_transform self.caption_transform = alb.Compose([ T.NormalizeCaption(), T.TokenizeCaption(tokenizer), T.TruncateCaptionTokens(max_caption_length), ]) self.use_single_caption = use_single_caption self.padding_idx = tokenizer.token_to_id("<unk>") # Handles to commonly used variables for word masking. self._vocab_size = tokenizer.get_vocab_size() self._mask_index = tokenizer.token_to_id("[MASK]") self._mask_proportion = mask_proportion self._mask_prob = mask_probability self._repl_prob = replace_probability
def __init__( self, data_root: str, split: str, tokenizer: SentencePieceBPETokenizer, image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM, max_caption_length: int = 30, mask_proportion: float = 0.15, mask_probability: float = 0.80, replace_probability: float = 0.10, ): self._dset = CocoCaptionsDataset(data_root, split) self.tokenizer = tokenizer self.image_transform = image_transform self.max_caption_length = max_caption_length # Short handles for common tokens for convenience: self.padding_idx = tokenizer.token_to_id("<unk>") self.sos_id = tokenizer.token_to_id("[SOS]") self.eos_id = tokenizer.token_to_id("[EOS]") self.mask_id = tokenizer.token_to_id("[MASK]") self._vocab_size = tokenizer.get_vocab_size() self._mask_proportion = mask_proportion self._mask_prob = mask_probability self._repl_prob = replace_probability
def __init__( self, data_root: str, csv: str, split: str, tokenizer: SentencePieceBPETokenizer, image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM, padded_length: int = 256, max_caption_length: int = 50, use_single_caption: bool = False, percentage: float = 100.0, ): self.data_root = data_root self.padded_length = padded_length info_df = pd.read_csv(os.path.join(data_root, csv), delimiter="|") self.video_list = [] for index, row in info_df.iterrows(): self.video_list.append((index, row['name'], [row['orth']])) self.image_transform = image_transform self.caption_transform = alb.Compose([ T.NormalizeCaption(), T.TokenizeCaption(tokenizer), T.TruncateCaptionTokens(max_caption_length), ]) self.use_single_caption = use_single_caption self.padding_idx = tokenizer.token_to_id("<unk>")
def __init__( self, data_root: str, split: str, tokenizer: SentencePieceBPETokenizer, image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM, max_caption_length: int = 30, ): self._dset = CocoCaptionsDataset(data_root, split) self.image_transform = image_transform self.max_caption_length = max_caption_length # Short handles for common tokens for convenience: self.padding_idx = tokenizer.token_to_id("<unk>") self.sos_id = tokenizer.token_to_id("[SOS]") self.eos_id = tokenizer.token_to_id("[EOS]")
def __init__( self, data_root: str, split: str, tokenizer: SentencePieceBPETokenizer, image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM, max_caption_length: int = 30, ): lmdb_path = os.path.join(data_root, f"serialized_{split}.lmdb") self.reader = LmdbReader(lmdb_path) self.image_transform = image_transform self.caption_transform = alb.Compose([ T.NormalizeCaption(), T.TokenizeCaption(tokenizer), T.TruncateCaptionTokens(max_caption_length), ]) self.padding_idx = tokenizer.token_to_id("<unk>")
def log_predictions(self, batch: Batch, tokenizer: SentencePieceBPETokenizer) -> str: self.eval() with torch.no_grad(): predictions = self.forward(batch)["predictions"] self.train() predictions_str = "" for tokens, preds in zip(batch["caption_tokens"], predictions): # Predictions here are individual tokens, and do not have any order # like captions, so decode them separately so we don't strip off # metaspace character and special tokens if any. preds = [tokenizer.id_to_token(p) for p in preds.tolist()] predictions_str += f""" Caption tokens : {tokenizer.decode(tokens.tolist())} Predictions (f): {" ".join(preds)} """ return predictions_str