示例#1
0
    def __init__(
        self,
        data_root: str,
        split: str,
        tokenizer: SentencePieceBPETokenizer,
        image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM,
        mask_proportion: float = 0.15,
        mask_probability: float = 0.80,
        replace_probability: float = 0.10,
        max_caption_length: int = 30,
        use_single_caption: bool = False,
        percentage: float = 100.0,
    ):
        lmdb_path = os.path.join(data_root, 'virtex',
                                 f"serialized_{split}.lmdb")
        self.reader = LmdbReader(lmdb_path, percentage=percentage)

        self.image_transform = image_transform
        self.caption_transform = alb.Compose([
            T.NormalizeCaption(),
            T.TokenizeCaption(tokenizer),
            T.TruncateCaptionTokens(max_caption_length),
        ])
        self.use_single_caption = use_single_caption
        self.padding_idx = tokenizer.token_to_id("<unk>")

        # Handles to commonly used variables for word masking.
        self._vocab_size = tokenizer.get_vocab_size()
        self._mask_index = tokenizer.token_to_id("[MASK]")
        self._mask_proportion = mask_proportion
        self._mask_prob = mask_probability
        self._repl_prob = replace_probability
示例#2
0
    def __init__(
        self,
        data_root: str,
        split: str,
        tokenizer: SentencePieceBPETokenizer,
        image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM,
        max_caption_length: int = 30,
        mask_proportion: float = 0.15,
        mask_probability: float = 0.80,
        replace_probability: float = 0.10,
    ):
        self._dset = CocoCaptionsDataset(data_root, split)
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_caption_length = max_caption_length

        # Short handles for common tokens for convenience:
        self.padding_idx = tokenizer.token_to_id("<unk>")
        self.sos_id = tokenizer.token_to_id("[SOS]")
        self.eos_id = tokenizer.token_to_id("[EOS]")
        self.mask_id = tokenizer.token_to_id("[MASK]")

        self._vocab_size = tokenizer.get_vocab_size()
        self._mask_proportion = mask_proportion
        self._mask_prob = mask_probability
        self._repl_prob = replace_probability
示例#3
0
 def __init__(
     self,
     data_root: str,
     csv: str,
     split: str,
     tokenizer: SentencePieceBPETokenizer,
     image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM,
     padded_length: int = 256,
     max_caption_length: int = 50,
     use_single_caption: bool = False,
     percentage: float = 100.0,
 ):
     self.data_root = data_root
     self.padded_length = padded_length
     info_df = pd.read_csv(os.path.join(data_root, csv), delimiter="|")
     self.video_list = []
     for index, row in info_df.iterrows():
         self.video_list.append((index, row['name'], [row['orth']]))
     self.image_transform = image_transform
     self.caption_transform = alb.Compose([
         T.NormalizeCaption(),
         T.TokenizeCaption(tokenizer),
         T.TruncateCaptionTokens(max_caption_length),
     ])
     self.use_single_caption = use_single_caption
     self.padding_idx = tokenizer.token_to_id("<unk>")
示例#4
0
    def __init__(
        self,
        data_root: str,
        split: str,
        tokenizer: SentencePieceBPETokenizer,
        image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM,
        max_caption_length: int = 30,
    ):
        self._dset = CocoCaptionsDataset(data_root, split)
        self.image_transform = image_transform
        self.max_caption_length = max_caption_length

        # Short handles for common tokens for convenience:
        self.padding_idx = tokenizer.token_to_id("<unk>")
        self.sos_id = tokenizer.token_to_id("[SOS]")
        self.eos_id = tokenizer.token_to_id("[EOS]")
示例#5
0
    def __init__(
        self,
        data_root: str,
        split: str,
        tokenizer: SentencePieceBPETokenizer,
        image_transform: Callable = T.DEFAULT_IMAGE_TRANSFORM,
        max_caption_length: int = 30,
    ):
        lmdb_path = os.path.join(data_root, f"serialized_{split}.lmdb")
        self.reader = LmdbReader(lmdb_path)

        self.image_transform = image_transform
        self.caption_transform = alb.Compose([
            T.NormalizeCaption(),
            T.TokenizeCaption(tokenizer),
            T.TruncateCaptionTokens(max_caption_length),
        ])
        self.padding_idx = tokenizer.token_to_id("<unk>")
示例#6
0
    def log_predictions(self, batch: Batch,
                        tokenizer: SentencePieceBPETokenizer) -> str:

        self.eval()
        with torch.no_grad():
            predictions = self.forward(batch)["predictions"]
        self.train()

        predictions_str = ""
        for tokens, preds in zip(batch["caption_tokens"], predictions):
            # Predictions here are individual tokens, and do not have any order
            # like captions, so decode them separately so we don't strip off
            # metaspace character and special tokens if any.
            preds = [tokenizer.id_to_token(p) for p in preds.tolist()]
            predictions_str += f"""
                Caption tokens : {tokenizer.decode(tokens.tolist())}
                Predictions (f): {" ".join(preds)}

                """
        return predictions_str