def __init__( self, visual_feature_size: int, vocab_size: int, hidden_size: int, num_layers: int, attention_heads: int, feedforward_size: int, dropout: float = 0.1, norm_first: bool = False, mask_future_positions: bool = True, max_caption_length: int = 30, padding_idx: int = 0, ): super().__init__(visual_feature_size, vocab_size, hidden_size) self.num_layers = num_layers self.attention_heads = attention_heads self.feedforward_size = feedforward_size self.dropout = dropout self.mask_future_positions = mask_future_positions self.padding_idx = padding_idx self.visual_projection = nn.Linear(visual_feature_size, self.textual_feature_size) self.embedding = WordAndPositionalEmbedding( self.vocab_size, self.textual_feature_size, dropout=dropout, max_caption_length=max_caption_length, padding_idx=padding_idx, ) # Initialize a transformer with given transformer class (for example # nn.TransformerEncoder and nn.TransformerEncoderLayer). self.transformer = nn.TransformerDecoder( nn.TransformerDecoderLayer( self.textual_feature_size, self.attention_heads, dim_feedforward=self.feedforward_size, dropout=dropout, activation="gelu", batch_first=True, norm_first=norm_first, ), num_layers=self.num_layers, # Add final layer norm for pre-norm transformers. norm=nn.LayerNorm(self.hidden_size) if norm_first else None, ) self.apply(self._init_weights) # Create an output linear layer and tie the input and output word # embeddings to reduce parameters. self.output = nn.Linear(self.textual_feature_size, vocab_size) self.output.weight = self.embedding.words.weight
def __init__( self, visual_feature_size: int, vocab_size: int, hidden_size: int, num_layers: int, attention_heads: int, feedforward_size: int, dropout: float = 0.1, norm_type: str = "post", mask_future_positions: bool = True, max_caption_length: int = 30, padding_idx: int = 0, ): super().__init__(visual_feature_size, vocab_size, hidden_size) self.num_layers = num_layers self.attention_heads = attention_heads self.feedforward_size = feedforward_size self.dropout = dropout self.mask_future_positions = mask_future_positions self.padding_idx = padding_idx self.visual_projection = nn.Linear( visual_feature_size, self.textual_feature_size ) self.embedding = WordAndPositionalEmbedding( self.vocab_size, self.textual_feature_size, dropout=dropout, max_caption_length=max_caption_length, padding_idx=padding_idx, ) # Make decoder layer depending on whether it's a Pre-Norm or Post-Norm. LayerClass = ( nn.TransformerDecoderLayer if norm_type == "post" else PreNormTransformerDecoderLayer ) _layer = LayerClass( self.textual_feature_size, self.attention_heads, dim_feedforward=self.feedforward_size, dropout=dropout, activation="gelu", ) self.transformer = nn.TransformerDecoder(_layer, self.num_layers) self.apply(self._init_weights) # Create an output linear layer and tie the input and output word # embeddings to reduce parameters. self.output = nn.Linear(self.textual_feature_size, vocab_size) self.output.weight = self.embedding.words.weight
def __init__( self, vocab_size: int, hidden_size: int, num_layers: int, attention_heads: int, feedforward_size: int, dropout: float = 0.1, norm_type: str = "pre", padding_idx: int = 0, max_caption_length: int = 30, ): super().__init__(vocab_size, hidden_size) self.num_layers = num_layers self.attention_heads = attention_heads self.feedforward_size = feedforward_size self.dropout = dropout self.padding_idx = padding_idx self.embedding = WordAndPositionalEmbedding( self.vocab_size, self.textual_feature_size, max_caption_length=max_caption_length, dropout=dropout, ) # Make encoder layer depending on whether it's a Pre-Norm or Post-Norm. LayerClass = (nn.TransformerDecoderLayer if norm_type == "post" else PreNormTransformerDecoderLayer) _layer = LayerClass( self.textual_feature_size, self.attention_heads, dim_feedforward=self.feedforward_size, dropout=dropout, activation="gelu", ) # We call this member as "encoder" for consistent naming, and because # it still "encodes" the caption for us. self.encoder = nn.TransformerDecoder(_layer, self.num_layers) self.apply(self._init_weights) # Create an output linear layer and tie the input and output word # embeddings to reduce parameters. self.output = nn.Linear(self.textual_feature_size, vocab_size) self.output.weight = self.embedding.words.weight