def __init__(self, input_size, hidden_size, dropout): super(Sentinel, self).__init__() self.affine_x = nn.Linear(input_size, hidden_size) init_weights(self.affine_x, 'linear') self.affine_h = nn.Linear(hidden_size, hidden_size) init_weights(self.affine_h, 'linear') # Dropout applied before affine transformation self.dropout = nn.Dropout(dropout)
def __init__(self, args): super(TransformerEncoderLayer, self).__init__() self.embed_dim = args.embed_size self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout ) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = nn.Linear(self.embed_dim, args.encoder_ffn_embed_dim) utils.init_weights(self.fc1, 'relu') self.fc2 = nn.Linear(args.encoder_ffn_embed_dim, self.embed_dim) utils.init_weights(self.fc2, 'relu') self.layer_norms = nn.ModuleList([nn.LayerNorm(self.embed_dim) for i in range(2)])
def __init__(self, embed_size, hidden_size, decoder_attn_embed_size, vocab_size, dropout): super(AdaptiveBlock, self).__init__() # Sentinel block self.sentinel = Sentinel(embed_size, hidden_size, dropout) # Image Spatial Attention Block self.atten = Atten(hidden_size, decoder_attn_embed_size, dropout) # Final Caption generator self.mlp = nn.Linear(hidden_size, vocab_size) init_weights(self.mlp, 'linear') # Dropout layer inside Affine Transformation self.dropout = nn.Dropout(dropout) self.hidden_size = hidden_size
def __init__(self, hidden_size, decoder_attn_embed_size, dropout): super(Atten, self).__init__() self.affine_v = nn.Linear(hidden_size, decoder_attn_embed_size) # W_v init_weights(self.affine_v, 'linear') self.affine_g = nn.Linear(hidden_size, decoder_attn_embed_size) # W_g init_weights(self.affine_g, 'linear') self.affine_s = nn.Linear(hidden_size, decoder_attn_embed_size) # W_s init_weights(self.affine_s, 'linear') self.affine_h = nn.Linear(decoder_attn_embed_size, 1) # w_h init_weights(self.affine_h, 'linear') self.dropout = nn.Dropout(dropout)
def __init__(self, args, word_emb): super(AdaptiveDecoder, self).__init__() # word embedding self.relu_dropout = args.relu_dropout self.embed = self.from_pretrained(word_emb, freeze=True) self.w_to_h = nn.Linear(args.embed_size, args.hidden_size) init_weights(self.w_to_h, 'relu') self.w_to_c = nn.Linear(args.embed_size, args.hidden_size) init_weights(self.w_to_c, 'relu') # LSTM decoder: input = [ w_t; v_g ] => 2 x word_embed_size; self.LSTM = nn.LSTM(args.embed_size, args.hidden_size, 1, batch_first=True) init_weights(self.LSTM) # Save hidden_size for hidden and cell variable self.hidden_size = args.hidden_size # Adaptive Attention Block: # Sentinel + C_hat + Final scores for caption sampling self.adaptive = AdaptiveBlock(args.embed_size, args.hidden_size, args.decoder_attn_embed_size, args.vocab_size, args.dropout)