def __init__(self, d_model, d_inner_hid, n_head, dim_per_head, dropout=0.1, dim_capsule=100, num_capsules=0, null_capsule=False): super(DecoderBlock, self).__init__() self.slf_attn = MultiHeadedAttention(head_count=n_head, model_dim=d_model, dropout=dropout, dim_per_head=dim_per_head) # self.ctx_attn = MultiHeadedAttention(head_count=n_head, model_dim=d_model, dropout=dropout, # dim_per_head=dim_per_head) self.pos_ffn = PositionwiseFeedForward(size=d_model, hidden_size=d_inner_hid) self.layer_norm_1 = nn.LayerNorm(d_model) self.layer_norm_2 = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) # contextual capsule layer self.apply_capsule = True # self.pre_capsule_layer_norm = nn.LayerNorm(d_model) assert dim_capsule % num_capsules == 0 self.dim_per_cap = dim_capsule // num_capsules dim_per_part = dim_capsule // 3 total_num_capsules = num_capsules self.null_caps = null_capsule if null_capsule: INFO("Using Null Capsules to attract irrelevant routing.") total_num_capsules += num_capsules // 3 self.capsule_layer = ContextualCapsuleLayer( num_out_caps=total_num_capsules, num_in_caps=None, dim_in_caps=d_model, dim_out_caps=self.dim_per_cap, dim_context=d_model, num_iterations=3, share_route_weights_for_in_caps=True) self.out_and_cap_ffn = MultiInputPositionwiseFeedForward( size=d_model, hidden_size=d_inner_hid, dropout=dropout, inp_sizes=[dim_per_part, dim_per_part, dim_per_part])