Пример #1
0
    def __init__(self,
                 opt,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 dict_size=None,
                 label_emb=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions,
                dict_size=dict_size,
                label_emb=label_emb,
                opt=opt)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
Пример #2
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 full_context_alignment=False,
                 alignment_heads=None):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 layer_index,
                 max_relative_positions=0):
        super(ConvTransformerEncoderLayer, self).__init__()

        if layer_index <= 2:
            self.self_attn = ConvMultiHeadedAttention(
                heads,
                d_model,
                13,
                3,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        else:
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #4
0
    def __init__(self, size, dropout,
                 head_count=8, hidden_size=2048, self_attn_type="scaled-dot"):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn_type = self_attn_type

        if self_attn_type == "scaled-dot":
            self.self_attn = onmt.modules.MultiHeadedAttention(
                head_count, size, dropout=dropout)
        elif self_attn_type == "average":
            self.self_attn = onmt.modules.AverageAttention(
                size, dropout=dropout)

        self.context_attn = onmt.modules.MultiHeadedAttention(
            head_count, size, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(size,
                                                    hidden_size,
                                                    dropout)
        self.layer_norm_1 = onmt.modules.LayerNorm(size)
        self.layer_norm_2 = onmt.modules.LayerNorm(size)
        self.dropout = dropout
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
Пример #5
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 strided_attn=False,
                 conv_k_v=False):
        super(TransformerEncoderLayer, self).__init__()

        self.strided_attn = strided_attn
        self.conv_k_v = conv_k_v
        if self.strided_attn:
            self.self_attn = MultiHeadedStridedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        else:
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.conv1d_k_v = nn.Conv1d(d_model, d_model, kernel_size=3, stride=3)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #6
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
Пример #7
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 self_attn_type="scaled-dot",
                 max_relative_positions=0,
                 aan_useffn=False,
                 tgt_concept_words_type=-1):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                max_relative_positions=max_relative_positions)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=attention_dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)

        self.tgt_concept_words_type = tgt_concept_words_type
        if tgt_concept_words_type in [2]:
            self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            self.model_dim = 1024
            self.d_ff = 4096

            torch.set_grad_enabled(False)
            onmt_ffn = PositionwiseFeedForward(self.model_dim, self.d_ff)
            onmt_ffn.eval()
            if use_cuda:
                onmt_ffn.to(self.test_device)

            turbo_ffn_trans = turbo_transformers.PositionwiseFeedForward.from_onmt(
                onmt_ffn, is_trans_weight=True)
            turbo_ffn_notrans = turbo_transformers.PositionwiseFeedForward.from_onmt(
                onmt_ffn, is_trans_weight=False)
            # (batch_size, input_len, model_dim)
            inputs = torch.rand(size=(batch_size, input_len, self.model_dim),
                                dtype=torch.float32,
                                device=self.test_device)
            return onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs
Пример #9
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.video_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm1 = LayerNorm(d_model)
        self.layer_norm2 = LayerNorm(d_model)

        self.drop = nn.Dropout(dropout)
        self.sublayer = nn.ModuleList(
            [SublayerConnection(d_model, dropout) for _ in range(3)])
Пример #10
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #11
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot"):
        super(TransformerDecoderLayer, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(heads,
                                                  d_model,
                                                  dropout=dropout)
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model, dropout=dropout)

        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
Пример #12
0
 def __init__(self, model_dim, dropout=0.1, aan_useffn=False):
     self.model_dim = model_dim
     self.aan_useffn = aan_useffn
     super(AverageAttention, self).__init__()
     if aan_useffn:
         self.average_layer = PositionwiseFeedForward(
             model_dim, model_dim, dropout)
     self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
Пример #13
0
    def __init__(self, model_dim, dropout=0.1):
        self.model_dim = model_dim

        super(AverageAttention, self).__init__()

        self.average_layer = PositionwiseFeedForward(model_dim, model_dim,
                                                     dropout)
        self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
Пример #14
0
    def __init__(self, size, dropout, head_count=8, hidden_size=2048):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(head_count,
                                                           size,
                                                           dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout)
        self.layer_norm = onmt.modules.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
Пример #15
0
    def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                 max_relative_positions=0, gnn=None):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
        self.ffn_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
    def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size):  # feat_vec_size added for adaptable feat_vec_size #latt
        super(LatticeEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
  #latt
        self.latt_attn = onmt.modules.GlobalAttention(d_model)
        self.feat_vec_size = feat_vec_size
  #latt
        self.layer_norm = onmt.modules.LayerNorm(d_model) 
        self.dropout = nn.Dropout(dropout)
Пример #17
0
 def __init__(self,
              model_dim,
              dropout=0.1,
              aan_useffn=False,
              pos_ffn_activation_fn=ActivationFunction.relu):
     self.model_dim = model_dim
     self.aan_useffn = aan_useffn
     super(AverageAttention, self).__init__()
     if aan_useffn:
         self.average_layer = PositionwiseFeedForward(
             model_dim, model_dim, dropout, pos_ffn_activation_fn)
     self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
    def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size):  # feat_vec_size added for adaptable feat_vec_size #latt
        super(LatticeEncoderLayer, self).__init__()

    #    self.self_attn = onmt.modules.MultiHeadedAttention(
    #         heads, d_model, dropout=dropout)
      # not used for RBA layer
      
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
  #latt
        self.latt_attn = onmt.modules.GlobalAttention(d_model)
        self.feat_vec_size = feat_vec_size
        self.linear_context_score = nn.Linear(feat_vec_size, 1)  # Layer for calculating context gate score
  #latt
        self.layer_norm = onmt.modules.LayerNorm(d_model) 
        self.dropout = nn.Dropout(dropout)
Пример #19
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLMLayer, self).__init__()

        # we no longer have context attention, only self attention
        self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                           d_model,
                                                           dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = onmt.modules.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        self_attn_type="scaled-dot",
        self_attn_func="softmax",
        self_attn_alpha=None,
        self_attn_bisect_iter=0,
        context_attn_func="softmax",
        context_attn_alpha=None,
        context_attn_bisect_iter=0,
    ):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn_type = self_attn_type

        if self_attn_type == "scaled-dot":
            self.self_attn = onmt.modules.MultiHeadedAttention(
                heads,
                d_model,
                dropout=dropout,
                attn_func=self_attn_func,
                attn_alpha=self_attn_alpha,
                attn_bisect_iter=self_attn_bisect_iter,
            )
        elif self_attn_type == "average":
            self.self_attn = onmt.modules.AverageAttention(d_model,
                                                           dropout=dropout)

        self.context_attn = onmt.modules.MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            attn_func=context_attn_func,
            attn_alpha=context_attn_alpha,
            attn_bisect_iter=context_attn_bisect_iter,
        )
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = dropout
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer("mask", mask)
Пример #21
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(ATransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                           d_model,
                                                           dropout=dropout)
        self.knowledge_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.context_attn = onmt.modules.MultiHeadedAttention(heads,
                                                              d_model,
                                                              dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #22
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 max_relative_positions=0,
                 downsampling=1):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.ds_layer = nn.Linear(d_model, int(
            d_model / downsampling)) if downsampling > 1 else None
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
    def __init__(self,
                 num_layers,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 embeddings,
                 selected_ctx=0,
                 fields=None):
        super(SimpleContextTransformerEncoder, self).__init__()
        self.selected_ctx = selected_ctx
        self.fields = fields

        self.num_layers = num_layers
        self.embeddings = embeddings
        self.layer_norm_shared = onmt.modules.LayerNorm(d_model)
        self.layer_norm_ctx = onmt.modules.LayerNorm(d_model)
        self.layer_norm_src_final = onmt.modules.LayerNorm(d_model)
        self.layer_norm_ctx_final = onmt.modules.LayerNorm(d_model)

        self.shared_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, heads, d_ff, dropout)
            for _ in range(num_layers - 1)
        ])

        self.extra_ctx_layer = TransformerEncoderLayer(d_model, heads, d_ff,
                                                       dropout)
        self.ctx_src_self_attn = onmt.modules.MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.ctx_src_layer_norm = onmt.modules.LayerNorm(d_model)

        self.src_self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                               d_model,
                                                               dropout=dropout)
        self.src_layer_norm = onmt.modules.LayerNorm(d_model)

        # TODO dim
        self.gate = nn.Linear(d_model * 2, 1)
        self.gate_sigmoid = nn.Sigmoid()

        self.final_feed_forward = PositionwiseFeedForward(
            d_model, d_ff, dropout)
        self.final_layer_norm = onmt.modules.LayerNorm(d_model)
Пример #24
0
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 attention_dropout,
                 max_relative_positions=0,
                 activation='relu',
                 is_bert=False):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads,
            d_model,
            dropout=attention_dropout,
            max_relative_positions=max_relative_positions)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    activation)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #25
0
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        self_attn_func,
        self_attn_alpha,
        self_attn_bisect_iter,
    ):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = onmt.modules.MultiHeadedAttention(
            heads,
            d_model,
            dropout=dropout,
            attn_func=self_attn_func,
            attn_alpha=self_attn_alpha,
            attn_bisect_iter=self_attn_bisect_iter,
        )
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Пример #26
0
 def __init__(self, d_model, d_ff, n_head, dropout):
     super(VideoBlock, self).__init__()
     self.self_attn = MultiHeadedAttention(n_head, d_model)
     self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
     self.sublayer = nn.ModuleList([SublayerConnection(d_model, dropout)])
     self.drop = nn.Dropout(dropout)
Пример #27
0
    def __init__(self, d_model, heads, d_ff, dropout, attention_dropout,
                 max_relative_positions=0, num_boost=4, learnable_weights=True, 
                 boost_type='continuous', main_stream=False, boost_drop_rate=0.1, 
                 boost_dropout_diff=0.0,boost_with_ffn=False, boost_str='', 
                 boost_gating=False, mask_pos_type=[], self_att_merge_layer=False,
                 adv_bias_step=0.0, shuffle_merge=False, shuffle_merge_type="sum",
                 adv_gradient_boost=False,
                 adv_gradient_boost_step=0.01, adv_gradient_boost_func='mse',
                 adv_gradient_boost_no_ce=False, gradient_boost_scale=1.0,
                 boost_adv_method_list=[], boost_sample_rate=1.0,
                 shuffle_fix=0, boost_single_att=False, boost_single_ffn=False,
                 shuffle_stop_gradient=False):
                 
        super(TransformerEncoderBoostLayer, self).__init__()

        self.num_boost = num_boost
        self.boost_type = boost_type
        self.main_stream = main_stream
        self.boost_drop_rate = boost_drop_rate
        self.boost_with_ffn = boost_with_ffn
        self.use_adv = True if self.boost_type == 'adv' else False
        self.a_num = num_boost
        # self.use_dropout_diff = True if boost_dropout_diff != 0.0 else False
        self.use_dropout_diff = False
        self.d_num = num_boost
        self.use_mask = True if self.boost_type in {'continuous', 'continuous_comp', 'random', 'pos'} else False
        # overwrite params based on boost_str
        self.boost_gating = boost_gating
        self.mask_pos_type = mask_pos_type
        # init postag params
        self.use_postag = False
        self.p_num = 0
        self._parse_boost_str(boost_str)
        # whether to use self-att to merge each path's output
        self.use_self_att_merge_layer = self_att_merge_layer
        
        self.adv_bias_step = adv_bias_step
        self.shuffle_merge = shuffle_merge
        self.shuffle_merge_type = shuffle_merge_type

        self.adv_gradient_boost = adv_gradient_boost
        self.adv_gradient_boost_step = adv_gradient_boost_step
        self.adv_gradient_boost_func = adv_gradient_boost_func
        self.adv_gradient_boost_no_ce = adv_gradient_boost_no_ce

        self.gradient_boost_scale = gradient_boost_scale 
        self.boost_sample_rate = boost_sample_rate

        self.boost_single_att = boost_single_att
        self.boost_single_ffn = boost_single_ffn

        self.shuffle_stop_gradient = shuffle_stop_gradient

        # compute dropout list
        if not self.use_dropout_diff:
            dropout_list = [dropout for i in range(self.num_boost)]
        else:
            dropout_diffs = [boost_dropout_diff * i - float(self.d_num)/2 * boost_dropout_diff for i in range(self.d_num)]
            dropout_list = [dropout + dropout_diffs[i] for i in range(self.d_num)] + [dropout for i in range(self.num_boost - self.d_num)]
        self.dropout_list = dropout_list
        print("Boost dropout list: {}".format(dropout_list))
        assert max(dropout_list) <= 1.0 and min(dropout_list) >= 0.0
        
        # list of self-attention module
        if not self.boost_single_att:
            self.self_attn_list = [ MultiHeadedAttention(
                                        heads, d_model, dropout=attention_dropout,
                                        max_relative_positions=max_relative_positions)
                                    for n in range(self.num_boost) ]
            self.self_attn_list = nn.ModuleList(self.self_attn_list)
        else:
            self.self_attn_list = MultiHeadedAttention(
                                        heads, d_model, dropout=attention_dropout,
                                        max_relative_positions=max_relative_positions)
        # assert self.d_num == self.num_boost

        if self.main_stream:
            # main stream for self-attention
            self.main_self_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout,
                                                       max_relative_positions=max_relative_positions)
        
        if self.use_self_att_merge_layer:
            # keep the default setting for self-attention layer.
            self.att_merge_layer = MultiHeadedAttention(
                heads, d_model, dropout=attention_dropout,
                max_relative_positions=max_relative_positions)
            self.merge_layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        # convert all ones to 1/N
        weights_init = torch.ones(self.num_boost, dtype=torch.float32) / self.num_boost
        self.weights = nn.Parameter(weights_init, requires_grad=learnable_weights)
        
        if self.boost_with_ffn:
            if not self.boost_single_ffn:
                feed_forward_list = [ PositionwiseFeedForward(d_model, d_ff, dropout_list[i]) for i in range(self.num_boost) ]
                self.feed_forward = nn.ModuleList(feed_forward_list)
            else:
                self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_list[0])
        else:
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
            
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
        # TODO: Functions for drop_rate is not implemented yet.

        self.shuffle_fix = shuffle_fix

        if self.shuffle_merge:
            if shuffle_fix == 0:            
                shuffle_matrix = torch.abs(torch.randn(self.num_boost, self.num_boost))
            else:
                shuffle_matrix = torch.ones(self.num_boost, self.num_boost) / self.num_boost
            self.shuffle_matrix = nn.Parameter(shuffle_matrix)
            self.merge_weights = torch.ones((self.num_boost-1,), dtype=torch.float32, requires_grad=False)

        if self.use_adv or self.use_postag is True:
            # permutation of max position range.
            self.max_perm = 3
            self.max_exchange = 3
            if not boost_adv_method_list:
                all_adv_methods = ['swap', 'reorder', 'delete', 'mask']
            else:
                all_adv_methods = boost_adv_method_list
            assert self.a_num <= len(all_adv_methods)
            self.activate_methods = all_adv_methods[:self.a_num]
            # create mask tensor
            if "mask" in self.activate_methods or self.use_postag is True:
                mask_tensor = torch.empty(d_model)
                torch.nn.init.normal_(mask_tensor, std=1.0/math.sqrt(d_model))
                self.mask_tensor = nn.Parameter(mask_tensor)

            print('Activated adversarial methods: {}'.format(self.activate_methods))
        
        if self.use_postag:
            assert len(self.mask_pos_type) == self.p_num

        if self.adv_gradient_boost is True:
            if adv_gradient_boost_func == 'mse':
                self.mse = nn.MSELoss(reduction='none')
            elif adv_gradient_boost_func == 'cos':
                self.cos_sim = nn.CosineSimilarity(dim=2)
            elif adv_gradient_boost_func == 'l1':
                self.l1 = nn.L1Loss(reduction='none')
            else:
                raise ValueError()
        
        self.keep_adv_gradient = False
        self.adv_gradient_value = 'moving_average'
        if self.keep_adv_gradient:
            self.register_buffer('gradient_moving_average', )

        self.keep_ffn_dist = []
        self.keep_attn_dist = []
        self.keep_attn_out_dist = []
        self.keep_attn_score = []
        
        return
Пример #28
0
    def __init__(
        self,
        d_model,
        heads,
        d_ff,
        dropout,
        attention_dropout,
        self_attn_type="scaled-dot",
        max_relative_positions=0,
        aan_useffn=False,
        full_context_alignment=False,
        alignment_heads=0,
        pos_ffn_activation_fn=ActivationFunction.relu,
    ):
        """
        Args:
            d_model (int): the dimension of keys/values/queries in
                :class:`MultiHeadedAttention`, also the input size of
                the first-layer of the :class:`PositionwiseFeedForward`.
            heads (int): the number of heads for MultiHeadedAttention.
            d_ff (int): the second-layer of the
                :class:`PositionwiseFeedForward`.
            dropout (float): dropout in residual, self-attn(dot) and
                feed-forward
            attention_dropout (float): dropout in context_attn  (and
                self-attn(avg))
            self_attn_type (string): type of self-attention scaled-dot,
                average
            max_relative_positions (int):
                Max distance between inputs in relative positions
                representations
            aan_useffn (bool): Turn on the FFN layer in the AAN decoder
            full_context_alignment (bool):
                whether enable an extra full context decoder forward for
                alignment
            alignment_heads (int):
                N. of cross attention heads to use for alignment guiding
            pos_ffn_activation_fn (ActivationFunction):
                activation function choice for PositionwiseFeedForward layer

        """
        super(TransformerDecoderLayerBase, self).__init__()

        if self_attn_type == "scaled-dot":
            self.self_attn = MultiHeadedAttention(
                heads,
                d_model,
                dropout=attention_dropout,
                max_relative_positions=max_relative_positions,
            )
        elif self_attn_type == "average":
            self.self_attn = AverageAttention(d_model,
                                              dropout=attention_dropout,
                                              aan_useffn=aan_useffn)

        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
                                                    pos_ffn_activation_fn)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        self.full_context_alignment = full_context_alignment
        self.alignment_heads = alignment_heads
Пример #29
0
    def __init__(self, model_mode, model_mode2, model_ffn_mode, d_model, heads,
                 d_ff, dropout):
        super(ATransformerEncoderLayer, self).__init__()

        self.model_mode = model_mode
        self.model_mode2 = model_mode2
        self.model_ffn_mode = model_ffn_mode

        if self.model_mode2 in ['default']:
            # attention
            self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                               d_model,
                                                               dropout=dropout)
            self.knowledge_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            self.context_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            # feed forward
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
            # layer normalization
            self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
            # debug
            print(
                'init encoder to default: model_mode={}, model_mode2={}, model_ffn_mode={}'
                .format(self.model_mode, self.model_mode2,
                        self.model_ffn_mode))

        elif self.model_mode2 in ['ffn']:
            # attention
            self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                               d_model,
                                                               dropout=dropout)
            self.knowledge_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            self.context_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            # feed forward
            if self.model_mode in ['top_act']:
                d_act = 1
            elif self.model_mode in ['all_acts']:
                d_act = 4
            else:
                print('choose valid option -model_mode')
                exit()
            if self.model_ffn_mode in ['additional']:
                self.feed_forward = PositionwiseFeedForward(
                    d_model + d_act, d_ff, dropout)
                self.feed_forward2 = nn.Linear(d_model + d_act, d_model)
            elif self.model_ffn_mode in ['resnet_nLN', 'resnet_LN']:
                self.feed_forward = PositionwiseFeedForward2(
                    d_model + d_act, d_model, d_ff, dropout,
                    self.model_ffn_mode)
            else:
                print('choose valid option -model_ffn_mode')
                exit()
            # layer normalization
            self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
            # debug
            print(
                'init encoder to ffn: model_mode={}, model_mode2={}, model_ffn_mode={}'
                .format(self.model_mode, self.model_mode2,
                        self.model_ffn_mode))

        elif self.model_mode2 in ['utt_emb']:
            # TODO: branch test
            if self.model_mode in ['top_act']:
                d_act = 1
            elif self.model_mode in ['all_acts']:
                d_act = 4
            else:
                print('choose valid option -model_mode')
                exit()
            # align dimention
            self.align_feed_forward = nn.Linear(d_model + d_act, d_model)
            # attention
            self.self_attn = onmt.modules.MultiHeadedAttention(heads,
                                                               d_model,
                                                               dropout=dropout)
            self.knowledge_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            self.context_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            # feed forward
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
            # layer normalization
            self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
            # debug
            print(
                'init encoder to utt_emb: model_mode={}, model_mode2={}, model_ffn_mode={}'
                .format(self.model_mode, self.model_mode2,
                        self.model_ffn_mode))

        else:
            print('choose valid option -model_mode2')
            exit()

        self.dropout = nn.Dropout(dropout)
Пример #30
0
    def __init__(self,
                 model_mode,
                 model_mode2,
                 model_ffn_mode,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 self_attn_type="scaled-dot"):
        super(TransformerDecoderLayer, self).__init__()

        self.model_mode = model_mode
        self.model_mode2 = model_mode2
        self.model_ffn_mode = model_ffn_mode
        self.self_attn_type = self_attn_type

        if self.model_mode2 in ['default']:
            # self attention
            if self_attn_type == "scaled-dot":
                self.self_attn = onmt.modules.MultiHeadedAttention(
                    heads, d_model, dropout=dropout)
            elif self_attn_type == "average":
                self.self_attn = onmt.modules.AverageAttention(d_model,
                                                               dropout=dropout)
            # other attention
            self.knowledge_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            self.history_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            # feed forward
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
            # layer normalization
            self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
            # debug
            print(
                'init decoder to default: model_mode={}, model_mode2={}, model_ffn_mode={}'
                .format(self.model_mode, self.model_mode2,
                        self.model_ffn_mode))

        elif self.model_mode2 in ['ffn']:
            # self attention
            if self_attn_type == "scaled-dot":
                self.self_attn = onmt.modules.MultiHeadedAttention(
                    heads, d_model, dropout=dropout)
            elif self_attn_type == "average":
                self.self_attn = onmt.modules.AverageAttention(d_model,
                                                               dropout=dropout)
            # other attention
            self.knowledge_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            self.history_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            # feed forward
            if self.model_mode in ['top_act']:
                d_act = 1
            elif self.model_mode in ['all_acts']:
                d_act = 4
            else:
                print('choose valid option -model_mode')
                exit()
            if self.model_ffn_mode in ['additional']:
                self.feed_forward = PositionwiseFeedForward(
                    d_model + d_act, d_ff, dropout)
                self.feed_forward2 = nn.Linear(d_model + d_act, d_model)
            elif self.model_ffn_mode in ['resnet_nLN', 'resnet_LN']:
                self.feed_forward = PositionwiseFeedForward2(
                    d_model + d_act, d_model, d_ff, dropout,
                    self.model_ffn_mode)
            else:
                print('choose valid option -model_ffn_mode')
                exit()
            # layer normalization
            self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
            # debug
            print(
                'init decoder to ffn: model_mode={}, model_mode2={}, model_ffn_mode={}'
                .format(self.model_mode, self.model_mode2,
                        self.model_ffn_mode))

        elif self.model_mode2 in ['utt_emb']:
            # TODO: branch test
            if self.model_mode in ['top_act']:
                d_act = 1
            elif self.model_mode in ['all_acts']:
                d_act = 4
            else:
                print('choose valid option -model_mode')
                exit()
            # align dimention
            self.align_feed_forward = nn.Linear(d_model + d_act, d_model)
            # self attention
            if self_attn_type == "scaled-dot":
                self.self_attn = onmt.modules.MultiHeadedAttention(
                    heads, d_model, dropout=dropout)
            elif self_attn_type == "average":
                self.self_attn = onmt.modules.AverageAttention(d_model,
                                                               dropout=dropout)
            # other attention
            self.knowledge_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            self.history_attn = onmt.modules.MultiHeadedAttention(
                heads, d_model, dropout=dropout)
            # feed forward
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
            # layer normalization
            self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
            self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6)
            # debug
            print(
                'init decoder to utt_emb: model_mode={}, model_mode2={}, model_ffn_mode={}'
                .format(self.model_mode, self.model_mode2,
                        self.model_ffn_mode))

        else:
            print('choose valid option -model_mode2')
            exit()

        self.dropout = dropout
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)