def __init__(self, opt, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, dict_size=None, label_emb=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions, dict_size=dict_size, label_emb=label_emb, opt=opt) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=None): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads
def __init__(self, d_model, heads, d_ff, dropout, layer_index, max_relative_positions=0): super(ConvTransformerEncoderLayer, self).__init__() if layer_index <= 2: self.self_attn = ConvMultiHeadedAttention( heads, d_model, 13, 3, dropout=dropout, max_relative_positions=max_relative_positions) else: self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, size, dropout, head_count=8, hidden_size=2048, self_attn_type="scaled-dot"): super(TransformerDecoderLayer, self).__init__() self.self_attn_type = self_attn_type if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( head_count, size, dropout=dropout) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention( size, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( head_count, size, dropout=dropout) self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout) self.layer_norm_1 = onmt.modules.LayerNorm(size) self.layer_norm_2 = onmt.modules.LayerNorm(size) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, strided_attn=False, conv_k_v=False): super(TransformerEncoderLayer, self).__init__() self.strided_attn = strided_attn self.conv_k_v = conv_k_v if self.strided_attn: self.self_attn = MultiHeadedStridedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) else: self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.conv1d_k_v = nn.Conv1d(d_model, d_model, kernel_size=3, stride=3) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", max_relative_positions=0): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, tgt_concept_words_type=-1): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.tgt_concept_words_type = tgt_concept_words_type if tgt_concept_words_type in [2]: self.tgt_concept_mlp = nn.Linear(d_model * 2, d_model)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.model_dim = 1024 self.d_ff = 4096 torch.set_grad_enabled(False) onmt_ffn = PositionwiseFeedForward(self.model_dim, self.d_ff) onmt_ffn.eval() if use_cuda: onmt_ffn.to(self.test_device) turbo_ffn_trans = turbo_transformers.PositionwiseFeedForward.from_onmt( onmt_ffn, is_trans_weight=True) turbo_ffn_notrans = turbo_transformers.PositionwiseFeedForward.from_onmt( onmt_ffn, is_trans_weight=False) # (batch_size, input_len, model_dim) inputs = torch.rand(size=(batch_size, input_len, self.model_dim), dtype=torch.float32, device=self.test_device) return onmt_ffn, turbo_ffn_trans, turbo_ffn_notrans, inputs
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.video_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm1 = LayerNorm(d_model) self.layer_norm2 = LayerNorm(d_model) self.drop = nn.Dropout(dropout) self.sublayer = nn.ModuleList( [SublayerConnection(d_model, dropout) for _ in range(3)])
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot"): super(TransformerDecoderLayer, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__(self, model_dim, dropout=0.1, aan_useffn=False): self.model_dim = model_dim self.aan_useffn = aan_useffn super(AverageAttention, self).__init__() if aan_useffn: self.average_layer = PositionwiseFeedForward( model_dim, model_dim, dropout) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
def __init__(self, model_dim, dropout=0.1): self.model_dim = model_dim super(AverageAttention, self).__init__() self.average_layer = PositionwiseFeedForward(model_dim, model_dim, dropout) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
def __init__(self, size, dropout, head_count=8, hidden_size=2048): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention(head_count, size, dropout=dropout) self.feed_forward = PositionwiseFeedForward(size, hidden_size, dropout) self.layer_norm = onmt.modules.LayerNorm(size) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, gnn=None): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) self.ffn_layer_norm = nn.LayerNorm(d_model, eps=1e-6)
def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size): # feat_vec_size added for adaptable feat_vec_size #latt super(LatticeEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) #latt self.latt_attn = onmt.modules.GlobalAttention(d_model) self.feat_vec_size = feat_vec_size #latt self.layer_norm = onmt.modules.LayerNorm(d_model) self.dropout = nn.Dropout(dropout)
def __init__(self, model_dim, dropout=0.1, aan_useffn=False, pos_ffn_activation_fn=ActivationFunction.relu): self.model_dim = model_dim self.aan_useffn = aan_useffn super(AverageAttention, self).__init__() if aan_useffn: self.average_layer = PositionwiseFeedForward( model_dim, model_dim, dropout, pos_ffn_activation_fn) self.gating_layer = nn.Linear(model_dim * 2, model_dim * 2)
def __init__(self, d_model, heads, d_ff, dropout, feat_vec_size): # feat_vec_size added for adaptable feat_vec_size #latt super(LatticeEncoderLayer, self).__init__() # self.self_attn = onmt.modules.MultiHeadedAttention( # heads, d_model, dropout=dropout) # not used for RBA layer self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) #latt self.latt_attn = onmt.modules.GlobalAttention(d_model) self.feat_vec_size = feat_vec_size self.linear_context_score = nn.Linear(feat_vec_size, 1) # Layer for calculating context gate score #latt self.layer_norm = onmt.modules.LayerNorm(d_model) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLMLayer, self).__init__() # we no longer have context attention, only self attention self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = onmt.modules.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__( self, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot", self_attn_func="softmax", self_attn_alpha=None, self_attn_bisect_iter=0, context_attn_func="softmax", context_attn_alpha=None, context_attn_bisect_iter=0, ): super(TransformerDecoderLayer, self).__init__() self.self_attn_type = self_attn_type if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout, attn_func=self_attn_func, attn_alpha=self_attn_alpha, attn_bisect_iter=self_attn_bisect_iter, ) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention(d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout, attn_func=context_attn_func, attn_alpha=context_attn_alpha, attn_bisect_iter=context_attn_bisect_iter, ) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer("mask", mask)
def __init__(self, d_model, heads, d_ff, dropout): super(ATransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, max_relative_positions=0, downsampling=1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.ds_layer = nn.Linear(d_model, int( d_model / downsampling)) if downsampling > 1 else None self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, selected_ctx=0, fields=None): super(SimpleContextTransformerEncoder, self).__init__() self.selected_ctx = selected_ctx self.fields = fields self.num_layers = num_layers self.embeddings = embeddings self.layer_norm_shared = onmt.modules.LayerNorm(d_model) self.layer_norm_ctx = onmt.modules.LayerNorm(d_model) self.layer_norm_src_final = onmt.modules.LayerNorm(d_model) self.layer_norm_ctx_final = onmt.modules.LayerNorm(d_model) self.shared_layers = nn.ModuleList([ TransformerEncoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers - 1) ]) self.extra_ctx_layer = TransformerEncoderLayer(d_model, heads, d_ff, dropout) self.ctx_src_self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.ctx_src_layer_norm = onmt.modules.LayerNorm(d_model) self.src_self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.src_layer_norm = onmt.modules.LayerNorm(d_model) # TODO dim self.gate = nn.Linear(d_model * 2, 1) self.gate_sigmoid = nn.Sigmoid() self.final_feed_forward = PositionwiseFeedForward( d_model, d_ff, dropout) self.final_layer_norm = onmt.modules.LayerNorm(d_model)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, activation='relu', is_bert=False): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, activation) self.layer_norm = nn.LayerNorm(d_model, eps=1e-12 if is_bert else 1e-6) self.dropout = nn.Dropout(dropout)
def __init__( self, d_model, heads, d_ff, dropout, self_attn_func, self_attn_alpha, self_attn_bisect_iter, ): super(TransformerEncoderLayer, self).__init__() self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout, attn_func=self_attn_func, attn_alpha=self_attn_alpha, attn_bisect_iter=self_attn_bisect_iter, ) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, d_ff, n_head, dropout): super(VideoBlock, self).__init__() self.self_attn = MultiHeadedAttention(n_head, d_model) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.sublayer = nn.ModuleList([SublayerConnection(d_model, dropout)]) self.drop = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, attention_dropout, max_relative_positions=0, num_boost=4, learnable_weights=True, boost_type='continuous', main_stream=False, boost_drop_rate=0.1, boost_dropout_diff=0.0,boost_with_ffn=False, boost_str='', boost_gating=False, mask_pos_type=[], self_att_merge_layer=False, adv_bias_step=0.0, shuffle_merge=False, shuffle_merge_type="sum", adv_gradient_boost=False, adv_gradient_boost_step=0.01, adv_gradient_boost_func='mse', adv_gradient_boost_no_ce=False, gradient_boost_scale=1.0, boost_adv_method_list=[], boost_sample_rate=1.0, shuffle_fix=0, boost_single_att=False, boost_single_ffn=False, shuffle_stop_gradient=False): super(TransformerEncoderBoostLayer, self).__init__() self.num_boost = num_boost self.boost_type = boost_type self.main_stream = main_stream self.boost_drop_rate = boost_drop_rate self.boost_with_ffn = boost_with_ffn self.use_adv = True if self.boost_type == 'adv' else False self.a_num = num_boost # self.use_dropout_diff = True if boost_dropout_diff != 0.0 else False self.use_dropout_diff = False self.d_num = num_boost self.use_mask = True if self.boost_type in {'continuous', 'continuous_comp', 'random', 'pos'} else False # overwrite params based on boost_str self.boost_gating = boost_gating self.mask_pos_type = mask_pos_type # init postag params self.use_postag = False self.p_num = 0 self._parse_boost_str(boost_str) # whether to use self-att to merge each path's output self.use_self_att_merge_layer = self_att_merge_layer self.adv_bias_step = adv_bias_step self.shuffle_merge = shuffle_merge self.shuffle_merge_type = shuffle_merge_type self.adv_gradient_boost = adv_gradient_boost self.adv_gradient_boost_step = adv_gradient_boost_step self.adv_gradient_boost_func = adv_gradient_boost_func self.adv_gradient_boost_no_ce = adv_gradient_boost_no_ce self.gradient_boost_scale = gradient_boost_scale self.boost_sample_rate = boost_sample_rate self.boost_single_att = boost_single_att self.boost_single_ffn = boost_single_ffn self.shuffle_stop_gradient = shuffle_stop_gradient # compute dropout list if not self.use_dropout_diff: dropout_list = [dropout for i in range(self.num_boost)] else: dropout_diffs = [boost_dropout_diff * i - float(self.d_num)/2 * boost_dropout_diff for i in range(self.d_num)] dropout_list = [dropout + dropout_diffs[i] for i in range(self.d_num)] + [dropout for i in range(self.num_boost - self.d_num)] self.dropout_list = dropout_list print("Boost dropout list: {}".format(dropout_list)) assert max(dropout_list) <= 1.0 and min(dropout_list) >= 0.0 # list of self-attention module if not self.boost_single_att: self.self_attn_list = [ MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) for n in range(self.num_boost) ] self.self_attn_list = nn.ModuleList(self.self_attn_list) else: self.self_attn_list = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) # assert self.d_num == self.num_boost if self.main_stream: # main stream for self-attention self.main_self_attn = MultiHeadedAttention(heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) if self.use_self_att_merge_layer: # keep the default setting for self-attention layer. self.att_merge_layer = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions) self.merge_layer_norm = nn.LayerNorm(d_model, eps=1e-6) # convert all ones to 1/N weights_init = torch.ones(self.num_boost, dtype=torch.float32) / self.num_boost self.weights = nn.Parameter(weights_init, requires_grad=learnable_weights) if self.boost_with_ffn: if not self.boost_single_ffn: feed_forward_list = [ PositionwiseFeedForward(d_model, d_ff, dropout_list[i]) for i in range(self.num_boost) ] self.feed_forward = nn.ModuleList(feed_forward_list) else: self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_list[0]) else: self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) # TODO: Functions for drop_rate is not implemented yet. self.shuffle_fix = shuffle_fix if self.shuffle_merge: if shuffle_fix == 0: shuffle_matrix = torch.abs(torch.randn(self.num_boost, self.num_boost)) else: shuffle_matrix = torch.ones(self.num_boost, self.num_boost) / self.num_boost self.shuffle_matrix = nn.Parameter(shuffle_matrix) self.merge_weights = torch.ones((self.num_boost-1,), dtype=torch.float32, requires_grad=False) if self.use_adv or self.use_postag is True: # permutation of max position range. self.max_perm = 3 self.max_exchange = 3 if not boost_adv_method_list: all_adv_methods = ['swap', 'reorder', 'delete', 'mask'] else: all_adv_methods = boost_adv_method_list assert self.a_num <= len(all_adv_methods) self.activate_methods = all_adv_methods[:self.a_num] # create mask tensor if "mask" in self.activate_methods or self.use_postag is True: mask_tensor = torch.empty(d_model) torch.nn.init.normal_(mask_tensor, std=1.0/math.sqrt(d_model)) self.mask_tensor = nn.Parameter(mask_tensor) print('Activated adversarial methods: {}'.format(self.activate_methods)) if self.use_postag: assert len(self.mask_pos_type) == self.p_num if self.adv_gradient_boost is True: if adv_gradient_boost_func == 'mse': self.mse = nn.MSELoss(reduction='none') elif adv_gradient_boost_func == 'cos': self.cos_sim = nn.CosineSimilarity(dim=2) elif adv_gradient_boost_func == 'l1': self.l1 = nn.L1Loss(reduction='none') else: raise ValueError() self.keep_adv_gradient = False self.adv_gradient_value = 'moving_average' if self.keep_adv_gradient: self.register_buffer('gradient_moving_average', ) self.keep_ffn_dist = [] self.keep_attn_dist = [] self.keep_attn_out_dist = [] self.keep_attn_score = [] return
def __init__( self, d_model, heads, d_ff, dropout, attention_dropout, self_attn_type="scaled-dot", max_relative_positions=0, aan_useffn=False, full_context_alignment=False, alignment_heads=0, pos_ffn_activation_fn=ActivationFunction.relu, ): """ Args: d_model (int): the dimension of keys/values/queries in :class:`MultiHeadedAttention`, also the input size of the first-layer of the :class:`PositionwiseFeedForward`. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the :class:`PositionwiseFeedForward`. dropout (float): dropout in residual, self-attn(dot) and feed-forward attention_dropout (float): dropout in context_attn (and self-attn(avg)) self_attn_type (string): type of self-attention scaled-dot, average max_relative_positions (int): Max distance between inputs in relative positions representations aan_useffn (bool): Turn on the FFN layer in the AAN decoder full_context_alignment (bool): whether enable an extra full context decoder forward for alignment alignment_heads (int): N. of cross attention heads to use for alignment guiding pos_ffn_activation_fn (ActivationFunction): activation function choice for PositionwiseFeedForward layer """ super(TransformerDecoderLayerBase, self).__init__() if self_attn_type == "scaled-dot": self.self_attn = MultiHeadedAttention( heads, d_model, dropout=attention_dropout, max_relative_positions=max_relative_positions, ) elif self_attn_type == "average": self.self_attn = AverageAttention(d_model, dropout=attention_dropout, aan_useffn=aan_useffn) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout, pos_ffn_activation_fn) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) self.full_context_alignment = full_context_alignment self.alignment_heads = alignment_heads
def __init__(self, model_mode, model_mode2, model_ffn_mode, d_model, heads, d_ff, dropout): super(ATransformerEncoderLayer, self).__init__() self.model_mode = model_mode self.model_mode2 = model_mode2 self.model_ffn_mode = model_ffn_mode if self.model_mode2 in ['default']: # attention self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) # feed forward self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) # layer normalization self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # debug print( 'init encoder to default: model_mode={}, model_mode2={}, model_ffn_mode={}' .format(self.model_mode, self.model_mode2, self.model_ffn_mode)) elif self.model_mode2 in ['ffn']: # attention self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) # feed forward if self.model_mode in ['top_act']: d_act = 1 elif self.model_mode in ['all_acts']: d_act = 4 else: print('choose valid option -model_mode') exit() if self.model_ffn_mode in ['additional']: self.feed_forward = PositionwiseFeedForward( d_model + d_act, d_ff, dropout) self.feed_forward2 = nn.Linear(d_model + d_act, d_model) elif self.model_ffn_mode in ['resnet_nLN', 'resnet_LN']: self.feed_forward = PositionwiseFeedForward2( d_model + d_act, d_model, d_ff, dropout, self.model_ffn_mode) else: print('choose valid option -model_ffn_mode') exit() # layer normalization self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # debug print( 'init encoder to ffn: model_mode={}, model_mode2={}, model_ffn_mode={}' .format(self.model_mode, self.model_mode2, self.model_ffn_mode)) elif self.model_mode2 in ['utt_emb']: # TODO: branch test if self.model_mode in ['top_act']: d_act = 1 elif self.model_mode in ['all_acts']: d_act = 4 else: print('choose valid option -model_mode') exit() # align dimention self.align_feed_forward = nn.Linear(d_model + d_act, d_model) # attention self.self_attn = onmt.modules.MultiHeadedAttention(heads, d_model, dropout=dropout) self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.context_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) # feed forward self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) # layer normalization self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # debug print( 'init encoder to utt_emb: model_mode={}, model_mode2={}, model_ffn_mode={}' .format(self.model_mode, self.model_mode2, self.model_ffn_mode)) else: print('choose valid option -model_mode2') exit() self.dropout = nn.Dropout(dropout)
def __init__(self, model_mode, model_mode2, model_ffn_mode, d_model, heads, d_ff, dropout, self_attn_type="scaled-dot"): super(TransformerDecoderLayer, self).__init__() self.model_mode = model_mode self.model_mode2 = model_mode2 self.model_ffn_mode = model_ffn_mode self.self_attn_type = self_attn_type if self.model_mode2 in ['default']: # self attention if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention(d_model, dropout=dropout) # other attention self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.history_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) # feed forward self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) # layer normalization self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # debug print( 'init decoder to default: model_mode={}, model_mode2={}, model_ffn_mode={}' .format(self.model_mode, self.model_mode2, self.model_ffn_mode)) elif self.model_mode2 in ['ffn']: # self attention if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention(d_model, dropout=dropout) # other attention self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.history_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) # feed forward if self.model_mode in ['top_act']: d_act = 1 elif self.model_mode in ['all_acts']: d_act = 4 else: print('choose valid option -model_mode') exit() if self.model_ffn_mode in ['additional']: self.feed_forward = PositionwiseFeedForward( d_model + d_act, d_ff, dropout) self.feed_forward2 = nn.Linear(d_model + d_act, d_model) elif self.model_ffn_mode in ['resnet_nLN', 'resnet_LN']: self.feed_forward = PositionwiseFeedForward2( d_model + d_act, d_model, d_ff, dropout, self.model_ffn_mode) else: print('choose valid option -model_ffn_mode') exit() # layer normalization self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # debug print( 'init decoder to ffn: model_mode={}, model_mode2={}, model_ffn_mode={}' .format(self.model_mode, self.model_mode2, self.model_ffn_mode)) elif self.model_mode2 in ['utt_emb']: # TODO: branch test if self.model_mode in ['top_act']: d_act = 1 elif self.model_mode in ['all_acts']: d_act = 4 else: print('choose valid option -model_mode') exit() # align dimention self.align_feed_forward = nn.Linear(d_model + d_act, d_model) # self attention if self_attn_type == "scaled-dot": self.self_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) elif self_attn_type == "average": self.self_attn = onmt.modules.AverageAttention(d_model, dropout=dropout) # other attention self.knowledge_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) self.history_attn = onmt.modules.MultiHeadedAttention( heads, d_model, dropout=dropout) # feed forward self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) # layer normalization self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_3 = nn.LayerNorm(d_model, eps=1e-6) # debug print( 'init decoder to utt_emb: model_mode={}, model_mode2={}, model_ffn_mode={}' .format(self.model_mode, self.model_mode2, self.model_ffn_mode)) else: print('choose valid option -model_mode2') exit() self.dropout = dropout self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)