def __init__(self, opt, death_rate=0.0, **kwargs): super(RelativeTransformerEncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.depthwise_conv = opt.depthwise_conv self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.no_ffn = opt.no_ffn self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size if self.mfw: assert not self.mpw, "[ERROR] factorized and partitioned weights cannot be used at the same time." self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.no_ffn: self.preprocess_ffn = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: if not self.no_ffn: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) self.multihead = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: if not self.no_ffn: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: if not self.no_ffn: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.multihead = RelativeSelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) if self.depthwise_conv: self.preprocess_conv = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_conv = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.depthwise_conv = ConformerConvBlock(opt.model_size, opt.conv_kernel, bias=True) else: self.depthwise_conv = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.lfv_multilingual = opt.lfv_multilingual if opt.lfv_multilingual: self.lid_net = lid_net self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) else: self.lid_net = None self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.rezero = opt.rezero self.learnable_pos = opt.learnable_position_encoding self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout self.preprocess_attn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.macaron: self.preprocess_mcr_ffn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_mcr_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropoutt, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, activation=opt.ffn_activation, glu=opt.ffn_glu) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) if not self.ignore_source: self.preprocess_src_attn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation, activation=opt.ffn_activation, glu=opt.ffn_glu) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, learnable_pos=self.learnable_pos, max_pos=opt.max_pos_length) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) # self.lfv_multilingual = opt.lfv_multilingual # # if opt.lfv_multilingual: # self.lid_net = lid_net # self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) # else: # self.lid_net = None # self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)