def __init__(self, opt, dicts, positional_encoder, language_embeddings=None, ignore_source=False): self.death_rate = opt.death_rate self.max_memory_size = opt.max_memory_size self.stream_context = opt.stream_context self.extra_context_size = opt.extra_context_size self.n_heads = opt.n_heads self.fast_self_attn = opt.fast_self_attention self.mpw = opt.multilingual_partitioned_weights self.learnable_position_encoding = opt.learnable_position_encoding self.max_pos_length = opt.max_pos_length # build_modules will be called from the inherited constructor super().__init__(opt, dicts, positional_encoder, language_embeddings, ignore_source, allocate_positions=False) if self.learnable_position_encoding: self.positional_encoder = None else: self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size) self.d_head = self.model_size // self.n_heads # Parameters for the position biases - deprecated. kept for backward compatibility # self.r_w_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_head)) # self.r_r_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_head)) self.mln = opt.multilingual_layer_norm if not opt.rezero: self.postprocess_layer = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) else: self.postprocess_layer = Identity()
def __init__(self, opt, death_rate=0.0, **kwargs): super(EncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 if self.macaron: self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n') self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_self_attention: self.multihead = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) else: self.multihead = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu)
def __init__(self, opt, dicts, positional_encoder, encoder_type='text', language_embeddings=None): self.death_rate = opt.death_rate self.learnable_position_encoding = opt.learnable_position_encoding self.layer_modules = list() self.asynchronous = opt.asynchronous self.max_memory_size = opt.max_memory_size self.extra_context_size = opt.extra_context_size self.experimental = opt.experimental self.unidirectional = opt.unidirectional self.reversible = opt.src_reversible self.n_heads = opt.n_heads self.fast_self_attn = opt.fast_self_attention self.checkpointing = opt.checkpointing self.mpw = opt.multilingual_partitioned_weights self.multilingual_linear_projection = opt.multilingual_linear_projection self.mln = opt.multilingual_layer_norm self.no_input_scale = opt.no_input_scale self.learnable_position_encoding = opt.learnable_position_encoding self.max_pos_length = opt.max_pos_length # TODO: multilingually linear transformation # build_modules will be called from the inherited constructor super().__init__(opt, dicts, positional_encoder, encoder_type, language_embeddings) # learnable position encoding if self.learnable_position_encoding: # raise NotImplementedError self.positional_encoder = None else: # or using pre-set sinusoidal self.positional_encoder = SinusoidalPositionalEmbedding(opt.model_size) self.d_head = self.model_size // self.n_heads if self.multilingual_linear_projection: self.linear_proj = nn.Parameter(torch.Tensor(opt.n_languages, self.model_size, self.model_size)) std_ = math.sqrt(2.0 / (self.model_size + self.model_size)) torch.nn.init.normal_(self.linear_proj, 0.0, std_) self.mln = opt.multilingual_layer_norm if not opt.rezero: self.postprocess_layer = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) else: self.postprocess_layer = Identity()
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.lfv_multilingual = opt.lfv_multilingual if opt.lfv_multilingual: self.lid_net = lid_net self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) else: self.lid_net = None self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)
def __init__(self, opt, death_rate=0.0, **kwargs): super(RelativeTransformerEncoderLayer, self).__init__() self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.depthwise_conv = opt.depthwise_conv self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.no_ffn = opt.no_ffn self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size if self.mfw: assert not self.mpw, "[ERROR] factorized and partitioned weights cannot be used at the same time." self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.no_ffn: self.preprocess_ffn = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: if not self.no_ffn: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) self.multihead = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: if not self.no_ffn: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: if not self.no_ffn: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.multihead = RelativeSelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) if self.depthwise_conv: self.preprocess_conv = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_conv = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.depthwise_conv = ConformerConvBlock(opt.model_size, opt.conv_kernel, bias=True) else: self.depthwise_conv = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)
def preprocessing(rezero, *args, **kwargs): if rezero: return Identity() else: return PrePostProcessing(*args, **kwargs)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.rezero = opt.rezero self.learnable_pos = opt.learnable_position_encoding self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout self.preprocess_attn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.macaron: self.preprocess_mcr_ffn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_mcr_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropoutt, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, activation=opt.ffn_activation, glu=opt.ffn_glu) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) if not self.ignore_source: self.preprocess_src_attn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation, activation=opt.ffn_activation, glu=opt.ffn_glu) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, learnable_pos=self.learnable_pos, max_pos=opt.max_pos_length) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) # self.lfv_multilingual = opt.lfv_multilingual # # if opt.lfv_multilingual: # self.lid_net = lid_net # self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) # else: # self.lid_net = None # self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)