def __init__(self, opt, embedding, language_embeddings=None, **kwargs): super(SpeechLSTMDecoder, self).__init__() # Keep for reference # Define layers self.model_size = opt.model_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.variational_dropout = opt.variational_dropout self.encoder_type = opt.encoder_type self.lstm = nn.LSTM(self.model_size, self.model_size, self.layers, dropout=self.dropout, batch_first=True) self.fast_xattention = opt.fast_xattention self.n_head = 1 # fixed # also fix attention dropout to 0.0 if opt.fast_xattention: self.multihead_tgt = EncdecMultiheadAttn(self.n_head, opt.model_size, 0.0) else: self.multihead_tgt = MultiHeadAttention(self.n_head, opt.model_size, attn_p=0.0, share=3) self.preprocess_layer = PrePostProcessing( self.model_size, self.emb_dropout, sequence='d', variational=self.variational_dropout) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.preprocess_attn = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = embedding self.encoder_cnn_downsampling = opt.cnn_downsampling self.language_embeddings = language_embeddings self.use_language_embedding = opt.use_language_embedding self.language_embedding_type = opt.language_embedding_type if self.language_embedding_type == 'concat': self.projector = nn.Linear(opt.model_size * 2, opt.model_size) print("* Create LSTM Decoder with %d layers." % self.layers)
def __init__(self, opt, embedding, language_embeddings=None, **kwargs): super(SpeechLSTMDecoder, self).__init__() # Keep for reference # Define layers self.model_size = opt.model_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.variational_dropout = opt.variational_dropout self.multilingual_factorized_weights = opt.multilingual_factorized_weights self.mfw_rank = opt.mfw_rank self.encoder_type = opt.encoder_type self.n_languages = opt.n_languages self.lstm = nn.LSTM(self.model_size, self.model_size, self.layers, dropout=self.dropout, batch_first=True) if self.multilingual_factorized_weights: from onmt.modules.weight_control_lstm import WeightFactoredLSTM self.lstm = WeightFactoredLSTM(self.lstm, dropout=opt.weight_drop, n_languages=opt.n_languages, rank=self.mfw_rank) self.fast_xattention = opt.fast_xattention self.n_head = 1 # fixed to always use 1 head # also fix attention dropout to 0.0 if self.multilingual_factorized_weights: self.fast_xattention = True from onmt.modules.multilingual_factorized.encdec_attention import MFWEncdecMultiheadAttn self.multihead_tgt = MFWEncdecMultiheadAttn(self.n_head, opt.model_size, 0.0, n_languages=opt.n_languages, rank=opt.mfw_rank, weight_drop=0.0) else: if opt.fast_xattention: self.multihead_tgt = EncdecMultiheadAttn(self.n_head, opt.model_size, 0.0) else: self.multihead_tgt = MultiHeadAttention(self.n_head, opt.model_size, attn_p=0.0, share=3) self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', variational=self.variational_dropout) self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.preprocess_attn = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = embedding self.encoder_cnn_downsampling = opt.cnn_downsampling self.language_embeddings = language_embeddings self.use_language_embedding = opt.use_language_embedding self.language_embedding_type = opt.language_embedding_type if self.language_embedding_type == 'concat': self.projector = nn.Linear(opt.model_size * 2, opt.model_size) print("* Create LSTM Decoder with %d layers." % self.layers)
def __init__(self, opt): super().__init__() self.layer_norm = nn.LayerNorm((opt.model_size, ), elementwise_affine=True) self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.attn = EncdecMultiheadAttn(opt.n_heads, opt.model_size, attn_drop=opt.attn_dropout) self.dropout = opt.attn_dropout self.variational = opt.variational_dropout
def __init__(self, opt): super(TacotronDecoder, self).__init__() self.n_mel_channels = opt.n_mel_channels self.n_frames_per_step = opt.n_frames_per_step self.encoder_embedding_dim = opt.model_size self.attention_rnn_dim = opt.model_size self.decoder_rnn_dim = opt.model_size self.prenet_dim = opt.prenet_dim self.max_decoder_steps = opt.max_decoder_steps self.gate_threshold = 0.5 self.p_attention_dropout = opt.attn_dropout self.p_decoder_dropout = opt.dropout self.encoder_type = opt.encoder_type self.lstm = nn.LSTM(opt.prenet_dim, opt.model_size, 2, dropout=opt.dropout, batch_first=True) self.linear_trans = nn.Linear(opt.n_mel_channels * opt.n_frames_per_step , opt.model_size) torch.nn.init.xavier_uniform_(self.linear_trans.weight) if opt.fast_xattention: self.multihead_tgt = EncdecMultiheadAttn(1, opt.model_size, opt.attn_dropout) else: self.multihead_tgt = MultiHeadAttention(1, opt.model_size, attn_p=opt.attn_dropout, share=3) self.preprocess_layer = PrePostProcessing(opt.model_size, 0, sequence='n') self.prenet = Prenet( opt.n_mel_channels * opt.n_frames_per_step, [opt.prenet_dim, opt.prenet_dim]) self.attention_rnn = nn.LSTMCell( opt.prenet_dim + opt.model_size, opt.model_size) self.attention_layer = Attention( opt.model_size, opt.model_size, opt.attention_dim, opt.attention_location_n_filters, opt.attention_location_kernel_size) self.postprocess_layer = PrePostProcessing(opt.model_size, 0, sequence='n') self.decoder_rnn = nn.LSTMCell( opt.model_size + opt.model_size, opt.model_size, 1) self.linear_projection = LinearNorm( opt.model_size , opt.n_mel_channels * opt.n_frames_per_step) self.gate_layer = LinearNorm( opt.model_size , 1, bias=True, w_init_gain='sigmoid')
def __init__(self, opt, death_rate=0.0): super(DecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 if self.macaron: self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, opt.dropout, sequence='n') self.postprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) self.mcr_feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_self_attention: self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) else: self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=1) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not opt.fast_xattention: self.multihead_src = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2) else: self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu)
def __init__(self, opt, embedding, language_embeddings=None, ignore_source=False, allocate_positions=True): super(SpeechLSTMDecoder, self).__init__() # Keep for reference # Define layers self.model_size = opt.model_size self.layers = opt.layers self.dropout = opt.dropout self.word_dropout = opt.word_dropout self.attn_dropout = opt.attn_dropout self.emb_dropout = opt.emb_dropout self.variational_dropout = opt.variational_dropout self.encoder_type = opt.encoder_type self.lstm = nn.LSTM(self.model_size, self.model_size, self.layers, dropout=self.dropout, batch_first=True) self.fast_self_attention = opt.fast_self_attention if opt.fast_xattention: self.multihead_tgt = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_tgt = MultiHeadAttention(opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=3) # self.preprocess_layer = PrePostProcessing(self.model_size, self.emb_dropout, sequence='d', # variational=self.variational_dropout) self.preprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.postprocess_layer = PrePostProcessing(self.model_size, 0, sequence='n') self.word_lut = embedding self.encoder_cnn_downsampling = opt.cnn_downsampling self.language_embeddings = language_embeddings self.use_language_embedding = opt.use_language_embedding self.gumbel_embedding = opt.gumbel_embedding self.bottleneck = opt.bottleneck self.language_embedding_type = opt.language_embedding_type if self.language_embedding_type == 'concat': self.projector = nn.Linear(opt.model_size * 2, opt.model_size)
def __init__(self, opt, death_rate=0.0): super().__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention self.factor_size = opt.layers self.adaptive_type = opt.adaptive self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.adaptive_type == 'universal': self.multihead_src = EncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = AdaptiveEncDecAttn(opt.n_heads, opt.model_size, self.factor_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.adaptive_type == 'universal': self.multihead_tgt = RelativeSelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) else: self.multihead_tgt = AdaptiveRelativeAttn(opt.model_size, opt.n_heads, self.factor_size, opt.attn_dropout) self.feedforward = AdaptiveFeedForward(opt.model_size, opt.inner_size, self.factor_size, opt.dropout, variational=self.variational)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.fast_self_attention = opt.fast_self_attention # self.lfv_multilingual = opt.lfv_multilingual self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if opt.fast_xattention: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MultiHeadAttention( opt.n_heads, opt.model_size, attn_p=opt.attn_dropout, share=2) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.fast_self_attention: self.multihead_tgt = RelPartialLearnableMultiHeadAttn( opt.n_heads, opt.model_size, d_head, dropatt=opt.attn_dropout) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) if not opt.fast_feed_forward: feedforward = FeedForward(opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.feedforward = Bottle(feedforward) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.lfv_multilingual = opt.lfv_multilingual if opt.lfv_multilingual: self.lid_net = lid_net self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) else: self.lid_net = None self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.batch_ensemble = opt.batch_ensemble self.mfw = opt.multilingual_factorized_weights self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.dropout = opt.dropout if self.macaron: self.preprocess_mcr_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_mcr_ffn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) # if self.batch_ensemble > 0: # self.multihead_src = BEEncdecMultiheadAttn(opt.n_heads, opt.model_size, opt.attn_dropout, # ensemble=self.batch_ensemble) # else: if not self.mfw: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational) self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout)
def __init__(self, opt, death_rate=0.0): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.batch_ensemble = opt.batch_ensemble self.mfw = opt.multilingual_factorized_weights self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.dropout = opt.dropout self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout self.rezero = opt.rezero self.n_heads = opt.n_heads self.absolute_position_encoding = opt.absolute_position_encoding self.learnable_pos = opt.learnable_position_encoding self.stochastic_sublayer = opt.stochastic_sublayer self.post_norm = opt.post_norm if self.macaron: self.preprocess_mcr_ffn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_mcr_ffn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, activation=opt.ffn_activation, glu=opt.ffn_glu) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) self.preprocess_attn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_attn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) if not self.ignore_source: self.preprocess_src_attn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_src_attn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) if not self.mfw: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) else: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, ) self.preprocess_ffn = preprocessing(opt.rezero, opt.model_size, self.post_norm) self.postprocess_ffn = postprocessing(opt.rezero, opt.model_size, self.residual_dropout, self.variational, self.post_norm) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, activation=opt.ffn_activation, glu=opt.ffn_glu) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, no_bias=opt.mfw_no_bias, ) else: self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) if not self.absolute_position_encoding: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, learnable_pos=self.learnable_pos, max_pos=opt.max_pos_length) else: self.multihead_tgt = SelfMultiheadAttn(opt.model_size, opt.n_heads, opt.attn_dropout)
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.preprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) if not self.ignore_source: self.preprocess_src_attn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_src_attn = PrePostProcessing( opt.model_size, opt.dropout, sequence='da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='n') self.postprocess_ffn = PrePostProcessing(opt.model_size, opt.dropout, sequence='da', variational=self.variational) d_head = opt.model_size // opt.n_heads if not self.mfw: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, activation=opt.activation) else: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, opt.dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative) self.lfv_multilingual = opt.lfv_multilingual if opt.lfv_multilingual: self.lid_net = lid_net self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) else: self.lid_net = None self.lfv_mapper = None
def __init__(self, opt, death_rate=0.0, lid_net=None): super(RelativeTransformerDecoderLayer, self).__init__() self.ignore_source = opt.ignore_source self.variational = opt.variational_dropout self.death_rate = death_rate self.mfw = opt.multilingual_factorized_weights self.mpw = opt.multilingual_partitioned_weights self.mln = opt.multilingual_layer_norm self.weight_drop = opt.weight_drop self.multilingual_adapter = opt.multilingual_adapter self.adapter_bottleneck_size = opt.adapter_bottleneck_size self.macaron = opt.macaron self.ffn_scale = 0.5 if self.macaron else 1 self.rezero = opt.rezero self.learnable_pos = opt.learnable_position_encoding self.residual_dropout = opt.residual_dropout if opt.residual_dropout >= 0 else opt.dropout self.ffn_dropout = opt.ffn_dropout if opt.ffn_dropout >= 0 else opt.dropout self.preprocess_attn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.macaron: self.preprocess_mcr_ffn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_mcr_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.mcr_feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropoutt, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, activation=opt.ffn_activation, glu=opt.ffn_glu) else: self.mcr_feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) if not self.ignore_source: self.preprocess_src_attn = preprocessing( self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_src_attn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) if self.mfw: self.multihead_src = MFWEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.multihead_src = MPEncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_src = EncdecMultiheadAttn( opt.n_heads, opt.model_size, opt.attn_dropout) self.preprocess_ffn = preprocessing(self.rezero, opt.model_size, 0.0, sequence='n', multilingual=self.mln, n_languages=opt.n_languages) self.postprocess_ffn = PrePostProcessing( opt.model_size, self.residual_dropout, sequence='dz' if self.rezero else 'da', variational=self.variational) d_head = opt.model_size // opt.n_heads if self.mfw: self.feedforward = MFWPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation, activation=opt.ffn_activation, glu=opt.ffn_glu) self.multihead_tgt = MFWRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, n_languages=opt.n_languages, rank=opt.mfw_rank, use_multiplicative=opt.mfw_multiplicative, weight_drop=self.weight_drop, mfw_activation=opt.mfw_activation) elif self.mpw: self.feedforward = MPPositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, factor_size=opt.mpw_factor_size) self.multihead_tgt = MPRelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, factor_size=opt.mpw_factor_size) else: self.multihead_tgt = RelativeSelfMultiheadAttn( opt.model_size, opt.n_heads, opt.attn_dropout, learnable_pos=self.learnable_pos, max_pos=opt.max_pos_length) self.feedforward = PositionWiseFeedForward( opt.model_size, opt.inner_size, self.ffn_dropout, variational=self.variational, activation=opt.ffn_activation, glu=opt.ffn_glu) # self.lfv_multilingual = opt.lfv_multilingual # # if opt.lfv_multilingual: # self.lid_net = lid_net # self.lfv_mapper = nn.Linear(opt.bottleneck_size, opt.model_size) # else: # self.lid_net = None # self.lfv_mapper = None if self.multilingual_adapter: from onmt.modules.multilingual_factorized.multilingual_adapters import MultilingualAdapter self.adapters = MultilingualAdapter(opt.model_size, opt.adapter_bottleneck_size, n_languages=opt.n_languages, dropout=opt.dropout)