def __init__( self, size, self_attn, mixed_attn, feed_forward_enc, feed_forward_dec, dropout_rate, normalize_before=True, concat_after=False, ): """Construct an DecoderLayer object.""" super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.mixed_attn = mixed_attn # feed forwar 應該會有兩個 for enc 跟 dec self.feed_forward_enc = feed_forward_enc self.feed_forward_dec = feed_forward_dec self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size)
def __init__(self, size, self_attn, cn_src_attn, en_src_attn, feed_forward, dropout_rate, moe_att_mode='linear', normalize_before=True, concat_after=False): """Construct an DecoderLayer object.""" super(HANDecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size) # Hierarchical attention self.cn_src_attn = cn_src_attn # declare attn here for initialization self.en_src_attn = en_src_attn self.src_attn = MoEAttn(size, cn_src_attn, en_src_attn, moe_att_mode)
def __init__( self, size, self_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False, ): """Construct an DecoderLayer object.""" super(DecoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat = nn.Linear((size + size), size)
def __init__( self, size, self_attn, src_attn, feed_forward_1, #fix feed_forward_2, #fix dropout_rate, normalize_before=True, concat_after=False, ): """Construct an DecoderLayer object.""" super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward_1 = feed_forward_1 #fix self.feed_forward_2 = feed_forward_2 #fix self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size)
def __init__(self, size, self_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False, time_window=15): super(EncoderLayerTimeRestricted, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear = nn.Linear(size + size, size) self.window_size = time_window pad_front = int(self.window_size / 2) pad_end = self.window_size - pad_front - 1 self.pad = ( 0, 0, pad_front, pad_end ) # pad the second to last dimension by (pad_front, pad_end)
def __init__( self, size, self_attn, feed_forward, feed_forward_macaron, conv_module, dropout_rate, normalize_before=True, concat_after=False, ): """Construct an EncoderLayer object.""" super(EncoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.feed_forward_macaron = feed_forward_macaron self.conv_module = conv_module self.norm_ff = LayerNorm(size) # for the FNN module self.norm_mha = LayerNorm(size) # for the MHA module if feed_forward_macaron is not None: self.norm_ff_macaron = LayerNorm(size) self.ff_scale = 0.5 else: self.ff_scale = 1.0 if self.conv_module is not None: self.norm_conv = LayerNorm(size) # for the CNN module self.norm_final = LayerNorm(size) # for the final output of the block self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear = nn.Linear(size + size, size)
def __init__(self, size, self_attn, feed_forward, dropout_rate): super(EncoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.size = size
def __init__(self, size, self_attn, src_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False, cross_self_attn=None, cross_src_attn=None, cross_operator=None, cross_shared=False, cross_weight_learnable=False, cross_weight=0.0): """Construct an DecoderLayer object.""" super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward if not cross_shared and cross_self_attn is not None and cross_src_attn is not None: self.cross_self_attn = cross_self_attn self.cross_src_attn = cross_src_attn self.cross_shared = False else: self.cross_self_attn = None self.cross_src_attn = None if cross_self_attn is not None: self.cross_attn = cross_self_attn if cross_src_attn is not None: self.cross_attn = cross_src_attn if cross_self_attn is None and cross_src_attn is None: self.cross_attn = None self.cross_shared = True self.cross_operator = cross_operator if cross_self_attn is not None or cross_src_attn is not None: if cross_operator == "concat": self.cross_concat_linear1 = nn.Linear(size + size, size) self.cross_concat_linear2 = nn.Linear(size + size, size) elif cross_operator == "sum": if cross_weight_learnable: assert float(cross_weight) > 0 self.cross_weight = torch.nn.Parameter( torch.tensor(cross_weight)) else: self.cross_weight = cross_weight self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size)
def __init__( self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, hparams=None ): """Initilize duration predictor module. Args: idim (int): Input dimension. n_layers (int, optional): Number of convolutional layers. n_chans (int, optional): Number of channels of convolutional layers. kernel_size (int, optional): Kernel size of convolutional layers. dropout_rate (float, optional): Dropout rate. offset (float, optional): Offset value to avoid nan in log domain. """ super(DurationPredictor, self).__init__() self.hparams = hparams self.offset = offset self.conv = torch.nn.ModuleList() self.norm = torch.nn.ModuleList() if hparams.is_spk_layer_norm else None self.dropout = torch.nn.ModuleList() if hparams.is_spk_layer_norm else None for idx in range(n_layers): in_chans = idim if idx == 0 else n_chans if hparams.is_spk_layer_norm: self.conv += [ torch.nn.Sequential( torch.nn.Conv1d( in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ), torch.nn.ReLU(), ) ] else: self.conv += [ torch.nn.Sequential( torch.nn.Conv1d( in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ), torch.nn.ReLU(), LayerNorm(n_chans, hparams=hparams, dim=1), torch.nn.Dropout(dropout_rate), ) ] if hparams.is_spk_layer_norm: self.norm.append(LayerNorm(n_chans, hparams=hparams, dim=1)) self.dropout.append(torch.nn.Dropout(dropout_rate)) self.linear = torch.nn.Linear(n_chans, 1)
def __init__(self, size, self_attn, feed_forward, dropout_rate): """Construct an DecoderLayer object.""" super(DecoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.size = size
def __init__(self, size, lstm, src_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False): super(DecoderLayer, self).__init__() self.size = size self.lstm = lstm self.src_attn = src_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.norm3 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(size + size, size) self.concat_linear2 = nn.Linear(size + size, size)
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False): super(RelMultiHeadAttn, self).__init__() self.n_head = n_head self.d_model = d_model self.d_head = d_head self.dropout = dropout self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) self.drop = nn.Dropout(dropout) self.dropatt = nn.Dropout(dropatt) self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) self.layer_norm = LayerNorm(d_model) self.scale = 1 / (d_head**0.5) self.pre_lnorm = pre_lnorm
def __init__(self, idim, args): super(Encoder, self).__init__() if args.transformer_input_layer == "linear": self.input_layer = torch.nn.Sequential( torch.nn.Linear(idim, args.adim), torch.nn.LayerNorm(args.adim), torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(), PositionalEncoding(args.adim, args.dropout_rate)) elif args.transformer_input_layer == "conv2d": self.input_layer = Conv2dSubsampling(idim, args.adim, args.dropout_rate) elif args.transformer_input_layer == "embed": self.input_layer = torch.nn.Sequential( torch.nn.Embedding(idim, args.adim), PositionalEncoding(args.adim, args.dropout_rate)) else: raise ValueError("unknown input_layer: " + args.transformer_input_layer) self.encoders = repeat( args.elayers, lambda: EncoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args. transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.eunits, args. dropout_rate), args.dropout_rate)) self.norm = LayerNorm(args.adim)
def __init__( self, odim, jdim, attention_dim=512, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.0, attention_dropout_rate=0.0, input_layer="embed", pos_enc_class=PositionalEncoding, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention( attention_heads, attention_dim, attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, ), ) self.after_norm = LayerNorm(attention_dim) self.lin_enc = torch.nn.Linear(attention_dim, jdim) self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False) self.lin_out = torch.nn.Linear(jdim, odim) self.attention_dim = attention_dim self.odim = odim self.blank = blank
def __init__( self, idim, enc_arch, input_layer="linear", repeat_block=0, self_attn_type="selfattn", positional_encoding_type="abs_pos", positionwise_layer_type="linear", positionwise_activation_type="relu", conv_mod_activation_type="relu", normalize_before=True, padding_idx=-1, ): """Construct an Transformer encoder object.""" super(Encoder, self).__init__() self.embed, self.encoders, self.enc_out = build_blocks( "encoder", idim, input_layer, enc_arch, repeat_block=repeat_block, self_attn_type=self_attn_type, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, conv_mod_activation_type=conv_mod_activation_type, padding_idx=padding_idx, ) self.normalize_before = normalize_before if self.normalize_before: self.after_norm = LayerNorm(self.enc_out)
def __init__( self, size: int, kernel_size: int, dropout_rate: float, use_linear_after_conv: bool, gate_activation: str, ): super().__init__() n_channels = size // 2 # split input channels self.norm = LayerNorm(n_channels) self.conv = torch.nn.Conv1d( n_channels, n_channels, kernel_size, 1, (kernel_size - 1) // 2, groups=n_channels, ) if use_linear_after_conv: self.linear = torch.nn.Linear(n_channels, n_channels) else: self.linear = None if gate_activation == "identity": self.act = torch.nn.Identity() else: self.act = get_activation(gate_activation) self.dropout = torch.nn.Dropout(dropout_rate)
def __init__(self, idim, time_len=8, mem_len=0, ext_len=0, future_len=0, attention_type="memory", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() self.idim = idim self.time_len = time_len self.future_len = future_len self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.attention_type = attention_type self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self._generateInputLayer() if attention_type == "memory": self.encoders = repeat( num_blocks, lambda: EncoderLayerXL(n_head=attention_heads, d_model=attention_dim, d_head=attention_dim // attention_heads, ext_len=ext_len, mem_len=mem_len, future_len=future_len, dropout=dropout_rate, dropatt=attention_dropout_rate, pre_lnorm=normalize_before, pos_ff=PositionwiseFeedForward( attention_dim, linear_units, dropout_rate))) elif attention_type == "traditional": self.encoders = repeat( num_blocks, lambda: EncoderLayerTD( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) else: ValueError("only memory or traditional can be used") if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__() attention_dim = encoder_output_size if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(vocab_size, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(vocab_size, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError( f"only 'embed' or 'linear' is supported: {input_layer}") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, vocab_size) else: self.output_layer = None
def __init__( self, input_size: int, w2v_url: str, w2v_dir_path: str = "./", output_size: int = 256, normalize_before: bool = False, freeze_finetune_updates: int = 0, ): assert check_argument_types() super().__init__() if w2v_url != "": try: import fairseq from fairseq.models.wav2vec.wav2vec2 import Wav2Vec2Model except Exception as e: print("Error: FairSeq is not properly installed.") print( "Please install FairSeq: cd ${MAIN_ROOT}/tools && make fairseq.done" ) raise e self.w2v_model_path = download_w2v(w2v_url, w2v_dir_path) self._output_size = output_size models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [self.w2v_model_path], arg_overrides={"data": w2v_dir_path}, ) model = models[0] if not isinstance(model, Wav2Vec2Model): try: model = model.w2v_encoder.w2v_model except Exception as e: print("Error: pretrained models should be within: " "'Wav2Vec2Model, Wav2VecCTC' classes, etc.") raise e self.encoders = model self.pretrained_params = copy.deepcopy(model.state_dict()) self.normalize_before = normalize_before if self.normalize_before: self.after_norm = LayerNorm(output_size) if model.cfg.encoder_embed_dim != output_size: # TODO(xkc09): try LSTM self.output_layer = torch.nn.Sequential( torch.nn.Linear(model.cfg.encoder_embed_dim, output_size), ) else: self.output_layer = None self.freeze_finetune_updates = freeze_finetune_updates self.register_buffer("num_updates", torch.LongTensor([0]))
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, moe_att_mode='linear'): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: HANDecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate=dropout_rate, moe_att_mode=moe_att_mode, normalize_before=normalize_before, concat_after=concat_after, )) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__(self, idim, center_len=8, left_len=0, hop_len=0, right_len=0, abs_pos=1, rel_pos=0, use_mem=1, att_type="mta", subpos=None, subtype="normal", attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() if subpos is None: subpos = [0, 0] self.idim = idim self.center_len = center_len self.use_mem = use_mem != 0 self.left_len = left_len if self.use_mem: self.mem_len = left_len else: self.mem_len = 0 self.hop_len = hop_len self.right_len = right_len self.abs_pos = abs_pos != 0 self.rel_pos = rel_pos != 0 self.attention_dim = attention_dim self.attention_heads = attention_heads self.linear_units = linear_units self.dropout_rate = dropout_rate self.input_layer = input_layer self.normalize_before = normalize_before self.concat_after = concat_after self.positional_dropout_rate = positional_dropout_rate self.pos_enc_class = pos_enc_class self.subpos = subpos self.subtype = subtype self.num_blocks = num_blocks self.attention_dropout_rate = attention_dropout_rate self.att_type = att_type self.encoders = torch.nn.ModuleList() self._generateInputLayer() self._generateEncoderLayer() if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, hdim: int, self_attention: MultiHeadedAttention, feed_forward: PositionwiseFeedForward, dropout_rate: float, ): """Construct an DecoderLayer object.""" super().__init__() self.self_attention = self_attention self.feed_forward = feed_forward self.norm1 = LayerNorm(hdim) self.norm2 = LayerNorm(hdim) self.dropout = torch.nn.Dropout(dropout_rate) self.hdim = hdim
def __init__( self, size, self_attn, feed_forward, dropout_rate, total_layer_num, normalize_before=True, concat_after=False, ): """Construct an EncoderLayer object.""" super(ContextualBlockEncoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.norm1 = LayerNorm(size) self.norm2 = LayerNorm(size) self.dropout = nn.Dropout(dropout_rate) self.size = size self.normalize_before = normalize_before self.concat_after = concat_after self.total_layer_num = total_layer_num if self.concat_after: self.concat_linear = nn.Linear(size + size, size)
def __init__( self, idim: int, enc_arch: List, input_layer: str = "linear", repeat_block: int = 1, self_attn_type: str = "selfattn", positional_encoding_type: str = "abs_pos", positionwise_layer_type: str = "linear", positionwise_activation_type: str = "relu", conv_mod_activation_type: str = "relu", aux_enc_output_layers: List = [], input_layer_dropout_rate: float = 0.0, input_layer_pos_enc_dropout_rate: float = 0.0, padding_idx: int = -1, ): """Construct an CustomEncoder object.""" super().__init__() ( self.embed, self.encoders, self.enc_out, self.conv_subsampling_factor, ) = build_blocks( "encoder", idim, input_layer, enc_arch, repeat_block=repeat_block, self_attn_type=self_attn_type, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, conv_mod_activation_type=conv_mod_activation_type, input_layer_dropout_rate=input_layer_dropout_rate, input_layer_pos_enc_dropout_rate=input_layer_pos_enc_dropout_rate, padding_idx=padding_idx, ) self.after_norm = LayerNorm(self.enc_out) self.n_blocks = len(enc_arch) * repeat_block self.aux_enc_output_layers = aux_enc_output_layers
def __init__( self, idim, enc_arch, input_layer="linear", repeat_block=0, self_attn_type="selfattn", positional_encoding_type="abs_pos", positionwise_layer_type="linear", positionwise_activation_type="relu", conv_mod_activation_type="relu", normalize_before=True, aux_task_layer_list=[], padding_idx=-1, ): """Construct an CustomEncoder object.""" super().__init__() ( self.embed, self.encoders, self.enc_out, self.conv_subsampling_factor, ) = build_blocks( "encoder", idim, input_layer, enc_arch, repeat_block=repeat_block, self_attn_type=self_attn_type, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, conv_mod_activation_type=conv_mod_activation_type, padding_idx=padding_idx, ) self.normalize_before = normalize_before if self.normalize_before: self.after_norm = LayerNorm(self.enc_out) self.n_blocks = len(enc_arch) * repeat_block self.aux_task_layer_list = aux_task_layer_list
def __init__(self, odim, args): super(Decoder, self).__init__() self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, args.adim), PositionalEncoding(args.adim, args.dropout_rate) ) self.decoders = repeat( args.dlayers, lambda: DecoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate), args.dropout_rate ) ) self.output_norm = LayerNorm(args.adim) self.output_layer = torch.nn.Linear(args.adim, odim)
def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): super(Encoder, self).__init__() if input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(idim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) elif input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(idim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before self.encoders = repeat( num_blocks, lambda: EncoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim)
def __init__( self, odim, edim, jdim, dec_arch, input_layer="embed", repeat_block=0, joint_activation_type="tanh", positional_encoding_type="abs_pos", positionwise_layer_type="linear", positionwise_activation_type="relu", dropout_rate_embed=0.0, blank=0, ): """Construct a Decoder object for transformer-transducer models.""" torch.nn.Module.__init__(self) self.embed, self.decoders, ddim = build_blocks( "decoder", odim, input_layer, dec_arch, repeat_block=repeat_block, positional_encoding_type=positional_encoding_type, positionwise_layer_type=positionwise_layer_type, positionwise_activation_type=positionwise_activation_type, dropout_rate_embed=dropout_rate_embed, padding_idx=blank, ) self.after_norm = LayerNorm(ddim) self.lin_enc = torch.nn.Linear(edim, jdim) self.lin_dec = torch.nn.Linear(ddim, jdim, bias=False) self.lin_out = torch.nn.Linear(jdim, odim) self.joint_activation = get_activation(joint_activation_type) self.dunits = ddim self.odim = odim self.blank = blank
def __init__( self, vocab_size: int, encoder_output_size: int, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, ): assert check_argument_types() super().__init__() attention_dim = encoder_output_size if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(vocab_size, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(vocab_size, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError( f"only 'embed' or 'linear' is supported: {input_layer}") self.normalize_before = normalize_before if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, vocab_size) else: self.output_layer = None # Must set by the inheritance self.decoders = None
def __init__( self, idim: int, n_layers: int = 2, n_chans: int = 384, kernel_size: int = 3, bias: bool = True, dropout_rate: float = 0.5, output_dim = 1, ): """Initilize duration predictor module. Args: idim (int): Input dimension. n_layers (int, optional): Number of convolutional layers. n_chans (int, optional): Number of channels of convolutional layers. kernel_size (int, optional): Kernel size of convolutional layers. dropout_rate (float, optional): Dropout rate. """ # print('n_layers:', n_layers) # assert check_argument_types() super().__init__() self.conv = torch.nn.ModuleList() for idx in range(n_layers): in_chans = idim if idx == 0 else n_chans self.conv += [ torch.nn.Sequential( torch.nn.Conv1d( in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, bias=bias, ), torch.nn.ReLU(), LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate), ) ] self.linear = torch.nn.Linear(n_chans, output_dim)