예제 #1
0
    def define_name(dir_name, args):
        if 'conv' in args.enc_type:
            dir_name = ConvEncoder.define_name(dir_name, args)

        dir_name += str(args.transformer_enc_d_model) + 'dmodel'
        dir_name += str(args.transformer_enc_d_ff) + 'dff'
        if args.transformer_ffn_bottleneck_dim > 0:
            dir_name += str(args.transformer_ffn_bottleneck_dim) + 'bn'
        dir_name += str(args.enc_n_layers) + 'L'
        dir_name += str(args.transformer_enc_n_heads) + 'H'
        dir_name += 'pe' + str(args.transformer_enc_pe_type)
        if args.transformer_enc_clamp_len > 0:
            dir_name += '_clamp' + str(args.transformer_enc_clamp_len)
        if args.dropout_enc_layer > 0:
            dir_name += '_LD' + str(args.dropout_enc_layer)
        if int(str(args.lc_chunk_size_left).split('_')[-1]) > 0 or \
                int(str(args.lc_chunk_size_current).split('_')[-1]) > 0 or \
                int(str(args.lc_chunk_size_right).split('_')[-1]) > 0:
            dir_name += '_chunkL' + str(args.lc_chunk_size_left) + 'C' + \
                str(args.lc_chunk_size_current) + 'R' + str(args.lc_chunk_size_right)
            dir_name += '_' + args.lc_type
        elif sum(list(map(int,
                          args.transformer_enc_lookaheads.split('_')))) > 0:
            dir_name += '_LA' + str(
                sum(list(map(int,
                             args.transformer_enc_lookaheads.split('_')))))
        return dir_name
예제 #2
0
파일: rnn.py 프로젝트: pradipcyb/neural_sp
 def add_args(parser, args):
     group = parser.add_argument_group("RNN encoder")
     parser = ConvEncoder.add_args(parser, args)
     group.add_argument('--enc_n_units',
                        type=int,
                        default=512,
                        help='number of units in each encoder RNN layer')
     group.add_argument(
         '--enc_n_projs',
         type=int,
         default=0,
         help=
         'number of units in the projection layer after each encoder RNN layer'
     )
     group.add_argument(
         '--bidirectional_sum_fwd_bwd',
         type=strtobool,
         default=False,
         help='sum forward and backward RNN outputs for dimension reduction'
     )
     # streaming
     group.add_argument(
         '--lc_chunk_size_left',
         type=str,
         default="0",
         help='left chunk size for latency-controlled RNN encoder')
     group.add_argument(
         '--lc_chunk_size_right',
         type=str,
         default="0",
         help='right chunk size for latency-controlled RNN encoder')
     return parser
예제 #3
0
    def define_name(dir_name, args):
        if 'conv' in args.enc_type:
            dir_name = ConvEncoder.define_name(dir_name, args)

        dir_name += str(args.enc_n_units) + 'H'
        if args.enc_n_projs > 0:
            dir_name += str(args.enc_n_projs) + 'P'
        dir_name += str(args.enc_n_layers) + 'L'
        if args.bidirectional_sum_fwd_bwd:
            dir_name += '_sumfwdbwd'
        if int(args.lc_chunk_size_left.split('_')[0]) > 0 or int(args.lc_chunk_size_right.split('_')[0]) > 0:
            dir_name += '_chunkL' + args.lc_chunk_size_left + 'R' + args.lc_chunk_size_right
        return dir_name
예제 #4
0
    def add_args(parser, args):
        """Add arguments."""
        group = parser.add_argument_group("Transformer encoder")
        if 'conv' in args.enc_type:
            parser = ConvEncoder.add_args(parser, args)
        # Transformer common
        if not hasattr(args, 'transformer_d_model'):
            group.add_argument('--transformer_d_model', type=int, default=256,
                               help='number of units in the MHA layer')
        if not hasattr(args, 'transformer_d_ff'):
            group.add_argument('--transformer_d_ff', type=int, default=2048,
                               help='number of units in the FFN layer')
        if not hasattr(args, 'transformer_d_ff_bottleneck_dim'):
            group.add_argument('--transformer_d_ff_bottleneck_dim', type=int, default=0,
                               help='bottleneck dimension in the FFN layer')
        if not hasattr(args, 'transformer_n_heads'):
            group.add_argument('--transformer_n_heads', type=int, default=4,
                               help='number of heads in the MHA layer')
        if not hasattr(args, 'transformer_layer_norm_eps'):
            group.add_argument('--transformer_layer_norm_eps', type=float, default=1e-12,
                               help='epsilon value for layer normalization')
        if not hasattr(args, 'transformer_ffn_activation'):
            group.add_argument('--transformer_ffn_activation', type=str, default='relu',
                               choices=['relu', 'gelu', 'gelu_accurate', 'glu', 'swish'],
                               help='nonlinear activation for the FFN layer')
        if not hasattr(args, 'transformer_param_init'):
            group.add_argument('--transformer_param_init', type=str, default='xavier_uniform',
                               choices=['xavier_uniform', 'pytorch'],
                               help='parameter initializatin')
        # NOTE: These checks are important to avoid conflict with args in Transformer decoder

        # Conformer encoder specific
        group.add_argument('--transformer_enc_pe_type', type=str, default='relative',
                           choices=['relative'],
                           help='type of positional encoding for the Transformer encoder')
        group.add_argument('--conformer_kernel_size', type=int, default=32,
                           help='kernel size for depthwise convolution in convolution module for Conformer encoder layers')
        group.add_argument('--dropout_enc_layer', type=float, default=0.0,
                           help='LayerDrop probability for Conformer encoder layers')
        # streaming
        group.add_argument('--lc_chunk_size_left', type=int, default=0,
                           help='left chunk size for latency-controlled Conformer encoder')
        group.add_argument('--lc_chunk_size_current', type=int, default=0,
                           help='current chunk size (and hop size) for latency-controlled Conformer encoder')
        group.add_argument('--lc_chunk_size_right', type=int, default=0,
                           help='right chunk size for latency-controlled Conformer encoder')
        return parser
예제 #5
0
    def add_args(parser, args):
        """Add arguments."""
        group = parser.add_argument_group("Transformer encoder")
        if 'conv' in args.enc_type:
            parser = ConvEncoder.add_args(parser, args)
        # Transformer common
        if not hasattr(args, 'transformer_layer_norm_eps'):
            group.add_argument('--transformer_ffn_bottleneck_dim', type=int, default=0,
                               help='bottleneck dimension in the FFN layer')
            group.add_argument('--transformer_input_bottleneck_dim', type=int, default=0,
                               help='bottleneck dimension in the FFN layer')
            group.add_argument('--transformer_layer_norm_eps', type=float, default=1e-12,
                               help='epsilon value for layer normalization')
            group.add_argument('--transformer_ffn_activation', type=str, default='relu',
                               choices=['relu', 'gelu', 'gelu_accurate', 'glu', 'swish'],
                               help='nonlinear activation for the FFN layer')
            group.add_argument('--transformer_param_init', type=str, default='xavier_uniform',
                               choices=['xavier_uniform', 'pytorch'],
                               help='parameter initialization')

        # Transformer encoder specific
        group.add_argument('--transformer_enc_d_model', type=int, default=256,
                           help='number of units in the MHA layer for Transformer encoder')
        group.add_argument('--transformer_enc_d_ff', type=int, default=2048,
                           help='number of units in the FFN layer for Transformer encoder')
        group.add_argument('--transformer_enc_n_heads', type=int, default=4,
                           help='number of heads in the MHA layer for Transformer encoder')
        group.add_argument('--transformer_enc_pe_type', type=str, default='add',
                           choices=['add', 'none', 'relative', 'relative_xl'],
                           help='type of positional encoding for Transformer encoder')
        group.add_argument('--dropout_enc_layer', type=float, default=0.0,
                           help='LayerDrop probability for Transformer encoder layers')
        group.add_argument('--transformer_enc_clamp_len', type=int, default=-1,
                           help='maximum length for relative positional encoding. -1 means infinite length.')
        # streaming
        group.add_argument('--transformer_enc_lookaheads', type=str, default="0_0_0_0_0_0_0_0_0_0_0_0",
                           help='lookahead frames per layer for unidirectional Transformer encoder')
        group.add_argument('--lc_chunk_size_left', type=str, default="0",
                           help='left chunk size for latency-controlled Transformer encoder')
        group.add_argument('--lc_chunk_size_current', type=str, default="0",
                           help='current chunk size (and hop size) for latency-controlled Transformer encoder')
        group.add_argument('--lc_chunk_size_right', type=str, default="0",
                           help='right chunk size for latency-controlled Transformer encoder')
        group.add_argument('--lc_type', type=str, default='reshape',
                           choices=['reshape', 'mask'],
                           help='implementation methods of latency-controlled Transformer encoder')
        return parser
예제 #6
0
    def define_name(dir_name, args):
        if 'conv' in args.enc_type:
            dir_name = ConvEncoder.define_name(dir_name, args)

        dir_name += str(args.transformer_d_model) + 'dmodel'
        dir_name += str(args.transformer_d_ff) + 'dff'
        if args.transformer_ffn_bottleneck_dim > 0:
            dir_name += str(args.transformer_ffn_bottleneck_dim) + 'bn'
        dir_name += str(args.enc_n_layers) + 'L'
        dir_name += str(args.transformer_n_heads) + 'H'
        dir_name += 'pe' + str(args.transformer_enc_pe_type)
        if args.dropout_enc_layer > 0:
            dir_name += 'droplayer' + str(args.dropout_enc_layer)
        if args.lc_chunk_size_left > 0 or getattr(args, 'lc_chunk_size_current', 0) > 0 or args.lc_chunk_size_right > 0:
            dir_name += '_chunkL' + str(args.lc_chunk_size_left) + 'C' + \
                str(args.lc_chunk_size_current) + 'R' + str(args.lc_chunk_size_right)
            dir_name += '_' + args.lc_type
        return dir_name
예제 #7
0
    def define_name(dir_name, args):
        if 'conv' in args.enc_type:
            dir_name = ConvEncoder.define_name(dir_name, args)

        dir_name += str(args.enc_n_units) + 'H'
        if args.enc_n_projs > 0:
            dir_name += str(args.enc_n_projs) + 'P'
        dir_name += str(args.enc_n_layers) + 'L'
        if args.bidirectional_sum_fwd_bwd:
            dir_name += '_sumfwdbwd'
        if int(args.lc_chunk_size_left.split('_')[0]) > 0 or int(
                args.lc_chunk_size_right.split('_')[0]) > 0:
            dir_name += '_chunkL' + args.lc_chunk_size_left + 'R' + args.lc_chunk_size_right
            if not args.cnn_lookahead:
                dir_name += '_blockwise'
        if args.rsp_prob_enc > 0:
            dir_name += '_RSP' + str(args.rsp_prob_enc)
        return dir_name
예제 #8
0
파일: rnn.py 프로젝트: mbencherif/neural_sp
 def add_args(parser, args):
     group = parser.add_argument_group("RNN encoder")
     parser = ConvEncoder.add_args(parser, args)
     group.add_argument('--enc_n_units',
                        type=int,
                        default=512,
                        help='number of units in each encoder RNN layer')
     group.add_argument(
         '--enc_n_projs',
         type=int,
         default=0,
         help=
         'number of units in the projection layer after each encoder RNN layer'
     )
     group.add_argument(
         '--bidirectional_sum_fwd_bwd',
         type=strtobool,
         default=False,
         help='sum forward and backward RNN outputs for dimension reduction'
     )
     # streaming
     group.add_argument(
         '--lc_chunk_size_left',
         type=str,
         default="-1",
         help='current chunk size for latency-controlled RNN encoder')
     group.add_argument(
         '--lc_chunk_size_right',
         type=str,
         default="0",
         help='right chunk size for latency-controlled RNN encoder')
     group.add_argument('--cnn_lookahead',
                        type=strtobool,
                        default=True,
                        help='disable lookahead frames in CNN layers')
     group.add_argument('--rsp_prob_enc',
                        type=float,
                        default=0.0,
                        help='probability for Random State Passing (RSP)')
     return parser
예제 #9
0
 def add_args(parser, args):
     # group = parser.add_argument_group("TDS encoder")
     parser = ConvEncoder.add_args(parser, args)
     return parser
예제 #10
0
    def __init__(self, input_dim, attn_type, n_heads, n_layers, d_model, d_ff,
                 last_proj_dim, pe_type, layer_norm_eps, ffn_activation,
                 dropout_in, dropout, dropout_att, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, conv_param_init, param_init,
                 chunk_size_left, chunk_size_current, chunk_size_right):

        super(TransformerEncoder, self).__init__()

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type
        self.chunk_size_left = chunk_size_left
        self.chunk_size_current = chunk_size_current
        self.chunk_size_right = chunk_size_right

        # Setting for CNNs before RNNs
        if conv_channels:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layers = repeat(
            TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout,
                                    dropout_att, layer_norm_eps,
                                    ffn_activation, param_init), n_layers)
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        if last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim
        else:
            self.bridge = None
            self._odim = d_model

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor()

        if param_init == 'xavier_uniform':
            self.reset_parameters()
예제 #11
0
    def __init__(self, input_dim, enc_type, n_heads, n_layers, n_layers_sub1,
                 n_layers_sub2, d_model, d_ff, ffn_bottleneck_dim,
                 ffn_activation, pe_type, layer_norm_eps, last_proj_dim,
                 dropout_in, dropout, dropout_att, dropout_layer, subsample,
                 subsample_type, n_stacks, n_splices, conv_in_channel,
                 conv_channels, conv_kernel_sizes, conv_strides, conv_poolings,
                 conv_batch_norm, conv_layer_norm, conv_bottleneck_dim,
                 conv_param_init, task_specific_layer, param_init, clamp_len,
                 lookahead, chunk_size_left, chunk_size_current,
                 chunk_size_right, streaming_type):

        super(TransformerEncoder, self).__init__()

        # parse subsample
        subsamples = [1] * n_layers
        for lth, s in enumerate(list(map(int,
                                         subsample.split('_')[:n_layers]))):
            subsamples[lth] = s
        # parse lookahead
        lookaheads = [0] * n_layers
        for lth, s in enumerate(list(map(int,
                                         lookahead.split('_')[:n_layers]))):
            lookaheads[lth] = s

        if len(subsamples) > 0 and len(subsamples) != n_layers:
            raise ValueError(
                'subsample must be the same size as n_layers. n_layers: %d, subsample: %s'
                % (n_layers, subsamples))
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise Warning(
                'Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d'
                % (n_layers, n_layers_sub1))
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise Warning(
                'Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d'
                % (n_layers_sub1, n_layers_sub2))

        self.enc_type = enc_type
        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type
        self.scale = math.sqrt(d_model)

        # for compatibility
        chunk_size_left = str(chunk_size_left)
        chunk_size_current = str(chunk_size_current)
        chunk_size_right = str(chunk_size_right)

        # for streaming encoder
        self.unidir = 'uni' in enc_type
        self.lookaheads = lookaheads
        if sum(lookaheads) > 0:
            assert self.unidir
        self.chunk_size_left = int(chunk_size_left.split('_')[-1]) // n_stacks
        self.chunk_size_current = int(
            chunk_size_current.split('_')[-1]) // n_stacks
        self.chunk_size_right = int(
            chunk_size_right.split('_')[-1]) // n_stacks
        self.lc_bidir = self.chunk_size_current > 0 and enc_type != 'conv' and 'uni' not in enc_type
        self.cnn_lookahead = self.unidir or enc_type == 'conv'
        self.streaming_type = streaming_type if self.lc_bidir else ''
        # -: past context
        # *: current context
        # +: future context
        # reshape) overlapped windowing. additional redundant computation is introduced.
        # During inference, caching is not applied. However, considering (N_l+N_c+N_r) is very short
        # and independent on layer depth, the overhead is negligible.
        # chunk1: |**|++
        # chunk2:  --|**|++
        # chunk3:     --|**|++
        # chunk4:        --|**|++
        # chunk5:           --|**|++
        # mask) chunkwise masking. future context is restricted within the current chunk
        # to avoid accumuration of future context depending on the layer depth.
        # chunk1: |**|
        # chunk2:  --|**|
        # chunk3:  -- --|**|
        # chunk4:     -- --|**|
        # chunk5:        -- --|**|
        if self.unidir:
            assert self.chunk_size_left == self.chunk_size_current == self.chunk_size_right == 0
        if self.streaming_type == 'mask':
            assert self.chunk_size_right == 0
            assert self.chunk_size_left == self.chunk_size_current
            # NOTE: this is important to cache CNN output at each chunk
        if self.lc_bidir:
            assert n_layers_sub1 == 0
            assert n_layers_sub2 == 0
            assert not self.unidir

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs
        if 'conv' in enc_type:
            assert conv_channels
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor
        self.subsample = None
        if np.prod(subsamples) > 1:
            self._factor *= np.prod(subsamples)
            if subsample_type == 'max_pool':
                self.subsample = nn.ModuleList(
                    [MaxpoolSubsampler(factor) for factor in subsamples])
            elif subsample_type == 'concat':
                self.subsample = nn.ModuleList([
                    ConcatSubsampler(factor, self._odim)
                    for factor in subsamples
                ])
            elif subsample_type == 'drop':
                self.subsample = nn.ModuleList(
                    [DropSubsampler(factor) for factor in subsamples])
            elif subsample_type == '1dconv':
                self.subsample = nn.ModuleList([
                    Conv1dSubsampler(factor, self._odim)
                    for factor in subsamples
                ])
            elif subsample_type == 'add':
                self.subsample = nn.ModuleList(
                    [AddSubsampler(factor) for factor in subsamples])

        if self.chunk_size_left > 0:
            assert self.chunk_size_left % self._factor == 0
        if self.chunk_size_current > 0:
            assert self.chunk_size_current % self._factor == 0
        if self.chunk_size_right > 0:
            assert self.chunk_size_right % self._factor == 0

        self.pos_enc, self.pos_emb = None, None
        self.u_bias, self.v_bias = None, None
        if pe_type in ['relative', 'relative_xl']:
            self.pos_emb = XLPositionalEmbedding(d_model, dropout)
            if pe_type == 'relative_xl':
                self.u_bias = nn.Parameter(
                    torch.Tensor(n_heads, d_model // n_heads))
                self.v_bias = nn.Parameter(
                    torch.Tensor(n_heads, d_model // n_heads))
                # NOTE: u_bias and v_bias are global parameters shared in the whole model
        else:
            self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type,
                                              param_init)

        self.layers = nn.ModuleList([
            copy.deepcopy(
                TransformerEncoderBlock(d_model, d_ff, n_heads, dropout,
                                        dropout_att, dropout_layer,
                                        layer_norm_eps, ffn_activation,
                                        param_init, pe_type, clamp_len,
                                        ffn_bottleneck_dim))
            for _ in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = TransformerEncoderBlock(
                    d_model, d_ff, n_heads, dropout, dropout_att,
                    dropout_layer, layer_norm_eps, ffn_activation, param_init,
                    pe_type, clamp_len, ffn_bottleneck_dim)
            odim_sub1 = d_model
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)
                odim_sub1 = last_proj_dim
            if n_layers_sub1 == n_layers:
                self.norm_out_sub1 = None
            else:
                self.norm_out_sub1 = nn.LayerNorm(odim_sub1,
                                                  eps=layer_norm_eps)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = TransformerEncoderBlock(
                    d_model, d_ff, n_heads, dropout, dropout_att,
                    dropout_layer, layer_norm_eps, ffn_activation, param_init,
                    pe_type, clamp_len, ffn_bottleneck_dim)
            odim_sub2 = d_model
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)
                odim_sub2 = last_proj_dim
            if n_layers_sub2 == n_layers:
                self.norm_out_sub2 = None
            else:
                self.norm_out_sub2 = nn.LayerNorm(odim_sub2,
                                                  eps=layer_norm_eps)

        if last_proj_dim > 0 and last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        self.reset_parameters(param_init)

        # for streaming inference
        self.reset_cache()
예제 #12
0
    def __init__(self, input_dim, enc_type, n_heads, kernel_size, n_layers,
                 n_layers_sub1, n_layers_sub2, d_model, d_ff,
                 ffn_bottleneck_dim, last_proj_dim, pe_type, layer_norm_eps,
                 ffn_activation, dropout_in, dropout, dropout_att,
                 dropout_layer, n_stacks, n_splices, conv_in_channel,
                 conv_channels, conv_kernel_sizes, conv_strides, conv_poolings,
                 conv_batch_norm, conv_layer_norm, conv_bottleneck_dim,
                 conv_param_init, task_specific_layer, param_init,
                 chunk_size_left, chunk_size_current, chunk_size_right):

        super(ConformerEncoder, self).__init__()

        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type
        self.scale = math.sqrt(d_model)

        # for streaming encoder
        self.chunk_size_left = chunk_size_left
        self.chunk_size_current = chunk_size_current
        self.chunk_size_right = chunk_size_right
        self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs
        if 'conv' in enc_type:
            assert conv_channels
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor

        if self.chunk_size_left > 0:
            assert self.chunk_size_left % self._factor == 0
        if self.chunk_size_current > 0:
            assert self.chunk_size_current % self._factor == 0
        if self.chunk_size_right > 0:
            assert self.chunk_size_right % self._factor == 0

        self.pos_emb = XLPositionalEmbedding(d_model, dropout)
        assert pe_type == 'relative'
        # TODO(hirofumi0810): try other positional encodings

        self.layers = nn.ModuleList([
            copy.deepcopy(
                ConformerEncoderBlock(d_model,
                                      d_ff,
                                      n_heads,
                                      kernel_size,
                                      dropout,
                                      dropout_att,
                                      dropout_layer,
                                      layer_norm_eps,
                                      ffn_activation,
                                      param_init,
                                      ffn_bottleneck_dim=ffn_bottleneck_dim))
            for _ in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = ConformerEncoderBlock(
                    d_model,
                    d_ff,
                    n_heads,
                    kernel_size,
                    dropout,
                    dropout_att,
                    dropout_layer,
                    layer_norm_eps,
                    ffn_activation,
                    param_init,
                    ffn_bottleneck_dim=ffn_bottleneck_dim)
            self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = ConformerEncoderBlock(
                    d_model,
                    d_ff,
                    n_heads,
                    kernel_size,
                    dropout,
                    dropout_att,
                    dropout_layer,
                    layer_norm_eps,
                    ffn_activation,
                    param_init,
                    ffn_bottleneck_dim=ffn_bottleneck_dim)
            self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)

        if last_proj_dim > 0 and last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        self.reset_parameters(param_init)
예제 #13
0
    def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers,
                 n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim,
                 pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout,
                 dropout_att, dropout_layer, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, conv_param_init, task_specific_layer,
                 param_init, chunk_size_left, chunk_size_current,
                 chunk_size_right):

        super(TransformerEncoder, self).__init__()

        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type

        # for streaming TransformerXL encoder
        self.N_l = chunk_size_left
        self.N_c = chunk_size_current
        self.N_r = chunk_size_right
        self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0
        self.memory_transformer = ('transformer_xl' in enc_type)
        self.mem_len = chunk_size_left
        self.scale = math.sqrt(d_model)
        if self.memory_transformer:
            assert pe_type == 'none'
            assert chunk_size_left > 0
            assert chunk_size_current > 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs
        if conv_channels:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor

        if self.memory_transformer:
            self.pos_emb = XLPositionalEmbedding(d_model, dropout)
            self.u = nn.Parameter(
                torch.Tensor(self.n_heads, self.d_model // self.n_heads))
            self.v = nn.Parameter(
                torch.Tensor(self.n_heads, self.d_model // self.n_heads))
            # NOTE: u and v are global parameters
        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type,
                                          param_init)

        self.layers = nn.ModuleList([
            copy.deepcopy(
                TransformerEncoderBlock(
                    d_model,
                    d_ff,
                    attn_type,
                    n_heads,
                    dropout,
                    dropout_att,
                    dropout_layer,
                    layer_norm_eps,
                    ffn_activation,
                    param_init,
                    memory_transformer=self.memory_transformer))
            for _ in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_layer, layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_layer, layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)

        if last_proj_dim > 0 and last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        self.reset_parameters(param_init)
예제 #14
0
파일: rnn.py 프로젝트: rosrad/neural_sp
    def __init__(self,
                 input_dim,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 dropout_in,
                 dropout,
                 subsample,
                 subsample_type='drop',
                 n_stacks=1,
                 n_splices=1,
                 last_proj_dim=0,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_bottleneck_dim=0,
                 n_layers_sub1=0,
                 n_layers_sub2=0,
                 nin=False,
                 task_specific_layer=False,
                 param_init=0.1):

        super(RNNEncoder, self).__init__()
        logger = logging.getLogger("training")

        if len(subsample) > 0 and len(subsample) != n_layers:
            raise ValueError('subsample must be the same size as n_layers.')
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.rnn_type = rnn_type
        self.bidirectional = True if rnn_type in [
            'blstm', 'bgru', 'conv_blstm', 'conv_bgru'
        ] else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers

        # Setting for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # Setting for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        # Setting for CNNs before RNNs
        if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            if rnn_type in ['tds', 'gated_conv']:
                strides = []
                poolings = []
            else:
                strides = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_strides.split('_')
                           ] if len(conv_strides) > 0 else []
                poolings = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_poolings.split('_')
                            ] if len(conv_poolings) > 0 else []
            if 'conv_' in rnn_type:
                subsample = [1] * self.n_layers
                logger.warning(
                    'Subsampling is automatically ignored because CNN layers are used before RNN layers.'
                )
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []

        if len(channels) > 0:
            if rnn_type == 'tds':
                self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                       in_channel=conv_in_channel,
                                       channels=channels,
                                       kernel_sizes=kernel_sizes,
                                       dropout=dropout,
                                       bottleneck_dim=last_proj_dim)
            elif rnn_type == 'gated_conv':
                self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                             in_channel=conv_in_channel,
                                             channels=channels,
                                             kernel_sizes=kernel_sizes,
                                             dropout=dropout,
                                             bottleneck_dim=last_proj_dim,
                                             param_init=param_init)
            else:
                assert n_stacks == 1 and n_splices == 1
                self.conv = ConvEncoder(input_dim,
                                        in_channel=conv_in_channel,
                                        channels=channels,
                                        kernel_sizes=kernel_sizes,
                                        strides=strides,
                                        poolings=poolings,
                                        dropout=0,
                                        batch_norm=conv_batch_norm,
                                        bottleneck_dim=conv_bottleneck_dim,
                                        param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

        self.padding = Padding()

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            self.rnn = nn.ModuleList()
            self.dropout = nn.ModuleList()
            self.proj = None
            if n_projs > 0:
                self.proj = nn.ModuleList()

            # subsample
            self.subsample = None
            if subsample_type == 'max_pool' and np.prod(subsample) > 1:
                self.subsample = nn.ModuleList(
                    [MaxpoolSubsampler(subsample[l]) for l in range(n_layers)])
            elif subsample_type == 'concat' and np.prod(subsample) > 1:
                self.subsample = nn.ModuleList([
                    ConcatSubsampler(subsample[l], n_units, self.n_dirs)
                    for l in range(n_layers)
                ])
            elif subsample_type == 'drop' and np.prod(subsample) > 1:
                self.subsample = nn.ModuleList(
                    [DropSubsampler(subsample[l]) for l in range(n_layers)])

            # NiN
            self.nin = None
            if nin:
                self.nin = nn.ModuleList()

            for l in range(n_layers):
                if 'lstm' in rnn_type:
                    rnn_i = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError(
                        'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                    )

                self.rnn += [
                    rnn_i(self._output_dim,
                          n_units,
                          1,
                          bias=True,
                          batch_first=True,
                          dropout=0,
                          bidirectional=self.bidirectional)
                ]
                self.dropout += [nn.Dropout(p=dropout)]
                self._output_dim = n_units * self.n_dirs

                # Projection layer
                if self.proj is not None:
                    if l != n_layers - 1:
                        self.proj += [Linear(n_units * self.n_dirs, n_projs)]
                        self._output_dim = n_projs

                # Task specific layer
                if l == n_layers_sub1 - 1 and task_specific_layer:
                    self.rnn_sub1 = rnn_i(self._output_dim,
                                          n_units,
                                          1,
                                          bias=True,
                                          batch_first=True,
                                          dropout=0,
                                          bidirectional=self.bidirectional)
                    self.dropout_sub1 = nn.Dropout(p=dropout)
                    if last_proj_dim != self.output_dim:
                        self.bridge_sub1 = Linear(n_units, last_proj_dim)
                if l == n_layers_sub2 - 1 and task_specific_layer:
                    self.rnn_sub2 = rnn_i(self._output_dim,
                                          n_units,
                                          1,
                                          bias=True,
                                          batch_first=True,
                                          dropout=0,
                                          bidirectional=self.bidirectional)
                    self.dropout_sub2 = nn.Dropout(p=dropout)
                    if last_proj_dim != self.output_dim:
                        self.bridge_sub2 = Linear(n_units, last_proj_dim)

                # Network in network
                if self.nin is not None:
                    if l != n_layers - 1:
                        self.nin += [NiN(self._output_dim)]
                    # if n_layers_sub1 > 0 or n_layers_sub2 > 0:
                    #     assert task_specific_layer

            if last_proj_dim != self.output_dim:
                self.bridge = Linear(self._output_dim, last_proj_dim)
                self._output_dim = last_proj_dim

        # Initialize parameters
        self.reset_parameters(param_init)
예제 #15
0
    def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers,
                 n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim,
                 pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout,
                 dropout_att, dropout_residual, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, conv_param_init, task_specific_layer,
                 param_init, chunk_size_left, chunk_size_current,
                 chunk_size_right, n_layers_rnn):

        super(TransformerEncoder, self).__init__()

        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type

        # for streaming TransformerXL encoder
        self.chunk_size_left = chunk_size_left
        self.chunk_size_cur = chunk_size_current
        self.chunk_size_right = chunk_size_right
        self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0
        self.memory_transformer = ('transformer_xl' in enc_type)
        self.mem_len = chunk_size_left
        self.scale = math.sqrt(d_model)
        if self.memory_transformer:
            assert pe_type == 'none'
            assert chunk_size_left > 0
            assert chunk_size_current > 0
        if self.latency_controlled:
            assert pe_type == 'none'

        # for hybrid RNN-Transformer encoder
        self.hybrid_rnn = n_layers_rnn > 0
        self.n_layers_rnn = n_layers_rnn
        self.proj = None

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs before RNNs
        if conv_channels:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        # Hybrid RNN-Transformer
        if self.hybrid_rnn:
            assert pe_type == 'none'
            self.rnn = nn.ModuleList()
            self.rnn_bwd = nn.ModuleList()
            self.dropout_rnn = nn.Dropout(p=dropout)
            assert ('blstm' in enc_type or 'bgru' in enc_type)
            # NOTE: support bidirectional only
            self.bidir_sum = True

            for _ in range(n_layers_rnn):
                if 'blstm' in enc_type:
                    rnn_i = nn.LSTM
                elif 'bgru' in enc_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError(
                        'rnn_type must be "(conv_)blstm_transformer(_xl)" or "(conv_)bgru_transformer(_xl)".'
                    )

                self.rnn += [rnn_i(self._odim, d_model, 1, batch_first=True)]
                self.rnn_bwd += [
                    rnn_i(self._odim, d_model, 1, batch_first=True)
                ]
                self._odim = d_model if self.bidir_sum else d_model * self.n_dirs

            if self._odim != d_model:
                self.proj = nn.Linear(self._odim, d_model)

        if self.memory_transformer:
            self.pos_emb = XLPositionalEmbedding(d_model, dropout)
            self.u = nn.Parameter(
                torch.Tensor(self.n_heads, self.d_model // self.n_heads))
            self.v = nn.Parameter(
                torch.Tensor(self.n_heads, self.d_model // self.n_heads))
            # NOTE: u and v are global parameters
        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type,
                                          param_init)
        # TODO: replace dropout_in with dropout

        self.layers = nn.ModuleList([
            copy.deepcopy(
                TransformerEncoderBlock(
                    d_model,
                    d_ff,
                    attn_type,
                    n_heads,
                    dropout,
                    dropout_att,
                    dropout_residual * (lth + 1) / n_layers,
                    layer_norm_eps,
                    ffn_activation,
                    param_init,
                    memory_transformer=self.memory_transformer))
            for lth in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_residual * n_layers_sub1 / n_layers,
                    layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_residual * n_layers_sub2 / n_layers,
                    layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)

        if last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor

        self.reset_parameters(param_init)
예제 #16
0
def build_encoder(args):

    if 'conv' in args.enc_type:
        assert args.n_stacks == 1 and args.n_splices == 1
        from neural_sp.models.seq2seq.encoders.conv import ConvEncoder
        conv = ConvEncoder(
            args.input_dim,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            strides=args.conv_strides,
            poolings=args.conv_poolings,
            dropout=0.,
            normalization=args.conv_normalization,
            residual=False,
            bottleneck_dim=args.transformer_enc_d_model
            if 'former' in args.enc_type else args.conv_bottleneck_dim,
            param_init=args.param_init)
    else:
        conv = None

    # safeguard
    if not hasattr(args, 'transformer_enc_d_model') and hasattr(
            args, 'transformer_d_model'):
        args.transformer_enc_d_model = args.transformer_d_model
        args.transformer_dec_d_model = args.transformer_d_model
    if not hasattr(args, 'transformer_enc_d_ff') and hasattr(
            args, 'transformer_d_ff'):
        args.transformer_enc_d_ff = args.transformer_d_ff
    if not hasattr(args, 'transformer_enc_n_heads') and hasattr(
            args, 'transformer_n_heads'):
        args.transformer_enc_n_heads = args.transformer_n_heads

    if args.enc_type == 'tds':
        from neural_sp.models.seq2seq.encoders.tds import TDSEncoder
        encoder = TDSEncoder(
            input_dim=args.input_dim * args.n_stacks,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            dropout=args.dropout_enc,
            last_proj_dim=args.transformer_dec_d_model
            if 'transformer' in args.dec_type else args.dec_n_units)

    elif args.enc_type == 'gated_conv':
        from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder
        encoder = GatedConvEncoder(
            input_dim=args.input_dim * args.n_stacks,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            dropout=args.dropout_enc,
            last_proj_dim=args.transformer_dec_d_model
            if 'transformer' in args.dec_type else args.dec_n_units,
            param_init=args.param_init)

    elif 'transformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder
        encoder = TransformerEncoder(
            input_dim=args.input_dim
            if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            n_heads=args.transformer_enc_n_heads,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            d_model=args.transformer_enc_d_model,
            d_ff=args.transformer_enc_d_ff,
            ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim,
            ffn_activation=args.transformer_ffn_activation,
            pe_type=args.transformer_enc_pe_type,
            layer_norm_eps=args.transformer_layer_norm_eps,
            last_proj_dim=args.transformer_dec_d_model
            if 'transformer' in args.dec_type else 0,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            dropout_att=args.dropout_att,
            dropout_layer=args.dropout_enc_layer,
            subsample=args.subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            frontend_conv=conv,
            task_specific_layer=args.task_specific_layer,
            param_init=args.transformer_param_init,
            clamp_len=args.transformer_enc_clamp_len,
            lookahead=args.transformer_enc_lookaheads,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_current=args.lc_chunk_size_current,
            chunk_size_right=args.lc_chunk_size_right,
            streaming_type=args.lc_type)

    elif 'conformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.conformer import ConformerEncoder
        encoder = ConformerEncoder(
            input_dim=args.input_dim
            if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            n_heads=args.transformer_enc_n_heads,
            kernel_size=args.conformer_kernel_size,
            normalization=args.conformer_normalization,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            d_model=args.transformer_enc_d_model,
            d_ff=args.transformer_enc_d_ff,
            ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim,
            ffn_activation='swish',
            pe_type=args.transformer_enc_pe_type,
            layer_norm_eps=args.transformer_layer_norm_eps,
            last_proj_dim=args.transformer_dec_d_model
            if 'transformer' in args.dec_type else 0,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            dropout_att=args.dropout_att,
            dropout_layer=args.dropout_enc_layer,
            subsample=args.subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            frontend_conv=conv,
            task_specific_layer=args.task_specific_layer,
            param_init=args.transformer_param_init,
            clamp_len=args.transformer_enc_clamp_len,
            lookahead=args.transformer_enc_lookaheads,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_current=args.lc_chunk_size_current,
            chunk_size_right=args.lc_chunk_size_right,
            streaming_type=args.lc_type)

    else:
        from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder
        encoder = RNNEncoder(
            input_dim=args.input_dim
            if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            n_units=args.enc_n_units,
            n_projs=args.enc_n_projs,
            last_proj_dim=args.transformer_dec_d_model
            if 'transformer' in args.dec_type else 0,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            subsample=args.subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            frontend_conv=conv,
            bidir_sum_fwd_bwd=args.bidirectional_sum_fwd_bwd,
            task_specific_layer=args.task_specific_layer,
            param_init=args.param_init,
            chunk_size_current=args.lc_chunk_size_left,  # for compatibility
            chunk_size_right=args.lc_chunk_size_right,
            cnn_lookahead=args.cnn_lookahead,
            rsp_prob=args.rsp_prob_enc)

    return encoder
예제 #17
0
    def __init__(self, input_dim, enc_type, n_heads, n_layers, n_layers_sub1,
                 n_layers_sub2, d_model, d_ff, ffn_bottleneck_dim,
                 last_proj_dim, pe_type, layer_norm_eps, ffn_activation,
                 dropout_in, dropout, dropout_att, dropout_layer, subsample,
                 subsample_type, n_stacks, n_splices, conv_in_channel,
                 conv_channels, conv_kernel_sizes, conv_strides, conv_poolings,
                 conv_batch_norm, conv_layer_norm, conv_bottleneck_dim,
                 conv_param_init, task_specific_layer, param_init,
                 chunk_size_left, chunk_size_current, chunk_size_right,
                 latency_control_type):

        super(TransformerEncoder, self).__init__()

        # parse subsample
        subsamples = [1] * n_layers
        for lth, s in enumerate(list(map(int,
                                         subsample.split('_')[:n_layers]))):
            subsamples[lth] = s

        if len(subsamples) > 0 and len(subsamples) != n_layers:
            raise ValueError(
                'subsample must be the same size as n_layers. n_layers: %d, subsample: %s'
                % (n_layers, subsamples))
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')
        assert enc_type in ['transformer', 'conv_transformer']

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type
        self.scale = math.sqrt(d_model)

        # for streaming encoder
        self.chunk_size_left = chunk_size_left
        self.chunk_size_current = chunk_size_current
        self.chunk_size_right = chunk_size_right
        self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0
        self.lc_type = latency_control_type
        # reshape) not lookahead frames in CNN layers, but requires some additional computations
        # mask) there are some lookahead frames in CNN layers, no additional computations

        # TransformerXL like streaming encoder
        self.memory_transformer = ('transformer_xl' in enc_type)
        self.mem_len = chunk_size_left
        if self.memory_transformer:
            assert pe_type == 'relative'
            assert chunk_size_left > 0
            assert chunk_size_current > 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs
        if 'conv' in enc_type:
            assert conv_channels
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor
        self.subsample = None
        if np.prod(subsamples) > 1:
            self._factor *= np.prod(subsamples)
            if subsample_type == 'max_pool':
                self.subsample = nn.ModuleList(
                    [MaxpoolSubsampler(factor) for factor in subsamples])
            elif subsample_type == 'concat':
                self.subsample = nn.ModuleList([
                    ConcatSubsampler(factor, self._odim)
                    for factor in subsamples
                ])
            elif subsample_type == 'drop':
                self.subsample = nn.ModuleList(
                    [DropSubsampler(factor) for factor in subsamples])
            elif subsample_type == '1dconv':
                self.subsample = nn.ModuleList([
                    Conv1dSubsampler(factor, self._odim)
                    for factor in subsamples
                ])

        if self.chunk_size_left > 0:
            assert self.chunk_size_left % self._factor == 0
        if self.chunk_size_current > 0:
            assert self.chunk_size_current % self._factor == 0
        if self.chunk_size_right > 0:
            assert self.chunk_size_right % self._factor == 0

        self.pos_emb = None
        self.u = None
        self.v = None
        if self.memory_transformer:
            self.pos_emb = XLPositionalEmbedding(d_model, dropout)
            self.u = nn.Parameter(torch.Tensor(n_heads, d_model // n_heads))
            self.v = nn.Parameter(torch.Tensor(n_heads, d_model // n_heads))
            # NOTE: u and v are global parameters
        elif pe_type == 'relative':
            self.pos_emb = XLPositionalEmbedding(d_model, dropout)
        else:
            self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type,
                                              param_init)

        self.layers = nn.ModuleList([
            copy.deepcopy(
                TransformerEncoderBlock(d_model,
                                        d_ff,
                                        n_heads,
                                        dropout,
                                        dropout_att,
                                        dropout_layer,
                                        layer_norm_eps,
                                        ffn_activation,
                                        param_init,
                                        relative_attention=self.pos_emb
                                        is not None,
                                        ffn_bottleneck_dim=ffn_bottleneck_dim))
            for _ in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = TransformerEncoderBlock(
                    d_model,
                    d_ff,
                    n_heads,
                    dropout,
                    dropout_att,
                    dropout_layer,
                    layer_norm_eps,
                    ffn_activation,
                    param_init,
                    ffn_bottleneck_dim=ffn_bottleneck_dim)
            self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = TransformerEncoderBlock(
                    d_model,
                    d_ff,
                    n_heads,
                    dropout,
                    dropout_att,
                    dropout_layer,
                    layer_norm_eps,
                    ffn_activation,
                    param_init,
                    ffn_bottleneck_dim=ffn_bottleneck_dim)
            self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)

        if last_proj_dim > 0 and last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        self.reset_parameters(param_init)
예제 #18
0
    def __init__(self,
                 input_dim,
                 attn_type,
                 attn_n_heads,
                 n_layers,
                 d_model,
                 d_ff,
                 pe_type,
                 dropout_in=0,
                 dropout=0,
                 dropout_att=0,
                 layer_norm_eps=1e-6,
                 n_stacks=1,
                 n_splices=1,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_residual=False,
                 conv_bottleneck_dim=0):

        super(TransformerEncoder, self).__init__()

        self.d_model = d_model
        self.pe_type = pe_type

        # Setting for CNNs before RNNs
        if conv_channels:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            strides = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_strides.split('_')
                       ] if len(conv_strides) > 0 else []
            poolings = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_poolings.split('_')
                        ] if len(conv_poolings) > 0 else []
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []

        if len(channels) > 0:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim * n_stacks,
                                    in_channel=conv_in_channel,
                                    channels=channels,
                                    kernel_sizes=kernel_sizes,
                                    strides=strides,
                                    poolings=poolings,
                                    dropout=0,
                                    batch_norm=conv_batch_norm,
                                    residual=conv_residual,
                                    bottleneck_dim=d_model)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

            self.embed_in = LinearND(
                self._output_dim, d_model,
                dropout=0)  # NOTE: do not apply dropout here

        if pe_type:
            self.pos_emb_in = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layer_norm_in = nn.LayerNorm(d_model, eps=layer_norm_eps)

        # Self-attention layers
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, d_ff, attn_type, attn_n_heads,
                                    dropout, dropout_att, layer_norm_eps)
            for l in range(n_layers)
        ])
        self.layer_norm_top = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._output_dim = d_model
예제 #19
0
파일: rnn.py 프로젝트: thanhkm/neural_sp
class RNNEncoder(EncoderBase):
    """RNN encoder.

    Args:
        input_dim (int): dimension of input features (freq * channel)
        rnn_type (str): type of encoder (including pure CNN layers)
        n_units (int): number of units in each layer
        n_projs (int): number of units in each projection layer
        last_proj_dim (int): dimension of the last projection layer
        n_layers (int): number of layers
        n_layers_sub1 (int): number of layers in the 1st auxiliary task
        n_layers_sub2 (int): number of layers in the 2nd auxiliary task
        dropout_in (float): dropout probability for input-hidden connection
        dropout (float): dropout probability for hidden-hidden connection
        subsample (list): subsample in the corresponding RNN layers
            ex.) [False, True, True, False] means that subsample is conducted in the 2nd and 3rd layers.
        subsample_type (str): drop/concat/max_pool
        n_stacks (int): number of frames to stack
        n_splices (int): number of frames to splice
        conv_in_channel (int): number of channels of input features
        conv_channels (int): number of channles in the CNN blocks
        conv_kernel_sizes (list): size of kernels in the CNN blocks
        conv_strides (list): number of strides in the CNN blocks
        conv_poolings (list): size of poolings in the CNN blocks
        conv_batch_norm (bool): apply batch normalization only in the CNN blocks
        conv_layer_norm (bool): apply layer normalization only in the CNN blocks
        conv_bottleneck_dim (int): dimension of the bottleneck layer between CNN and RNN layers
        nin (bool): insert 1*1 conv + batch normalization + ReLU
        bidirectional_sum_fwd_bwd (bool):
        task_specific_layer (bool):
        param_init (float):
        lc_chunk_size_left (int): left chunk size for latency-controlled bidirectional encoder
        lc_chunk_size_right (int): right chunk size for latency-controlled bidirectional encoder
        lc_state_reset_prob (float): probability to reset states for latency-controlled bidirectional encoder

    """

    def __init__(self, input_dim, rnn_type, n_units, n_projs, last_proj_dim,
                 n_layers, n_layers_sub1, n_layers_sub2,
                 dropout_in, dropout,
                 subsample, subsample_type, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings,
                 conv_batch_norm, conv_layer_norm, conv_bottleneck_dim,
                 nin, bidirectional_sum_fwd_bwd,
                 task_specific_layer, param_init,
                 lc_chunk_size_left, lc_chunk_size_right, lc_state_reset_prob):

        super(RNNEncoder, self).__init__()

        if len(subsample) > 0 and len(subsample) != n_layers:
            raise ValueError('subsample must be the same size as n_layers.')
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.rnn_type = rnn_type
        self.bidirectional = True if ('blstm' in rnn_type or 'bgru' in rnn_type) else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers

        # for latency-controlled
        self.latency_controlled = lc_chunk_size_left > 0 or lc_chunk_size_right > 0
        self.lc_chunk_size_left = lc_chunk_size_left
        self.lc_chunk_size_right = lc_chunk_size_right
        self.lc_state_reset_prob = lc_state_reset_prob
        if self.latency_controlled:
            assert n_layers_sub1 == 0
            assert n_layers_sub2 == 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        if rnn_type == 'tds':
            self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                   in_channel=conv_in_channel,
                                   channels=conv_channels,
                                   kernel_sizes=conv_kernel_sizes,
                                   dropout=dropout,
                                   bottleneck_dim=last_proj_dim)
        elif rnn_type == 'gated_conv':
            self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                         in_channel=conv_in_channel,
                                         channels=conv_channels,
                                         kernel_sizes=conv_kernel_sizes,
                                         dropout=dropout,
                                         bottleneck_dim=last_proj_dim,
                                         param_init=param_init)

        elif 'conv' in rnn_type:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    residual=False,
                                    bottleneck_dim=conv_bottleneck_dim,
                                    param_init=param_init)
        else:
            self.conv = None

        if self.conv is None:
            self._odim = input_dim * n_splices * n_stacks
        else:
            self._odim = self.conv.output_dim
            subsample = [1] * self.n_layers
            logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.')

        self.padding = Padding(bidirectional_sum_fwd_bwd=bidirectional_sum_fwd_bwd)

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            self.rnn = nn.ModuleList()
            if self.latency_controlled:
                self.rnn_bwd = nn.ModuleList()
            self.dropout = nn.Dropout(p=dropout)
            self.proj = None
            if n_projs > 0:
                self.proj = nn.ModuleList()

            # subsample
            self.subsample_layer = None
            if subsample_type == 'max_pool' and np.prod(subsample) > 1:
                self.subsample_layer = nn.ModuleList([MaxpoolSubsampler(subsample[l])
                                                      for l in range(n_layers)])
            elif subsample_type == 'concat' and np.prod(subsample) > 1:
                self.subsample_layer = nn.ModuleList([ConcatSubsampler(subsample[l], n_units * self.n_dirs)
                                                      for l in range(n_layers)])
            elif subsample_type == 'drop' and np.prod(subsample) > 1:
                self.subsample_layer = nn.ModuleList([DropSubsampler(subsample[l])
                                                      for l in range(n_layers)])
            elif subsample_type == '1dconv' and np.prod(subsample) > 1:
                self.subsample_layer = nn.ModuleList([Conv1dSubsampler(subsample[l], n_units * self.n_dirs)
                                                      for l in range(n_layers)])

            # NiN
            self.nin = nn.ModuleList() if nin else None

            for l in range(n_layers):
                if 'lstm' in rnn_type:
                    rnn_i = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError('rnn_type must be "(conv_)(b/lcb)lstm" or "(conv_)(b/lcb)gru".')

                if self.latency_controlled:
                    self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True)]
                    self.rnn_bwd += [rnn_i(self._odim, n_units, 1, batch_first=True)]
                else:
                    self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True,
                                       bidirectional=self.bidirectional)]
                self._odim = n_units if bidirectional_sum_fwd_bwd else n_units * self.n_dirs
                self.bidirectional_sum_fwd_bwd = bidirectional_sum_fwd_bwd

                # Projection layer
                if self.proj is not None:
                    if l != n_layers - 1:
                        self.proj += [nn.Linear(n_units * self.n_dirs, n_projs)]
                        self._odim = n_projs

                # Task specific layer
                if l == n_layers_sub1 - 1 and task_specific_layer:
                    self.rnn_sub1 = rnn_i(self._odim, n_units, 1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub1 = nn.Linear(n_units, last_proj_dim)
                if l == n_layers_sub2 - 1 and task_specific_layer:
                    self.rnn_sub2 = rnn_i(self._odim, n_units, 1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub2 = nn.Linear(n_units, last_proj_dim)

                # Network in network
                if self.nin is not None:
                    if l != n_layers - 1:
                        self.nin += [NiN(self._odim)]
                    # if n_layers_sub1 > 0 or n_layers_sub2 > 0:
                    #     assert task_specific_layer

            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge = nn.Linear(self._odim, last_proj_dim)
                self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor()
        self._factor *= np.prod(subsample)

        self.reset_parameters(param_init)

        # for streaming inference
        self.reset_cache()

    def reset_parameters(self, param_init):
        """Initialize parameters with uniform distribution."""
        logger.info('===== Initialize %s =====' % self.__class__.__name__)
        for n, p in self.named_parameters():
            if 'conv' in n or 'tds' in n or 'gated_conv' in n:
                continue  # for CNN layers before RNN layers
            if p.dim() == 1:
                nn.init.constant_(p, 0.)  # bias
                logger.info('Initialize %s with %s / %.3f' % (n, 'constant', 0.))
            elif p.dim() in [2, 4]:
                nn.init.uniform_(p, a=-param_init, b=param_init)
                logger.info('Initialize %s with %s / %.3f' % (n, 'uniform', param_init))
            else:
                raise ValueError(n)

    def reset_cache(self):
        self.fwd_states = [None] * self.n_layers
        logger.debug('Reset cache.')

    def forward(self, xs, xlens, task, use_cache=False, streaming=False):
        """Forward computation.

        Args:
            xs (FloatTensor): `[B, T, input_dim]`
            xlens (list): A list of length `[B]`
            task (str): all or ys or ys_sub1 or ys_sub2
            use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state
            streaming (bool): streaming encoding
        Returns:
            eouts (dict):
                xs (FloatTensor): `[B, T // prod(subsample), n_units (*2)]`
                xlens (IntTensor): `[B]`
                xs_sub1 (FloatTensor): `[B, T // prod(subsample), n_units (*2)]`
                xlens_sub1 (IntTensor): `[B]`
                xs_sub2 (FloatTensor): `[B, T // prod(subsample), n_units (*2)]`
                xlens_sub2 (IntTensor): `[B]`

        """
        eouts = {'ys': {'xs': None, 'xlens': None},
                 'ys_sub1': {'xs': None, 'xlens': None},
                 'ys_sub2': {'xs': None, 'xlens': None}}

        # Sort by lenghts in the descending order for pack_padded_sequence
        xlens, perm_ids = torch.IntTensor(xlens).sort(0, descending=True)
        xs = xs[perm_ids]
        _, perm_ids_unsort = perm_ids.sort()

        # Dropout for inputs-hidden connection
        xs = self.dropout_in(xs)

        # Path through CNN blocks before RNN layers
        if self.conv is not None:
            xs, xlens = self.conv(xs, xlens)
            if self.rnn_type in ['conv', 'tds', 'gated_conv']:
                eouts['ys']['xs'] = xs
                eouts['ys']['xlens'] = xlens
                return eouts

        if not use_cache:
            self.reset_cache()

        if self.latency_controlled:
            # Flip the layer and time loop
            xs, xlens = self._forward_streaming(xs, xlens, streaming)
        else:
            for l in range(self.n_layers):
                self.rnn[l].flatten_parameters()  # for multi-GPUs
                xs, self.fwd_states[l] = self.padding(xs, xlens, self.rnn[l],
                                                      prev_state=self.fwd_states[l])
                xs = self.dropout(xs)

                # Pick up outputs in the sub task before the projection layer
                if l == self.n_layers_sub1 - 1:
                    xs_sub1, xlens_sub1 = self.sub_module(xs, xlens, perm_ids_unsort, 'sub1')
                    if task == 'ys_sub1':
                        eouts[task]['xs'], eouts[task]['xlens'] = xs_sub1, xlens_sub1
                        return eouts
                if l == self.n_layers_sub2 - 1:
                    xs_sub2, xlens_sub2 = self.sub_module(xs, xlens, perm_ids_unsort, 'sub2')
                    if task == 'ys_sub2':
                        eouts[task]['xs'], eouts[task]['xlens'] = xs_sub2, xlens_sub2
                        return eouts

                # NOTE: Exclude the last layer
                if l != self.n_layers - 1:
                    # Projection layer -> Subsampling -> NiN
                    if self.proj is not None:
                        xs = torch.tanh(self.proj[l](xs))
                    if self.subsample_layer is not None:
                        xs, xlens = self.subsample_layer[l](xs, xlens)
                    if self.nin is not None:
                        xs = self.nin[l](xs)

        # Bridge layer
        if self.bridge is not None:
            xs = self.bridge(xs)

        # Unsort
        xs = xs[perm_ids_unsort]
        xlens = xlens[perm_ids_unsort]

        if task in ['all', 'ys']:
            eouts['ys']['xs'], eouts['ys']['xlens'] = xs, xlens
        if self.n_layers_sub1 >= 1 and task == 'all':
            eouts['ys_sub1']['xs'], eouts['ys_sub1']['xlens'] = xs_sub1, xlens_sub1
        if self.n_layers_sub2 >= 1 and task == 'all':
            eouts['ys_sub2']['xs'], eouts['ys_sub2']['xlens'] = xs_sub2, xlens_sub2
        return eouts

    def _forward_streaming(self, xs, xlens, streaming):
        """Streaming encoding for the latency-controlled bidirectional encoder.

        Args:
            xs (FloatTensor): `[B, T, n_units]`
        Returns:
            xs (FloatTensor): `[B, T, n_units]`

        """
        cs_l = self.lc_chunk_size_left // self.subsampling_factor()
        cs_r = self.lc_chunk_size_right // self.subsampling_factor()

        # full context BPTT
        if cs_l < 0:
            for l in range(self.n_layers):
                self.rnn[l].flatten_parameters()  # for multi-GPUs
                self.rnn_bwd[l].flatten_parameters()  # for multi-GPUs
                # bwd
                xs_bwd = torch.flip(xs, dims=[1])
                xs_bwd, _ = self.rnn_bwd[l](xs_bwd, hx=None)
                xs_bwd = torch.flip(xs_bwd, dims=[1])
                # fwd
                xs_fwd, _ = self.rnn[l](xs, hx=None)
                if self.bidirectional_sum_fwd_bwd:
                    xs = xs_fwd + xs_bwd
                else:
                    xs = torch.cat([xs_fwd, xs_bwd], dim=-1)
                xs = self.dropout(xs)

                # Projection layer
                if self.proj is not None and l != self.n_layers - 1:
                    xs = torch.tanh(self.proj[l](xs))
            return xs, xlens

        bs, xmax, input_dim = xs.size()
        n_chunks = 1 if streaming else math.ceil(xmax / cs_l)
        xlens = torch.IntTensor(bs).fill_(cs_l if streaming else xmax)

        xs_chunks = []
        for t in range(0, cs_l * n_chunks, cs_l):
            xs_chunk = xs[:, t:t + (cs_l + cs_r)]
            for l in range(self.n_layers):
                self.rnn[l].flatten_parameters()  # for multi-GPUs
                self.rnn_bwd[l].flatten_parameters()  # for multi-GPUs
                # bwd
                xs_chunk_bwd = torch.flip(xs_chunk, dims=[1])
                xs_chunk_bwd, _ = self.rnn_bwd[l](xs_chunk_bwd, hx=None)
                xs_chunk_bwd = torch.flip(xs_chunk_bwd, dims=[1])  # `[B, cs_l+cs_r, n_units]`
                # fwd
                if xs_chunk.size(1) <= cs_l:
                    xs_chunk_fwd, self.fwd_states[l] = self.rnn[l](xs_chunk, hx=self.fwd_states[l])
                    if self.training and self.lc_state_reset_prob > 0 and random.random() < self.lc_state_reset_prob:
                        self.fwd_states[l] = None
                else:
                    xs_chunk_fwd1, self.fwd_states[l] = self.rnn[l](xs_chunk[:, :cs_l], hx=self.fwd_states[l])
                    if self.training and self.lc_state_reset_prob > 0 and random.random() < self.lc_state_reset_prob:
                        self.fwd_states[l] = None
                    xs_chunk_fwd2, _ = self.rnn[l](xs_chunk[:, cs_l:], hx=self.fwd_states[l])
                    xs_chunk_fwd = torch.cat([xs_chunk_fwd1, xs_chunk_fwd2], dim=1)  # `[B, cs_l+cs_r, n_units]`
                    # NOTE: xs_chunk_fwd2 is for xs_chunk_bwd in the next layer
                if self.bidirectional_sum_fwd_bwd:
                    xs_chunk = xs_chunk_fwd + xs_chunk_bwd
                else:
                    xs_chunk = torch.cat([xs_chunk_fwd, xs_chunk_bwd], dim=-1)
                xs_chunk = self.dropout(xs_chunk)

                # Projection layer
                if self.proj is not None and l != self.n_layers - 1:
                    xs_chunk = torch.tanh(self.proj[l](xs_chunk))
            xs_chunks.append(xs_chunk[:, :cs_l])
        xs = torch.cat(xs_chunks, dim=1)

        return xs, xlens

    def sub_module(self, xs, xlens, perm_ids_unsort, module='sub1'):
        if self.task_specific_layer:
            getattr(self, 'rnn_' + module).flatten_parameters()  # for multi-GPUs
            xs_sub, _ = self.padding(xs, xlens, getattr(self, 'rnn_' + module))
            xs_sub = self.dropout(xs_sub)
        else:
            xs_sub = xs.clone()[perm_ids_unsort]
        if getattr(self, 'bridge_' + module) is not None:
            xs_sub = getattr(self, 'bridge_' + module)(xs_sub)
        xlens_sub = xlens[perm_ids_unsort]
        return xs_sub, xlens_sub
예제 #20
0
class TransformerEncoder(EncoderBase):
    """Transformer encoder.

    Args:
        input_dim (int): dimension of input features (freq * channel)
        enc_type (str): type of encoder
        attn_type (str): type of attention
        n_heads (int): number of heads for multi-head attention
        n_layers (int): number of blocks
        n_layers_sub1 (int): number of layers in the 1st auxiliary task
        n_layers_sub2 (int): number of layers in the 2nd auxiliary task
        d_model (int): dimension of MultiheadAttentionMechanism
        d_ff (int): dimension of PositionwiseFeedForward
        last_proj_dim (int): dimension of the last projection layer
        pe_type (str): type of positional encoding
        layer_norm_eps (float): epsilon value for layer normalization
        ffn_activation (str): nonolinear function for PositionwiseFeedForward
        dropout_in (float): dropout probability for input-hidden connection
        dropout (float): dropout probabilities for linear layers
        dropout_att (float): dropout probabilities for attention distributions
        dropout_residual (float): dropout probability for stochastic residual connections
        n_stacks (int): number of frames to stack
        n_splices (int): frames to splice. Default is 1 frame.
        conv_in_channel (int): number of channels of input features
        conv_channels (int): number of channles in the CNN blocks
        conv_kernel_sizes (list): size of kernels in the CNN blocks
        conv_strides (list): number of strides in the CNN blocks
        conv_poolings (list): size of poolings in the CNN blocks
        conv_batch_norm (bool): apply batch normalization only in the CNN blocks
        conv_layer_norm (bool): apply layer normalization only in the CNN blocks
        conv_bottleneck_dim (int): dimension of the bottleneck layer between CNN and self-attention layers
        conv_param_init (float): only for CNN layers before Transformer layers
        chunk_size_left (int): left chunk size for time-restricted Transformer encoder
        chunk_size_current (int): current chunk size for time-restricted Transformer encoder
        chunk_size_right (int): right chunk size for time-restricted Transformer encoder
        task_specific_layer (bool): add a task specific layer for each sub task
        param_init (str): parameter initialization method

    """
    def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers,
                 n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim,
                 pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout,
                 dropout_att, dropout_residual, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, conv_param_init, task_specific_layer,
                 param_init, chunk_size_left, chunk_size_current,
                 chunk_size_right):

        super(TransformerEncoder, self).__init__()

        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type

        # for latency-controlled
        self.chunk_size_left = chunk_size_left
        self.chunk_size_cur = chunk_size_current
        self.chunk_size_right = chunk_size_right

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs before RNNs
        if conv_channels:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layers = nn.ModuleList([
            copy.deepcopy(
                TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads,
                                        dropout, dropout_att,
                                        dropout_residual * (l + 1) / n_layers,
                                        layer_norm_eps, ffn_activation,
                                        param_init)) for l in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_residual * n_layers_sub1 / n_layers,
                    layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_residual * n_layers_sub2 / n_layers,
                    layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)

        if last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor()

        if param_init == 'xavier_uniform':
            self.reset_parameters()

    def reset_parameters(self):
        """Initialize parameters with Xavier uniform distribution."""
        logger.info(
            '===== Initialize %s with Xavier uniform distribution =====' %
            self.__class__.__name__)
        if self.conv is None:
            nn.init.xavier_uniform_(self.embed.weight)
            nn.init.constant_(self.embed.bias, 0.)
        if self.bridge is not None:
            nn.init.xavier_uniform_(self.bridge.weight)
            nn.init.constant_(self.bridge.bias, 0.)

    def forward(self, xs, xlens, task, use_cache=False, streaming=False):
        """Forward computation.

        Args:
            xs (FloatTensor): `[B, T, input_dim]`
            xlens (list): `[B]`
            task (str): not supported now
            use_cache (bool):
            streaming (bool): streaming encoding
        Returns:
            eouts (dict):
                xs (FloatTensor): `[B, T, d_model]`
                xlens (list): `[B]`

        """
        eouts = {
            'ys': {
                'xs': None,
                'xlens': None
            },
            'ys_sub1': {
                'xs': None,
                'xlens': None
            },
            'ys_sub2': {
                'xs': None,
                'xlens': None
            }
        }

        if self.conv is None:
            xs = self.embed(xs)
        else:
            # Path through CNN blocks before RNN layers
            xs, xlens = self.conv(xs, xlens)
        if not self.training:
            self.data_dict['elens'] = tensor2np(xlens)

        bs, xmax, idim = xs.size()
        xs = self.pos_enc(xs)
        if self.chunk_size_left > 0:
            # Time-restricted self-attention for streaming models
            cs_l = self.chunk_size_left
            cs_c = self.chunk_size_cur
            cs_r = self.chunk_size_right
            xs_chunks = []
            xx_aws = [[] for l in range(self.n_layers)]
            xs_pad = torch.cat([
                xs.new_zeros(bs, cs_l, idim), xs,
                xs.new_zeros(bs, cs_r, idim)
            ],
                               dim=1)
            # TODO: remove right padding
            for t in range(cs_l, cs_l + xmax, self.chunk_size_cur):
                xs_chunk = xs_pad[:, t - cs_l:t + cs_c + cs_r]
                for l, layer in enumerate(self.layers):
                    xs_chunk, xx_aws_chunk = layer(xs_chunk, None)  # no mask
                    xx_aws[l].append(xx_aws_chunk[:, :, cs_l:cs_l + cs_c,
                                                  cs_l:cs_l + cs_c])
                xs_chunks.append(xs_chunk[:, cs_l:cs_l + cs_c])
            xs = torch.cat(xs_chunks, dim=1)[:, :xmax]
            if not self.training:
                for l in range(self.n_layers):
                    self.aws_dict['xx_aws_layer%d' % l] = tensor2np(
                        torch.cat(xx_aws[l], dim=3)[:, :, :xmax, :xmax])
        else:
            # Create the self-attention mask
            xx_mask = make_pad_mask(xlens, self.device_id).unsqueeze(2).repeat(
                [1, 1, xmax])

            for l, layer in enumerate(self.layers):
                xs, xx_aws = layer(xs, xx_mask)
                if not self.training:
                    self.aws_dict['xx_aws_layer%d' % l] = tensor2np(xx_aws)

                # Pick up outputs in the sub task before the projection layer
                if l == self.n_layers_sub1 - 1:
                    xs_sub1 = self.layer_sub1(
                        xs, xx_mask
                    )[0] if self.task_specific_layer else xs.clone()
                    xs_sub1 = self.norm_out_sub1(xs_sub1)
                    if self.bridge_sub1 is not None:
                        xs_sub1 = self.bridge_sub1(xs_sub1)
                    if task == 'ys_sub1':
                        eouts[task]['xs'], eouts[task][
                            'xlens'] = xs_sub1, xlens
                        return eouts
                if l == self.n_layers_sub2 - 1:
                    xs_sub2 = self.layer_sub2(
                        xs, xx_mask
                    )[0] if self.task_specific_layer else xs.clone()
                    xs_sub2 = self.norm_out_sub2(xs_sub2)
                    if self.bridge_sub2 is not None:
                        xs_sub2 = self.bridge_sub2(xs_sub2)
                    if task == 'ys_sub2':
                        eouts[task]['xs'], eouts[task][
                            'xlens'] = xs_sub2, xlens
                        return eouts

        xs = self.norm_out(xs)

        # Bridge layer
        if self.bridge is not None:
            xs = self.bridge(xs)

        if task in ['all', 'ys']:
            eouts['ys']['xs'], eouts['ys']['xlens'] = xs, xlens
        if self.n_layers_sub1 >= 1 and task == 'all':
            eouts['ys_sub1']['xs'], eouts['ys_sub1']['xlens'] = xs_sub1, xlens
        if self.n_layers_sub2 >= 1 and task == 'all':
            eouts['ys_sub2']['xs'], eouts['ys_sub2']['xlens'] = xs_sub2, xlens
        return eouts

    def _plot_attention(self, save_path, n_cols=2):
        """Plot attention for each head in all layers."""
        from matplotlib import pyplot as plt
        from matplotlib.ticker import MaxNLocator

        _save_path = mkdir_join(save_path, 'enc_att_weights')

        # Clean directory
        if _save_path is not None and os.path.isdir(_save_path):
            shutil.rmtree(_save_path)
            os.mkdir(_save_path)

        for k, aw in self.aws_dict.items():
            elens = self.data_dict['elens']

            plt.clf()
            n_heads = aw.shape[1]
            n_cols_tmp = 1 if n_heads == 1 else n_cols
            fig, axes = plt.subplots(max(1, n_heads // n_cols_tmp),
                                     n_cols_tmp,
                                     figsize=(20, 8),
                                     squeeze=False)
            for h in range(n_heads):
                ax = axes[h // n_cols_tmp, h % n_cols_tmp]
                ax.imshow(aw[-1, h, :elens[-1], :elens[-1]], aspect="auto")
                ax.grid(False)
                ax.set_xlabel("Input (head%d)" % h)
                ax.set_ylabel("Output (head%d)" % h)
                ax.xaxis.set_major_locator(MaxNLocator(integer=True))
                ax.yaxis.set_major_locator(MaxNLocator(integer=True))

            fig.tight_layout()
            fig.savefig(os.path.join(_save_path, '%s.png' % k), dvi=500)
            plt.close()
예제 #21
0
파일: rnn.py 프로젝트: pradipcyb/neural_sp
    def __init__(self, input_dim, enc_type, n_units, n_projs, last_proj_dim,
                 n_layers, n_layers_sub1, n_layers_sub2, dropout_in, dropout,
                 subsample, subsample_type, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, bidir_sum_fwd_bwd, task_specific_layer,
                 param_init, chunk_size_left, chunk_size_right):

        super(RNNEncoder, self).__init__()

        # parse subsample
        subsamples = [1] * n_layers
        for lth, s in enumerate(list(map(int,
                                         subsample.split('_')[:n_layers]))):
            subsamples[lth] = s

        if len(subsamples) > 0 and len(subsamples) != n_layers:
            raise ValueError(
                'subsample must be the same size as n_layers. n_layers: %d, subsample: %s'
                % (n_layers, subsamples))
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError(
                'Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d'
                % (n_layers, n_layers_sub1))
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError(
                'Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d'
                % (n_layers_sub1, n_layers_sub2))

        self.enc_type = enc_type
        self.bidirectional = True if ('blstm' in enc_type
                                      or 'bgru' in enc_type) else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers
        self.bidir_sum = bidir_sum_fwd_bwd

        # for latency-controlled
        self.chunk_size_left = int(chunk_size_left.split('_')[0]) // n_stacks
        self.chunk_size_right = int(chunk_size_right.split('_')[0]) // n_stacks
        self.lc_bidir = self.chunk_size_left > 0 or self.chunk_size_right > 0
        if self.lc_bidir:
            assert enc_type not in ['lstm', 'gru', 'conv_lstm', 'conv_gru']
            assert n_layers_sub2 == 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        if 'conv' in enc_type:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    residual=False,
                                    bottleneck_dim=conv_bottleneck_dim,
                                    param_init=param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks

        if enc_type != 'conv':
            self.rnn = nn.ModuleList()
            if self.lc_bidir:
                self.rnn_bwd = nn.ModuleList()
            self.dropout = nn.Dropout(p=dropout)
            self.proj = nn.ModuleList() if n_projs > 0 else None
            self.subsample = nn.ModuleList(
            ) if np.prod(subsamples) > 1 else None
            self.padding = Padding(bidir_sum_fwd_bwd=bidir_sum_fwd_bwd
                                   if not self.lc_bidir else False)

            for lth in range(n_layers):
                if 'lstm' in enc_type:
                    rnn_i = nn.LSTM
                elif 'gru' in enc_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError(
                        'enc_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                    )

                if self.lc_bidir:
                    self.rnn += [
                        rnn_i(self._odim, n_units, 1, batch_first=True)
                    ]
                    self.rnn_bwd += [
                        rnn_i(self._odim, n_units, 1, batch_first=True)
                    ]
                else:
                    self.rnn += [
                        rnn_i(self._odim,
                              n_units,
                              1,
                              batch_first=True,
                              bidirectional=self.bidirectional)
                    ]
                self._odim = n_units if bidir_sum_fwd_bwd else n_units * self.n_dirs

                # Projection layer
                if self.proj is not None:
                    if lth != n_layers - 1:
                        self.proj += [nn.Linear(self._odim, n_projs)]
                        self._odim = n_projs

                # subsample
                if np.prod(subsamples) > 1:
                    if subsample_type == 'max_pool':
                        self.subsample += [MaxpoolSubsampler(subsamples[lth])]
                    elif subsample_type == 'concat':
                        self.subsample += [
                            ConcatSubsampler(subsamples[lth], self._odim)
                        ]
                    elif subsample_type == 'drop':
                        self.subsample += [DropSubsampler(subsamples[lth])]
                    elif subsample_type == '1dconv':
                        self.subsample += [
                            Conv1dSubsampler(subsamples[lth], self._odim)
                        ]
                    elif subsample_type == 'add':
                        self.subsample += [AddSubsampler(subsamples[lth])]

                # Task specific layer
                if lth == n_layers_sub1 - 1 and task_specific_layer:
                    self.rnn_sub1 = rnn_i(self._odim,
                                          n_units,
                                          1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub1 = nn.Linear(n_units, last_proj_dim)
                if lth == n_layers_sub2 - 1 and task_specific_layer:
                    assert not self.lc_bidir
                    self.rnn_sub2 = rnn_i(self._odim,
                                          n_units,
                                          1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub2 = nn.Linear(n_units, last_proj_dim)

            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge = nn.Linear(self._odim, last_proj_dim)
                self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor
        elif np.prod(subsamples) > 1:
            self._factor *= np.prod(subsamples)
        # NOTE: subsampling factor for frame stacking should not be included here
        if self.chunk_size_left > 0:
            assert self.chunk_size_left % self._factor == 0
        if self.chunk_size_right > 0:
            assert self.chunk_size_right % self._factor == 0

        self.reset_parameters(param_init)

        # for streaming inference
        self.reset_cache()
예제 #22
0
    def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers,
                 n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim,
                 pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout,
                 dropout_att, dropout_residual, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, conv_param_init, task_specific_layer,
                 param_init, chunk_size_left, chunk_size_current,
                 chunk_size_right):

        super(TransformerEncoder, self).__init__()

        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type

        # for latency-controlled
        self.chunk_size_left = chunk_size_left
        self.chunk_size_cur = chunk_size_current
        self.chunk_size_right = chunk_size_right

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # for attention plot
        self.aws_dict = {}
        self.data_dict = {}

        # Setting for CNNs before RNNs
        if conv_channels:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layers = nn.ModuleList([
            copy.deepcopy(
                TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads,
                                        dropout, dropout_att,
                                        dropout_residual * (l + 1) / n_layers,
                                        layer_norm_eps, ffn_activation,
                                        param_init)) for l in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        self._odim = d_model

        if n_layers_sub1 > 0:
            if task_specific_layer:
                self.layer_sub1 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_residual * n_layers_sub1 / n_layers,
                    layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim != self.output_dim:
                self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim)

        if n_layers_sub2 > 0:
            if task_specific_layer:
                self.layer_sub2 = TransformerEncoderBlock(
                    d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                    dropout_residual * n_layers_sub2 / n_layers,
                    layer_norm_eps, ffn_activation, param_init)
            self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if last_proj_dim != self.output_dim:
                self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim)

        if last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor()

        if param_init == 'xavier_uniform':
            self.reset_parameters()
예제 #23
0
    def __init__(self,
                 input_dim,
                 attn_type,
                 attn_n_heads,
                 n_layers,
                 d_model,
                 d_ff,
                 pe_type='add',
                 layer_norm_eps=1e-6,
                 dropout_in=0,
                 dropout=0,
                 dropout_att=0,
                 last_proj_dim=0,
                 n_stacks=1,
                 n_splices=1,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_residual=False,
                 conv_bottleneck_dim=0,
                 param_init=0.1):

        super(TransformerEncoder, self).__init__()
        logger = logging.getLogger("training")

        self.d_model = d_model
        self.n_layers = n_layers
        self.attn_n_heads = attn_n_heads
        self.pe_type = pe_type

        # Setting for CNNs before RNNs
        if conv_channels:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            strides = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_strides.split('_')
                       ] if len(conv_strides) > 0 else []
            poolings = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_poolings.split('_')
                        ] if len(conv_poolings) > 0 else []
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []
            logger.warning(
                'Subsampling is automatically ignored because CNN layers are used before RNN layers.'
            )

        if len(channels) > 0:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=channels,
                                    kernel_sizes=kernel_sizes,
                                    strides=strides,
                                    poolings=poolings,
                                    dropout=0,
                                    batch_norm=conv_batch_norm,
                                    residual=conv_residual,
                                    bottleneck_dim=d_model,
                                    param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

            self.embed = LinearND(self._output_dim, d_model,
                                  dropout=0)  # NOTE: do not apply dropout here

        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, d_ff, attn_type, attn_n_heads,
                                    dropout, dropout_att, layer_norm_eps)
            for l in range(n_layers)
        ])
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        if last_proj_dim != self.output_dim:
            self.bridge = LinearND(self._output_dim,
                                   last_proj_dim,
                                   dropout=dropout)
            self._output_dim = last_proj_dim
        else:
            self.bridge = None
            self._output_dim = d_model

        # Initialize parameters
        self.reset_parameters()
예제 #24
0
    def __init__(self,
                 input_dim,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 dropout_in,
                 dropout,
                 subsample,
                 subsample_type='drop',
                 n_stacks=1,
                 n_splices=1,
                 last_proj_dim=0,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_residual=False,
                 conv_bottleneck_dim=0,
                 residual=False,
                 n_layers_sub1=0,
                 n_layers_sub2=0,
                 nin=False,
                 task_specific_layer=False,
                 param_init=0.1):

        super(RNNEncoder, self).__init__()

        logger = logging.getLogger("training")

        if len(subsample) > 0 and len(subsample) != n_layers:
            raise ValueError('subsample must be the same size as n_layers.')
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.rnn_type = rnn_type
        self.bidirectional = True if rnn_type in [
            'blstm', 'bgru', 'conv_blstm', 'conv_bgru'
        ] else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_projs = n_projs
        self.n_layers = n_layers

        # Setting for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # Setting for subsampling
        self.subsample = subsample
        self.subsample_type = subsample_type

        # Setting for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Setting for residual connections
        self.residual = residual
        if residual:
            assert np.prod(subsample) == 1

        # Setting for the NiN (Network in Network)
        self.nin = nin

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        # Setting for CNNs before RNNs
        if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            if rnn_type in ['tds', 'gated_conv']:
                strides = []
                poolings = []
            else:
                strides = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_strides.split('_')
                           ] if len(conv_strides) > 0 else []
                poolings = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_poolings.split('_')
                            ] if len(conv_poolings) > 0 else []
            if 'conv_' in rnn_type:
                self.subsample = [1] * self.n_layers
                logger.warning(
                    'Subsampling is automatically ignored because CNN layers are used before RNN layers.'
                )
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []

        if len(channels) > 0:
            if rnn_type == 'tds':
                self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                       in_channel=conv_in_channel,
                                       channels=channels,
                                       kernel_sizes=kernel_sizes,
                                       dropout=dropout,
                                       bottleneck_dim=last_proj_dim)
            elif rnn_type == 'gated_conv':
                self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                             in_channel=conv_in_channel,
                                             channels=channels,
                                             kernel_sizes=kernel_sizes,
                                             dropout=dropout,
                                             bottleneck_dim=last_proj_dim,
                                             param_init=param_init)
            else:
                assert n_stacks == 1 and n_splices == 1
                self.conv = ConvEncoder(input_dim,
                                        in_channel=conv_in_channel,
                                        channels=channels,
                                        kernel_sizes=kernel_sizes,
                                        strides=strides,
                                        poolings=poolings,
                                        dropout=0,
                                        batch_norm=conv_batch_norm,
                                        residual=conv_residual,
                                        bottleneck_dim=conv_bottleneck_dim,
                                        param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            # Fast implementation without processes between each layer
            self.fast_impl = False
            if np.prod(
                    self.subsample
            ) == 1 and self.n_projs == 0 and not residual and n_layers_sub1 == 0 and not nin:
                self.fast_impl = True
                if 'lstm' in rnn_type:
                    rnn = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn = nn.GRU
                else:
                    raise ValueError(
                        'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                    )

                self.rnn = rnn(self._output_dim,
                               n_units,
                               n_layers,
                               bias=True,
                               batch_first=True,
                               dropout=dropout,
                               bidirectional=self.bidirectional)
                # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer
                self._output_dim = n_units * self.n_dirs
                self.dropout_top = nn.Dropout(p=dropout)
            else:
                self.rnn = nn.ModuleList()
                self.dropout = nn.ModuleList()
                if self.n_projs > 0:
                    self.proj = nn.ModuleList()
                if subsample_type == 'max_pool' and np.prod(
                        self.subsample) > 1:
                    self.max_pool = nn.ModuleList()
                    for l in range(n_layers):
                        if self.subsample[l] > 1:
                            self.max_pool += [
                                nn.MaxPool2d((1, 1),
                                             stride=(self.subsample[l], 1),
                                             ceil_mode=True)
                            ]
                        else:
                            self.max_pool += [None]
                if subsample_type == 'concat' and np.prod(self.subsample) > 1:
                    self.concat_proj = nn.ModuleList()
                    self.concat_bn = nn.ModuleList()
                    for l in range(n_layers):
                        if self.subsample[l] > 1:
                            self.concat_proj += [
                                LinearND(
                                    n_units * self.n_dirs * self.subsample[l],
                                    n_units * self.n_dirs)
                            ]
                            self.concat_bn += [
                                nn.BatchNorm1d(n_units * self.n_dirs)
                            ]
                        else:
                            self.concat_proj += [None]
                            self.concat_bn += [None]
                if nin:
                    self.nin_conv = nn.ModuleList()
                    self.nin_bn = nn.ModuleList()

                for l in range(n_layers):
                    if 'lstm' in rnn_type:
                        rnn_i = nn.LSTM
                    elif 'gru' in rnn_type:
                        rnn_i = nn.GRU
                    else:
                        raise ValueError(
                            'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                        )

                    self.rnn += [
                        rnn_i(self._output_dim,
                              n_units,
                              1,
                              bias=True,
                              batch_first=True,
                              dropout=0,
                              bidirectional=self.bidirectional)
                    ]
                    self.dropout += [nn.Dropout(p=dropout)]
                    self._output_dim = n_units * self.n_dirs

                    # Projection layer
                    if n_projs > 0 and l != n_layers - 1:
                        self.proj += [LinearND(n_units * self.n_dirs, n_projs)]
                        self._output_dim = n_projs

                    # Task specific layer
                    if l == n_layers_sub1 - 1 and task_specific_layer:
                        self.rnn_sub1 = rnn_i(self._output_dim,
                                              n_units,
                                              1,
                                              bias=True,
                                              batch_first=True,
                                              dropout=0,
                                              bidirectional=self.bidirectional)
                        self.dropout_sub1 = nn.Dropout(p=dropout)
                        if last_proj_dim != self.output_dim:
                            self.bridge_sub1 = LinearND(n_units,
                                                        last_proj_dim,
                                                        dropout=dropout)
                    if l == n_layers_sub2 - 1 and task_specific_layer:
                        self.rnn_sub2 = rnn_i(self._output_dim,
                                              n_units,
                                              1,
                                              bias=True,
                                              batch_first=True,
                                              dropout=0,
                                              bidirectional=self.bidirectional)
                        self.dropout_sub2 = nn.Dropout(p=dropout)
                        if last_proj_dim != self.output_dim:
                            self.bridge_sub2 = LinearND(n_units,
                                                        last_proj_dim,
                                                        dropout=dropout)

                    # Network in network (1*1 conv + batch normalization + ReLU)
                    # NOTE: exclude the last layer
                    if nin and l != n_layers - 1:
                        self.nin_conv += [
                            nn.Conv2d(in_channels=self._output_dim,
                                      out_channels=self._output_dim,
                                      kernel_size=1,
                                      stride=1,
                                      padding=0)
                        ]
                        self.nin_bn += [nn.BatchNorm2d(self._output_dim)]
                        if n_layers_sub1 > 0 or n_layers_sub2 > 0:
                            assert task_specific_layer

                if last_proj_dim != self.output_dim:
                    self.bridge = LinearND(self._output_dim,
                                           last_proj_dim,
                                           dropout=dropout)
                    self._output_dim = last_proj_dim

        # Initialize parameters
        self.reset_parameters(param_init)
예제 #25
0
파일: rnn.py 프로젝트: many-hats/neural_sp
    def __init__(self, input_dim, rnn_type, n_units, n_projs, last_proj_dim,
                 n_layers, n_layers_sub1, n_layers_sub2,
                 dropout_in, dropout,
                 subsample, subsample_type, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings,
                 conv_batch_norm, conv_layer_norm, conv_bottleneck_dim,
                 bidirectional_sum_fwd_bwd, task_specific_layer, param_init,
                 chunk_size_left, chunk_size_right):

        super(RNNEncoder, self).__init__()

        # parse subsample
        subsample_list = [1] * n_layers
        for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))):
            subsample_list[lth] = s

        if len(subsample_list) > 0 and len(subsample_list) != n_layers:
            raise ValueError('subsample must be the same size as n_layers. n_layers: %d, subsample: %s' %
                             (n_layers, subsample_list))
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d' %
                             (n_layers, n_layers_sub1))
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d' %
                             (n_layers_sub1, n_layers_sub2))

        self.rnn_type = rnn_type
        self.bidirectional = True if ('blstm' in rnn_type or 'bgru' in rnn_type) else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers
        self.bidir_sum = bidirectional_sum_fwd_bwd

        # for latency-controlled
        self.latency_controlled = chunk_size_left > 0 or chunk_size_right > 0
        self.chunk_size_left = chunk_size_left
        self.chunk_size_right = chunk_size_right
        if self.latency_controlled:
            assert n_layers_sub2 == 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        if rnn_type == 'tds':
            self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                   in_channel=conv_in_channel,
                                   channels=conv_channels,
                                   kernel_sizes=conv_kernel_sizes,
                                   dropout=dropout,
                                   bottleneck_dim=last_proj_dim)
        elif rnn_type == 'gated_conv':
            self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                         in_channel=conv_in_channel,
                                         channels=conv_channels,
                                         kernel_sizes=conv_kernel_sizes,
                                         dropout=dropout,
                                         bottleneck_dim=last_proj_dim,
                                         param_init=param_init)

        elif 'conv' in rnn_type:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    residual=False,
                                    bottleneck_dim=conv_bottleneck_dim,
                                    param_init=param_init)
        else:
            self.conv = None

        if self.conv is None:
            self._odim = input_dim * n_splices * n_stacks
        else:
            self._odim = self.conv.output_dim
            subsample_list = [1] * self.n_layers
            logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.')

        self.padding = Padding(bidirectional_sum_fwd_bwd=bidirectional_sum_fwd_bwd)

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            self.rnn = nn.ModuleList()
            if self.latency_controlled:
                self.rnn_bwd = nn.ModuleList()
            self.dropout = nn.Dropout(p=dropout)
            self.proj = None
            if n_projs > 0:
                self.proj = nn.ModuleList()

            # subsample
            self.subsample_layer = None
            if subsample_type == 'max_pool' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([MaxpoolSubsampler(subsample_list[lth])
                                                      for lth in range(n_layers)])
            elif subsample_type == 'concat' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([ConcatSubsampler(subsample_list[lth], n_units * self.n_dirs)
                                                      for lth in range(n_layers)])
            elif subsample_type == 'drop' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([DropSubsampler(subsample_list[lth])
                                                      for lth in range(n_layers)])
            elif subsample_type == '1dconv' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([Conv1dSubsampler(subsample_list[lth], n_units * self.n_dirs)
                                                      for lth in range(n_layers)])

            for lth in range(n_layers):
                if 'lstm' in rnn_type:
                    rnn_i = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".')

                if self.latency_controlled:
                    self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True)]
                    self.rnn_bwd += [rnn_i(self._odim, n_units, 1, batch_first=True)]
                else:
                    self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True,
                                       bidirectional=self.bidirectional)]
                self._odim = n_units if bidirectional_sum_fwd_bwd else n_units * self.n_dirs

                # Projection layer
                if self.proj is not None:
                    if lth != n_layers - 1:
                        self.proj += [nn.Linear(n_units * self.n_dirs, n_projs)]
                        self._odim = n_projs

                # Task specific layer
                if lth == n_layers_sub1 - 1 and task_specific_layer:
                    assert not self.latency_controlled
                    self.rnn_sub1 = rnn_i(self._odim, n_units, 1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub1 = nn.Linear(n_units, last_proj_dim)
                if lth == n_layers_sub2 - 1 and task_specific_layer:
                    assert not self.latency_controlled
                    self.rnn_sub2 = rnn_i(self._odim, n_units, 1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub2 = nn.Linear(n_units, last_proj_dim)

            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge = nn.Linear(self._odim, last_proj_dim)
                self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor
        self._factor *= np.prod(subsample_list)

        self.reset_parameters(param_init)

        # for streaming inference
        self.reset_cache()
예제 #26
0
class TransformerEncoder(EncoderBase):
    """Transformer encoder.

    Args:
        input_dim (int): dimension of input features (freq * channel)
        attn_type (str): type of attention
        n_heads (int): number of heads for multi-head attention
        n_layers (int): number of blocks
        d_model (int): dimension of MultiheadAttentionMechanism
        d_ff (int): dimension of PositionwiseFeedForward
        last_proj_dim (int): dimension of the last projection layer
        pe_type (str): type of positional encoding
        layer_norm_eps (float): epsilon value for layer normalization
        ffn_activation (str): nonolinear function for PositionwiseFeedForward
        dropout_in (float): dropout probability for input-hidden connection
        dropout (float): dropout probabilities for linear layers
        dropout_att (float): dropout probabilities for attention distributions
        n_stacks (int): number of frames to stack
        n_splices (int): frames to splice. Default is 1 frame.
        conv_in_channel (int): number of channels of input features
        conv_channels (int): number of channles in the CNN blocks
        conv_kernel_sizes (list): size of kernels in the CNN blocks
        conv_strides (list): number of strides in the CNN blocks
        conv_poolings (list): size of poolings in the CNN blocks
        conv_batch_norm (bool): apply batch normalization only in the CNN blocks
        conv_layer_norm (bool): apply layer normalization only in the CNN blocks
        conv_bottleneck_dim (int): dimension of the bottleneck layer between CNN and self-attention layers
        conv_param_init (float): only for CNN layers before Transformer layers
        chunk_size_left (int): left chunk size for time-restricted Transformer encoder
        chunk_size_current (int): current chunk size for time-restricted Transformer encoder
        chunk_size_right (int): right chunk size for time-restricted Transformer encoder
        param_init (str):

    """
    def __init__(self, input_dim, attn_type, n_heads, n_layers, d_model, d_ff,
                 last_proj_dim, pe_type, layer_norm_eps, ffn_activation,
                 dropout_in, dropout, dropout_att, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes,
                 conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm,
                 conv_bottleneck_dim, conv_param_init, param_init,
                 chunk_size_left, chunk_size_current, chunk_size_right):

        super(TransformerEncoder, self).__init__()

        self.d_model = d_model
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pe_type = pe_type
        self.chunk_size_left = chunk_size_left
        self.chunk_size_current = chunk_size_current
        self.chunk_size_right = chunk_size_right

        # Setting for CNNs before RNNs
        if conv_channels:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    layer_norm_eps=layer_norm_eps,
                                    residual=False,
                                    bottleneck_dim=d_model,
                                    param_init=conv_param_init)
            self._odim = self.conv.output_dim
        else:
            self.conv = None
            self._odim = input_dim * n_splices * n_stacks
            self.embed = nn.Linear(self._odim, d_model)

        self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type)
        self.layers = repeat(
            TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout,
                                    dropout_att, layer_norm_eps,
                                    ffn_activation, param_init), n_layers)
        self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

        if last_proj_dim != self.output_dim:
            self.bridge = nn.Linear(self._odim, last_proj_dim)
            self._odim = last_proj_dim
        else:
            self.bridge = None
            self._odim = d_model

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor()

        if param_init == 'xavier_uniform':
            self.reset_parameters()

    def reset_parameters(self):
        """Initialize parameters with Xavier uniform distribution."""
        logger.info(
            '===== Initialize %s with Xavier uniform distribution =====' %
            self.__class__.__name__)
        if self.conv is None:
            nn.init.xavier_uniform_(self.embed.weight)
            nn.init.constant_(self.embed.bias, 0.)
        if self.bridge is not None:
            nn.init.xavier_uniform_(self.bridge.weight)
            nn.init.constant_(self.bridge.bias, 0.)

    def forward(self, xs, xlens, task, use_cache=False, streaming=False):
        """Forward computation.

        Args:
            xs (FloatTensor): `[B, T, input_dim]`
            xlens (list): `[B]`
            task (str): not supported now
            use_cache (bool):
            streaming (bool): streaming encoding
        Returns:
            eouts (dict):
                xs (FloatTensor): `[B, T, d_model]`
                xlens (list): `[B]`

        """
        eouts = {
            'ys': {
                'xs': None,
                'xlens': None
            },
            'ys_sub1': {
                'xs': None,
                'xlens': None
            },
            'ys_sub2': {
                'xs': None,
                'xlens': None
            }
        }

        if self.conv is None:
            xs = self.embed(xs)
        else:
            # Path through CNN blocks before RNN layers
            xs, xlens = self.conv(xs, xlens)

        bs, xmax, idim = xs.size()
        xs = self.pos_enc(xs)
        if self.chunk_size_left > 0:
            # Time-restricted self-attention for streaming models
            cs_l = self.chunk_size_left
            cs_c = self.chunk_size_current
            cs_r = self.chunk_size_right
            hop_size = self.chunk_size_current
            xs_chunks = []
            xx_aws = [[] for l in range(self.n_layers)]
            xs_pad = torch.cat([
                xs.new_zeros(bs, cs_l, idim), xs,
                xs.new_zeros(bs, cs_r, idim)
            ],
                               dim=1)
            # TODO: remove right padding
            for t in range(cs_l, cs_l + xmax, hop_size):
                xs_chunk = xs_pad[:, t - cs_l:t + cs_c + cs_r]
                for l in range(self.n_layers):
                    xs_chunk, xx_aws_chunk = self.layers[l](xs_chunk,
                                                            None)  # no mask
                    xx_aws[l].append(xx_aws_chunk[:, :, cs_l:cs_l + cs_c,
                                                  cs_l:cs_l + cs_c])
                xs_chunks.append(xs_chunk[:, cs_l:cs_l + cs_c])
            xs = torch.cat(xs_chunks, dim=1)[:, :xmax]
            if not self.training:
                for l in range(self.n_layers):
                    setattr(
                        self, 'xx_aws_layer%d' % l,
                        tensor2np(
                            torch.cat(xx_aws[l], dim=3)[:, :, :xmax, :xmax]))
        else:
            # Create the self-attention mask
            xx_mask = make_pad_mask(xlens, self.device_id).unsqueeze(2).repeat(
                [1, 1, xmax])

            for l in range(self.n_layers):
                xs, xx_aws = self.layers[l](xs, xx_mask)
                if not self.training:
                    setattr(self, 'xx_aws_layer%d' % l, tensor2np(xx_aws))
        xs = self.norm_out(xs)

        # Bridge layer
        if self.bridge is not None:
            xs = self.bridge(xs)

        eouts['ys']['xs'] = xs
        eouts['ys']['xlens'] = xlens
        return eouts

    def _plot_attention(self, save_path, n_cols=2):
        """Plot attention for each head in all layers."""
        from matplotlib import pyplot as plt
        from matplotlib.ticker import MaxNLocator

        save_path = mkdir_join(save_path, 'enc_xx_att_weights')

        # Clean directory
        if save_path is not None and os.path.isdir(save_path):
            shutil.rmtree(save_path)
            os.mkdir(save_path)

        for l in range(self.n_layers):
            if not hasattr(self, 'xx_aws_layer%d' % l):
                continue

            xx_aws = getattr(self, 'xx_aws_layer%d' % l)

            plt.clf()
            fig, axes = plt.subplots(self.n_heads // n_cols,
                                     n_cols,
                                     figsize=(20, 8))
            for h in range(self.n_heads):
                if self.n_heads > n_cols:
                    ax = axes[h // n_cols, h % n_cols]
                else:
                    ax = axes[h]
                ax.imshow(xx_aws[-1, h, :, :], aspect="auto")
                ax.grid(False)
                ax.set_xlabel("Input (head%d)" % h)
                ax.set_ylabel("Output (head%d)" % h)
                ax.xaxis.set_major_locator(MaxNLocator(integer=True))
                ax.yaxis.set_major_locator(MaxNLocator(integer=True))

            fig.tight_layout()
            fig.savefig(os.path.join(save_path, 'layer%d.png' % (l)), dvi=500)
            plt.close()