def __init__(self, input_dim, rnn_type, n_units, n_projs, last_proj_dim, n_layers, n_layers_sub1, n_layers_sub2, dropout_in, dropout, subsample, subsample_type, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, bidirectional_sum_fwd_bwd, task_specific_layer, param_init, chunk_size_left, chunk_size_right): super(RNNEncoder, self).__init__() # parse subsample subsample_list = [1] * n_layers for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))): subsample_list[lth] = s if len(subsample_list) > 0 and len(subsample_list) != n_layers: raise ValueError('subsample must be the same size as n_layers. n_layers: %d, subsample: %s' % (n_layers, subsample_list)) if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d' % (n_layers, n_layers_sub1)) if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d' % (n_layers_sub1, n_layers_sub2)) self.rnn_type = rnn_type self.bidirectional = True if ('blstm' in rnn_type or 'bgru' in rnn_type) else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers self.bidir_sum = bidirectional_sum_fwd_bwd # for latency-controlled self.latency_controlled = chunk_size_left > 0 or chunk_size_right > 0 self.chunk_size_left = chunk_size_left self.chunk_size_right = chunk_size_right if self.latency_controlled: assert n_layers_sub2 == 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) elif 'conv' in rnn_type: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, residual=False, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) else: self.conv = None if self.conv is None: self._odim = input_dim * n_splices * n_stacks else: self._odim = self.conv.output_dim subsample_list = [1] * self.n_layers logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.') self.padding = Padding(bidirectional_sum_fwd_bwd=bidirectional_sum_fwd_bwd) if rnn_type not in ['conv', 'tds', 'gated_conv']: self.rnn = nn.ModuleList() if self.latency_controlled: self.rnn_bwd = nn.ModuleList() self.dropout = nn.Dropout(p=dropout) self.proj = None if n_projs > 0: self.proj = nn.ModuleList() # subsample self.subsample_layer = None if subsample_type == 'max_pool' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([MaxpoolSubsampler(subsample_list[lth]) for lth in range(n_layers)]) elif subsample_type == 'concat' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([ConcatSubsampler(subsample_list[lth], n_units * self.n_dirs) for lth in range(n_layers)]) elif subsample_type == 'drop' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([DropSubsampler(subsample_list[lth]) for lth in range(n_layers)]) elif subsample_type == '1dconv' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([Conv1dSubsampler(subsample_list[lth], n_units * self.n_dirs) for lth in range(n_layers)]) for lth in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".') if self.latency_controlled: self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True)] self.rnn_bwd += [rnn_i(self._odim, n_units, 1, batch_first=True)] else: self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional)] self._odim = n_units if bidirectional_sum_fwd_bwd else n_units * self.n_dirs # Projection layer if self.proj is not None: if lth != n_layers - 1: self.proj += [nn.Linear(n_units * self.n_dirs, n_projs)] self._odim = n_projs # Task specific layer if lth == n_layers_sub1 - 1 and task_specific_layer: assert not self.latency_controlled self.rnn_sub1 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(n_units, last_proj_dim) if lth == n_layers_sub2 - 1 and task_specific_layer: assert not self.latency_controlled self.rnn_sub2 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(n_units, last_proj_dim) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor self._factor *= np.prod(subsample_list) self.reset_parameters(param_init) # for streaming inference self.reset_cache()
def build_encoder(args): # safeguard if not hasattr(args, 'transformer_enc_d_model') and hasattr(args, 'transformer_d_model'): args.transformer_enc_d_model = args.transformer_d_model args.transformer_dec_d_model = args.transformer_d_model if not hasattr(args, 'transformer_enc_d_ff') and hasattr(args, 'transformer_d_ff'): args.transformer_enc_d_ff = args.transformer_d_ff if not hasattr(args, 'transformer_enc_n_heads') and hasattr(args, 'transformer_n_heads'): args.transformer_enc_n_heads = args.transformer_n_heads if args.enc_type == 'tds': from neural_sp.models.seq2seq.encoders.tds import TDSEncoder encoder = TDSEncoder( input_dim=args.input_dim * args.n_stacks, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, dropout=args.dropout_enc, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else args.dec_n_units) elif args.enc_type == 'gated_conv': from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder raise ValueError encoder = GatedConvEncoder( input_dim=args.input_dim * args.n_stacks, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, dropout=args.dropout_enc, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else args.dec_n_units, param_init=args.param_init) elif 'transformer' in args.enc_type: from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder encoder = TransformerEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, n_heads=args.transformer_enc_n_heads, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, d_model=args.transformer_enc_d_model, d_ff=args.transformer_enc_d_ff, ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim, ffn_activation=args.transformer_ffn_activation, pe_type=args.transformer_enc_pe_type, layer_norm_eps=args.transformer_layer_norm_eps, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0, dropout_in=args.dropout_in, dropout=args.dropout_enc, dropout_att=args.dropout_att, dropout_layer=args.dropout_enc_layer, subsample=args.subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, conv_in_channel=args.conv_in_channel, conv_channels=args.conv_channels, conv_kernel_sizes=args.conv_kernel_sizes, conv_strides=args.conv_strides, conv_poolings=args.conv_poolings, conv_batch_norm=args.conv_batch_norm, conv_layer_norm=args.conv_layer_norm, conv_bottleneck_dim=args.conv_bottleneck_dim, conv_param_init=args.param_init, task_specific_layer=args.task_specific_layer, param_init=args.transformer_param_init, clamp_len=args.transformer_enc_clamp_len, lookahead=args.transformer_enc_lookaheads, chunk_size_left=args.lc_chunk_size_left, chunk_size_current=args.lc_chunk_size_current, chunk_size_right=args.lc_chunk_size_right, streaming_type=args.lc_type) elif 'conformer' in args.enc_type: from neural_sp.models.seq2seq.encoders.conformer import ConformerEncoder encoder = ConformerEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, n_heads=args.transformer_enc_n_heads, kernel_size=args.conformer_kernel_size, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, d_model=args.transformer_enc_d_model, d_ff=args.transformer_enc_d_ff, ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim, ffn_activation='swish', pe_type=args.transformer_enc_pe_type, layer_norm_eps=args.transformer_layer_norm_eps, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0, dropout_in=args.dropout_in, dropout=args.dropout_enc, dropout_att=args.dropout_att, dropout_layer=args.dropout_enc_layer, subsample=args.subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, conv_in_channel=args.conv_in_channel, conv_channels=args.conv_channels, conv_kernel_sizes=args.conv_kernel_sizes, conv_strides=args.conv_strides, conv_poolings=args.conv_poolings, conv_batch_norm=args.conv_batch_norm, conv_layer_norm=args.conv_layer_norm, conv_bottleneck_dim=args.conv_bottleneck_dim, conv_param_init=args.param_init, task_specific_layer=args.task_specific_layer, param_init=args.transformer_param_init, clamp_len=args.transformer_enc_clamp_len, lookahead=args.transformer_enc_lookaheads, chunk_size_left=args.lc_chunk_size_left, chunk_size_current=args.lc_chunk_size_current, chunk_size_right=args.lc_chunk_size_right, streaming_type=args.lc_type) else: from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder encoder = RNNEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, n_units=args.enc_n_units, n_projs=args.enc_n_projs, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, dropout_in=args.dropout_in, dropout=args.dropout_enc, subsample=args.subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, conv_in_channel=args.conv_in_channel, conv_channels=args.conv_channels, conv_kernel_sizes=args.conv_kernel_sizes, conv_strides=args.conv_strides, conv_poolings=args.conv_poolings, conv_batch_norm=args.conv_batch_norm, conv_layer_norm=args.conv_layer_norm, conv_bottleneck_dim=args.conv_bottleneck_dim, bidir_sum_fwd_bwd=args.bidirectional_sum_fwd_bwd, task_specific_layer=args.task_specific_layer, param_init=args.param_init, chunk_size_left=args.lc_chunk_size_left, chunk_size_right=args.lc_chunk_size_right, rsp_prob=args.rsp_prob_enc) return encoder
def __init__(self, input_dim, rnn_type, n_units, n_projs, n_layers, dropout_in, dropout, subsample, subsample_type='drop', n_stacks=1, n_splices=1, last_proj_dim=0, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_residual=False, conv_bottleneck_dim=0, residual=False, n_layers_sub1=0, n_layers_sub2=0, nin=False, task_specific_layer=False, param_init=0.1): super(RNNEncoder, self).__init__() logger = logging.getLogger("training") if len(subsample) > 0 and len(subsample) != n_layers: raise ValueError('subsample must be the same size as n_layers.') if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.rnn_type = rnn_type self.bidirectional = True if rnn_type in [ 'blstm', 'bgru', 'conv_blstm', 'conv_bgru' ] else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_projs = n_projs self.n_layers = n_layers # Setting for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # Setting for subsampling self.subsample = subsample self.subsample_type = subsample_type # Setting for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Setting for residual connections self.residual = residual if residual: assert np.prod(subsample) == 1 # Setting for the NiN (Network in Network) self.nin = nin # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) # Setting for CNNs before RNNs if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] if rnn_type in ['tds', 'gated_conv']: strides = [] poolings = [] else: strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] if 'conv_' in rnn_type: self.subsample = [1] * self.n_layers logger.warning( 'Subsampling is automatically ignored because CNN layers are used before RNN layers.' ) else: channels = [] kernel_sizes = [] strides = [] poolings = [] if len(channels) > 0: if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) else: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, residual=conv_residual, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None if rnn_type not in ['conv', 'tds', 'gated_conv']: # Fast implementation without processes between each layer self.fast_impl = False if np.prod( self.subsample ) == 1 and self.n_projs == 0 and not residual and n_layers_sub1 == 0 and not nin: self.fast_impl = True if 'lstm' in rnn_type: rnn = nn.LSTM elif 'gru' in rnn_type: rnn = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) self.rnn = rnn(self._output_dim, n_units, n_layers, bias=True, batch_first=True, dropout=dropout, bidirectional=self.bidirectional) # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer self._output_dim = n_units * self.n_dirs self.dropout_top = nn.Dropout(p=dropout) else: self.rnn = nn.ModuleList() self.dropout = nn.ModuleList() if self.n_projs > 0: self.proj = nn.ModuleList() if subsample_type == 'max_pool' and np.prod( self.subsample) > 1: self.max_pool = nn.ModuleList() for l in range(n_layers): if self.subsample[l] > 1: self.max_pool += [ nn.MaxPool2d((1, 1), stride=(self.subsample[l], 1), ceil_mode=True) ] else: self.max_pool += [None] if subsample_type == 'concat' and np.prod(self.subsample) > 1: self.concat_proj = nn.ModuleList() self.concat_bn = nn.ModuleList() for l in range(n_layers): if self.subsample[l] > 1: self.concat_proj += [ LinearND( n_units * self.n_dirs * self.subsample[l], n_units * self.n_dirs) ] self.concat_bn += [ nn.BatchNorm1d(n_units * self.n_dirs) ] else: self.concat_proj += [None] self.concat_bn += [None] if nin: self.nin_conv = nn.ModuleList() self.nin_bn = nn.ModuleList() for l in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) self.rnn += [ rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) ] self.dropout += [nn.Dropout(p=dropout)] self._output_dim = n_units * self.n_dirs # Projection layer if n_projs > 0 and l != n_layers - 1: self.proj += [LinearND(n_units * self.n_dirs, n_projs)] self._output_dim = n_projs # Task specific layer if l == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub1 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub1 = LinearND(n_units, last_proj_dim, dropout=dropout) if l == n_layers_sub2 - 1 and task_specific_layer: self.rnn_sub2 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub2 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub2 = LinearND(n_units, last_proj_dim, dropout=dropout) # Network in network (1*1 conv + batch normalization + ReLU) # NOTE: exclude the last layer if nin and l != n_layers - 1: self.nin_conv += [ nn.Conv2d(in_channels=self._output_dim, out_channels=self._output_dim, kernel_size=1, stride=1, padding=0) ] self.nin_bn += [nn.BatchNorm2d(self._output_dim)] if n_layers_sub1 > 0 or n_layers_sub2 > 0: assert task_specific_layer if last_proj_dim != self.output_dim: self.bridge = LinearND(self._output_dim, last_proj_dim, dropout=dropout) self._output_dim = last_proj_dim # Initialize parameters self.reset_parameters(param_init)
def build_encoder(args): if args.enc_type == 'tds': from neural_sp.models.seq2seq.encoders.tds import TDSEncoder raise ValueError encoder = TDSEncoder( input_dim=args.input_dim * args.n_stacks, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, dropout=args.dropout_enc, bottleneck_dim=args.transformer_d_model if 'transformer' in args.dec_type else args.dec_n_units) elif args.enc_type == 'gated_conv': from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder raise ValueError encoder = GatedConvEncoder( input_dim=args.input_dim * args.n_stacks, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, dropout=args.dropout_enc, bottleneck_dim=args.transformer_d_model if 'transformer' in args.dec_type else args.dec_n_units, param_init=args.param_init) elif 'transformer' in args.enc_type: from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder encoder = TransformerEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, attn_type=args.transformer_attn_type, n_heads=args.transformer_n_heads, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, d_model=args.transformer_d_model, d_ff=args.transformer_d_ff, last_proj_dim=args.transformer_d_model if 'transformer' in args.dec_type else 0, pe_type=args.transformer_enc_pe_type, layer_norm_eps=args.transformer_layer_norm_eps, ffn_activation=args.transformer_ffn_activation, dropout_in=args.dropout_in, dropout=args.dropout_enc, dropout_att=args.dropout_att, dropout_layer=args.dropout_enc_layer, n_stacks=args.n_stacks, n_splices=args.n_splices, conv_in_channel=args.conv_in_channel, conv_channels=args.conv_channels, conv_kernel_sizes=args.conv_kernel_sizes, conv_strides=args.conv_strides, conv_poolings=args.conv_poolings, conv_batch_norm=args.conv_batch_norm, conv_layer_norm=args.conv_layer_norm, conv_bottleneck_dim=args.conv_bottleneck_dim, conv_param_init=args.param_init, task_specific_layer=args.task_specific_layer, param_init=args.transformer_param_init, chunk_size_left=args.lc_chunk_size_left, chunk_size_current=args.lc_chunk_size_current, chunk_size_right=args.lc_chunk_size_right) else: subsample = [1] * args.enc_n_layers for l, s in enumerate( list(map(int, args.subsample.split('_')[:args.enc_n_layers]))): subsample[l] = s from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder encoder = RNNEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, rnn_type=args.enc_type, n_units=args.enc_n_units, n_projs=args.enc_n_projs, last_proj_dim=args.transformer_d_model if 'transformer' in args.dec_type else 0, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, dropout_in=args.dropout_in, dropout=args.dropout_enc, subsample=subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, conv_in_channel=args.conv_in_channel, conv_channels=args.conv_channels, conv_kernel_sizes=args.conv_kernel_sizes, conv_strides=args.conv_strides, conv_poolings=args.conv_poolings, conv_batch_norm=args.conv_batch_norm, conv_layer_norm=args.conv_layer_norm, conv_bottleneck_dim=args.conv_bottleneck_dim, bidirectional_sum_fwd_bwd=args.bidirectional_sum_fwd_bwd, task_specific_layer=args.task_specific_layer, param_init=args.param_init, chunk_size_left=args.lc_chunk_size_left, chunk_size_right=args.lc_chunk_size_right) # NOTE: pure Conv/TDS/GatedConv encoders are also included return encoder
def __init__(self, input_dim, rnn_type, n_units, n_projs, n_layers, dropout_in, dropout, subsample, subsample_type='drop', n_stacks=1, n_splices=1, last_proj_dim=0, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_bottleneck_dim=0, n_layers_sub1=0, n_layers_sub2=0, nin=False, task_specific_layer=False, param_init=0.1): super(RNNEncoder, self).__init__() logger = logging.getLogger("training") if len(subsample) > 0 and len(subsample) != n_layers: raise ValueError('subsample must be the same size as n_layers.') if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.rnn_type = rnn_type self.bidirectional = True if rnn_type in [ 'blstm', 'bgru', 'conv_blstm', 'conv_bgru' ] else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers # Setting for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # Setting for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) # Setting for CNNs before RNNs if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] if rnn_type in ['tds', 'gated_conv']: strides = [] poolings = [] else: strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] if 'conv_' in rnn_type: subsample = [1] * self.n_layers logger.warning( 'Subsampling is automatically ignored because CNN layers are used before RNN layers.' ) else: channels = [] kernel_sizes = [] strides = [] poolings = [] if len(channels) > 0: if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) else: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None self.padding = Padding() if rnn_type not in ['conv', 'tds', 'gated_conv']: self.rnn = nn.ModuleList() self.dropout = nn.ModuleList() self.proj = None if n_projs > 0: self.proj = nn.ModuleList() # subsample self.subsample = None if subsample_type == 'max_pool' and np.prod(subsample) > 1: self.subsample = nn.ModuleList( [MaxpoolSubsampler(subsample[l]) for l in range(n_layers)]) elif subsample_type == 'concat' and np.prod(subsample) > 1: self.subsample = nn.ModuleList([ ConcatSubsampler(subsample[l], n_units, self.n_dirs) for l in range(n_layers) ]) elif subsample_type == 'drop' and np.prod(subsample) > 1: self.subsample = nn.ModuleList( [DropSubsampler(subsample[l]) for l in range(n_layers)]) # NiN self.nin = None if nin: self.nin = nn.ModuleList() for l in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) self.rnn += [ rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) ] self.dropout += [nn.Dropout(p=dropout)] self._output_dim = n_units * self.n_dirs # Projection layer if self.proj is not None: if l != n_layers - 1: self.proj += [Linear(n_units * self.n_dirs, n_projs)] self._output_dim = n_projs # Task specific layer if l == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub1 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub1 = Linear(n_units, last_proj_dim) if l == n_layers_sub2 - 1 and task_specific_layer: self.rnn_sub2 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub2 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub2 = Linear(n_units, last_proj_dim) # Network in network if self.nin is not None: if l != n_layers - 1: self.nin += [NiN(self._output_dim)] # if n_layers_sub1 > 0 or n_layers_sub2 > 0: # assert task_specific_layer if last_proj_dim != self.output_dim: self.bridge = Linear(self._output_dim, last_proj_dim) self._output_dim = last_proj_dim # Initialize parameters self.reset_parameters(param_init)