def define_name(dir_name, args): if 'conv' in args.enc_type: dir_name = ConvEncoder.define_name(dir_name, args) dir_name += str(args.transformer_enc_d_model) + 'dmodel' dir_name += str(args.transformer_enc_d_ff) + 'dff' if args.transformer_ffn_bottleneck_dim > 0: dir_name += str(args.transformer_ffn_bottleneck_dim) + 'bn' dir_name += str(args.enc_n_layers) + 'L' dir_name += str(args.transformer_enc_n_heads) + 'H' dir_name += 'pe' + str(args.transformer_enc_pe_type) if args.transformer_enc_clamp_len > 0: dir_name += '_clamp' + str(args.transformer_enc_clamp_len) if args.dropout_enc_layer > 0: dir_name += '_LD' + str(args.dropout_enc_layer) if int(str(args.lc_chunk_size_left).split('_')[-1]) > 0 or \ int(str(args.lc_chunk_size_current).split('_')[-1]) > 0 or \ int(str(args.lc_chunk_size_right).split('_')[-1]) > 0: dir_name += '_chunkL' + str(args.lc_chunk_size_left) + 'C' + \ str(args.lc_chunk_size_current) + 'R' + str(args.lc_chunk_size_right) dir_name += '_' + args.lc_type elif sum(list(map(int, args.transformer_enc_lookaheads.split('_')))) > 0: dir_name += '_LA' + str( sum(list(map(int, args.transformer_enc_lookaheads.split('_'))))) return dir_name
def add_args(parser, args): group = parser.add_argument_group("RNN encoder") parser = ConvEncoder.add_args(parser, args) group.add_argument('--enc_n_units', type=int, default=512, help='number of units in each encoder RNN layer') group.add_argument( '--enc_n_projs', type=int, default=0, help= 'number of units in the projection layer after each encoder RNN layer' ) group.add_argument( '--bidirectional_sum_fwd_bwd', type=strtobool, default=False, help='sum forward and backward RNN outputs for dimension reduction' ) # streaming group.add_argument( '--lc_chunk_size_left', type=str, default="0", help='left chunk size for latency-controlled RNN encoder') group.add_argument( '--lc_chunk_size_right', type=str, default="0", help='right chunk size for latency-controlled RNN encoder') return parser
def define_name(dir_name, args): if 'conv' in args.enc_type: dir_name = ConvEncoder.define_name(dir_name, args) dir_name += str(args.enc_n_units) + 'H' if args.enc_n_projs > 0: dir_name += str(args.enc_n_projs) + 'P' dir_name += str(args.enc_n_layers) + 'L' if args.bidirectional_sum_fwd_bwd: dir_name += '_sumfwdbwd' if int(args.lc_chunk_size_left.split('_')[0]) > 0 or int(args.lc_chunk_size_right.split('_')[0]) > 0: dir_name += '_chunkL' + args.lc_chunk_size_left + 'R' + args.lc_chunk_size_right return dir_name
def add_args(parser, args): """Add arguments.""" group = parser.add_argument_group("Transformer encoder") if 'conv' in args.enc_type: parser = ConvEncoder.add_args(parser, args) # Transformer common if not hasattr(args, 'transformer_d_model'): group.add_argument('--transformer_d_model', type=int, default=256, help='number of units in the MHA layer') if not hasattr(args, 'transformer_d_ff'): group.add_argument('--transformer_d_ff', type=int, default=2048, help='number of units in the FFN layer') if not hasattr(args, 'transformer_d_ff_bottleneck_dim'): group.add_argument('--transformer_d_ff_bottleneck_dim', type=int, default=0, help='bottleneck dimension in the FFN layer') if not hasattr(args, 'transformer_n_heads'): group.add_argument('--transformer_n_heads', type=int, default=4, help='number of heads in the MHA layer') if not hasattr(args, 'transformer_layer_norm_eps'): group.add_argument('--transformer_layer_norm_eps', type=float, default=1e-12, help='epsilon value for layer normalization') if not hasattr(args, 'transformer_ffn_activation'): group.add_argument('--transformer_ffn_activation', type=str, default='relu', choices=['relu', 'gelu', 'gelu_accurate', 'glu', 'swish'], help='nonlinear activation for the FFN layer') if not hasattr(args, 'transformer_param_init'): group.add_argument('--transformer_param_init', type=str, default='xavier_uniform', choices=['xavier_uniform', 'pytorch'], help='parameter initializatin') # NOTE: These checks are important to avoid conflict with args in Transformer decoder # Conformer encoder specific group.add_argument('--transformer_enc_pe_type', type=str, default='relative', choices=['relative'], help='type of positional encoding for the Transformer encoder') group.add_argument('--conformer_kernel_size', type=int, default=32, help='kernel size for depthwise convolution in convolution module for Conformer encoder layers') group.add_argument('--dropout_enc_layer', type=float, default=0.0, help='LayerDrop probability for Conformer encoder layers') # streaming group.add_argument('--lc_chunk_size_left', type=int, default=0, help='left chunk size for latency-controlled Conformer encoder') group.add_argument('--lc_chunk_size_current', type=int, default=0, help='current chunk size (and hop size) for latency-controlled Conformer encoder') group.add_argument('--lc_chunk_size_right', type=int, default=0, help='right chunk size for latency-controlled Conformer encoder') return parser
def add_args(parser, args): """Add arguments.""" group = parser.add_argument_group("Transformer encoder") if 'conv' in args.enc_type: parser = ConvEncoder.add_args(parser, args) # Transformer common if not hasattr(args, 'transformer_layer_norm_eps'): group.add_argument('--transformer_ffn_bottleneck_dim', type=int, default=0, help='bottleneck dimension in the FFN layer') group.add_argument('--transformer_input_bottleneck_dim', type=int, default=0, help='bottleneck dimension in the FFN layer') group.add_argument('--transformer_layer_norm_eps', type=float, default=1e-12, help='epsilon value for layer normalization') group.add_argument('--transformer_ffn_activation', type=str, default='relu', choices=['relu', 'gelu', 'gelu_accurate', 'glu', 'swish'], help='nonlinear activation for the FFN layer') group.add_argument('--transformer_param_init', type=str, default='xavier_uniform', choices=['xavier_uniform', 'pytorch'], help='parameter initialization') # Transformer encoder specific group.add_argument('--transformer_enc_d_model', type=int, default=256, help='number of units in the MHA layer for Transformer encoder') group.add_argument('--transformer_enc_d_ff', type=int, default=2048, help='number of units in the FFN layer for Transformer encoder') group.add_argument('--transformer_enc_n_heads', type=int, default=4, help='number of heads in the MHA layer for Transformer encoder') group.add_argument('--transformer_enc_pe_type', type=str, default='add', choices=['add', 'none', 'relative', 'relative_xl'], help='type of positional encoding for Transformer encoder') group.add_argument('--dropout_enc_layer', type=float, default=0.0, help='LayerDrop probability for Transformer encoder layers') group.add_argument('--transformer_enc_clamp_len', type=int, default=-1, help='maximum length for relative positional encoding. -1 means infinite length.') # streaming group.add_argument('--transformer_enc_lookaheads', type=str, default="0_0_0_0_0_0_0_0_0_0_0_0", help='lookahead frames per layer for unidirectional Transformer encoder') group.add_argument('--lc_chunk_size_left', type=str, default="0", help='left chunk size for latency-controlled Transformer encoder') group.add_argument('--lc_chunk_size_current', type=str, default="0", help='current chunk size (and hop size) for latency-controlled Transformer encoder') group.add_argument('--lc_chunk_size_right', type=str, default="0", help='right chunk size for latency-controlled Transformer encoder') group.add_argument('--lc_type', type=str, default='reshape', choices=['reshape', 'mask'], help='implementation methods of latency-controlled Transformer encoder') return parser
def define_name(dir_name, args): if 'conv' in args.enc_type: dir_name = ConvEncoder.define_name(dir_name, args) dir_name += str(args.transformer_d_model) + 'dmodel' dir_name += str(args.transformer_d_ff) + 'dff' if args.transformer_ffn_bottleneck_dim > 0: dir_name += str(args.transformer_ffn_bottleneck_dim) + 'bn' dir_name += str(args.enc_n_layers) + 'L' dir_name += str(args.transformer_n_heads) + 'H' dir_name += 'pe' + str(args.transformer_enc_pe_type) if args.dropout_enc_layer > 0: dir_name += 'droplayer' + str(args.dropout_enc_layer) if args.lc_chunk_size_left > 0 or getattr(args, 'lc_chunk_size_current', 0) > 0 or args.lc_chunk_size_right > 0: dir_name += '_chunkL' + str(args.lc_chunk_size_left) + 'C' + \ str(args.lc_chunk_size_current) + 'R' + str(args.lc_chunk_size_right) dir_name += '_' + args.lc_type return dir_name
def define_name(dir_name, args): if 'conv' in args.enc_type: dir_name = ConvEncoder.define_name(dir_name, args) dir_name += str(args.enc_n_units) + 'H' if args.enc_n_projs > 0: dir_name += str(args.enc_n_projs) + 'P' dir_name += str(args.enc_n_layers) + 'L' if args.bidirectional_sum_fwd_bwd: dir_name += '_sumfwdbwd' if int(args.lc_chunk_size_left.split('_')[0]) > 0 or int( args.lc_chunk_size_right.split('_')[0]) > 0: dir_name += '_chunkL' + args.lc_chunk_size_left + 'R' + args.lc_chunk_size_right if not args.cnn_lookahead: dir_name += '_blockwise' if args.rsp_prob_enc > 0: dir_name += '_RSP' + str(args.rsp_prob_enc) return dir_name
def add_args(parser, args): group = parser.add_argument_group("RNN encoder") parser = ConvEncoder.add_args(parser, args) group.add_argument('--enc_n_units', type=int, default=512, help='number of units in each encoder RNN layer') group.add_argument( '--enc_n_projs', type=int, default=0, help= 'number of units in the projection layer after each encoder RNN layer' ) group.add_argument( '--bidirectional_sum_fwd_bwd', type=strtobool, default=False, help='sum forward and backward RNN outputs for dimension reduction' ) # streaming group.add_argument( '--lc_chunk_size_left', type=str, default="-1", help='current chunk size for latency-controlled RNN encoder') group.add_argument( '--lc_chunk_size_right', type=str, default="0", help='right chunk size for latency-controlled RNN encoder') group.add_argument('--cnn_lookahead', type=strtobool, default=True, help='disable lookahead frames in CNN layers') group.add_argument('--rsp_prob_enc', type=float, default=0.0, help='probability for Random State Passing (RSP)') return parser
def add_args(parser, args): # group = parser.add_argument_group("TDS encoder") parser = ConvEncoder.add_args(parser, args) return parser
def __init__(self, input_dim, attn_type, n_heads, n_layers, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(TransformerEncoder, self).__init__() self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.chunk_size_left = chunk_size_left self.chunk_size_current = chunk_size_current self.chunk_size_right = chunk_size_right # Setting for CNNs before RNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = repeat( TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, layer_norm_eps, ffn_activation, param_init), n_layers) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim else: self.bridge = None self._odim = d_model # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor() if param_init == 'xavier_uniform': self.reset_parameters()
def __init__(self, input_dim, enc_type, n_heads, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, ffn_bottleneck_dim, ffn_activation, pe_type, layer_norm_eps, last_proj_dim, dropout_in, dropout, dropout_att, dropout_layer, subsample, subsample_type, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, clamp_len, lookahead, chunk_size_left, chunk_size_current, chunk_size_right, streaming_type): super(TransformerEncoder, self).__init__() # parse subsample subsamples = [1] * n_layers for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))): subsamples[lth] = s # parse lookahead lookaheads = [0] * n_layers for lth, s in enumerate(list(map(int, lookahead.split('_')[:n_layers]))): lookaheads[lth] = s if len(subsamples) > 0 and len(subsamples) != n_layers: raise ValueError( 'subsample must be the same size as n_layers. n_layers: %d, subsample: %s' % (n_layers, subsamples)) if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise Warning( 'Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d' % (n_layers, n_layers_sub1)) if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise Warning( 'Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d' % (n_layers_sub1, n_layers_sub2)) self.enc_type = enc_type self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.scale = math.sqrt(d_model) # for compatibility chunk_size_left = str(chunk_size_left) chunk_size_current = str(chunk_size_current) chunk_size_right = str(chunk_size_right) # for streaming encoder self.unidir = 'uni' in enc_type self.lookaheads = lookaheads if sum(lookaheads) > 0: assert self.unidir self.chunk_size_left = int(chunk_size_left.split('_')[-1]) // n_stacks self.chunk_size_current = int( chunk_size_current.split('_')[-1]) // n_stacks self.chunk_size_right = int( chunk_size_right.split('_')[-1]) // n_stacks self.lc_bidir = self.chunk_size_current > 0 and enc_type != 'conv' and 'uni' not in enc_type self.cnn_lookahead = self.unidir or enc_type == 'conv' self.streaming_type = streaming_type if self.lc_bidir else '' # -: past context # *: current context # +: future context # reshape) overlapped windowing. additional redundant computation is introduced. # During inference, caching is not applied. However, considering (N_l+N_c+N_r) is very short # and independent on layer depth, the overhead is negligible. # chunk1: |**|++ # chunk2: --|**|++ # chunk3: --|**|++ # chunk4: --|**|++ # chunk5: --|**|++ # mask) chunkwise masking. future context is restricted within the current chunk # to avoid accumuration of future context depending on the layer depth. # chunk1: |**| # chunk2: --|**| # chunk3: -- --|**| # chunk4: -- --|**| # chunk5: -- --|**| if self.unidir: assert self.chunk_size_left == self.chunk_size_current == self.chunk_size_right == 0 if self.streaming_type == 'mask': assert self.chunk_size_right == 0 assert self.chunk_size_left == self.chunk_size_current # NOTE: this is important to cache CNN output at each chunk if self.lc_bidir: assert n_layers_sub1 == 0 assert n_layers_sub2 == 0 assert not self.unidir # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs if 'conv' in enc_type: assert conv_channels assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor self.subsample = None if np.prod(subsamples) > 1: self._factor *= np.prod(subsamples) if subsample_type == 'max_pool': self.subsample = nn.ModuleList( [MaxpoolSubsampler(factor) for factor in subsamples]) elif subsample_type == 'concat': self.subsample = nn.ModuleList([ ConcatSubsampler(factor, self._odim) for factor in subsamples ]) elif subsample_type == 'drop': self.subsample = nn.ModuleList( [DropSubsampler(factor) for factor in subsamples]) elif subsample_type == '1dconv': self.subsample = nn.ModuleList([ Conv1dSubsampler(factor, self._odim) for factor in subsamples ]) elif subsample_type == 'add': self.subsample = nn.ModuleList( [AddSubsampler(factor) for factor in subsamples]) if self.chunk_size_left > 0: assert self.chunk_size_left % self._factor == 0 if self.chunk_size_current > 0: assert self.chunk_size_current % self._factor == 0 if self.chunk_size_right > 0: assert self.chunk_size_right % self._factor == 0 self.pos_enc, self.pos_emb = None, None self.u_bias, self.v_bias = None, None if pe_type in ['relative', 'relative_xl']: self.pos_emb = XLPositionalEmbedding(d_model, dropout) if pe_type == 'relative_xl': self.u_bias = nn.Parameter( torch.Tensor(n_heads, d_model // n_heads)) self.v_bias = nn.Parameter( torch.Tensor(n_heads, d_model // n_heads)) # NOTE: u_bias and v_bias are global parameters shared in the whole model else: self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type, param_init) self.layers = nn.ModuleList([ copy.deepcopy( TransformerEncoderBlock(d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, pe_type, clamp_len, ffn_bottleneck_dim)) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = TransformerEncoderBlock( d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, pe_type, clamp_len, ffn_bottleneck_dim) odim_sub1 = d_model if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) odim_sub1 = last_proj_dim if n_layers_sub1 == n_layers: self.norm_out_sub1 = None else: self.norm_out_sub1 = nn.LayerNorm(odim_sub1, eps=layer_norm_eps) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = TransformerEncoderBlock( d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, pe_type, clamp_len, ffn_bottleneck_dim) odim_sub2 = d_model if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) odim_sub2 = last_proj_dim if n_layers_sub2 == n_layers: self.norm_out_sub2 = None else: self.norm_out_sub2 = nn.LayerNorm(odim_sub2, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim self.reset_parameters(param_init) # for streaming inference self.reset_cache()
def __init__(self, input_dim, enc_type, n_heads, kernel_size, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, ffn_bottleneck_dim, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, dropout_layer, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(ConformerEncoder, self).__init__() if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.scale = math.sqrt(d_model) # for streaming encoder self.chunk_size_left = chunk_size_left self.chunk_size_current = chunk_size_current self.chunk_size_right = chunk_size_right self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs if 'conv' in enc_type: assert conv_channels assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor if self.chunk_size_left > 0: assert self.chunk_size_left % self._factor == 0 if self.chunk_size_current > 0: assert self.chunk_size_current % self._factor == 0 if self.chunk_size_right > 0: assert self.chunk_size_right % self._factor == 0 self.pos_emb = XLPositionalEmbedding(d_model, dropout) assert pe_type == 'relative' # TODO(hirofumi0810): try other positional encodings self.layers = nn.ModuleList([ copy.deepcopy( ConformerEncoderBlock(d_model, d_ff, n_heads, kernel_size, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, ffn_bottleneck_dim=ffn_bottleneck_dim)) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = ConformerEncoderBlock( d_model, d_ff, n_heads, kernel_size, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, ffn_bottleneck_dim=ffn_bottleneck_dim) self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = ConformerEncoderBlock( d_model, d_ff, n_heads, kernel_size, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, ffn_bottleneck_dim=ffn_bottleneck_dim) self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim self.reset_parameters(param_init)
def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, dropout_layer, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(TransformerEncoder, self).__init__() if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type # for streaming TransformerXL encoder self.N_l = chunk_size_left self.N_c = chunk_size_current self.N_r = chunk_size_right self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0 self.memory_transformer = ('transformer_xl' in enc_type) self.mem_len = chunk_size_left self.scale = math.sqrt(d_model) if self.memory_transformer: assert pe_type == 'none' assert chunk_size_left > 0 assert chunk_size_current > 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor if self.memory_transformer: self.pos_emb = XLPositionalEmbedding(d_model, dropout) self.u = nn.Parameter( torch.Tensor(self.n_heads, self.d_model // self.n_heads)) self.v = nn.Parameter( torch.Tensor(self.n_heads, self.d_model // self.n_heads)) # NOTE: u and v are global parameters self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type, param_init) self.layers = nn.ModuleList([ copy.deepcopy( TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, memory_transformer=self.memory_transformer)) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim self.reset_parameters(param_init)
def __init__(self, input_dim, rnn_type, n_units, n_projs, n_layers, dropout_in, dropout, subsample, subsample_type='drop', n_stacks=1, n_splices=1, last_proj_dim=0, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_bottleneck_dim=0, n_layers_sub1=0, n_layers_sub2=0, nin=False, task_specific_layer=False, param_init=0.1): super(RNNEncoder, self).__init__() logger = logging.getLogger("training") if len(subsample) > 0 and len(subsample) != n_layers: raise ValueError('subsample must be the same size as n_layers.') if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.rnn_type = rnn_type self.bidirectional = True if rnn_type in [ 'blstm', 'bgru', 'conv_blstm', 'conv_bgru' ] else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers # Setting for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # Setting for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) # Setting for CNNs before RNNs if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] if rnn_type in ['tds', 'gated_conv']: strides = [] poolings = [] else: strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] if 'conv_' in rnn_type: subsample = [1] * self.n_layers logger.warning( 'Subsampling is automatically ignored because CNN layers are used before RNN layers.' ) else: channels = [] kernel_sizes = [] strides = [] poolings = [] if len(channels) > 0: if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) else: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None self.padding = Padding() if rnn_type not in ['conv', 'tds', 'gated_conv']: self.rnn = nn.ModuleList() self.dropout = nn.ModuleList() self.proj = None if n_projs > 0: self.proj = nn.ModuleList() # subsample self.subsample = None if subsample_type == 'max_pool' and np.prod(subsample) > 1: self.subsample = nn.ModuleList( [MaxpoolSubsampler(subsample[l]) for l in range(n_layers)]) elif subsample_type == 'concat' and np.prod(subsample) > 1: self.subsample = nn.ModuleList([ ConcatSubsampler(subsample[l], n_units, self.n_dirs) for l in range(n_layers) ]) elif subsample_type == 'drop' and np.prod(subsample) > 1: self.subsample = nn.ModuleList( [DropSubsampler(subsample[l]) for l in range(n_layers)]) # NiN self.nin = None if nin: self.nin = nn.ModuleList() for l in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) self.rnn += [ rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) ] self.dropout += [nn.Dropout(p=dropout)] self._output_dim = n_units * self.n_dirs # Projection layer if self.proj is not None: if l != n_layers - 1: self.proj += [Linear(n_units * self.n_dirs, n_projs)] self._output_dim = n_projs # Task specific layer if l == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub1 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub1 = Linear(n_units, last_proj_dim) if l == n_layers_sub2 - 1 and task_specific_layer: self.rnn_sub2 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub2 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub2 = Linear(n_units, last_proj_dim) # Network in network if self.nin is not None: if l != n_layers - 1: self.nin += [NiN(self._output_dim)] # if n_layers_sub1 > 0 or n_layers_sub2 > 0: # assert task_specific_layer if last_proj_dim != self.output_dim: self.bridge = Linear(self._output_dim, last_proj_dim) self._output_dim = last_proj_dim # Initialize parameters self.reset_parameters(param_init)
def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, dropout_residual, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, chunk_size_left, chunk_size_current, chunk_size_right, n_layers_rnn): super(TransformerEncoder, self).__init__() if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type # for streaming TransformerXL encoder self.chunk_size_left = chunk_size_left self.chunk_size_cur = chunk_size_current self.chunk_size_right = chunk_size_right self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0 self.memory_transformer = ('transformer_xl' in enc_type) self.mem_len = chunk_size_left self.scale = math.sqrt(d_model) if self.memory_transformer: assert pe_type == 'none' assert chunk_size_left > 0 assert chunk_size_current > 0 if self.latency_controlled: assert pe_type == 'none' # for hybrid RNN-Transformer encoder self.hybrid_rnn = n_layers_rnn > 0 self.n_layers_rnn = n_layers_rnn self.proj = None # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs before RNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) # Hybrid RNN-Transformer if self.hybrid_rnn: assert pe_type == 'none' self.rnn = nn.ModuleList() self.rnn_bwd = nn.ModuleList() self.dropout_rnn = nn.Dropout(p=dropout) assert ('blstm' in enc_type or 'bgru' in enc_type) # NOTE: support bidirectional only self.bidir_sum = True for _ in range(n_layers_rnn): if 'blstm' in enc_type: rnn_i = nn.LSTM elif 'bgru' in enc_type: rnn_i = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)blstm_transformer(_xl)" or "(conv_)bgru_transformer(_xl)".' ) self.rnn += [rnn_i(self._odim, d_model, 1, batch_first=True)] self.rnn_bwd += [ rnn_i(self._odim, d_model, 1, batch_first=True) ] self._odim = d_model if self.bidir_sum else d_model * self.n_dirs if self._odim != d_model: self.proj = nn.Linear(self._odim, d_model) if self.memory_transformer: self.pos_emb = XLPositionalEmbedding(d_model, dropout) self.u = nn.Parameter( torch.Tensor(self.n_heads, self.d_model // self.n_heads)) self.v = nn.Parameter( torch.Tensor(self.n_heads, self.d_model // self.n_heads)) # NOTE: u and v are global parameters self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type, param_init) # TODO: replace dropout_in with dropout self.layers = nn.ModuleList([ copy.deepcopy( TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * (lth + 1) / n_layers, layer_norm_eps, ffn_activation, param_init, memory_transformer=self.memory_transformer)) for lth in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * n_layers_sub1 / n_layers, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * n_layers_sub2 / n_layers, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) if last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor self.reset_parameters(param_init)
def build_encoder(args): if 'conv' in args.enc_type: assert args.n_stacks == 1 and args.n_splices == 1 from neural_sp.models.seq2seq.encoders.conv import ConvEncoder conv = ConvEncoder( args.input_dim, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, strides=args.conv_strides, poolings=args.conv_poolings, dropout=0., normalization=args.conv_normalization, residual=False, bottleneck_dim=args.transformer_enc_d_model if 'former' in args.enc_type else args.conv_bottleneck_dim, param_init=args.param_init) else: conv = None # safeguard if not hasattr(args, 'transformer_enc_d_model') and hasattr( args, 'transformer_d_model'): args.transformer_enc_d_model = args.transformer_d_model args.transformer_dec_d_model = args.transformer_d_model if not hasattr(args, 'transformer_enc_d_ff') and hasattr( args, 'transformer_d_ff'): args.transformer_enc_d_ff = args.transformer_d_ff if not hasattr(args, 'transformer_enc_n_heads') and hasattr( args, 'transformer_n_heads'): args.transformer_enc_n_heads = args.transformer_n_heads if args.enc_type == 'tds': from neural_sp.models.seq2seq.encoders.tds import TDSEncoder encoder = TDSEncoder( input_dim=args.input_dim * args.n_stacks, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, dropout=args.dropout_enc, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else args.dec_n_units) elif args.enc_type == 'gated_conv': from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder encoder = GatedConvEncoder( input_dim=args.input_dim * args.n_stacks, in_channel=args.conv_in_channel, channels=args.conv_channels, kernel_sizes=args.conv_kernel_sizes, dropout=args.dropout_enc, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else args.dec_n_units, param_init=args.param_init) elif 'transformer' in args.enc_type: from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder encoder = TransformerEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, n_heads=args.transformer_enc_n_heads, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, d_model=args.transformer_enc_d_model, d_ff=args.transformer_enc_d_ff, ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim, ffn_activation=args.transformer_ffn_activation, pe_type=args.transformer_enc_pe_type, layer_norm_eps=args.transformer_layer_norm_eps, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0, dropout_in=args.dropout_in, dropout=args.dropout_enc, dropout_att=args.dropout_att, dropout_layer=args.dropout_enc_layer, subsample=args.subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, frontend_conv=conv, task_specific_layer=args.task_specific_layer, param_init=args.transformer_param_init, clamp_len=args.transformer_enc_clamp_len, lookahead=args.transformer_enc_lookaheads, chunk_size_left=args.lc_chunk_size_left, chunk_size_current=args.lc_chunk_size_current, chunk_size_right=args.lc_chunk_size_right, streaming_type=args.lc_type) elif 'conformer' in args.enc_type: from neural_sp.models.seq2seq.encoders.conformer import ConformerEncoder encoder = ConformerEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, n_heads=args.transformer_enc_n_heads, kernel_size=args.conformer_kernel_size, normalization=args.conformer_normalization, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, d_model=args.transformer_enc_d_model, d_ff=args.transformer_enc_d_ff, ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim, ffn_activation='swish', pe_type=args.transformer_enc_pe_type, layer_norm_eps=args.transformer_layer_norm_eps, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0, dropout_in=args.dropout_in, dropout=args.dropout_enc, dropout_att=args.dropout_att, dropout_layer=args.dropout_enc_layer, subsample=args.subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, frontend_conv=conv, task_specific_layer=args.task_specific_layer, param_init=args.transformer_param_init, clamp_len=args.transformer_enc_clamp_len, lookahead=args.transformer_enc_lookaheads, chunk_size_left=args.lc_chunk_size_left, chunk_size_current=args.lc_chunk_size_current, chunk_size_right=args.lc_chunk_size_right, streaming_type=args.lc_type) else: from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder encoder = RNNEncoder( input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim, enc_type=args.enc_type, n_units=args.enc_n_units, n_projs=args.enc_n_projs, last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0, n_layers=args.enc_n_layers, n_layers_sub1=args.enc_n_layers_sub1, n_layers_sub2=args.enc_n_layers_sub2, dropout_in=args.dropout_in, dropout=args.dropout_enc, subsample=args.subsample, subsample_type=args.subsample_type, n_stacks=args.n_stacks, n_splices=args.n_splices, frontend_conv=conv, bidir_sum_fwd_bwd=args.bidirectional_sum_fwd_bwd, task_specific_layer=args.task_specific_layer, param_init=args.param_init, chunk_size_current=args.lc_chunk_size_left, # for compatibility chunk_size_right=args.lc_chunk_size_right, cnn_lookahead=args.cnn_lookahead, rsp_prob=args.rsp_prob_enc) return encoder
def __init__(self, input_dim, enc_type, n_heads, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, ffn_bottleneck_dim, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, dropout_layer, subsample, subsample_type, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, chunk_size_left, chunk_size_current, chunk_size_right, latency_control_type): super(TransformerEncoder, self).__init__() # parse subsample subsamples = [1] * n_layers for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))): subsamples[lth] = s if len(subsamples) > 0 and len(subsamples) != n_layers: raise ValueError( 'subsample must be the same size as n_layers. n_layers: %d, subsample: %s' % (n_layers, subsamples)) if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') assert enc_type in ['transformer', 'conv_transformer'] self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.scale = math.sqrt(d_model) # for streaming encoder self.chunk_size_left = chunk_size_left self.chunk_size_current = chunk_size_current self.chunk_size_right = chunk_size_right self.latency_controlled = chunk_size_left > 0 or chunk_size_current > 0 or chunk_size_right > 0 self.lc_type = latency_control_type # reshape) not lookahead frames in CNN layers, but requires some additional computations # mask) there are some lookahead frames in CNN layers, no additional computations # TransformerXL like streaming encoder self.memory_transformer = ('transformer_xl' in enc_type) self.mem_len = chunk_size_left if self.memory_transformer: assert pe_type == 'relative' assert chunk_size_left > 0 assert chunk_size_current > 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs if 'conv' in enc_type: assert conv_channels assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor self.subsample = None if np.prod(subsamples) > 1: self._factor *= np.prod(subsamples) if subsample_type == 'max_pool': self.subsample = nn.ModuleList( [MaxpoolSubsampler(factor) for factor in subsamples]) elif subsample_type == 'concat': self.subsample = nn.ModuleList([ ConcatSubsampler(factor, self._odim) for factor in subsamples ]) elif subsample_type == 'drop': self.subsample = nn.ModuleList( [DropSubsampler(factor) for factor in subsamples]) elif subsample_type == '1dconv': self.subsample = nn.ModuleList([ Conv1dSubsampler(factor, self._odim) for factor in subsamples ]) if self.chunk_size_left > 0: assert self.chunk_size_left % self._factor == 0 if self.chunk_size_current > 0: assert self.chunk_size_current % self._factor == 0 if self.chunk_size_right > 0: assert self.chunk_size_right % self._factor == 0 self.pos_emb = None self.u = None self.v = None if self.memory_transformer: self.pos_emb = XLPositionalEmbedding(d_model, dropout) self.u = nn.Parameter(torch.Tensor(n_heads, d_model // n_heads)) self.v = nn.Parameter(torch.Tensor(n_heads, d_model // n_heads)) # NOTE: u and v are global parameters elif pe_type == 'relative': self.pos_emb = XLPositionalEmbedding(d_model, dropout) else: self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type, param_init) self.layers = nn.ModuleList([ copy.deepcopy( TransformerEncoderBlock(d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, relative_attention=self.pos_emb is not None, ffn_bottleneck_dim=ffn_bottleneck_dim)) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = TransformerEncoderBlock( d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, ffn_bottleneck_dim=ffn_bottleneck_dim) self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = TransformerEncoderBlock( d_model, d_ff, n_heads, dropout, dropout_att, dropout_layer, layer_norm_eps, ffn_activation, param_init, ffn_bottleneck_dim=ffn_bottleneck_dim) self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim self.reset_parameters(param_init)
def __init__(self, input_dim, attn_type, attn_n_heads, n_layers, d_model, d_ff, pe_type, dropout_in=0, dropout=0, dropout_att=0, layer_norm_eps=1e-6, n_stacks=1, n_splices=1, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_residual=False, conv_bottleneck_dim=0): super(TransformerEncoder, self).__init__() self.d_model = d_model self.pe_type = pe_type # Setting for CNNs before RNNs if conv_channels: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] else: channels = [] kernel_sizes = [] strides = [] poolings = [] if len(channels) > 0: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, residual=conv_residual, bottleneck_dim=d_model) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None self.embed_in = LinearND( self._output_dim, d_model, dropout=0) # NOTE: do not apply dropout here if pe_type: self.pos_emb_in = PositionalEncoding(d_model, dropout_in, pe_type) self.layer_norm_in = nn.LayerNorm(d_model, eps=layer_norm_eps) # Self-attention layers self.layers = nn.ModuleList([ TransformerEncoderBlock(d_model, d_ff, attn_type, attn_n_heads, dropout, dropout_att, layer_norm_eps) for l in range(n_layers) ]) self.layer_norm_top = nn.LayerNorm(d_model, eps=layer_norm_eps) self._output_dim = d_model
class RNNEncoder(EncoderBase): """RNN encoder. Args: input_dim (int): dimension of input features (freq * channel) rnn_type (str): type of encoder (including pure CNN layers) n_units (int): number of units in each layer n_projs (int): number of units in each projection layer last_proj_dim (int): dimension of the last projection layer n_layers (int): number of layers n_layers_sub1 (int): number of layers in the 1st auxiliary task n_layers_sub2 (int): number of layers in the 2nd auxiliary task dropout_in (float): dropout probability for input-hidden connection dropout (float): dropout probability for hidden-hidden connection subsample (list): subsample in the corresponding RNN layers ex.) [False, True, True, False] means that subsample is conducted in the 2nd and 3rd layers. subsample_type (str): drop/concat/max_pool n_stacks (int): number of frames to stack n_splices (int): number of frames to splice conv_in_channel (int): number of channels of input features conv_channels (int): number of channles in the CNN blocks conv_kernel_sizes (list): size of kernels in the CNN blocks conv_strides (list): number of strides in the CNN blocks conv_poolings (list): size of poolings in the CNN blocks conv_batch_norm (bool): apply batch normalization only in the CNN blocks conv_layer_norm (bool): apply layer normalization only in the CNN blocks conv_bottleneck_dim (int): dimension of the bottleneck layer between CNN and RNN layers nin (bool): insert 1*1 conv + batch normalization + ReLU bidirectional_sum_fwd_bwd (bool): task_specific_layer (bool): param_init (float): lc_chunk_size_left (int): left chunk size for latency-controlled bidirectional encoder lc_chunk_size_right (int): right chunk size for latency-controlled bidirectional encoder lc_state_reset_prob (float): probability to reset states for latency-controlled bidirectional encoder """ def __init__(self, input_dim, rnn_type, n_units, n_projs, last_proj_dim, n_layers, n_layers_sub1, n_layers_sub2, dropout_in, dropout, subsample, subsample_type, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, nin, bidirectional_sum_fwd_bwd, task_specific_layer, param_init, lc_chunk_size_left, lc_chunk_size_right, lc_state_reset_prob): super(RNNEncoder, self).__init__() if len(subsample) > 0 and len(subsample) != n_layers: raise ValueError('subsample must be the same size as n_layers.') if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.rnn_type = rnn_type self.bidirectional = True if ('blstm' in rnn_type or 'bgru' in rnn_type) else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers # for latency-controlled self.latency_controlled = lc_chunk_size_left > 0 or lc_chunk_size_right > 0 self.lc_chunk_size_left = lc_chunk_size_left self.lc_chunk_size_right = lc_chunk_size_right self.lc_state_reset_prob = lc_state_reset_prob if self.latency_controlled: assert n_layers_sub1 == 0 assert n_layers_sub2 == 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) elif 'conv' in rnn_type: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, residual=False, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) else: self.conv = None if self.conv is None: self._odim = input_dim * n_splices * n_stacks else: self._odim = self.conv.output_dim subsample = [1] * self.n_layers logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.') self.padding = Padding(bidirectional_sum_fwd_bwd=bidirectional_sum_fwd_bwd) if rnn_type not in ['conv', 'tds', 'gated_conv']: self.rnn = nn.ModuleList() if self.latency_controlled: self.rnn_bwd = nn.ModuleList() self.dropout = nn.Dropout(p=dropout) self.proj = None if n_projs > 0: self.proj = nn.ModuleList() # subsample self.subsample_layer = None if subsample_type == 'max_pool' and np.prod(subsample) > 1: self.subsample_layer = nn.ModuleList([MaxpoolSubsampler(subsample[l]) for l in range(n_layers)]) elif subsample_type == 'concat' and np.prod(subsample) > 1: self.subsample_layer = nn.ModuleList([ConcatSubsampler(subsample[l], n_units * self.n_dirs) for l in range(n_layers)]) elif subsample_type == 'drop' and np.prod(subsample) > 1: self.subsample_layer = nn.ModuleList([DropSubsampler(subsample[l]) for l in range(n_layers)]) elif subsample_type == '1dconv' and np.prod(subsample) > 1: self.subsample_layer = nn.ModuleList([Conv1dSubsampler(subsample[l], n_units * self.n_dirs) for l in range(n_layers)]) # NiN self.nin = nn.ModuleList() if nin else None for l in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError('rnn_type must be "(conv_)(b/lcb)lstm" or "(conv_)(b/lcb)gru".') if self.latency_controlled: self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True)] self.rnn_bwd += [rnn_i(self._odim, n_units, 1, batch_first=True)] else: self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional)] self._odim = n_units if bidirectional_sum_fwd_bwd else n_units * self.n_dirs self.bidirectional_sum_fwd_bwd = bidirectional_sum_fwd_bwd # Projection layer if self.proj is not None: if l != n_layers - 1: self.proj += [nn.Linear(n_units * self.n_dirs, n_projs)] self._odim = n_projs # Task specific layer if l == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(n_units, last_proj_dim) if l == n_layers_sub2 - 1 and task_specific_layer: self.rnn_sub2 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(n_units, last_proj_dim) # Network in network if self.nin is not None: if l != n_layers - 1: self.nin += [NiN(self._odim)] # if n_layers_sub1 > 0 or n_layers_sub2 > 0: # assert task_specific_layer if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor() self._factor *= np.prod(subsample) self.reset_parameters(param_init) # for streaming inference self.reset_cache() def reset_parameters(self, param_init): """Initialize parameters with uniform distribution.""" logger.info('===== Initialize %s =====' % self.__class__.__name__) for n, p in self.named_parameters(): if 'conv' in n or 'tds' in n or 'gated_conv' in n: continue # for CNN layers before RNN layers if p.dim() == 1: nn.init.constant_(p, 0.) # bias logger.info('Initialize %s with %s / %.3f' % (n, 'constant', 0.)) elif p.dim() in [2, 4]: nn.init.uniform_(p, a=-param_init, b=param_init) logger.info('Initialize %s with %s / %.3f' % (n, 'uniform', param_init)) else: raise ValueError(n) def reset_cache(self): self.fwd_states = [None] * self.n_layers logger.debug('Reset cache.') def forward(self, xs, xlens, task, use_cache=False, streaming=False): """Forward computation. Args: xs (FloatTensor): `[B, T, input_dim]` xlens (list): A list of length `[B]` task (str): all or ys or ys_sub1 or ys_sub2 use_cache (bool): use the cached forward encoder state in the previous chunk as the initial state streaming (bool): streaming encoding Returns: eouts (dict): xs (FloatTensor): `[B, T // prod(subsample), n_units (*2)]` xlens (IntTensor): `[B]` xs_sub1 (FloatTensor): `[B, T // prod(subsample), n_units (*2)]` xlens_sub1 (IntTensor): `[B]` xs_sub2 (FloatTensor): `[B, T // prod(subsample), n_units (*2)]` xlens_sub2 (IntTensor): `[B]` """ eouts = {'ys': {'xs': None, 'xlens': None}, 'ys_sub1': {'xs': None, 'xlens': None}, 'ys_sub2': {'xs': None, 'xlens': None}} # Sort by lenghts in the descending order for pack_padded_sequence xlens, perm_ids = torch.IntTensor(xlens).sort(0, descending=True) xs = xs[perm_ids] _, perm_ids_unsort = perm_ids.sort() # Dropout for inputs-hidden connection xs = self.dropout_in(xs) # Path through CNN blocks before RNN layers if self.conv is not None: xs, xlens = self.conv(xs, xlens) if self.rnn_type in ['conv', 'tds', 'gated_conv']: eouts['ys']['xs'] = xs eouts['ys']['xlens'] = xlens return eouts if not use_cache: self.reset_cache() if self.latency_controlled: # Flip the layer and time loop xs, xlens = self._forward_streaming(xs, xlens, streaming) else: for l in range(self.n_layers): self.rnn[l].flatten_parameters() # for multi-GPUs xs, self.fwd_states[l] = self.padding(xs, xlens, self.rnn[l], prev_state=self.fwd_states[l]) xs = self.dropout(xs) # Pick up outputs in the sub task before the projection layer if l == self.n_layers_sub1 - 1: xs_sub1, xlens_sub1 = self.sub_module(xs, xlens, perm_ids_unsort, 'sub1') if task == 'ys_sub1': eouts[task]['xs'], eouts[task]['xlens'] = xs_sub1, xlens_sub1 return eouts if l == self.n_layers_sub2 - 1: xs_sub2, xlens_sub2 = self.sub_module(xs, xlens, perm_ids_unsort, 'sub2') if task == 'ys_sub2': eouts[task]['xs'], eouts[task]['xlens'] = xs_sub2, xlens_sub2 return eouts # NOTE: Exclude the last layer if l != self.n_layers - 1: # Projection layer -> Subsampling -> NiN if self.proj is not None: xs = torch.tanh(self.proj[l](xs)) if self.subsample_layer is not None: xs, xlens = self.subsample_layer[l](xs, xlens) if self.nin is not None: xs = self.nin[l](xs) # Bridge layer if self.bridge is not None: xs = self.bridge(xs) # Unsort xs = xs[perm_ids_unsort] xlens = xlens[perm_ids_unsort] if task in ['all', 'ys']: eouts['ys']['xs'], eouts['ys']['xlens'] = xs, xlens if self.n_layers_sub1 >= 1 and task == 'all': eouts['ys_sub1']['xs'], eouts['ys_sub1']['xlens'] = xs_sub1, xlens_sub1 if self.n_layers_sub2 >= 1 and task == 'all': eouts['ys_sub2']['xs'], eouts['ys_sub2']['xlens'] = xs_sub2, xlens_sub2 return eouts def _forward_streaming(self, xs, xlens, streaming): """Streaming encoding for the latency-controlled bidirectional encoder. Args: xs (FloatTensor): `[B, T, n_units]` Returns: xs (FloatTensor): `[B, T, n_units]` """ cs_l = self.lc_chunk_size_left // self.subsampling_factor() cs_r = self.lc_chunk_size_right // self.subsampling_factor() # full context BPTT if cs_l < 0: for l in range(self.n_layers): self.rnn[l].flatten_parameters() # for multi-GPUs self.rnn_bwd[l].flatten_parameters() # for multi-GPUs # bwd xs_bwd = torch.flip(xs, dims=[1]) xs_bwd, _ = self.rnn_bwd[l](xs_bwd, hx=None) xs_bwd = torch.flip(xs_bwd, dims=[1]) # fwd xs_fwd, _ = self.rnn[l](xs, hx=None) if self.bidirectional_sum_fwd_bwd: xs = xs_fwd + xs_bwd else: xs = torch.cat([xs_fwd, xs_bwd], dim=-1) xs = self.dropout(xs) # Projection layer if self.proj is not None and l != self.n_layers - 1: xs = torch.tanh(self.proj[l](xs)) return xs, xlens bs, xmax, input_dim = xs.size() n_chunks = 1 if streaming else math.ceil(xmax / cs_l) xlens = torch.IntTensor(bs).fill_(cs_l if streaming else xmax) xs_chunks = [] for t in range(0, cs_l * n_chunks, cs_l): xs_chunk = xs[:, t:t + (cs_l + cs_r)] for l in range(self.n_layers): self.rnn[l].flatten_parameters() # for multi-GPUs self.rnn_bwd[l].flatten_parameters() # for multi-GPUs # bwd xs_chunk_bwd = torch.flip(xs_chunk, dims=[1]) xs_chunk_bwd, _ = self.rnn_bwd[l](xs_chunk_bwd, hx=None) xs_chunk_bwd = torch.flip(xs_chunk_bwd, dims=[1]) # `[B, cs_l+cs_r, n_units]` # fwd if xs_chunk.size(1) <= cs_l: xs_chunk_fwd, self.fwd_states[l] = self.rnn[l](xs_chunk, hx=self.fwd_states[l]) if self.training and self.lc_state_reset_prob > 0 and random.random() < self.lc_state_reset_prob: self.fwd_states[l] = None else: xs_chunk_fwd1, self.fwd_states[l] = self.rnn[l](xs_chunk[:, :cs_l], hx=self.fwd_states[l]) if self.training and self.lc_state_reset_prob > 0 and random.random() < self.lc_state_reset_prob: self.fwd_states[l] = None xs_chunk_fwd2, _ = self.rnn[l](xs_chunk[:, cs_l:], hx=self.fwd_states[l]) xs_chunk_fwd = torch.cat([xs_chunk_fwd1, xs_chunk_fwd2], dim=1) # `[B, cs_l+cs_r, n_units]` # NOTE: xs_chunk_fwd2 is for xs_chunk_bwd in the next layer if self.bidirectional_sum_fwd_bwd: xs_chunk = xs_chunk_fwd + xs_chunk_bwd else: xs_chunk = torch.cat([xs_chunk_fwd, xs_chunk_bwd], dim=-1) xs_chunk = self.dropout(xs_chunk) # Projection layer if self.proj is not None and l != self.n_layers - 1: xs_chunk = torch.tanh(self.proj[l](xs_chunk)) xs_chunks.append(xs_chunk[:, :cs_l]) xs = torch.cat(xs_chunks, dim=1) return xs, xlens def sub_module(self, xs, xlens, perm_ids_unsort, module='sub1'): if self.task_specific_layer: getattr(self, 'rnn_' + module).flatten_parameters() # for multi-GPUs xs_sub, _ = self.padding(xs, xlens, getattr(self, 'rnn_' + module)) xs_sub = self.dropout(xs_sub) else: xs_sub = xs.clone()[perm_ids_unsort] if getattr(self, 'bridge_' + module) is not None: xs_sub = getattr(self, 'bridge_' + module)(xs_sub) xlens_sub = xlens[perm_ids_unsort] return xs_sub, xlens_sub
class TransformerEncoder(EncoderBase): """Transformer encoder. Args: input_dim (int): dimension of input features (freq * channel) enc_type (str): type of encoder attn_type (str): type of attention n_heads (int): number of heads for multi-head attention n_layers (int): number of blocks n_layers_sub1 (int): number of layers in the 1st auxiliary task n_layers_sub2 (int): number of layers in the 2nd auxiliary task d_model (int): dimension of MultiheadAttentionMechanism d_ff (int): dimension of PositionwiseFeedForward last_proj_dim (int): dimension of the last projection layer pe_type (str): type of positional encoding layer_norm_eps (float): epsilon value for layer normalization ffn_activation (str): nonolinear function for PositionwiseFeedForward dropout_in (float): dropout probability for input-hidden connection dropout (float): dropout probabilities for linear layers dropout_att (float): dropout probabilities for attention distributions dropout_residual (float): dropout probability for stochastic residual connections n_stacks (int): number of frames to stack n_splices (int): frames to splice. Default is 1 frame. conv_in_channel (int): number of channels of input features conv_channels (int): number of channles in the CNN blocks conv_kernel_sizes (list): size of kernels in the CNN blocks conv_strides (list): number of strides in the CNN blocks conv_poolings (list): size of poolings in the CNN blocks conv_batch_norm (bool): apply batch normalization only in the CNN blocks conv_layer_norm (bool): apply layer normalization only in the CNN blocks conv_bottleneck_dim (int): dimension of the bottleneck layer between CNN and self-attention layers conv_param_init (float): only for CNN layers before Transformer layers chunk_size_left (int): left chunk size for time-restricted Transformer encoder chunk_size_current (int): current chunk size for time-restricted Transformer encoder chunk_size_right (int): right chunk size for time-restricted Transformer encoder task_specific_layer (bool): add a task specific layer for each sub task param_init (str): parameter initialization method """ def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, dropout_residual, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(TransformerEncoder, self).__init__() if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type # for latency-controlled self.chunk_size_left = chunk_size_left self.chunk_size_cur = chunk_size_current self.chunk_size_right = chunk_size_right # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs before RNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = nn.ModuleList([ copy.deepcopy( TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * (l + 1) / n_layers, layer_norm_eps, ffn_activation, param_init)) for l in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * n_layers_sub1 / n_layers, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * n_layers_sub2 / n_layers, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) if last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor() if param_init == 'xavier_uniform': self.reset_parameters() def reset_parameters(self): """Initialize parameters with Xavier uniform distribution.""" logger.info( '===== Initialize %s with Xavier uniform distribution =====' % self.__class__.__name__) if self.conv is None: nn.init.xavier_uniform_(self.embed.weight) nn.init.constant_(self.embed.bias, 0.) if self.bridge is not None: nn.init.xavier_uniform_(self.bridge.weight) nn.init.constant_(self.bridge.bias, 0.) def forward(self, xs, xlens, task, use_cache=False, streaming=False): """Forward computation. Args: xs (FloatTensor): `[B, T, input_dim]` xlens (list): `[B]` task (str): not supported now use_cache (bool): streaming (bool): streaming encoding Returns: eouts (dict): xs (FloatTensor): `[B, T, d_model]` xlens (list): `[B]` """ eouts = { 'ys': { 'xs': None, 'xlens': None }, 'ys_sub1': { 'xs': None, 'xlens': None }, 'ys_sub2': { 'xs': None, 'xlens': None } } if self.conv is None: xs = self.embed(xs) else: # Path through CNN blocks before RNN layers xs, xlens = self.conv(xs, xlens) if not self.training: self.data_dict['elens'] = tensor2np(xlens) bs, xmax, idim = xs.size() xs = self.pos_enc(xs) if self.chunk_size_left > 0: # Time-restricted self-attention for streaming models cs_l = self.chunk_size_left cs_c = self.chunk_size_cur cs_r = self.chunk_size_right xs_chunks = [] xx_aws = [[] for l in range(self.n_layers)] xs_pad = torch.cat([ xs.new_zeros(bs, cs_l, idim), xs, xs.new_zeros(bs, cs_r, idim) ], dim=1) # TODO: remove right padding for t in range(cs_l, cs_l + xmax, self.chunk_size_cur): xs_chunk = xs_pad[:, t - cs_l:t + cs_c + cs_r] for l, layer in enumerate(self.layers): xs_chunk, xx_aws_chunk = layer(xs_chunk, None) # no mask xx_aws[l].append(xx_aws_chunk[:, :, cs_l:cs_l + cs_c, cs_l:cs_l + cs_c]) xs_chunks.append(xs_chunk[:, cs_l:cs_l + cs_c]) xs = torch.cat(xs_chunks, dim=1)[:, :xmax] if not self.training: for l in range(self.n_layers): self.aws_dict['xx_aws_layer%d' % l] = tensor2np( torch.cat(xx_aws[l], dim=3)[:, :, :xmax, :xmax]) else: # Create the self-attention mask xx_mask = make_pad_mask(xlens, self.device_id).unsqueeze(2).repeat( [1, 1, xmax]) for l, layer in enumerate(self.layers): xs, xx_aws = layer(xs, xx_mask) if not self.training: self.aws_dict['xx_aws_layer%d' % l] = tensor2np(xx_aws) # Pick up outputs in the sub task before the projection layer if l == self.n_layers_sub1 - 1: xs_sub1 = self.layer_sub1( xs, xx_mask )[0] if self.task_specific_layer else xs.clone() xs_sub1 = self.norm_out_sub1(xs_sub1) if self.bridge_sub1 is not None: xs_sub1 = self.bridge_sub1(xs_sub1) if task == 'ys_sub1': eouts[task]['xs'], eouts[task][ 'xlens'] = xs_sub1, xlens return eouts if l == self.n_layers_sub2 - 1: xs_sub2 = self.layer_sub2( xs, xx_mask )[0] if self.task_specific_layer else xs.clone() xs_sub2 = self.norm_out_sub2(xs_sub2) if self.bridge_sub2 is not None: xs_sub2 = self.bridge_sub2(xs_sub2) if task == 'ys_sub2': eouts[task]['xs'], eouts[task][ 'xlens'] = xs_sub2, xlens return eouts xs = self.norm_out(xs) # Bridge layer if self.bridge is not None: xs = self.bridge(xs) if task in ['all', 'ys']: eouts['ys']['xs'], eouts['ys']['xlens'] = xs, xlens if self.n_layers_sub1 >= 1 and task == 'all': eouts['ys_sub1']['xs'], eouts['ys_sub1']['xlens'] = xs_sub1, xlens if self.n_layers_sub2 >= 1 and task == 'all': eouts['ys_sub2']['xs'], eouts['ys_sub2']['xlens'] = xs_sub2, xlens return eouts def _plot_attention(self, save_path, n_cols=2): """Plot attention for each head in all layers.""" from matplotlib import pyplot as plt from matplotlib.ticker import MaxNLocator _save_path = mkdir_join(save_path, 'enc_att_weights') # Clean directory if _save_path is not None and os.path.isdir(_save_path): shutil.rmtree(_save_path) os.mkdir(_save_path) for k, aw in self.aws_dict.items(): elens = self.data_dict['elens'] plt.clf() n_heads = aw.shape[1] n_cols_tmp = 1 if n_heads == 1 else n_cols fig, axes = plt.subplots(max(1, n_heads // n_cols_tmp), n_cols_tmp, figsize=(20, 8), squeeze=False) for h in range(n_heads): ax = axes[h // n_cols_tmp, h % n_cols_tmp] ax.imshow(aw[-1, h, :elens[-1], :elens[-1]], aspect="auto") ax.grid(False) ax.set_xlabel("Input (head%d)" % h) ax.set_ylabel("Output (head%d)" % h) ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.tight_layout() fig.savefig(os.path.join(_save_path, '%s.png' % k), dvi=500) plt.close()
def __init__(self, input_dim, enc_type, n_units, n_projs, last_proj_dim, n_layers, n_layers_sub1, n_layers_sub2, dropout_in, dropout, subsample, subsample_type, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, bidir_sum_fwd_bwd, task_specific_layer, param_init, chunk_size_left, chunk_size_right): super(RNNEncoder, self).__init__() # parse subsample subsamples = [1] * n_layers for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))): subsamples[lth] = s if len(subsamples) > 0 and len(subsamples) != n_layers: raise ValueError( 'subsample must be the same size as n_layers. n_layers: %d, subsample: %s' % (n_layers, subsamples)) if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError( 'Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d' % (n_layers, n_layers_sub1)) if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError( 'Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d' % (n_layers_sub1, n_layers_sub2)) self.enc_type = enc_type self.bidirectional = True if ('blstm' in enc_type or 'bgru' in enc_type) else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers self.bidir_sum = bidir_sum_fwd_bwd # for latency-controlled self.chunk_size_left = int(chunk_size_left.split('_')[0]) // n_stacks self.chunk_size_right = int(chunk_size_right.split('_')[0]) // n_stacks self.lc_bidir = self.chunk_size_left > 0 or self.chunk_size_right > 0 if self.lc_bidir: assert enc_type not in ['lstm', 'gru', 'conv_lstm', 'conv_gru'] assert n_layers_sub2 == 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) if 'conv' in enc_type: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, residual=False, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks if enc_type != 'conv': self.rnn = nn.ModuleList() if self.lc_bidir: self.rnn_bwd = nn.ModuleList() self.dropout = nn.Dropout(p=dropout) self.proj = nn.ModuleList() if n_projs > 0 else None self.subsample = nn.ModuleList( ) if np.prod(subsamples) > 1 else None self.padding = Padding(bidir_sum_fwd_bwd=bidir_sum_fwd_bwd if not self.lc_bidir else False) for lth in range(n_layers): if 'lstm' in enc_type: rnn_i = nn.LSTM elif 'gru' in enc_type: rnn_i = nn.GRU else: raise ValueError( 'enc_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) if self.lc_bidir: self.rnn += [ rnn_i(self._odim, n_units, 1, batch_first=True) ] self.rnn_bwd += [ rnn_i(self._odim, n_units, 1, batch_first=True) ] else: self.rnn += [ rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) ] self._odim = n_units if bidir_sum_fwd_bwd else n_units * self.n_dirs # Projection layer if self.proj is not None: if lth != n_layers - 1: self.proj += [nn.Linear(self._odim, n_projs)] self._odim = n_projs # subsample if np.prod(subsamples) > 1: if subsample_type == 'max_pool': self.subsample += [MaxpoolSubsampler(subsamples[lth])] elif subsample_type == 'concat': self.subsample += [ ConcatSubsampler(subsamples[lth], self._odim) ] elif subsample_type == 'drop': self.subsample += [DropSubsampler(subsamples[lth])] elif subsample_type == '1dconv': self.subsample += [ Conv1dSubsampler(subsamples[lth], self._odim) ] elif subsample_type == 'add': self.subsample += [AddSubsampler(subsamples[lth])] # Task specific layer if lth == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(n_units, last_proj_dim) if lth == n_layers_sub2 - 1 and task_specific_layer: assert not self.lc_bidir self.rnn_sub2 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(n_units, last_proj_dim) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor elif np.prod(subsamples) > 1: self._factor *= np.prod(subsamples) # NOTE: subsampling factor for frame stacking should not be included here if self.chunk_size_left > 0: assert self.chunk_size_left % self._factor == 0 if self.chunk_size_right > 0: assert self.chunk_size_right % self._factor == 0 self.reset_parameters(param_init) # for streaming inference self.reset_cache()
def __init__(self, input_dim, enc_type, attn_type, n_heads, n_layers, n_layers_sub1, n_layers_sub2, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, dropout_residual, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, task_specific_layer, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(TransformerEncoder, self).__init__() if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type # for latency-controlled self.chunk_size_left = chunk_size_left self.chunk_size_cur = chunk_size_current self.chunk_size_right = chunk_size_right # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # for attention plot self.aws_dict = {} self.data_dict = {} # Setting for CNNs before RNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = nn.ModuleList([ copy.deepcopy( TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * (l + 1) / n_layers, layer_norm_eps, ffn_activation, param_init)) for l in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) self._odim = d_model if n_layers_sub1 > 0: if task_specific_layer: self.layer_sub1 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * n_layers_sub1 / n_layers, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub1 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(self._odim, last_proj_dim) if n_layers_sub2 > 0: if task_specific_layer: self.layer_sub2 = TransformerEncoderBlock( d_model, d_ff, attn_type, n_heads, dropout, dropout_att, dropout_residual * n_layers_sub2 / n_layers, layer_norm_eps, ffn_activation, param_init) self.norm_out_sub2 = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(self._odim, last_proj_dim) if last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor() if param_init == 'xavier_uniform': self.reset_parameters()
def __init__(self, input_dim, attn_type, attn_n_heads, n_layers, d_model, d_ff, pe_type='add', layer_norm_eps=1e-6, dropout_in=0, dropout=0, dropout_att=0, last_proj_dim=0, n_stacks=1, n_splices=1, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_residual=False, conv_bottleneck_dim=0, param_init=0.1): super(TransformerEncoder, self).__init__() logger = logging.getLogger("training") self.d_model = d_model self.n_layers = n_layers self.attn_n_heads = attn_n_heads self.pe_type = pe_type # Setting for CNNs before RNNs if conv_channels: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] else: channels = [] kernel_sizes = [] strides = [] poolings = [] logger.warning( 'Subsampling is automatically ignored because CNN layers are used before RNN layers.' ) if len(channels) > 0: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, residual=conv_residual, bottleneck_dim=d_model, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None self.embed = LinearND(self._output_dim, d_model, dropout=0) # NOTE: do not apply dropout here self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = nn.ModuleList([ TransformerEncoderBlock(d_model, d_ff, attn_type, attn_n_heads, dropout, dropout_att, layer_norm_eps) for l in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge = LinearND(self._output_dim, last_proj_dim, dropout=dropout) self._output_dim = last_proj_dim else: self.bridge = None self._output_dim = d_model # Initialize parameters self.reset_parameters()
def __init__(self, input_dim, rnn_type, n_units, n_projs, n_layers, dropout_in, dropout, subsample, subsample_type='drop', n_stacks=1, n_splices=1, last_proj_dim=0, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_residual=False, conv_bottleneck_dim=0, residual=False, n_layers_sub1=0, n_layers_sub2=0, nin=False, task_specific_layer=False, param_init=0.1): super(RNNEncoder, self).__init__() logger = logging.getLogger("training") if len(subsample) > 0 and len(subsample) != n_layers: raise ValueError('subsample must be the same size as n_layers.') if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.rnn_type = rnn_type self.bidirectional = True if rnn_type in [ 'blstm', 'bgru', 'conv_blstm', 'conv_bgru' ] else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_projs = n_projs self.n_layers = n_layers # Setting for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # Setting for subsampling self.subsample = subsample self.subsample_type = subsample_type # Setting for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Setting for residual connections self.residual = residual if residual: assert np.prod(subsample) == 1 # Setting for the NiN (Network in Network) self.nin = nin # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) # Setting for CNNs before RNNs if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] if rnn_type in ['tds', 'gated_conv']: strides = [] poolings = [] else: strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] if 'conv_' in rnn_type: self.subsample = [1] * self.n_layers logger.warning( 'Subsampling is automatically ignored because CNN layers are used before RNN layers.' ) else: channels = [] kernel_sizes = [] strides = [] poolings = [] if len(channels) > 0: if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) else: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, residual=conv_residual, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None if rnn_type not in ['conv', 'tds', 'gated_conv']: # Fast implementation without processes between each layer self.fast_impl = False if np.prod( self.subsample ) == 1 and self.n_projs == 0 and not residual and n_layers_sub1 == 0 and not nin: self.fast_impl = True if 'lstm' in rnn_type: rnn = nn.LSTM elif 'gru' in rnn_type: rnn = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) self.rnn = rnn(self._output_dim, n_units, n_layers, bias=True, batch_first=True, dropout=dropout, bidirectional=self.bidirectional) # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer self._output_dim = n_units * self.n_dirs self.dropout_top = nn.Dropout(p=dropout) else: self.rnn = nn.ModuleList() self.dropout = nn.ModuleList() if self.n_projs > 0: self.proj = nn.ModuleList() if subsample_type == 'max_pool' and np.prod( self.subsample) > 1: self.max_pool = nn.ModuleList() for l in range(n_layers): if self.subsample[l] > 1: self.max_pool += [ nn.MaxPool2d((1, 1), stride=(self.subsample[l], 1), ceil_mode=True) ] else: self.max_pool += [None] if subsample_type == 'concat' and np.prod(self.subsample) > 1: self.concat_proj = nn.ModuleList() self.concat_bn = nn.ModuleList() for l in range(n_layers): if self.subsample[l] > 1: self.concat_proj += [ LinearND( n_units * self.n_dirs * self.subsample[l], n_units * self.n_dirs) ] self.concat_bn += [ nn.BatchNorm1d(n_units * self.n_dirs) ] else: self.concat_proj += [None] self.concat_bn += [None] if nin: self.nin_conv = nn.ModuleList() self.nin_bn = nn.ModuleList() for l in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError( 'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".' ) self.rnn += [ rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) ] self.dropout += [nn.Dropout(p=dropout)] self._output_dim = n_units * self.n_dirs # Projection layer if n_projs > 0 and l != n_layers - 1: self.proj += [LinearND(n_units * self.n_dirs, n_projs)] self._output_dim = n_projs # Task specific layer if l == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub1 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub1 = LinearND(n_units, last_proj_dim, dropout=dropout) if l == n_layers_sub2 - 1 and task_specific_layer: self.rnn_sub2 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub2 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub2 = LinearND(n_units, last_proj_dim, dropout=dropout) # Network in network (1*1 conv + batch normalization + ReLU) # NOTE: exclude the last layer if nin and l != n_layers - 1: self.nin_conv += [ nn.Conv2d(in_channels=self._output_dim, out_channels=self._output_dim, kernel_size=1, stride=1, padding=0) ] self.nin_bn += [nn.BatchNorm2d(self._output_dim)] if n_layers_sub1 > 0 or n_layers_sub2 > 0: assert task_specific_layer if last_proj_dim != self.output_dim: self.bridge = LinearND(self._output_dim, last_proj_dim, dropout=dropout) self._output_dim = last_proj_dim # Initialize parameters self.reset_parameters(param_init)
def __init__(self, input_dim, rnn_type, n_units, n_projs, last_proj_dim, n_layers, n_layers_sub1, n_layers_sub2, dropout_in, dropout, subsample, subsample_type, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, bidirectional_sum_fwd_bwd, task_specific_layer, param_init, chunk_size_left, chunk_size_right): super(RNNEncoder, self).__init__() # parse subsample subsample_list = [1] * n_layers for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))): subsample_list[lth] = s if len(subsample_list) > 0 and len(subsample_list) != n_layers: raise ValueError('subsample must be the same size as n_layers. n_layers: %d, subsample: %s' % (n_layers, subsample_list)) if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d' % (n_layers, n_layers_sub1)) if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d' % (n_layers_sub1, n_layers_sub2)) self.rnn_type = rnn_type self.bidirectional = True if ('blstm' in rnn_type or 'bgru' in rnn_type) else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers self.bidir_sum = bidirectional_sum_fwd_bwd # for latency-controlled self.latency_controlled = chunk_size_left > 0 or chunk_size_right > 0 self.chunk_size_left = chunk_size_left self.chunk_size_right = chunk_size_right if self.latency_controlled: assert n_layers_sub2 == 0 # for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) elif 'conv' in rnn_type: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, residual=False, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) else: self.conv = None if self.conv is None: self._odim = input_dim * n_splices * n_stacks else: self._odim = self.conv.output_dim subsample_list = [1] * self.n_layers logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.') self.padding = Padding(bidirectional_sum_fwd_bwd=bidirectional_sum_fwd_bwd) if rnn_type not in ['conv', 'tds', 'gated_conv']: self.rnn = nn.ModuleList() if self.latency_controlled: self.rnn_bwd = nn.ModuleList() self.dropout = nn.Dropout(p=dropout) self.proj = None if n_projs > 0: self.proj = nn.ModuleList() # subsample self.subsample_layer = None if subsample_type == 'max_pool' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([MaxpoolSubsampler(subsample_list[lth]) for lth in range(n_layers)]) elif subsample_type == 'concat' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([ConcatSubsampler(subsample_list[lth], n_units * self.n_dirs) for lth in range(n_layers)]) elif subsample_type == 'drop' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([DropSubsampler(subsample_list[lth]) for lth in range(n_layers)]) elif subsample_type == '1dconv' and np.prod(subsample_list) > 1: self.subsample_layer = nn.ModuleList([Conv1dSubsampler(subsample_list[lth], n_units * self.n_dirs) for lth in range(n_layers)]) for lth in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".') if self.latency_controlled: self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True)] self.rnn_bwd += [rnn_i(self._odim, n_units, 1, batch_first=True)] else: self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional)] self._odim = n_units if bidirectional_sum_fwd_bwd else n_units * self.n_dirs # Projection layer if self.proj is not None: if lth != n_layers - 1: self.proj += [nn.Linear(n_units * self.n_dirs, n_projs)] self._odim = n_projs # Task specific layer if lth == n_layers_sub1 - 1 and task_specific_layer: assert not self.latency_controlled self.rnn_sub1 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub1 = nn.Linear(n_units, last_proj_dim) if lth == n_layers_sub2 - 1 and task_specific_layer: assert not self.latency_controlled self.rnn_sub2 = rnn_i(self._odim, n_units, 1, batch_first=True, bidirectional=self.bidirectional) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge_sub2 = nn.Linear(n_units, last_proj_dim) if last_proj_dim > 0 and last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor self._factor *= np.prod(subsample_list) self.reset_parameters(param_init) # for streaming inference self.reset_cache()
class TransformerEncoder(EncoderBase): """Transformer encoder. Args: input_dim (int): dimension of input features (freq * channel) attn_type (str): type of attention n_heads (int): number of heads for multi-head attention n_layers (int): number of blocks d_model (int): dimension of MultiheadAttentionMechanism d_ff (int): dimension of PositionwiseFeedForward last_proj_dim (int): dimension of the last projection layer pe_type (str): type of positional encoding layer_norm_eps (float): epsilon value for layer normalization ffn_activation (str): nonolinear function for PositionwiseFeedForward dropout_in (float): dropout probability for input-hidden connection dropout (float): dropout probabilities for linear layers dropout_att (float): dropout probabilities for attention distributions n_stacks (int): number of frames to stack n_splices (int): frames to splice. Default is 1 frame. conv_in_channel (int): number of channels of input features conv_channels (int): number of channles in the CNN blocks conv_kernel_sizes (list): size of kernels in the CNN blocks conv_strides (list): number of strides in the CNN blocks conv_poolings (list): size of poolings in the CNN blocks conv_batch_norm (bool): apply batch normalization only in the CNN blocks conv_layer_norm (bool): apply layer normalization only in the CNN blocks conv_bottleneck_dim (int): dimension of the bottleneck layer between CNN and self-attention layers conv_param_init (float): only for CNN layers before Transformer layers chunk_size_left (int): left chunk size for time-restricted Transformer encoder chunk_size_current (int): current chunk size for time-restricted Transformer encoder chunk_size_right (int): right chunk size for time-restricted Transformer encoder param_init (str): """ def __init__(self, input_dim, attn_type, n_heads, n_layers, d_model, d_ff, last_proj_dim, pe_type, layer_norm_eps, ffn_activation, dropout_in, dropout, dropout_att, n_stacks, n_splices, conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings, conv_batch_norm, conv_layer_norm, conv_bottleneck_dim, conv_param_init, param_init, chunk_size_left, chunk_size_current, chunk_size_right): super(TransformerEncoder, self).__init__() self.d_model = d_model self.n_layers = n_layers self.n_heads = n_heads self.pe_type = pe_type self.chunk_size_left = chunk_size_left self.chunk_size_current = chunk_size_current self.chunk_size_right = chunk_size_right # Setting for CNNs before RNNs if conv_channels: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=conv_channels, kernel_sizes=conv_kernel_sizes, strides=conv_strides, poolings=conv_poolings, dropout=0., batch_norm=conv_batch_norm, layer_norm=conv_layer_norm, layer_norm_eps=layer_norm_eps, residual=False, bottleneck_dim=d_model, param_init=conv_param_init) self._odim = self.conv.output_dim else: self.conv = None self._odim = input_dim * n_splices * n_stacks self.embed = nn.Linear(self._odim, d_model) self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = repeat( TransformerEncoderBlock(d_model, d_ff, attn_type, n_heads, dropout, dropout_att, layer_norm_eps, ffn_activation, param_init), n_layers) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge = nn.Linear(self._odim, last_proj_dim) self._odim = last_proj_dim else: self.bridge = None self._odim = d_model # calculate subsampling factor self._factor = 1 if self.conv is not None: self._factor *= self.conv.subsampling_factor() if param_init == 'xavier_uniform': self.reset_parameters() def reset_parameters(self): """Initialize parameters with Xavier uniform distribution.""" logger.info( '===== Initialize %s with Xavier uniform distribution =====' % self.__class__.__name__) if self.conv is None: nn.init.xavier_uniform_(self.embed.weight) nn.init.constant_(self.embed.bias, 0.) if self.bridge is not None: nn.init.xavier_uniform_(self.bridge.weight) nn.init.constant_(self.bridge.bias, 0.) def forward(self, xs, xlens, task, use_cache=False, streaming=False): """Forward computation. Args: xs (FloatTensor): `[B, T, input_dim]` xlens (list): `[B]` task (str): not supported now use_cache (bool): streaming (bool): streaming encoding Returns: eouts (dict): xs (FloatTensor): `[B, T, d_model]` xlens (list): `[B]` """ eouts = { 'ys': { 'xs': None, 'xlens': None }, 'ys_sub1': { 'xs': None, 'xlens': None }, 'ys_sub2': { 'xs': None, 'xlens': None } } if self.conv is None: xs = self.embed(xs) else: # Path through CNN blocks before RNN layers xs, xlens = self.conv(xs, xlens) bs, xmax, idim = xs.size() xs = self.pos_enc(xs) if self.chunk_size_left > 0: # Time-restricted self-attention for streaming models cs_l = self.chunk_size_left cs_c = self.chunk_size_current cs_r = self.chunk_size_right hop_size = self.chunk_size_current xs_chunks = [] xx_aws = [[] for l in range(self.n_layers)] xs_pad = torch.cat([ xs.new_zeros(bs, cs_l, idim), xs, xs.new_zeros(bs, cs_r, idim) ], dim=1) # TODO: remove right padding for t in range(cs_l, cs_l + xmax, hop_size): xs_chunk = xs_pad[:, t - cs_l:t + cs_c + cs_r] for l in range(self.n_layers): xs_chunk, xx_aws_chunk = self.layers[l](xs_chunk, None) # no mask xx_aws[l].append(xx_aws_chunk[:, :, cs_l:cs_l + cs_c, cs_l:cs_l + cs_c]) xs_chunks.append(xs_chunk[:, cs_l:cs_l + cs_c]) xs = torch.cat(xs_chunks, dim=1)[:, :xmax] if not self.training: for l in range(self.n_layers): setattr( self, 'xx_aws_layer%d' % l, tensor2np( torch.cat(xx_aws[l], dim=3)[:, :, :xmax, :xmax])) else: # Create the self-attention mask xx_mask = make_pad_mask(xlens, self.device_id).unsqueeze(2).repeat( [1, 1, xmax]) for l in range(self.n_layers): xs, xx_aws = self.layers[l](xs, xx_mask) if not self.training: setattr(self, 'xx_aws_layer%d' % l, tensor2np(xx_aws)) xs = self.norm_out(xs) # Bridge layer if self.bridge is not None: xs = self.bridge(xs) eouts['ys']['xs'] = xs eouts['ys']['xlens'] = xlens return eouts def _plot_attention(self, save_path, n_cols=2): """Plot attention for each head in all layers.""" from matplotlib import pyplot as plt from matplotlib.ticker import MaxNLocator save_path = mkdir_join(save_path, 'enc_xx_att_weights') # Clean directory if save_path is not None and os.path.isdir(save_path): shutil.rmtree(save_path) os.mkdir(save_path) for l in range(self.n_layers): if not hasattr(self, 'xx_aws_layer%d' % l): continue xx_aws = getattr(self, 'xx_aws_layer%d' % l) plt.clf() fig, axes = plt.subplots(self.n_heads // n_cols, n_cols, figsize=(20, 8)) for h in range(self.n_heads): if self.n_heads > n_cols: ax = axes[h // n_cols, h % n_cols] else: ax = axes[h] ax.imshow(xx_aws[-1, h, :, :], aspect="auto") ax.grid(False) ax.set_xlabel("Input (head%d)" % h) ax.set_ylabel("Output (head%d)" % h) ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) fig.tight_layout() fig.savefig(os.path.join(save_path, 'layer%d.png' % (l)), dvi=500) plt.close()