def check(self,
              encoder_type,
              bidirectional=False,
              conv=False,
              merge_bidirectional=False,
              projection=False,
              residual=False,
              dense_residual=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  bidirectional: %s' % str(bidirectional))
        print('  conv: %s' % str(conv))
        print('  merge_bidirectional: %s' % str(merge_bidirectional))
        print('  projection: %s' % str(projection))
        print('  residual: %s' % str(residual))
        print('  dense_residual: %s' % str(dense_residual))
        print('==================================================')

        if conv:
            # pattern 1
            # conv_channels = [32, 32]
            # conv_kernel_sizes = [[41, 11], [21, 11]]
            # conv_strides = [[2, 2], [2, 1]]
            # poolings = [[], []]

            # pattern 2 (VGG like)
            conv_channels = [64, 64]
            conv_kernel_sizes = [[3, 3], [3, 3]]
            conv_strides = [[1, 1], [1, 1]]
            poolings = [[2, 2], [2, 2]]
        else:
            conv_channels = []
            conv_kernel_sizes = []
            conv_strides = []
            poolings = []

        # Load batch data
        batch_size = 4
        splice = 1
        num_stack = 1
        xs, _, x_lens, _ = generate_data(batch_size=batch_size,
                                         num_stack=num_stack,
                                         splice=splice,
                                         backend='chainer')

        # Wrap by Variable
        xs = [chainer.Variable(x, requires_grad=False) for x in xs]

        # Load encoder
        encoder = load(encoder_type=encoder_type)

        # Initialize encoder
        encoder = encoder(
            input_size=xs[0].shape[-1] // splice // num_stack,  # 120
            rnn_type=encoder_type,
            bidirectional=bidirectional,
            num_units=256,
            num_proj=256 if projection else 0,
            num_layers=5,
            dropout_input=0.2,
            dropout_hidden=0.2,
            subsample_list=[],
            merge_bidirectional=merge_bidirectional,
            splice=splice,
            num_stack=num_stack,
            conv_channels=conv_channels,
            conv_kernel_sizes=conv_kernel_sizes,
            conv_strides=conv_strides,
            poolings=poolings,
            batch_norm=True,
            residual=residual,
            dense_residual=dense_residual)

        max_time = xs[0].shape[0]
        if conv:
            max_time = encoder.conv.get_conv_out_size(max_time, 1)

        outputs, _ = encoder(xs, x_lens)

        print('----- outputs -----')
        print((len(outputs), outputs[0].shape[0], outputs[0].shape[1]))
        num_directions = 2 if bidirectional and not merge_bidirectional else 1
        self.assertEqual(
            (batch_size, max_time, encoder.num_units * num_directions),
            (len(outputs), outputs[0].shape[0], outputs[0].shape[1]))
예제 #2
0
    def __init__(self,
                 input_size,
                 encoder_type,
                 encoder_bidirectional,
                 encoder_num_units,
                 encoder_num_proj,
                 encoder_num_layers,
                 fc_list,
                 dropout_input,
                 dropout_encoder,
                 num_classes,
                 parameter_init_distribution='uniform',
                 parameter_init=0.1,
                 recurrent_weight_orthogonal=False,
                 init_forget_gate_bias_with_one=True,
                 subsample_list=[],
                 subsample_type='drop',
                 logits_temperature=1,
                 num_stack=1,
                 splice=1,
                 input_channel=1,
                 conv_channels=[],
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 poolings=[],
                 activation='relu',
                 batch_norm=False,
                 label_smoothing_prob=0,
                 weight_noise_std=0,
                 encoder_residual=False,
                 encoder_dense_residual=False):

        super(ModelBase, self).__init__()
        self.model_type = 'ctc'

        # Setting for the encoder
        self.input_size = input_size
        self.num_stack = num_stack
        self.encoder_type = encoder_type
        self.encoder_num_units = encoder_num_units
        if encoder_bidirectional:
            self.encoder_num_units *= 2
        self.fc_list = fc_list
        self.subsample_list = subsample_list
        self.batch_norm = batch_norm

        # Setting for CTC
        self.num_classes = num_classes + 1  # Add the blank class
        self.logits_temperature = logits_temperature

        # Setting for regualarization
        self.weight_noise_injection = False
        self.weight_noise_std = float(weight_noise_std)
        self.ls_prob = label_smoothing_prob

        with self.init_scope():
            # Load the encoder
            if encoder_type in ['lstm', 'gru', 'rnn']:
                self.encoder = load(encoder_type=encoder_type)(
                    input_size=input_size,
                    rnn_type=encoder_type,
                    bidirectional=encoder_bidirectional,
                    num_units=encoder_num_units,
                    num_proj=encoder_num_proj,
                    num_layers=encoder_num_layers,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    subsample_list=subsample_list,
                    subsample_type=subsample_type,
                    use_cuda=self.use_cuda,
                    merge_bidirectional=False,
                    num_stack=num_stack,
                    splice=splice,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    activation=activation,
                    batch_norm=batch_norm,
                    residual=encoder_residual,
                    dense_residual=encoder_dense_residual)
            elif encoder_type == 'cnn':
                assert num_stack == 1 and splice == 1
                self.encoder = load(encoder_type='cnn')(
                    input_size=input_size,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    use_cuda=self.use_cuda,
                    activation=activation,
                    batch_norm=batch_norm)
            else:
                raise NotImplementedError

            ##################################################
            # Fully-connected layers
            ##################################################
            if len(fc_list) > 0:
                for i in range(len(fc_list)):
                    if i == 0:
                        if encoder_type == 'cnn':
                            bottle_input_size = self.encoder.output_size
                        else:
                            bottle_input_size = self.encoder_num_units

                        # if batch_norm:
                        #     setattr(self, 'bn_fc_0',
                        #             L.BatchNormalization(bottle_input_size))

                        setattr(
                            self, 'fc_0',
                            LinearND(bottle_input_size,
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                    else:
                        # if batch_norm:
                        #     setattr(self, 'bn_fc_' + str(i),
                        #             L.BatchNormalization(fc_list[i - 1]))

                        setattr(
                            self, 'fc_' + str(i),
                            LinearND(fc_list[i - 1],
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                # TODO: remove a bias term in the case of batch normalization

                self.fc_out = LinearND(fc_list[-1],
                                       self.num_classes,
                                       use_cuda=self.use_cuda)
            else:
                self.fc_out = LinearND(self.encoder_num_units,
                                       self.num_classes,
                                       use_cuda=self.use_cuda)

            ##################################################
            # Initialize parameters
            ##################################################
            self.init_weights(parameter_init,
                              distribution=parameter_init_distribution,
                              ignore_keys=['bias'])

            # Initialize all biases with 0
            self.init_weights(0, distribution='constant', keys=['bias'])

            # Recurrent weights are orthogonalized
            if recurrent_weight_orthogonal and encoder_type != 'cnn':
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=[encoder_type, 'weight'],
                                  ignore_keys=['bias'])

            # Initialize bias in forget gate with 1
            if init_forget_gate_bias_with_one:
                self.init_forget_gate_bias_with_one()

        # Set CTC decoders
        self._decode_greedy_np = GreedyDecoder(blank_index=0)
        self._decode_beam_np = BeamSearchDecoder(blank_index=0)
예제 #3
0
    def __init__(
            self,
            input_size,
            encoder_type,
            encoder_bidirectional,
            encoder_num_units,
            encoder_num_proj,
            encoder_num_layers,
            encoder_num_layers_sub,  # ***
            fc_list,
            fc_list_sub,
            dropout_input,
            dropout_encoder,
            main_loss_weight,  # ***
            sub_loss_weight,  # ***
            num_classes,
            num_classes_sub,  # ***
            parameter_init_distribution='uniform',
            parameter_init=0.1,
            recurrent_weight_orthogonal=False,
            init_forget_gate_bias_with_one=True,
            subsample_list=[],
            subsample_type='drop',
            logits_temperature=1,
            num_stack=1,
            splice=1,
            input_channel=1,
            conv_channels=[],
            conv_kernel_sizes=[],
            conv_strides=[],
            poolings=[],
            activation='relu',
            batch_norm=False,
            label_smoothing_prob=0,
            weight_noise_std=0,
            encoder_residual=False,
            encoder_dense_residual=False):

        super(HierarchicalCTC,
              self).__init__(input_size=input_size,
                             encoder_type=encoder_type,
                             encoder_bidirectional=encoder_bidirectional,
                             encoder_num_units=encoder_num_units,
                             encoder_num_proj=encoder_num_proj,
                             encoder_num_layers=encoder_num_layers,
                             dropout_input=dropout_input,
                             dropout_encoder=dropout_encoder,
                             num_classes=num_classes,
                             parameter_init=parameter_init,
                             subsample_list=subsample_list,
                             subsample_type=subsample_type,
                             fc_list=fc_list,
                             num_stack=num_stack,
                             splice=splice,
                             input_channel=input_channel,
                             conv_channels=conv_channels,
                             conv_kernel_sizes=conv_kernel_sizes,
                             conv_strides=conv_strides,
                             poolings=poolings,
                             logits_temperature=logits_temperature,
                             batch_norm=batch_norm,
                             label_smoothing_prob=label_smoothing_prob,
                             weight_noise_std=weight_noise_std)
        self.model_type = 'hierarchical_ctc'

        # Setting for the encoder
        self.encoder_num_layers_sub = encoder_num_layers_sub
        self.fc_list_sub = fc_list_sub

        # Setting for CTC
        self.num_classes_sub = num_classes_sub + 1  # Add the blank class

        # Setting for MTL
        self.main_loss_weight = main_loss_weight
        self.sub_loss_weight = sub_loss_weight

        with self.init_scope():
            # Overide
            delattr(self, 'encoder')

            # Load the encoder
            if encoder_type in ['lstm', 'gru', 'rnn']:
                self.encoder = load(encoder_type=encoder_type)(
                    input_size=input_size,  # 120 or 123
                    rnn_type=encoder_type,
                    bidirectional=encoder_bidirectional,
                    num_units=encoder_num_units,
                    num_proj=encoder_num_proj,
                    num_layers=encoder_num_layers,
                    num_layers_sub=encoder_num_layers_sub,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    subsample_list=subsample_list,
                    subsample_type=subsample_type,
                    use_cuda=self.use_cuda,
                    merge_bidirectional=False,
                    num_stack=num_stack,
                    splice=splice,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    activation=activation,
                    batch_norm=batch_norm,
                    residual=encoder_residual,
                    dense_residual=encoder_dense_residual)
            elif encoder_type == 'cnn':
                assert num_stack == 1 and splice == 1
                self.encoder = load(encoder_type='cnn')(
                    input_size=input_size,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    use_cuda=self.use_cuda,
                    activation=activation,
                    batch_norm=batch_norm)
            else:
                raise NotImplementedError

            ##################################################
            # Fully-connected layers in the main task
            ##################################################
            # Overide
            delattr(self, 'fc_out')
            if len(fc_list) > 0:
                for i in range(len(fc_list)):
                    # Overide
                    delattr(self, 'fc_' + str(i))

                    if i == 0:
                        if encoder_type == 'cnn':
                            bottle_input_size = self.encoder.output_size
                        else:
                            bottle_input_size = self.encoder_num_units

                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_' + str(i),
                            LinearND(bottle_input_size,
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                    else:
                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_' + str(i),
                            LinearND(fc_list[i - 1],
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                # TODO: remove a bias term in the case of batch normalization

                self.fc_out = LinearND(fc_list[-1],
                                       self.num_classes,
                                       use_cuda=self.use_cuda)
            else:
                self.fc_out = LinearND(self.encoder_num_units,
                                       self.num_classes,
                                       use_cuda=self.use_cuda)

            ##################################################
            # Fully-connected layers in the sub task
            ##################################################
            if len(fc_list_sub) > 0:
                for i in range(len(fc_list_sub)):
                    if i == 0:
                        if encoder_type == 'cnn':
                            bottle_input_size = self.encoder.output_size
                        else:
                            bottle_input_size = self.encoder_num_units

                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_sub_' + str(i),
                            LinearND(bottle_input_size,
                                     fc_list_sub[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                    else:
                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_sub_' + str(i),
                            LinearND(fc_list_sub[i - 1],
                                     fc_list_sub[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                # TODO: remove a bias term in the case of batch normalization

                self.fc_out_sub = LinearND(fc_list_sub[-1],
                                           self.num_classes_sub,
                                           use_cuda=self.use_cuda)
            else:
                self.fc_out_sub = LinearND(self.encoder_num_units,
                                           self.num_classes_sub,
                                           use_cuda=self.use_cuda)

            ##################################################
            # Initialize parameters
            ##################################################
            self.init_weights(parameter_init,
                              distribution=parameter_init_distribution,
                              ignore_keys=['bias'])

            # Initialize all biases with 0
            self.init_weights(0, distribution='constant', keys=['bias'])

            # Recurrent weights are orthogonalized
            if recurrent_weight_orthogonal:
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=['lstm', 'weight'],
                                  ignore_keys=['bias'])

            # Initialize bias in forget gate with 1
            if init_forget_gate_bias_with_one:
                self.init_forget_gate_bias_with_one()
예제 #4
0
    def __init__(
            self,
            input_size,
            encoder_type,
            encoder_bidirectional,
            encoder_num_units,
            encoder_num_proj,
            encoder_num_layers,
            encoder_num_layers_sub,  # ***
            attention_type,
            attention_dim,
            decoder_type,
            decoder_num_units,
            decoder_num_layers,
            decoder_num_units_sub,  # ***
            decoder_num_layers_sub,  # ***
            embedding_dim,
            embedding_dim_sub,  # ***
            dropout_input,
            dropout_encoder,
            dropout_decoder,
            dropout_embedding,
            main_loss_weight,  # ***
            sub_loss_weight,  # ***
            num_classes,
            num_classes_sub,  # ***
            parameter_init_distribution='uniform',
            parameter_init=0.1,
            recurrent_weight_orthogonal=False,
            init_forget_gate_bias_with_one=True,
            subsample_list=[],
            subsample_type='drop',
            bridge_layer=False,
            init_dec_state='first',
            sharpening_factor=1,
            logits_temperature=1,
            sigmoid_smoothing=False,
            coverage_weight=0,
            ctc_loss_weight_sub=0,  # ***
            attention_conv_num_channels=10,
            attention_conv_width=201,
            num_stack=1,
            splice=1,
            input_channel=1,
            conv_channels=[],
            conv_kernel_sizes=[],
            conv_strides=[],
            poolings=[],
            activation='relu',
            batch_norm=False,
            scheduled_sampling_prob=0,
            scheduled_sampling_max_step=0,
            label_smoothing_prob=0,
            weight_noise_std=0,
            encoder_residual=False,
            encoder_dense_residual=False,
            decoder_residual=False,
            decoder_dense_residual=False,
            decoding_order='attend_generate_update',
            bottleneck_dim=256,
            bottleneck_dim_sub=256,  # ***
            backward_sub=False,  # ***
            num_heads=1,
            num_heads_sub=1):  # ***

        super(HierarchicalAttentionSeq2seq, self).__init__(
            input_size=input_size,
            encoder_type=encoder_type,
            encoder_bidirectional=encoder_bidirectional,
            encoder_num_units=encoder_num_units,
            encoder_num_proj=encoder_num_proj,
            encoder_num_layers=encoder_num_layers,
            attention_type=attention_type,
            attention_dim=attention_dim,
            decoder_type=decoder_type,
            decoder_num_units=decoder_num_units,
            decoder_num_layers=decoder_num_layers,
            embedding_dim=embedding_dim,
            dropout_input=dropout_input,
            dropout_encoder=dropout_encoder,
            dropout_decoder=dropout_decoder,
            dropout_embedding=dropout_embedding,
            num_classes=num_classes,
            parameter_init=parameter_init,
            subsample_list=subsample_list,
            subsample_type=subsample_type,
            bridge_layer=bridge_layer,
            init_dec_state=init_dec_state,
            sharpening_factor=sharpening_factor,
            logits_temperature=logits_temperature,
            sigmoid_smoothing=sigmoid_smoothing,
            coverage_weight=coverage_weight,
            ctc_loss_weight=0,
            attention_conv_num_channels=attention_conv_num_channels,
            attention_conv_width=attention_conv_width,
            num_stack=num_stack,
            splice=splice,
            input_channel=input_channel,
            conv_channels=conv_channels,
            conv_kernel_sizes=conv_kernel_sizes,
            conv_strides=conv_strides,
            poolings=poolings,
            scheduled_sampling_prob=scheduled_sampling_prob,
            scheduled_sampling_max_step=scheduled_sampling_max_step,
            label_smoothing_prob=label_smoothing_prob,
            weight_noise_std=weight_noise_std,
            encoder_residual=encoder_residual,
            encoder_dense_residual=encoder_dense_residual,
            decoder_residual=decoder_residual,
            decoder_dense_residual=decoder_dense_residual,
            decoding_order=decoding_order,
            bottleneck_dim=bottleneck_dim,
            backward_loss_weight=0,
            num_heads=num_heads)
        self.model_type = 'hierarchical_attention'

        # Setting for the encoder
        self.encoder_num_units_sub = encoder_num_units
        if encoder_bidirectional:
            self.encoder_num_units_sub *= 2

        # Setting for the decoder in the sub task
        self.decoder_num_units_1 = decoder_num_units_sub
        self.decoder_num_layers_1 = decoder_num_layers_sub
        self.num_classes_sub = num_classes_sub + 1  # Add <EOS> class
        self.sos_1 = num_classes_sub
        self.eos_1 = num_classes_sub
        # NOTE: <SOS> and <EOS> have the same index
        self.backward_1 = backward_sub

        # Setting for the decoder initialization in the sub task
        if backward_sub:
            if init_dec_state == 'first':
                self.init_dec_state_1_bwd = 'final'
            elif init_dec_state == 'final':
                self.init_dec_state_1_bwd = 'first'
            else:
                self.init_dec_state_1_bwd = init_dec_state
            if encoder_type != decoder_type:
                self.init_dec_state_1_bwd = 'zero'
        else:
            self.init_dec_state_1_fwd = init_dec_state
            if encoder_type != decoder_type:
                self.init_dec_state_1_fwd = 'zero'

        # Setting for the attention in the sub task
        self.num_heads_1 = num_heads_sub

        # Setting for MTL
        self.main_loss_weight = main_loss_weight
        self.sub_loss_weight = sub_loss_weight
        self.ctc_loss_weight_sub = ctc_loss_weight_sub
        if backward_sub:
            self.bwd_weight_1 = sub_loss_weight

        with self.init_scope():
            # Overide encoder
            delattr(self, 'encoder')

            ##############################
            # Encoder
            ##############################
            if encoder_type in ['lstm', 'gru', 'rnn']:
                self.encoder = load(encoder_type=encoder_type)(
                    input_size=input_size,
                    rnn_type=encoder_type,
                    bidirectional=encoder_bidirectional,
                    num_units=encoder_num_units,
                    num_proj=encoder_num_proj,
                    num_layers=encoder_num_layers,
                    num_layers_sub=encoder_num_layers_sub,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    subsample_list=subsample_list,
                    subsample_type=subsample_type,
                    use_cuda=self.use_cuda,
                    merge_bidirectional=False,
                    num_stack=num_stack,
                    splice=splice,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    activation=activation,
                    batch_norm=batch_norm,
                    residual=encoder_residual,
                    dense_residual=encoder_dense_residual)
            elif encoder_type == 'cnn':
                assert num_stack == 1 and splice == 1
                self.encoder = load(encoder_type='cnn')(
                    input_size=input_size,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    use_cuda=self.use_cuda,
                    activation=activation,
                    batch_norm=batch_norm)
                self.init_dec_state_0 = 'zero'
                self.init_dec_state_1 = 'zero'
            else:
                raise NotImplementedError

            dir = 'bwd' if backward_sub else 'fwd'
            self.is_bridge_sub = False
            if self.sub_loss_weight > 0:

                ##################################################
                # Bridge layer between the encoder and decoder
                ##################################################
                if encoder_type == 'cnn':
                    self.bridge_1 = LinearND(self.encoder.output_size,
                                             decoder_num_units_sub,
                                             dropout=dropout_encoder,
                                             use_cuda=self.use_cuda)
                    self.encoder_num_units_sub = decoder_num_units_sub
                    self.is_bridge_sub = True
                elif bridge_layer:
                    self.bridge_1 = LinearND(self.encoder_num_units_sub,
                                             decoder_num_units_sub,
                                             dropout=dropout_encoder,
                                             use_cuda=self.use_cuda)
                    self.encoder_num_units_sub = decoder_num_units_sub
                    self.is_bridge_sub = True
                else:
                    self.is_bridge_sub = False

                ##################################################
                # Initialization of the decoder
                ##################################################
                if getattr(self, 'init_dec_state_1_' + dir) != 'zero':
                    setattr(
                        self, 'W_dec_init_1_' + dir,
                        LinearND(self.encoder_num_units_sub,
                                 decoder_num_units_sub,
                                 use_cuda=self.use_cuda))

                ##############################
                # Decoder (sub)
                ##############################
                if decoding_order == 'conditional':
                    setattr(
                        self, 'decoder_first_1_' + dir,
                        RNNDecoder(input_size=embedding_dim_sub,
                                   rnn_type=decoder_type,
                                   num_units=decoder_num_units_sub,
                                   num_layers=1,
                                   dropout=dropout_decoder,
                                   use_cuda=self.use_cuda,
                                   residual=False,
                                   dense_residual=False))
                    setattr(
                        self, 'decoder_second_1_' + dir,
                        RNNDecoder(input_size=self.encoder_num_units_sub,
                                   rnn_type=decoder_type,
                                   num_units=decoder_num_units_sub,
                                   num_layers=1,
                                   dropout=dropout_decoder,
                                   use_cuda=self.use_cuda,
                                   residual=False,
                                   dense_residual=False))
                    # NOTE; the conditional decoder only supports the 1 layer
                else:
                    setattr(
                        self, 'decoder_1_' + dir,
                        RNNDecoder(input_size=self.encoder_num_units_sub +
                                   embedding_dim_sub,
                                   rnn_type=decoder_type,
                                   num_units=decoder_num_units_sub,
                                   num_layers=decoder_num_layers_sub,
                                   dropout=dropout_decoder,
                                   use_cuda=self.use_cuda,
                                   residual=decoder_residual,
                                   dense_residual=decoder_dense_residual))

                ###################################
                # Attention layer (sub)
                ###################################
                setattr(
                    self, 'attend_1_' + dir,
                    AttentionMechanism(
                        encoder_num_units=self.encoder_num_units_sub,
                        decoder_num_units=decoder_num_units_sub,
                        attention_type=attention_type,
                        attention_dim=attention_dim,
                        use_cuda=self.use_cuda,
                        sharpening_factor=sharpening_factor,
                        sigmoid_smoothing=sigmoid_smoothing,
                        out_channels=attention_conv_num_channels,
                        kernel_size=attention_conv_width,
                        num_heads=num_heads_sub))

                ##############################
                # Output layer (sub)
                ##############################
                setattr(
                    self, 'W_d_1_' + dir,
                    LinearND(decoder_num_units_sub,
                             bottleneck_dim_sub,
                             dropout=dropout_decoder,
                             use_cuda=self.use_cuda))
                setattr(
                    self, 'W_c_1_' + dir,
                    LinearND(self.encoder_num_units_sub,
                             bottleneck_dim_sub,
                             dropout=dropout_decoder,
                             use_cuda=self.use_cuda))
                setattr(
                    self, 'fc_1_' + dir,
                    LinearND(bottleneck_dim_sub,
                             self.num_classes_sub,
                             use_cuda=self.use_cuda))

                ##############################
                # Embedding (sub)
                ##############################
                if label_smoothing_prob > 0:
                    self.embed_1 = Embedding_LS(
                        num_classes=self.num_classes_sub,
                        embedding_dim=embedding_dim_sub,
                        dropout=dropout_embedding,
                        label_smoothing_prob=label_smoothing_prob,
                        use_cuda=self.use_cuda)
                else:
                    self.embed_1 = Embedding(num_classes=self.num_classes_sub,
                                             embedding_dim=embedding_dim_sub,
                                             dropout=dropout_embedding,
                                             ignore_index=self.sos_sub,
                                             use_cuda=self.use_cuda)

            ##############################
            # CTC (sub)
            ##############################
            if ctc_loss_weight_sub > 0:
                self.fc_ctc_1 = LinearND(self.encoder_num_units_sub,
                                         num_classes_sub + 1,
                                         use_cuda=self.use_cuda)

                # Set CTC decoders
                self._decode_ctc_greedy_np = GreedyDecoder(blank_index=0)
                self._decode_ctc_beam_np = BeamSearchDecoder(blank_index=0)

            ##################################################
            # Initialize parameters
            ##################################################
            self.init_weights(parameter_init,
                              distribution=parameter_init_distribution,
                              ignore_keys=['bias'])

            # Initialize all biases with 0
            self.init_weights(0, distribution='constant', keys=['bias'])

            # Recurrent weights are orthogonalized
            if recurrent_weight_orthogonal:
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=[encoder_type, 'weight'],
                                  ignore_keys=['bias'])
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=[decoder_type, 'weight'],
                                  ignore_keys=['bias'])

            # Initialize bias in forget gate with 1
            if init_forget_gate_bias_with_one:
                self.init_forget_gate_bias_with_one()