def __init__(self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, fc_list, dropout_input, dropout_encoder, num_classes, parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', logits_temperature=1, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False): super(ModelBase, self).__init__() self.model_type = 'ctc' # Setting for the encoder self.input_size = input_size self.num_stack = num_stack self.encoder_type = encoder_type self.encoder_num_units = encoder_num_units if encoder_bidirectional: self.encoder_num_units *= 2 self.fc_list = fc_list self.subsample_list = subsample_list self.batch_norm = batch_norm # Setting for CTC self.num_classes = num_classes + 1 # Add the blank class self.logits_temperature = logits_temperature # Setting for regualarization self.weight_noise_injection = False self.weight_noise_std = float(weight_noise_std) self.ls_prob = label_smoothing_prob with self.init_scope(): # Load the encoder if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, use_cuda=self.use_cuda, merge_bidirectional=False, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, use_cuda=self.use_cuda, activation=activation, batch_norm=batch_norm) else: raise NotImplementedError ################################################## # Fully-connected layers ################################################## if len(fc_list) > 0: for i in range(len(fc_list)): if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # if batch_norm: # setattr(self, 'bn_fc_0', # L.BatchNormalization(bottle_input_size)) setattr( self, 'fc_0', LinearND(bottle_input_size, fc_list[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) else: # if batch_norm: # setattr(self, 'bn_fc_' + str(i), # L.BatchNormalization(fc_list[i - 1])) setattr( self, 'fc_' + str(i), LinearND(fc_list[i - 1], fc_list[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) # TODO: remove a bias term in the case of batch normalization self.fc_out = LinearND(fc_list[-1], self.num_classes, use_cuda=self.use_cuda) else: self.fc_out = LinearND(self.encoder_num_units, self.num_classes, use_cuda=self.use_cuda) ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal and encoder_type != 'cnn': self.init_weights(parameter_init, distribution='orthogonal', keys=[encoder_type, 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one() # Set CTC decoders self._decode_greedy_np = GreedyDecoder(blank_index=0) self._decode_beam_np = BeamSearchDecoder(blank_index=0)
def __init__( self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, encoder_num_layers_sub, # *** attention_type, attention_dim, decoder_type, decoder_num_units, decoder_num_units_sub, # *** decoder_num_layers, decoder_num_layers_sub, # *** embedding_dim, embedding_dim_sub, # *** dropout_input, dropout_encoder, dropout_decoder, dropout_embedding, main_loss_weight, # *** sub_loss_weight, # *** num_classes, num_classes_sub, # *** parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', bridge_layer=False, init_dec_state='first', sharpening_factor=1, logits_temperature=1, sigmoid_smoothing=False, coverage_weight=0, ctc_loss_weight_sub=0, # *** attention_conv_num_channels=10, attention_conv_width=201, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, scheduled_sampling_prob=0, scheduled_sampling_max_step=0, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False, decoder_residual=False, decoder_dense_residual=False, decoding_order='attend_generate_update', bottleneck_dim=256, bottleneck_dim_sub=256, # *** backward_sub=False, # *** num_heads=1, num_heads_sub=1): # *** super(HierarchicalAttentionSeq2seq, self).__init__( input_size=input_size, encoder_type=encoder_type, encoder_bidirectional=encoder_bidirectional, encoder_num_units=encoder_num_units, encoder_num_proj=encoder_num_proj, encoder_num_layers=encoder_num_layers, attention_type=attention_type, attention_dim=attention_dim, decoder_type=decoder_type, decoder_num_units=decoder_num_units, decoder_num_layers=decoder_num_layers, embedding_dim=embedding_dim, dropout_input=dropout_input, dropout_encoder=dropout_encoder, dropout_decoder=dropout_decoder, dropout_embedding=dropout_embedding, num_classes=num_classes, parameter_init=parameter_init, subsample_list=subsample_list, subsample_type=subsample_type, bridge_layer=bridge_layer, init_dec_state=init_dec_state, sharpening_factor=sharpening_factor, logits_temperature=logits_temperature, sigmoid_smoothing=sigmoid_smoothing, coverage_weight=coverage_weight, ctc_loss_weight=0, attention_conv_num_channels=attention_conv_num_channels, attention_conv_width=attention_conv_width, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, scheduled_sampling_prob=scheduled_sampling_prob, scheduled_sampling_max_step=scheduled_sampling_max_step, label_smoothing_prob=label_smoothing_prob, weight_noise_std=weight_noise_std, encoder_residual=encoder_residual, encoder_dense_residual=encoder_dense_residual, decoder_residual=decoder_residual, decoder_dense_residual=decoder_dense_residual, decoding_order=decoding_order, bottleneck_dim=bottleneck_dim, backward_loss_weight=0, num_heads=num_heads) self.model_type = 'hierarchical_attention' # Setting for the encoder self.encoder_num_units_sub = encoder_num_units if encoder_bidirectional: self.encoder_num_units_sub *= 2 # Setting for the decoder in the sub task self.decoder_num_units_1 = decoder_num_units_sub self.decoder_num_layers_1 = decoder_num_layers_sub self.num_classes_sub = num_classes_sub + 1 # Add <EOS> class self.sos_1 = num_classes_sub self.eos_1 = num_classes_sub # NOTE: <SOS> and <EOS> have the same index self.backward_1 = backward_sub # Setting for the decoder initialization in the sub task if backward_sub: if init_dec_state == 'first': self.init_dec_state_1_bwd = 'final' elif init_dec_state == 'final': self.init_dec_state_1_bwd = 'first' else: self.init_dec_state_1_bwd = init_dec_state if encoder_type != decoder_type: self.init_dec_state_1_bwd = 'zero' else: self.init_dec_state_1_fwd = init_dec_state if encoder_type != decoder_type: self.init_dec_state_1_fwd = 'zero' # Setting for the attention in the sub task self.num_heads_1 = num_heads_sub # Setting for MTL self.main_loss_weight = main_loss_weight self.sub_loss_weight = sub_loss_weight self.ctc_loss_weight_sub = ctc_loss_weight_sub if backward_sub: self.bwd_weight_1 = sub_loss_weight ############################## # Encoder # NOTE: overide encoder ############################## if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, num_layers_sub=encoder_num_layers_sub, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, batch_first=True, merge_bidirectional=False, pack_sequence=True, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, activation=activation, batch_norm=batch_norm) self.init_dec_state_0 = 'zero' self.init_dec_state_1 = 'zero' else: raise NotImplementedError dir = 'bwd' if backward_sub else 'fwd' self.is_bridge_sub = False if self.sub_loss_weight > 0: ################################################## # Bridge layer between the encoder and decoder ################################################## if encoder_type == 'cnn': self.bridge_1 = LinearND(self.encoder.output_size, decoder_num_units_sub, dropout=dropout_encoder) self.encoder_num_units_sub = decoder_num_units_sub self.is_bridge_sub = True elif bridge_layer: self.bridge_1 = LinearND(self.encoder_num_units_sub, decoder_num_units_sub, dropout=dropout_encoder) self.encoder_num_units_sub = decoder_num_units_sub self.is_bridge_sub = True else: self.is_bridge_sub = False ################################################## # Initialization of the decoder ################################################## if getattr(self, 'init_dec_state_1_' + dir) != 'zero': setattr( self, 'W_dec_init_1_' + dir, LinearND(self.encoder_num_units_sub, decoder_num_units_sub)) ############################## # Decoder (sub) ############################## if decoding_order == 'conditional': setattr( self, 'decoder_first_1_' + dir, RNNDecoder(input_size=embedding_dim_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=1, dropout=dropout_decoder, residual=False, dense_residual=False)) setattr( self, 'decoder_second_1_' + dir, RNNDecoder(input_size=self.encoder_num_units_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=1, dropout=dropout_decoder, residual=False, dense_residual=False)) # NOTE; the conditional decoder only supports the 1 layer else: setattr( self, 'decoder_1_' + dir, RNNDecoder(input_size=self.encoder_num_units_sub + embedding_dim_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=decoder_num_layers_sub, dropout=dropout_decoder, residual=decoder_residual, dense_residual=decoder_dense_residual)) ################################### # Attention layer (sub) ################################### setattr( self, 'attend_1_' + dir, AttentionMechanism( encoder_num_units=self.encoder_num_units_sub, decoder_num_units=decoder_num_units_sub, attention_type=attention_type, attention_dim=attention_dim, sharpening_factor=sharpening_factor, sigmoid_smoothing=sigmoid_smoothing, out_channels=attention_conv_num_channels, kernel_size=attention_conv_width, num_heads=num_heads_sub)) ############################## # Output layer (sub) ############################## setattr( self, 'W_d_1_' + dir, LinearND(decoder_num_units_sub, bottleneck_dim_sub, dropout=dropout_decoder)) setattr( self, 'W_c_1_' + dir, LinearND(self.encoder_num_units_sub, bottleneck_dim_sub, dropout=dropout_decoder)) setattr(self, 'fc_1_' + dir, LinearND(bottleneck_dim_sub, self.num_classes_sub)) ############################## # Embedding (sub) ############################## if label_smoothing_prob > 0: self.embed_1 = Embedding_LS( num_classes=self.num_classes_sub, embedding_dim=embedding_dim_sub, dropout=dropout_embedding, label_smoothing_prob=label_smoothing_prob) else: self.embed_1 = Embedding(num_classes=self.num_classes_sub, embedding_dim=embedding_dim_sub, dropout=dropout_embedding, ignore_index=-1) ############################## # CTC (sub) ############################## if ctc_loss_weight_sub > 0: self.fc_ctc_1 = LinearND(self.encoder_num_units_sub, num_classes_sub + 1) # Set CTC decoders self._decode_ctc_greedy_np = GreedyDecoder(blank_index=0) self._decode_ctc_beam_np = BeamSearchDecoder(blank_index=0) # NOTE: index 0 is reserved for the blank class ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal: self.init_weights(parameter_init, distribution='orthogonal', keys=[encoder_type, 'weight'], ignore_keys=['bias']) self.init_weights(parameter_init, distribution='orthogonal', keys=[decoder_type, 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one()