def ce_loss(self, logits, labels, len_logits, len_labels): """ Compute optimization loss. batch major """ l = tf.reduce_min([tf.shape(logits)[1], tf.shape(labels)[1]]) with tf.name_scope('CE_loss'): crossent = smoothing_cross_entropy(logits=logits[:, :l, :], labels=labels[:, :l], vocab_size=self.args.dim_output, confidence=1.0) mask = tf.sequence_mask(len_labels, maxlen=l, dtype=logits.dtype) mask2 = tf.sequence_mask(len_logits, maxlen=l, dtype=logits.dtype) mask *= mask2 # there must be reduce_sum not reduce_mean, for the valid token number is less loss = tf.reduce_sum(crossent * mask, -1) if self.args.model.decoder2.confidence_penalty > 0: # utt-level cp_loss = self.args.model.decoder2.confidence_penalty * \ confidence_penalty(logits, len_logits) loss += cp_loss if self.args.model.token_level_ocd: # token-level loss /= tf.reduce_sum(mask, -1) return loss
def ocd_loss(self, logits, len_logits, labels, decoded, len_decoded): """ the logits length is the sample_id length return batch shape loss if `len_logits` is all zero. then outputs the 0 """ from tfModels.OptimalDistill import OCD optimal_distributions, optimal_targets = OCD( hyp=decoded, ref=labels, vocab_size=self.args.dim_output) crossent = tf.nn.softmax_cross_entropy_with_logits_v2( labels=optimal_distributions, logits=logits) pad_mask = tf.sequence_mask(len_logits, maxlen=tf.shape(logits)[1], dtype=logits.dtype) loss = tf.reduce_sum(crossent * pad_mask, -1) # utt-level if self.args.model.decoder2.confidence_penalty > 0: # utt-level cp_loss = self.args.model.decoder2.confidence_penalty * \ confidence_penalty(logits, len_decoded) loss += cp_loss if self.args.model.token_level_ocd: # token-level loss /= tf.reduce_sum(pad_mask, -1) return loss
def build_single_graph(self, id_gpu, name_gpu, tensors_input): with tf.device(lambda op: choose_device(op, name_gpu, self.center_device)): inputs = tensors_input.feature_splits[id_gpu] len_inputs = tensors_input.len_fea_splits[id_gpu] inputs.set_shape([None, None, self.size_embedding]) if self.type == 'LSTM': from tfSeq2SeqModels.decoders.lm_decoder import LM_Decoder self.decoder = LM_Decoder(self.args, self.is_train, self.embed_table_decoder) logits = self.decoder(inputs, len_inputs) elif self.type == 'SelfAttention': from tfSeq2SeqModels.decoders.self_attention_lm_decoder import SelfAttentionDecoder self.decoder= SelfAttentionDecoder(self.args, self.is_train, self.embed_table_decoder) # from tfSeq2SeqModels.decoders.self_attention_lm_decoder_lh import SelfAttentionDecoder_lh # decoder = SelfAttentionDecoder_lh(self.args, self.is_train, self.embed_table_decoder) logits = self.decoder(inputs, len_inputs) len_logits = tensors_input.len_label_splits[id_gpu] loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tensors_input.label_splits[id_gpu], logits=logits) loss *= tf.sequence_mask( tensors_input.len_label_splits[id_gpu], maxlen=tf.shape(logits)[1], dtype=logits.dtype) if self.args.model.confidence_penalty: ls_loss = self.args.model.confidence_penalty * confidence_penalty(logits, len_logits) ls_loss = tf.reduce_mean(ls_loss) loss += ls_loss # from tfModels.tensor2tensor.common_layers import padded_cross_entropy, weights_nonzero # # mask = tf.sequence_mask( # tensors_input.len_label_splits[id_gpu], # maxlen=tf.shape(logits)[1], # dtype=logits.dtype) # batch_mask = tf.tile(tf.expand_dims(mask, -1), [1, 1, tf.shape(logits)[-1]]) # loss, _ = padded_cross_entropy( # logits* batch_mask, # tensors_input.label_splits[id_gpu], # 0.0, # weights_fn=weights_nonzero, # reduce_sum=False) # loss = tf.Print(loss, [weight_sum], message='weight_sum', summarize=1000) if self.is_train: with tf.name_scope("gradients"): gradients = self.optimizer.compute_gradients(loss) self.__class__.num_Model += 1 logging.info('\tbuild {} on {} succesfully! total model number: {}'.format( self.__class__.__name__, name_gpu, self.__class__.num_Model)) if self.is_train: return loss, gradients else: return loss
def rna_loss(self, logits, len_logits, labels, len_labels, encoded=None, len_encoded=None): with tf.name_scope("ctc_loss"): labels_sparse = dense_sequence_to_sparse( labels, len_labels) loss = tf.nn.ctc_loss( labels_sparse, logits, sequence_length=len_logits, ctc_merge_repeated=False, ignore_longer_outputs_than_inputs=True, time_major=False) if self.args.model.decoder.confidence_penalty: ls_loss = self.args.model.decoder.confidence_penalty * \ confidence_penalty(logits, len_logits) loss += ls_loss return loss
def ctc_loss(self, logits, len_logits, labels, len_labels): """ No valid path found: It is possible that no valid path is found if the activations for the targets are zero. return batch shape loss """ with tf.name_scope("ctc_loss"): labels_sparse = dense_sequence_to_sparse(labels, len_labels) loss = tf.nn.ctc_loss( labels_sparse, logits, sequence_length=len_logits, ctc_merge_repeated=self.args.model.avg_repeated, ignore_longer_outputs_than_inputs=True, time_major=False) if self.args.model.decoder.confidence_penalty: ls_loss = self.args.model.decoder.confidence_penalty * \ confidence_penalty(logits, len_logits) loss += ls_loss return loss
def build_single_graph(self, id_gpu, name_gpu, tensors_input): tf.get_variable_scope().set_initializer( tf.variance_scaling_initializer(1.0, mode="fan_avg", distribution="uniform")) with tf.device( lambda op: choose_device(op, name_gpu, self.center_device)): # create encoder obj encoder = self.gen_encoder(is_train=self.is_train, args=self.args) decoder = self.gen_decoder(is_train=self.is_train, embed_table=None, global_step=self.global_step, args=self.args) features = tensors_input.feature_splits[id_gpu] # using encoder to encode the inout sequence hidden_output, len_hidden_output = encoder( features=features, len_feas=tensors_input.len_fea_splits[id_gpu]) logits, align, len_logits = decoder(hidden_output, len_hidden_output) if self.is_train: loss = self.ctc_loss( logits=logits, len_logits=len_logits, labels=tensors_input.label_splits[id_gpu], len_labels=tensors_input.len_label_splits[id_gpu]) if self.args.model.balance_training: token_loss = loss / tf.to_float(len_logits) musk = tf.to_float( tf.greater(token_loss, self.args.model.balance_training)) loss *= musk if self.args.model.confidence_penalty: cp_loss = self.args.model.decoder.confidence_penalty * confidence_penalty( logits, len_logits) assert cp_loss.get_shape().ndims == 1 loss += cp_loss if self.args.model.constrain_repeated: from tfModels.CTCShrink import repeated_constrain_loss loss_constrain = repeated_constrain_loss( distribution_acoustic=logits, hidden=hidden_output, len_acoustic=len_hidden_output, blank_id=self.args.dim_output - 1) loss += self.args.model.constrain_repeated * loss_constrain with tf.name_scope("gradients"): assert loss.get_shape().ndims == 1 loss = tf.reduce_mean(loss) gradients = self.optimizer.compute_gradients(loss) self.__class__.num_Model += 1 logging.info( '\tbuild {} on {} succesfully! total model number: {}'.format( self.__class__.__name__, name_gpu, self.__class__.num_Model)) if self.is_train: return loss, gradients, [align, tensors_input.label_splits[id_gpu]] else: return logits, len_logits
def build_single_graph(self, id_gpu, name_gpu, tensors_input): tf.get_variable_scope().set_initializer( tf.variance_scaling_initializer(1.0, mode="fan_avg", distribution="uniform")) with tf.device( lambda op: choose_device(op, name_gpu, self.center_device)): self.encoder = self.gen_encoder(is_train=self.is_train, args=self.args) self.fc_decoder = self.gen_decoder(is_train=self.is_train, embed_table=None, global_step=self.global_step, args=self.args, name='decoder') self.decoder = decoder = self.gen_decoder2( is_train=self.is_train, embed_table=self.embedding_tabel, global_step=self.global_step, args=self.args, name='decoder2') hidden_output, len_hidden_output = self.encoder( features=tensors_input.feature_splits[id_gpu], len_feas=tensors_input.len_fea_splits[id_gpu]) logits_acoustic, alignment, len_acoustic = self.fc_decoder( hidden_output, len_hidden_output) logits_acoustic = tf.stop_gradient(logits_acoustic) len_acoustic = tf.stop_gradient(len_acoustic) distribution_acoustic = tf.nn.softmax(logits_acoustic) # whether to shrink the hidden or the acoutic distribution if not self.args.model.shrink_hidden: hidden_output = distribution_acoustic blank_id = self.args.dim_ctc_output - 1 if self.args.dim_ctc_output else self.args.dim_output - 1 hidden_shrunk, len_no_blank = acoustic_hidden_shrink_tf( distribution_acoustic=distribution_acoustic, hidden=hidden_output, len_acoustic=len_acoustic, blank_id=blank_id, num_post=self.args.model.num_post, frame_expand=self.args.model.frame_expand) if (not self.is_train) and (self.args.beam_size > 1): # infer phrase with tf.variable_scope(decoder.name or 'decoder'): logits, decoded, len_decoded = decoder.beam_decode_rerank( hidden_shrunk, len_no_blank) else: # train phrase logits, decoded, len_decoded = decoder(hidden_shrunk, len_no_blank) if self.is_train: if self.args.model.use_ce_loss: loss = self.ce_loss( logits=logits, labels=tensors_input.label_splits[id_gpu], len_logits=len_acoustic, len_labels=tensors_input.len_label_splits[id_gpu]) else: loss = self.ocd_loss( logits=logits, len_logits=len_decoded, labels=tensors_input.label_splits[id_gpu], decoded=decoded) if self.args.model.confidence_penalty > 0: # utt-level cp_loss = self.args.model.confidence_penalty * \ confidence_penalty(logits, len_decoded)/len_decoded loss += cp_loss if self.args.model.musk_update: self.idx_update = self.deserve_idx( decoded, len_decoded, tensors_input.label_splits[id_gpu], tensors_input.len_label_splits[id_gpu]) loss = tf.reshape(tf.gather(loss, self.idx_update), [-1]) l2_loss = tf.add_n( [tf.nn.l2_loss(v) for v in self.decoder.params]) with tf.name_scope("gradients"): loss = tf.reduce_mean(loss) gradients = self.optimizer.compute_gradients(loss) self.__class__.num_Model += 1 logging.info( '\tbuild {} on {} succesfully! total model number: {}'.format( self.__class__.__name__, name_gpu, self.__class__.num_Model)) if self.is_train: return loss, gradients, \ [decoded, tensors_input.label_splits[id_gpu], l2_loss] # return loss, gradients, tf.no_op() else: return logits, len_decoded, decoded