def ctc_find_eos(y_true, y_pred): # From SO : TODO : var init, predlength objective #convert y_pred from one-hot to label indices y_pred_ind = K.argmax(y_pred, axis=-1) #to make sure y_pred has one end_of_sentence (to avoid errors) y_pred_end = K.concatenate( [y_pred_ind[:, :-1], eos_index * K.ones_like(y_pred_ind[:, -1:])], axis=1) #to make sure the first occurrence of the char is more important than subsequent ones occurrence_weights = K.arange(start=max_length, stop=0, dtype=K.floatx()) is_eos_true = K.cast_to_floatx(K.equal(y_true, eos_index)) is_eos_pred = K.cast_to_floatx(K.equal(y_pred_end, eos_index)) #lengths true_lengths = 1 + K.argmax(occurrence_weights * is_eos_true, axis=1) pred_lengths = 1 + K.argmax(occurrence_weights * is_eos_pred, axis=1) #reshape true_lengths = K.reshape(true_lengths, (-1, 1)) pred_lengths = K.reshape(pred_lengths, (-1, 1)) return K.ctc_batch_cost(y_true, y_pred, pred_lengths, true_lengths) + self.beta( pred_lengths) # Maybe a temp fix
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14, conv_dim=128, lstm_dim=256): image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f'Window width/stride need to generate at least {output_length} windows (currently {num_windows})') image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length,), name='y_true') input_length = Input(shape=(1,), name='input_length') label_length = Input(shape=(1,), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 1 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). ##### Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) conv = Conv2D(conv_dim, (image_height, window_width), (1, window_stride), activation='relu')(image_reshaped) conv_squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv) lstm_output1 = lstm_fn(lstm_dim, return_sequences=True)(convnet_outputs) # (num_windows, 128) lstm_output2 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output1) lstm_output3 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output2 + lstm_output1) lstm_output4 = lstm_fn(lstm_dim, return_sequences=True)(lstm_output3 + lstm_output2) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output4) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows} )(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss' )([y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded' )([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output] ) return model
def ctc_loss(y_true, y_pred): """Function for computing the CTC loss""" if len(y_true.shape) > 2: y_true = tf.squeeze(y_true) ''' y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded) Output layer of the model is softmax. So sum across alphabet_size_1_hot_encoded results 1. string_length give string length. ''' input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False) input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True) # y_true strings are padded with 0. So sum of non-zero gives number of characters in this string. label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64") ''' About K.ctc_batch_loss: https://docs.w3cub.com/tensorflow~python/tf/keras/backend/ctc_batch_cost https://stackoverflow.com/questions/60782077/how-do-you-use-tensorflow-ctc-batch-cost-function-with-keras ''' loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) # average loss across all entries in the batch loss = tf.reduce_mean(loss) return loss
def ctc_loss_lambda_func(y_true, y_pred): """Function for computing the CTC loss""" if len(y_true.shape) > 2: y_true = tf.squeeze(y_true) # y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded) # output of every model is softmax # so sum across alphabet_size_1_hot_encoded give 1 # string_length give string length input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False) input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True) # y_true strings are padded with 0 # so sum of non-zero gives number of characters in this string label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64") loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) # average loss across all entries in the batch loss = tf.reduce_mean(loss) return loss
def ctc_lambda_loss(logits, labels, input_length, label_length, smoothing=0.0): ''' ctc loss function psram: logits, (B, T, D) psram: input_length, (B, 1), input length of encoder psram: labels, (B, T) psram: label_length, (B, 1), label length for convert dense label to sparse returns: loss, scalar ''' del smoothing ilen = tf.cond( pred=tf.equal(tf.rank(input_length), 1), true_fn=lambda: tf.expand_dims(input_length, axis=-1), false_fn=lambda: input_length, ) olen = tf.cond( pred=tf.equal(tf.rank(label_length), 1), true_fn=lambda: tf.expand_dims(label_length, axis=-1), false_fn=lambda: label_length, ) deps = [ tf.assert_rank(labels, 2), tf.assert_rank(logits, 3), tf.assert_rank(ilen, 2), # input_length tf.assert_rank(olen, 2), # output_length ] with tf.control_dependencies(deps): # (B, 1) batch_loss = K.ctc_batch_cost(labels, logits, ilen, olen) loss = tf.reduce_mean(batch_loss) return loss
def validate(model, x, y_true, input_len, label_len, y_strings, test=False, save_file=None): input_len = np.expand_dims(input_len, axis=1) label_len = np.expand_dims(label_len, axis=1) y_pred = model(x) loss = ctc_batch_cost(y_true, y_pred, input_len, label_len) input_len = np.squeeze(input_len) y_decode = ctc_decode(y_pred, input_len)[0][0] accuracy = 0.0 for i in range(len(y_strings)): predicted_sentence = indices_to_string(y_decode[i].numpy()) accuracy += wer(predicted_sentence, y_strings[i]) if test: save_file.write("Correct Sentence:" + str(y_strings[i]) + "\n") save_file.write("Predicted Sentence:" + predicted_sentence + "\n") return tf.reduce_mean(loss), accuracy / len(y_strings)
def ctc_loss(y_true, y_pred): """ Runs CTC Loss Algorithm on each batch element :param y_true: tensor (samples, max_string_length) containing the truth labels. :param y_pred: tensor (samples, time_steps, num_categories) containing the prediction, or output of the softmax. * caution input_length : tensor (samples, 1) containing the sequence length for each batch item in y_pred label_length : tensor (samples, 1) containing the sequence length for each batch item in y_true y_true는 [3,7,12,1,2,-1,-1,-1,-1] 와 같은 형태로 구성되어 있음. -1은 Blank를 의미 처음 등장하는 -1의 인덱스가 y_true의 sequnece length와 동일 y_pred의 총 width와 input_length는 동일 """ # Get the Length of Prediction shape = tf.shape(y_pred) batch_size = shape[0] max_length = shape[1, None, None] input_length = tf.tile(max_length, [batch_size, 1]) # Get the Length of Input label_length = tf.argmin(y_true, axis=-1)[:, None] return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
def ctc_loss_lambda_func(y_true, y_pred): """Function for computing the CTC loss""" if len(y_true.shape) > 2: y_true = tf.squeeze(y_true) # y_pred.shape = (batch_size, string_length, alphabet_size_1_hot_encoded) # output of every model is softmax # so sum across alphabet_size_1_hot_encoded give 1 # string_length give string length input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False) input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True) # y_true strings are padded with 0 # so sum of non-zero gives number of characters in this string label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64") # if you have an error when training go to the definition of the ctc_batch_cost function and add # 'ignore_longer_outputs_than_inputs=True' in the parameters of the ctc.ctc_loss() function line 5764 loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) # average loss across all entries in the batch loss = tf.reduce_mean(loss) return loss
def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN # tend to be garbage: y_pred = y_pred[:, 2:, :] res = backend.ctc_batch_cost(labels, y_pred, input_length, label_length) return res
def ctc_loss(self, y_true, y_pred): if len(y_true.shape) > 2: y_true = tf.squeeze(y_true) input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False) input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True) label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64") loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) loss = tf.reduce_mean(loss) return loss
def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args """ labels: tensor (number of samples, max_string_length) containing the truth labels. y_pred: tensor (number of samples, time_steps, num_character_labels) containing the prediction, or output of the softmax. input_length: tensor (number of samples, 1) containing the sequence length for each batch item in y_pred. label_length: tensor (number of samples, 1) containing the sequence length for each batch item in y_true. """ return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): # pylint: disable=too-many-locals image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError(f"Window width/stride need to generate >= {output_length} windows (currently {num_windows})") image_input = Input(shape=input_shape, name="image") y_true = Input(shape=(output_length,), name="y_true") input_length = Input(shape=(1,), name="input_length") label_length = Input(shape=(1,), name="label_length") # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). # Your code below (Lab 3) image_reshaped = Reshape((image_height, image_width, 1))(image_input) # (image_height, image_width, 1) image_patches = Lambda(slide_window, arguments={"window_width": window_width, "window_stride": window_stride})( image_reshaped ) # (num_windows, image_height, window_width, 1) # Make a LeNet and get rid of the last two layers (softmax and dropout) convnet = lenet((image_height, window_width, 1), (num_classes,)) convnet = KerasModel(inputs=convnet.inputs, outputs=convnet.layers[-2].output) convnet_outputs = TimeDistributed(convnet)(image_patches) # (num_windows, 128) lstm_output = LSTM(128, return_sequences=True)(convnet_outputs) # (num_windows, 128) softmax_output = Dense(num_classes, activation="softmax", name="softmax_output")(lstm_output) # (num_windows, num_classes) # Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={"num_windows": num_windows} )(input_length) ctc_loss_output = Lambda(lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name="ctc_loss")( [y_true, softmax_output, input_length_processed, label_length] ) ctc_decoded_output = Lambda(lambda x: ctc_decode(x[0], x[1], output_length), name="ctc_decoded")( [softmax_output, input_length_processed] ) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output], ) return model
def ctc_loss_lambda_func(args): """ Function for computing the ctc loss (can be put in a Lambda layer) :param args: y_pred, labels, input_length, label_length :return: CTC loss """ y_pred, labels, input_length, label_length = args return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def ctc_lambda_func(args): """Lambda implementation of CTC loss, using ctc_batch_cost from TensorFlow backend CTC implementation from Keras example found at https://github.com/keras-team/keras/blob/master/examples/image_ocr.py""" y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN # tend to be garbage: # print "y_pred_shape: ", y_pred.shape # y_pred = y_pred[:, 2:, :] # print "y_pred_shape: ", y_pred.shape return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def ctcLoss(yTrue, yPred): # Reshape the ground truth tensor into shape required by ctc_batch_cost(). yTrueShape = K.shape(yTrue) yTrue = K.reshape(yTrue, shape=(yTrueShape[0], yTrueShape[1])) # Get the input sequence and label sequence length for each sample in the batch. hasTrueLables = K.clip(yTrue + 1, 0, 1) labelLength = K.sum(hasTrueLables, axis=1, keepdims=True) hasPredLabels = K.sum(yPred, axis=2) inputLength = K.sum(hasPredLabels, axis=1, keepdims=True) return K.ctc_batch_cost(yTrue, yPred, inputLength, labelLength)
def ctc_lambda_func(args): ''' y_true = numeric translation of text y_pred = output of softmax layer input_length = output sequence length label_length = length of the true sequence ''' y_true, y_pred, input_length, label_length = args print(y_true.shape) print(y_pred.shape) print(input_length) print(label_length) return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
def ctc_lambda_func(y_true, y_pred, model_config, **kwargs): # 在2。0下没有**kwargs会编译不过 outputstep = y_pred.get_shape()[1] # 获得输入数据的序列长度 # 为批次中的每个数据,单独指定序列长度 input_length = np.asarray([[outputstep]] * model_config['batchsize'], dtype=np.int) label_length = np.asarray([[model_config['label_len']]] * model_config['batchsize']) # input_length必须大于label_length,否则会提示无效的ctc return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
def ctc_loss(y_true, y_pred): """ input_length = np.array(([61]*y_true.shape[1])).reshape((1,-1)) label_length = np.array(([61]*y_pred.shape[1])).reshape((1,-1)) input_length = tf.convert_to_tensor(input_length, dtype='int64') label_length = tf.convert_to_tensor(label_length, dtype='int64') """ labels = Input(name='the_labels', shape=[None], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def ctc_loss_lambda_func(y_true, y_pred): """Function for computing the CTC loss""" if len(y_true.shape) > 2: y_true = tf.squeeze(y_true) input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False) input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True) label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64") loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) loss = tf.reduce_mean(loss) return loss
def ctc_loss_lambda_func(y_true, y_pred): """Function for computing the CTC loss""" input_length = tf.ones(BATCH_SIZE) * MAX_LABEL_LENGTH input_length = tf.expand_dims(input_length, axis=-1) label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64") loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) loss = tf.reduce_mean(loss) return loss
def ctc_loss(args): ''' More info on CTC: https://towardsdatascience.com/intuitively-understanding-connectionist-temporal-classification-3797e43a86c Creates CTC (Connectionist Temporal Classification) loss for a speech_to_text model approach. :params: args - List of params: predictions, labels, input_len and labels_len :returns: calculated CTC loss based on args. ''' predictions, labels, input_len, labels_len = args return K.ctc_batch_cost(labels, predictions, input_len, labels_len)
def ctc_loss(self, labels, logits): print(labels.shape, 'loss') if labels.shape[1] == None: labels = k.placeholder(shape=(self.batch_size, self.max_len + 1), dtype=tf.int32) # tf.dtypes.cast(labels, tf.int32) y_true, length = tf.split(labels, [(labels.shape[1] - 1), 1], 1) logit_length = tf.expand_dims(tf.convert_to_tensor( [self.frames - self.cutoff] * self.batch_size, dtype=tf.int32), axis=1) # logits=logits[:,self.cutoff:,:] # length = tf.squeeze(length,axis=1) print(y_true.shape, logits.shape, length.shape, logit_length.shape, 'ctcloss') return k.ctc_batch_cost(y_true, logits, logit_length, length)
def call(self, y_true, y_pred): # Compute CTC loss, add directly; return preds batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length) self.add_loss(loss) # At run time, just return the computed predictions return y_pred
def line_lstm_ctc(input_shape, output_shape, window_width=28, window_stride=14): # pylint: disable=too-many-locals image_height, image_width = input_shape output_length, num_classes = output_shape num_windows = int((image_width - window_width) / window_stride) + 1 if num_windows < output_length: raise ValueError( f'Window width/stride need to generate >= {output_length} windows (currently {num_windows})' ) image_input = Input(shape=input_shape, name='image') y_true = Input(shape=(output_length, ), name='y_true') input_length = Input(shape=(1, ), name='input_length') label_length = Input(shape=(1, ), name='label_length') gpu_present = len(device_lib.list_local_devices()) > 2 lstm_fn = CuDNNLSTM if gpu_present else LSTM # Your code should use slide_window and extract image patches from image_input. # Pass a convolutional model over each image patch to generate a feature vector per window. # Pass these features through one or more LSTM layers. # Convert the lstm outputs to softmax outputs. # Note that lstms expect a input of shape (num_batch_size, num_timesteps, feature_length). # Your code below (Lab 3) # Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows})(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss')( [y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded')([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output]) return model
def ctc_criterion_backend(self, labels, output_mid, label_length, input_length): # This assume blank_index=n_class-1 if not self.blank_index == output_mid.shape[-1] - 1: raise AssertionError( "keras.backend.ctc requires blank_index = nclass-1") if self.from_logits: output_mid = tf.nn.softmax(output_mid) if tf.is_tensor(input_length): input_length = tf.reshape(input_length, (-1, 1)) else: input_length = input_length.reshape(-1, 1) if tf.is_tensor(label_length): label_length = tf.reshape(label_length, (-1, 1)) else: label_length = label_length.reshape(-1, 1) return K.ctc_batch_cost(labels, output_mid, input_length, label_length)
def loss(y_true, y_pred): """Why you make it so complicated? Since the prediction from models is (batch, timedistdim, tot_num_uniq_chars) and the true target is labels (batch_size,1) but the ctc loss need some additional information of different sizes. And the inputs to loss y_true, y_pred must be both same dimensions because of keras. So I have packed the needed information inside the y_true and just made it to a matching dimension with y_true""" batch_labels = y_true[:, :, 0] label_length = y_true[:, 0, 1] input_length = y_true[:, 0, 2] #reshape for the loss, add that extra meaningless dimension label_length = tf.expand_dims(label_length, -1) input_length = tf.expand_dims(input_length, -1) return ctc_batch_cost(batch_labels, y_pred, input_length, label_length)
def train_one_step(model, optimizer, x, y_true, input_len, label_len, y_strings): input_len = np.expand_dims(input_len, axis=1) label_len = np.expand_dims(label_len, axis=1) with tf.GradientTape() as tape: y_pred = model(x) loss = ctc_batch_cost(y_true, y_pred, input_len, label_len) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) input_len = np.squeeze(input_len) y_decode = ctc_decode(y_pred, input_len)[0][0] accuracy = 0.0 for i in range(len(y_strings)): predicted_sentence = indices_to_string(y_decode[i].numpy()) accuracy += wer(predicted_sentence, y_strings[i]) return tf.reduce_mean(loss), accuracy / len(y_strings)
def _ctc_lambda_func(args): ''' Setup the CTC loss as function. y_pred: The logits output from the model. Shape [batch_sz, times_steps, number_characters] labels: The tokenized transcription. Shape [batch_sz, label_length] label_length: The length of the transcription. Shape [batch_sz, 1] ''' y_pred, labels, label_length = args def _get_length(tensor): ''' Returns the length of a tensor Reference: "Automatic-Speech-Recognition" (https://github.com/rolczynski/Automatic-Speech-Recognition/blob/master/automatic_speech_recognition/pipeline/ctc_pipeline.py) ''' lengths = tf.math.reduce_sum(tf.ones_like(tensor), 1) lengths = tf.expand_dims(lengths, -1) return tf.cast(lengths, tf.int32) # extracts the number of time steps for the batch input_length = _get_length(tf.math.reduce_max(y_pred, 2)) return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
# lstm_output = Bidirectional(lstm_fn(256, return_sequences=True))(lstm_output) # lstm_output = Dropout(0.5)(lstm_output) lstm_output = BatchNormalization()(lstm_output) lstm_output = Conv1D(256, 3, activation='relu', padding='SAME')(lstm_output) lstm_output = Dropout(0.5)(lstm_output) softmax_output = Dense(num_classes, activation='softmax', name='softmax_output')(lstm_output) # (num_windows, num_classes) ##### Your code above (Lab 3) input_length_processed = Lambda( lambda x, num_windows=None: x * num_windows, arguments={'num_windows': num_windows} )(input_length) ctc_loss_output = Lambda( lambda x: K.ctc_batch_cost(x[0], x[1], x[2], x[3]), name='ctc_loss' )([y_true, softmax_output, input_length_processed, label_length]) ctc_decoded_output = Lambda( lambda x: ctc_decode(x[0], x[1], output_length), name='ctc_decoded' )([softmax_output, input_length_processed]) model = KerasModel( inputs=[image_input, y_true, input_length, label_length], outputs=[ctc_loss_output, ctc_decoded_output] ) return model
def __ctc_lambda_func(self, args): y_pred, labels, input_length, label_length = args return K.ctc_batch_cost(labels, y_pred, input_length, label_length)