def _train_step(self, batch): train_loss = 0. names = ['main_loss'] + ['reg_loss_%d' % i for i in range(len(batch))] for i, (alpha, r_batch) in enumerate(batch): features, input_length, labels, label_length, prediction, prediction_length = r_batch logits = self.model( [features, input_length, prediction, prediction_length], training=True) per_train_loss = rnnt_loss(logits=logits, labels=labels, label_length=label_length, logit_length=get_reduced_length( input_length, self.model.time_reduction_factor), blank=self.text_featurizer.blank) self.train_metrics[names[i]].update_state(per_train_loss) per_train_loss *= alpha train_loss += per_train_loss self.train_metrics['transducer_loss'].update_state(train_loss) train_loss = tf.nn.compute_average_loss( train_loss, global_batch_size=self.global_batch_size) gradients = tf.gradients(train_loss, self.model.trainable_variables) # gradients = tape.gradient(train_loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.model.trainable_variables))
def create_padding_mask(features, input_length, time_reduction_factor): """ Create masking with 0 for paddings and 1 for non-paddings Args: features ([tf.Tensor]): audio features with shape [B, T, F, C] input_length ([tf.Tensor]): audio features length with shape [B] time_reduction_factor ([int]) Returns: [tf.Tensor]: with shape [B, Tquery, Tkey] """ batch_size, padded_time, _, _ = shape_list(features) reduced_padded_time = get_reduced_length(padded_time, time_reduction_factor) def create_mask(length): reduced_length = get_reduced_length(length, time_reduction_factor) mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32) return tf.pad(mask, [[0, reduced_padded_time - reduced_length], [0, reduced_padded_time - reduced_length]], mode="CONSTANT", constant_values=0.0) return tf.map_fn(create_mask, input_length, fn_output_signature=tf.TensorSpec([None, None], dtype=tf.float32))
def create_mask(length): reduced_length = get_reduced_length(length, time_reduction_factor) mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32) return tf.pad(mask, [[0, reduced_padded_time - reduced_length], [0, reduced_padded_time - reduced_length]], mode="CONSTANT", constant_values=0.0)
def _train_step(self, batch): _, features, input_length, labels, label_length, pred_inp = batch mask = create_padding_mask(features, input_length, self.model.time_reduction_factor) with tf.GradientTape() as tape: logits = self.model( [features, input_length, pred_inp, label_length + 1], training=True, mask=mask) tape.watch(logits) per_train_loss = rnnt_loss(logits=logits, labels=labels, label_length=label_length, logit_length=get_reduced_length( input_length, self.model.time_reduction_factor), blank=self.text_featurizer.blank) train_loss = tf.nn.compute_average_loss( per_train_loss, global_batch_size=self.global_batch_size) gradients = tape.gradient(train_loss, self.model.trainable_variables) self.accumulation.accumulate(gradients) self.train_metrics["transducer_loss"].update_state(per_train_loss)
def _eval_step(self, batch): eval_loss = 0. names = ['main_loss'] + ['reg_loss_%d' % i for i in range(len(batch))] for i, (alpha, r_batch) in enumerate(batch): features, input_length, labels, label_length, prediction, prediction_length = r_batch logits = self.model( [features, input_length, prediction, prediction_length], training=False) per_eval_loss = rnnt_loss(logits=logits, labels=labels, label_length=label_length, logit_length=get_reduced_length( input_length, self.model.time_reduction_factor), blank=self.text_featurizer.blank) self.eval_metrics[names[i]].update_state(per_eval_loss) per_eval_loss *= alpha eval_loss += per_eval_loss self.eval_metrics["transducer_loss"].update_state(eval_loss)
conformer.load_weights(args.saved, by_name=True) conformer.summary(line_length=120) conformer.add_featurizers(speech_featurizer, text_featurizer) import numpy as np np.random.seed(0) tf.random.set_seed(0) if args.filename.endswith('.wav'): signal = read_raw_audio(args.filename) # features = speech_featurizer.tf_extract(signal) features = speech_featurizer.extract(signal) features = tf.constant(features) else: features = np.load(args.filename).reshape([-1, 80, 1]) features = tf.constant(features) input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor) if args.beam_width: transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...]) print("Transcript:", transcript[0].numpy().decode("UTF-8")) elif args.timestamp: transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp( signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()) print("Transcript:", transcript) print("Start time:", stime) print("End time:", etime) else: if args.filename.endswith('.wav'): transcript, _, _ = conformer.recognize_tflite( signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state()) print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8"))