def _train_step(self, batch):
        train_loss = 0.
        names = ['main_loss'] + ['reg_loss_%d' % i for i in range(len(batch))]
        for i, (alpha, r_batch) in enumerate(batch):
            features, input_length, labels, label_length, prediction, prediction_length = r_batch
            logits = self.model(
                [features, input_length, prediction, prediction_length],
                training=True)
            per_train_loss = rnnt_loss(logits=logits,
                                       labels=labels,
                                       label_length=label_length,
                                       logit_length=get_reduced_length(
                                           input_length,
                                           self.model.time_reduction_factor),
                                       blank=self.text_featurizer.blank)
            self.train_metrics[names[i]].update_state(per_train_loss)
            per_train_loss *= alpha
            train_loss += per_train_loss

        self.train_metrics['transducer_loss'].update_state(train_loss)
        train_loss = tf.nn.compute_average_loss(
            train_loss, global_batch_size=self.global_batch_size)

        gradients = tf.gradients(train_loss, self.model.trainable_variables)
        # gradients = tape.gradient(train_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.model.trainable_variables))
예제 #2
0
def create_padding_mask(features, input_length, time_reduction_factor):
    """
    Create masking with 0 for paddings and 1 for non-paddings
    Args:
        features ([tf.Tensor]): audio features with shape [B, T, F, C]
        input_length ([tf.Tensor]): audio features length with shape [B]
        time_reduction_factor ([int])

    Returns:
        [tf.Tensor]: with shape [B, Tquery, Tkey]
    """
    batch_size, padded_time, _, _ = shape_list(features)
    reduced_padded_time = get_reduced_length(padded_time,
                                             time_reduction_factor)

    def create_mask(length):
        reduced_length = get_reduced_length(length, time_reduction_factor)
        mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32)
        return tf.pad(mask, [[0, reduced_padded_time - reduced_length],
                             [0, reduced_padded_time - reduced_length]],
                      mode="CONSTANT",
                      constant_values=0.0)

    return tf.map_fn(create_mask,
                     input_length,
                     fn_output_signature=tf.TensorSpec([None, None],
                                                       dtype=tf.float32))
예제 #3
0
 def create_mask(length):
     reduced_length = get_reduced_length(length, time_reduction_factor)
     mask = tf.ones([reduced_length, reduced_length], dtype=tf.float32)
     return tf.pad(mask, [[0, reduced_padded_time - reduced_length],
                          [0, reduced_padded_time - reduced_length]],
                   mode="CONSTANT",
                   constant_values=0.0)
예제 #4
0
    def _train_step(self, batch):
        _, features, input_length, labels, label_length, pred_inp = batch

        mask = create_padding_mask(features, input_length,
                                   self.model.time_reduction_factor)

        with tf.GradientTape() as tape:
            logits = self.model(
                [features, input_length, pred_inp, label_length + 1],
                training=True,
                mask=mask)
            tape.watch(logits)
            per_train_loss = rnnt_loss(logits=logits,
                                       labels=labels,
                                       label_length=label_length,
                                       logit_length=get_reduced_length(
                                           input_length,
                                           self.model.time_reduction_factor),
                                       blank=self.text_featurizer.blank)
            train_loss = tf.nn.compute_average_loss(
                per_train_loss, global_batch_size=self.global_batch_size)

        gradients = tape.gradient(train_loss, self.model.trainable_variables)
        self.accumulation.accumulate(gradients)
        self.train_metrics["transducer_loss"].update_state(per_train_loss)
    def _eval_step(self, batch):
        eval_loss = 0.
        names = ['main_loss'] + ['reg_loss_%d' % i for i in range(len(batch))]
        for i, (alpha, r_batch) in enumerate(batch):
            features, input_length, labels, label_length, prediction, prediction_length = r_batch

            logits = self.model(
                [features, input_length, prediction, prediction_length],
                training=False)
            per_eval_loss = rnnt_loss(logits=logits,
                                      labels=labels,
                                      label_length=label_length,
                                      logit_length=get_reduced_length(
                                          input_length,
                                          self.model.time_reduction_factor),
                                      blank=self.text_featurizer.blank)
            self.eval_metrics[names[i]].update_state(per_eval_loss)
            per_eval_loss *= alpha
            eval_loss += per_eval_loss

        self.eval_metrics["transducer_loss"].update_state(eval_loss)
예제 #6
0
conformer.load_weights(args.saved, by_name=True)
conformer.summary(line_length=120)
conformer.add_featurizers(speech_featurizer, text_featurizer)

import numpy as np
np.random.seed(0)
tf.random.set_seed(0)
if args.filename.endswith('.wav'):
  signal = read_raw_audio(args.filename)
  # features = speech_featurizer.tf_extract(signal)
  features = speech_featurizer.extract(signal)
  features = tf.constant(features)
else:
  features = np.load(args.filename).reshape([-1, 80, 1])
  features = tf.constant(features)
input_length = get_reduced_length(tf.shape(features)[0], conformer.time_reduction_factor)

if args.beam_width:
  transcript = conformer.recognize_beam(features[None, ...], input_length[None, ...])
  print("Transcript:", transcript[0].numpy().decode("UTF-8"))
elif args.timestamp:
  transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp(
    signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
  print("Transcript:", transcript)
  print("Start time:", stime)
  print("End time:", etime)
else:
  if args.filename.endswith('.wav'):
    transcript, _, _ = conformer.recognize_tflite(
      signal, tf.constant(text_featurizer.blank, dtype=tf.int32), conformer.predict_net.get_initial_state())
    print("Transcript:", tf.strings.unicode_encode(transcript, "UTF-8").numpy().decode("UTF-8"))