Пример #1
0
    def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]:
        """Transcripts audio batch to text.

        Args:
            audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects.

        Returns:
            text_batch: Batch of transcripts.

        """
        data_layer = AudioInferDataLayer(
            audio_batch=audio_batch,
            **self.nemo_params['AudioToTextDataLayer'])
        audio_signal, audio_signal_len = data_layer()
        processed_signal, processed_signal_len = self.data_preprocessor(
            input_signal=audio_signal, length=audio_signal_len)
        encoded, encoded_len = self.jasper_encoder(
            audio_signal=processed_signal, length=processed_signal_len)
        log_probs = self.jasper_decoder(encoder_output=encoded)
        predictions = self.greedy_decoder(log_probs=log_probs)
        eval_tensors = [predictions]
        tensors = self.neural_factory.infer(tensors=eval_tensors)
        text_batch = post_process_predictions(tensors[0], self.labels)

        return text_batch
Пример #2
0
    def wav_to_text(self, manifest, greedy=True):

        data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False,
                                                   manifest_filepath=manifest,
                                                   labels=self.labels,
                                                   batch_size=1)
        audio_signal, audio_signal_len, transcript, transcript_len = data_layer(
        )
        log_probs, encoded_len = self.asr_model(input_signal=audio_signal,
                                                length=audio_signal_len)
        predictions = self.greedy_decoder(log_probs=log_probs)
        eval_tensors = [predictions]

        if self.ENABLE_NGRAM:
            print('Running with beam search')
            beam_predictions = self.beam_search_with_lm(
                log_probs=log_probs, log_probs_length=encoded_len)
            eval_tensors.append(beam_predictions)

        tensors = self.neural_factory.infer(tensors=eval_tensors)
        if greedy:
            prediction = post_process_predictions(tensors[0], self.labels)
        else:
            prediction = tensors[0][0][0][0][1]
        del data_layer
        del eval_tensors
        del beam_predictions
        del predictions
        del tensors
        del audio_signal, audio_signal_len, transcript, transcript_len
        del log_probs, encoded_len
        return prediction
Пример #3
0
def wav_to_text(manifest, greedy=True):
    from ruamel.yaml import YAML

    yaml = YAML(typ="safe")
    with open(MODEL_YAML) as f:
        jasper_model_definition = yaml.load(f)
    labels = jasper_model_definition['labels']

    # Instantiate necessary neural modules
    data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False,
                                               manifest_filepath=manifest,
                                               labels=labels,
                                               batch_size=1)

    # Define inference DAG
    audio_signal, audio_signal_len, _, _ = data_layer()
    processed_signal, processed_signal_len = data_preprocessor(
        input_signal=audio_signal, length=audio_signal_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                          length=processed_signal_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)

    if ENABLE_NGRAM:
        logging.info('Running with beam search')
        beam_predictions = beam_search_with_lm(log_probs=log_probs,
                                               log_probs_length=encoded_len)
        eval_tensors = [beam_predictions]

    if greedy:
        eval_tensors = [predictions]

    tensors = neural_factory.infer(tensors=eval_tensors)
    if greedy:
        from nemo.collections.asr.helpers import post_process_predictions

        prediction = post_process_predictions(tensors[0], labels)
    else:
        prediction = tensors[0][0][0][0][1]
    return prediction
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='AN4 ASR',
        conflict_handler='resolve',
    )

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        help="validation dataset path")

    # Create new args
    # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--batch_size",
                        default=48,
                        type=int,
                        help="size of the training batch")
    parser.add_argument("--lm", default=None, type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.add_argument("--do_not_eval_at_start", action='store_true')
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="~/TestData/an4_dataset/an4_train.json",
        eval_datasets="~/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1",
    )

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(
        local_rank=args.local_rank,
        files_to_copy=[__file__],
        optimization_level=args.amp_opt_level,
        random_seed=0,
        log_dir=args.work_dir,
        create_tb_writer=True,
        cudnn_benchmark=args.cudnn_benchmark,
    )
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    # Get vocabulary.
    vocab = jasper_params['labels']

    (
        loss,
        eval_tensors,
        callbacks,
        total_steps,
        log_probs_e,
        encoded_len_e,
    ) = create_dags(args.model_config, vocab, args, nf)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        lr_policy=CosineAnnealing(total_steps=total_steps,
                                  min_lr=args.lr / 100),
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "momentum": args.momentum,
            "betas": betas,
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
        amp_max_loss_scale=256.0,
        # synced_batchnorm=(nf.global_rank is not None),
    )

    if args.test_after_training:
        logging.info("Testing greedy and beam search with LM WER.")
        # Create BeamSearch NM
        if nf.world_size > 1 or args.lm is None:
            logging.warning(
                "Skipping beam search WER as it does not work if doing distributed training."
            )
        else:
            beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                vocab=vocab,
                beam_width=64,
                alpha=2.0,
                beta=1.5,
                lm_path=args.lm,
                num_cpus=max(os.cpu_count(), 1),
            )
            beam_predictions = beam_search_with_lm(
                log_probs=log_probs_e, log_probs_length=encoded_len_e)
            eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
            logging.info("Greedy WER: {:.2f}%".format(wer * 100))
            if wer > wer_thr:
                nf.sync_all_processes(False)
                raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :"
                                 f"than {wer_thr * 100:.2f}%")
        nf.sync_all_processes()

        if nf.world_size == 1 and args.lm is not None:
            beam_hypotheses = []
            # Over mini-batch
            for i in evaluated_tensors[-1]:
                # Over samples
                for j in i:
                    beam_hypotheses.append(j[0][1])

            beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                       references=references)
            logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
            assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                beam_wer * 100, beam_wer_thr * 100)
            assert beam_wer <= wer, "Final eval beam WER > than the greedy WER."

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True,
        )

        # Distributed Data Parallel changes the underlying class so we need
        # to reinstantiate Encoder and Decoder
        args.num_epochs += 10
        previous_step_count = total_steps
        loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(
            args.model_config, vocab, args, nf)

        nf.reset_trainer()
        nf.train(
            tensors_to_optimize=[loss],
            callbacks=callbacks,
            optimizer=args.optimizer,
            lr_policy=CosineAnnealing(warmup_steps=previous_step_count,
                                      total_steps=total_steps),
            optimization_params={
                "num_epochs": args.num_epochs,
                "lr": args.lr / 100,
                "momentum": args.momentum,
                "betas": betas,
                "weight_decay": args.weight_decay,
                "grad_norm_clip": None,
            },
            reset=True,
            amp_max_loss_scale=256.0,
            # synced_batchnorm=(nf.global_rank is not None),
        )

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                      references=references)
            logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
            if wer_new > wer * 1.1:
                nf.sync_all_processes(False)
                raise ValueError(
                    f"Fine tuning: new WER {wer_new * 100:.2f}% > than the "
                    f"previous WER {wer * 100:.2f}%")
        nf.sync_all_processes()

        # Open the log file and ensure that epochs is strictly increasing
        if nf._exp_manager.log_file:
            epochs = []
            with open(nf._exp_manager.log_file, "r") as log_file:
                line = log_file.readline()
                while line:
                    index = line.find("Starting epoch")
                    if index != -1:
                        epochs.append(int(line[index +
                                               len("Starting epoch"):]))
                    line = log_file.readline()
            for i, e in enumerate(epochs):
                if i != e:
                    raise ValueError("Epochs from logfile was not understood")
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    parser.add_argument("--vocab_file", type=str, required=True)
    parser.add_argument("--save_logprob", default=None, type=str)
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument("--beam_width", default=50, type=int)
    parser.add_argument("--alpha", default=2.0, type=float)
    parser.add_argument("--beta", default=1.0, type=float)
    parser.add_argument("--cutoff_prob", default=0.99, type=float)
    parser.add_argument("--cutoff_top_n", default=40, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=nemo.core.Optimization.mxprO1,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = load_vocab(args.vocab_file)

    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    n = len(data_layer)
    logging.info('Evaluating {0} examples'.format(n))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    if args.lm_path:
        beam_width = args.beam_width
        alpha = args.alpha
        beta = args.beta
        cutoff_prob = args.cutoff_prob
        cutoff_top_n = args.cutoff_top_n
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=beam_width,
            alpha=alpha,
            beta=beta,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n,
            lm_path=args.lm_path,
            num_cpus=max(os.cpu_count(), 1),
        )

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    (
        audio_signal_e1,
        a_sig_length_e1,
        transcript_e1,
        transcript_len_e1,
    ) = data_layer()
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1,
        predictions_e1,
        transcript_e1,
        transcript_len_e1,
        encoded_len_e1,
    ]

    if args.lm_path:
        beam_predictions_e1 = beam_search_with_lm(
            log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
        eval_tensors.append(beam_predictions_e1)

    evaluated_tensors = neural_factory.infer(
        tensors=eval_tensors,
        checkpoint_dir=load_dir,
    )

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)
    cer = word_error_rate(hypotheses=greedy_hypotheses,
                          references=references,
                          use_cer=True)
    logging.info("Greedy CER {:.2f}%".format(cer * 100))

    if args.lm_path:
        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        cer = word_error_rate(hypotheses=beam_hypotheses,
                              references=references,
                              use_cer=True)
        logging.info("Beam CER {:.2f}".format(cer * 100))

    if args.save_logprob:
        # Convert logits to list of numpy arrays
        logprob = []
        for i, batch in enumerate(evaluated_tensors[0]):
            for j in range(batch.shape[0]):
                logprob.append(
                    batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
Пример #6
0
def main():
    # Usage and Command line arguments
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5-En",
        required=True,
        help=
        "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En'",
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=1,
                        help="batch size to use for evaluation")
    parser.add_argument("--wer_target",
                        type=float,
                        default=None,
                        help="used by test")
    parser.add_argument("--wer_tolerance",
                        type=float,
                        default=1.0,
                        help="used by test")
    parser.add_argument("--trim_silence",
                        default=True,
                        type=bool,
                        help="trim audio from silence or not")
    parser.add_argument(
        "--normalize_text",
        default=True,
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.")
    args = parser.parse_args()

    # Setup NeuralModuleFactory to control training
    # instantiate Neural Factory with supported backend
    nf = nemo.core.NeuralModuleFactory()

    # Instantiate the model which we'll train
    logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}")
    asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained(
        model_info=args.asr_model)
    asr_model.eval()

    logging.info("\n\n")
    logging.info(f"Evaluation using {type(asr_model)} model.")
    logging.info(f"Evaluation using alphabet {asr_model.vocabulary}.")
    logging.info(f"The model has {asr_model.num_weights} weights.\n\n")

    eval_data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.dataset,
        labels=asr_model.vocabulary,
        batch_size=args.eval_batch_size,
        trim_silence=args.trim_silence,
        shuffle=False,
        normalize_transcripts=args.normalize_text,
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer(
    )
    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    predictions = greedy_decoder(log_probs=log_probs)

    # inference
    eval_tensors = [
        log_probs, predictions, transcript, transcript_len, encoded_len
    ]
    evaluated_tensors = nf.infer(tensors=eval_tensors)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                 asr_model.vocabulary)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3],
                                          asr_model.vocabulary)

    if args.asr_model.strip().endswith('-Zh'):
        val = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references,
                              use_cer=True)
        metric = 'CER'
    else:
        val = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references,
                              use_cer=False)
        metric = 'WER'
    logging.info(f"Greedy {metric} = {val}")
    if args.wer_target is not None:
        if args.wer_target * args.wer_tolerance < wer:
            raise ValueError(
                f"Resulting WER {wer} is higher than the target {args.wer_target}"
            )
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    # model params
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    # run params
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=64, type=int)
    parser.add_argument("--amp_opt_level", default="O1", type=str)
    # store results
    parser.add_argument("--save_logprob", default=None, type=str)

    # lm inference parameters
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument('--alpha',
                        default=2.0,
                        type=float,
                        help='value of LM weight',
                        required=False)
    parser.add_argument(
        '--alpha_max',
        type=float,
        help='maximum value of LM weight (for a grid search in \'eval\' mode)',
        required=False,
    )
    parser.add_argument('--alpha_step',
                        type=float,
                        help='step for LM weight\'s tuning in \'eval\' mode',
                        required=False,
                        default=0.1)
    parser.add_argument('--beta',
                        default=1.5,
                        type=float,
                        help='value of word count weight',
                        required=False)
    parser.add_argument(
        '--beta_max',
        type=float,
        help='maximum value of word count weight (for a grid search in \
          \'eval\' mode',
        required=False,
    )
    parser.add_argument(
        '--beta_step',
        type=float,
        help='step for word count weight\'s tuning in \'eval\' mode',
        required=False,
        default=0.1,
    )
    parser.add_argument("--beam_width", default=128, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    N = len(data_layer)
    logging.info('Evaluating {0} examples'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"])
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    # Define inference DAG
    audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer(
    )
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1,
        encoded_len_e1
    ]

    # inference
    evaluated_tensors = neural_factory.infer(tensors=eval_tensors,
                                             checkpoint_dir=load_dir)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)

    wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
    logging.info("Greedy WER {:.2f}%".format(wer * 100))

    # Convert logits to list of numpy arrays
    logprob = []
    for i, batch in enumerate(evaluated_tensors[0]):
        for j in range(batch.shape[0]):
            logprob.append(
                batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
    if args.save_logprob:
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)

    # language model
    if args.lm_path:
        if args.alpha_max is None:
            args.alpha_max = args.alpha
        # include alpha_max in tuning range
        args.alpha_max += args.alpha_step / 10.0

        if args.beta_max is None:
            args.beta_max = args.beta
        # include beta_max in tuning range
        args.beta_max += args.beta_step / 10.0

        beam_wers = []

        logprobexp = [np.exp(p) for p in logprob]
        for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step):
            for beta in np.arange(args.beta, args.beta_max, args.beta_step):
                logging.info('================================')
                logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})')
                beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                    vocab=vocab,
                    beam_width=args.beam_width,
                    alpha=alpha,
                    beta=beta,
                    lm_path=args.lm_path,
                    num_cpus=max(os.cpu_count(), 1),
                    input_tensor=False,
                )

                beam_predictions = beam_search_with_lm(log_probs=logprobexp,
                                                       log_probs_length=None,
                                                       force_pt=True)

                beam_predictions = [b[0][1] for b in beam_predictions[0]]
                lm_wer = word_error_rate(hypotheses=beam_predictions,
                                         references=references)
                logging.info("Beam WER {:.2f}%".format(lm_wer * 100))
                beam_wers.append(((alpha, beta), lm_wer * 100))

        logging.info('Beam WER for (alpha, beta)')
        logging.info('================================')
        logging.info('\n' + '\n'.join([str(e) for e in beam_wers]))
        logging.info('================================')
        best_beam_wer = min(beam_wers, key=lambda x: x[1])
        logging.info('Best (alpha, beta): '
                     f'{best_beam_wer[0]}, '
                     f'WER: {best_beam_wer[1]:.2f}%')
Пример #8
0
print('Start Training!')
neural_factory.train(tensors_to_optimize=[loss],
                     callbacks=callbacks,
                     optimizer='novograd',
                     optimization_params=optimization_params)

print('Inference Only')
# We've already built the inference DAG above, so all we need is to call infer().
evaluated_tensors = neural_factory.infer(
    # These are the tensors we want to get from the model.
    tensors=[loss_test, preds_test, transcript_test, transcript_len_test],
    # checkpoint_dir specifies where the model params are loaded from.
    checkpoint_dir=(data_dir + '/an4_checkpoints'))

# Process the results to get WER
greedy_hypotheses = helpers.post_process_predictions(evaluated_tensors[1],
                                                     labels)

references = helpers.post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], labels)

wer = helpers.word_error_rate(hypotheses=greedy_hypotheses,
                              references=references)
print("*** Greedy WER: {:.2f} ***".format(wer * 100))
"""And that's it!

## Model Improvements

You already have all you need to create your own ASR model in NeMo, but there are a few more tricks that you can employ if you so desire. In this section, we'll briefly cover a few possibilities for improving an ASR model.

### Data Augmentation
Пример #9
0
    def transcribe(self, audio_data, greedy=True):
        audio_file = tempfile.NamedTemporaryFile(dir=WORK_DIR,
                                                 prefix="jasper_audio.",
                                                 delete=False)
        # audio_file.write(audio_data)
        audio_file.close()
        audio_file_path = audio_file.name
        wf = wave.open(audio_file_path, "w")
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(24000)
        wf.writeframesraw(audio_data)
        wf.close()
        manifest = {
            "audio_filepath": audio_file_path,
            "duration": 60,
            "text": "todo"
        }
        manifest_file = tempfile.NamedTemporaryFile(dir=WORK_DIR,
                                                    prefix="jasper_manifest.",
                                                    delete=False,
                                                    mode="w")
        manifest_file.write(json.dumps(manifest))
        manifest_file.close()
        manifest_file_path = manifest_file.name
        data_layer = nemo_asr.AudioToTextDataLayer(
            shuffle=False,
            manifest_filepath=manifest_file_path,
            labels=self.labels,
            batch_size=1,
        )

        # Define inference DAG
        audio_signal, audio_signal_len, _, _ = data_layer()
        processed_signal, processed_signal_len = self.data_preprocessor(
            input_signal=audio_signal, length=audio_signal_len)
        encoded, encoded_len = self.jasper_encoder(
            audio_signal=processed_signal, length=processed_signal_len)
        log_probs = self.jasper_decoder(encoder_output=encoded)
        predictions = self.greedy_decoder(log_probs=log_probs)

        if greedy:
            eval_tensors = [predictions]
        else:
            if self.beam_search_with_lm:
                logging.info("Running with beam search")
                beam_predictions = self.beam_search_with_lm(
                    log_probs=log_probs, log_probs_length=encoded_len)
                eval_tensors = [beam_predictions]
            else:
                logging.info(
                    "language_model not specified. falling back to greedy decoding."
                )
                eval_tensors = [predictions]

        tensors = self.neural_factory.infer(tensors=eval_tensors)
        prediction = post_process_predictions(tensors[0], self.labels)
        prediction_text = ". ".join(prediction)
        os.unlink(manifest_file.name)
        os.unlink(audio_file.name)
        return prediction_text