Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        parents=[nm_argparse.NemoArgParser()],
        description='AN4 ASR',
        conflict_handler='resolve',
    )

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        help="validation dataset path")

    # Create new args
    # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--batch_size",
                        default=48,
                        type=int,
                        help="size of the training batch")
    parser.add_argument("--lm", default=None, type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.add_argument("--do_not_eval_at_start", action='store_true')
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="~/TestData/an4_dataset/an4_train.json",
        eval_datasets="~/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1",
    )

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(
        local_rank=args.local_rank,
        files_to_copy=[__file__],
        optimization_level=args.amp_opt_level,
        random_seed=0,
        log_dir=args.work_dir,
        create_tb_writer=True,
        cudnn_benchmark=args.cudnn_benchmark,
    )
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    # Get vocabulary.
    vocab = jasper_params['labels']

    (
        loss,
        eval_tensors,
        callbacks,
        total_steps,
        log_probs_e,
        encoded_len_e,
    ) = create_dags(args.model_config, vocab, args, nf)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        lr_policy=CosineAnnealing(total_steps=total_steps,
                                  min_lr=args.lr / 100),
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "momentum": args.momentum,
            "betas": betas,
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
        amp_max_loss_scale=256.0,
        # synced_batchnorm=(nf.global_rank is not None),
    )

    if args.test_after_training:
        logging.info("Testing greedy and beam search with LM WER.")
        # Create BeamSearch NM
        if nf.world_size > 1 or args.lm is None:
            logging.warning(
                "Skipping beam search WER as it does not work if doing distributed training."
            )
        else:
            beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                vocab=vocab,
                beam_width=64,
                alpha=2.0,
                beta=1.5,
                lm_path=args.lm,
                num_cpus=max(os.cpu_count(), 1),
            )
            beam_predictions = beam_search_with_lm(
                log_probs=log_probs_e, log_probs_length=encoded_len_e)
            eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
            logging.info("Greedy WER: {:.2f}%".format(wer * 100))
            if wer > wer_thr:
                nf.sync_all_processes(False)
                raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :"
                                 f"than {wer_thr * 100:.2f}%")
        nf.sync_all_processes()

        if nf.world_size == 1 and args.lm is not None:
            beam_hypotheses = []
            # Over mini-batch
            for i in evaluated_tensors[-1]:
                # Over samples
                for j in i:
                    beam_hypotheses.append(j[0][1])

            beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                       references=references)
            logging.info("Beam WER {:.2f}%".format(beam_wer * 100))
            assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                beam_wer * 100, beam_wer_thr * 100)
            assert beam_wer <= wer, "Final eval beam WER > than the greedy WER."

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True,
        )

        # Distributed Data Parallel changes the underlying class so we need
        # to reinstantiate Encoder and Decoder
        args.num_epochs += 10
        previous_step_count = total_steps
        loss, eval_tensors, callbacks, total_steps, _, _ = create_dags(
            args.model_config, vocab, args, nf)

        nf.reset_trainer()
        nf.train(
            tensors_to_optimize=[loss],
            callbacks=callbacks,
            optimizer=args.optimizer,
            lr_policy=CosineAnnealing(warmup_steps=previous_step_count,
                                      total_steps=total_steps),
            optimization_params={
                "num_epochs": args.num_epochs,
                "lr": args.lr / 100,
                "momentum": args.momentum,
                "betas": betas,
                "weight_decay": args.weight_decay,
                "grad_norm_clip": None,
            },
            reset=True,
            amp_max_loss_scale=256.0,
            # synced_batchnorm=(nf.global_rank is not None),
        )

        evaluated_tensors = nf.infer(eval_tensors)
        if nf.global_rank in [0, None]:
            greedy_hypotheses = post_process_predictions(
                evaluated_tensors[1], vocab)
            references = post_process_transcripts(evaluated_tensors[2],
                                                  evaluated_tensors[3], vocab)
            wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                      references=references)
            logging.info("New greedy WER: {:.2f}%".format(wer_new * 100))
            if wer_new > wer * 1.1:
                nf.sync_all_processes(False)
                raise ValueError(
                    f"Fine tuning: new WER {wer_new * 100:.2f}% > than the "
                    f"previous WER {wer * 100:.2f}%")
        nf.sync_all_processes()

        # Open the log file and ensure that epochs is strictly increasing
        if nf._exp_manager.log_file:
            epochs = []
            with open(nf._exp_manager.log_file, "r") as log_file:
                line = log_file.readline()
                while line:
                    index = line.find("Starting epoch")
                    if index != -1:
                        epochs.append(int(line[index +
                                               len("Starting epoch"):]))
                    line = log_file.readline()
            for i, e in enumerate(epochs):
                if i != e:
                    raise ValueError("Epochs from logfile was not understood")
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    parser.add_argument("--vocab_file", type=str, required=True)
    parser.add_argument("--save_logprob", default=None, type=str)
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument("--beam_width", default=50, type=int)
    parser.add_argument("--alpha", default=2.0, type=float)
    parser.add_argument("--beta", default=1.0, type=float)
    parser.add_argument("--cutoff_prob", default=0.99, type=float)
    parser.add_argument("--cutoff_top_n", default=40, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=nemo.core.Optimization.mxprO1,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = load_vocab(args.vocab_file)

    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    n = len(data_layer)
    logging.info('Evaluating {0} examples'.format(n))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    if args.lm_path:
        beam_width = args.beam_width
        alpha = args.alpha
        beta = args.beta
        cutoff_prob = args.cutoff_prob
        cutoff_top_n = args.cutoff_top_n
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=beam_width,
            alpha=alpha,
            beta=beta,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n,
            lm_path=args.lm_path,
            num_cpus=max(os.cpu_count(), 1),
        )

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    (
        audio_signal_e1,
        a_sig_length_e1,
        transcript_e1,
        transcript_len_e1,
    ) = data_layer()
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1,
        predictions_e1,
        transcript_e1,
        transcript_len_e1,
        encoded_len_e1,
    ]

    if args.lm_path:
        beam_predictions_e1 = beam_search_with_lm(
            log_probs=log_probs_e1, log_probs_length=encoded_len_e1)
        eval_tensors.append(beam_predictions_e1)

    evaluated_tensors = neural_factory.infer(
        tensors=eval_tensors,
        checkpoint_dir=load_dir,
    )

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)
    cer = word_error_rate(hypotheses=greedy_hypotheses,
                          references=references,
                          use_cer=True)
    logging.info("Greedy CER {:.2f}%".format(cer * 100))

    if args.lm_path:
        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        cer = word_error_rate(hypotheses=beam_hypotheses,
                              references=references,
                              use_cer=True)
        logging.info("Beam CER {:.2f}".format(cer * 100))

    if args.save_logprob:
        # Convert logits to list of numpy arrays
        logprob = []
        for i, batch in enumerate(evaluated_tensors[0]):
            for j in range(batch.shape[0]):
                logprob.append(
                    batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description='Jasper')
    # model params
    parser.add_argument("--model_config", type=str, required=True)
    parser.add_argument("--eval_datasets", type=str, required=True)
    parser.add_argument("--load_dir", type=str, required=True)
    # run params
    parser.add_argument("--local_rank", default=None, type=int)
    parser.add_argument("--batch_size", default=64, type=int)
    parser.add_argument("--amp_opt_level", default="O1", type=str)
    # store results
    parser.add_argument("--save_logprob", default=None, type=str)

    # lm inference parameters
    parser.add_argument("--lm_path", default=None, type=str)
    parser.add_argument('--alpha',
                        default=2.0,
                        type=float,
                        help='value of LM weight',
                        required=False)
    parser.add_argument(
        '--alpha_max',
        type=float,
        help='maximum value of LM weight (for a grid search in \'eval\' mode)',
        required=False,
    )
    parser.add_argument('--alpha_step',
                        type=float,
                        help='step for LM weight\'s tuning in \'eval\' mode',
                        required=False,
                        default=0.1)
    parser.add_argument('--beta',
                        default=1.5,
                        type=float,
                        help='value of word count weight',
                        required=False)
    parser.add_argument(
        '--beta_max',
        type=float,
        help='maximum value of word count weight (for a grid search in \
          \'eval\' mode',
        required=False,
    )
    parser.add_argument(
        '--beta_step',
        type=float,
        help='step for word count weight\'s tuning in \'eval\' mode',
        required=False,
        default=0.1,
    )
    parser.add_argument("--beam_width", default=128, type=int)

    args = parser.parse_args()
    batch_size = args.batch_size
    load_dir = args.load_dir

    if args.local_rank is not None:
        if args.lm_path:
            raise NotImplementedError(
                "Beam search decoder with LM does not currently support evaluation on multi-gpu."
            )
        device = nemo.core.DeviceType.AllGpu
    else:
        device = nemo.core.DeviceType.GPU

    # Instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        placement=device,
    )

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    eval_datasets = args.eval_datasets

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=batch_size,
        **eval_dl_params,
    )

    N = len(data_layer)
    logging.info('Evaluating {0} examples'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])
    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"])
    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    # Define inference DAG
    audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer(
    )
    processed_signal_e1, p_length_e1 = data_preprocessor(
        input_signal=audio_signal_e1, length=a_sig_length_e1)
    encoded_e1, encoded_len_e1 = jasper_encoder(
        audio_signal=processed_signal_e1, length=p_length_e1)
    log_probs_e1 = jasper_decoder(encoder_output=encoded_e1)
    predictions_e1 = greedy_decoder(log_probs=log_probs_e1)

    eval_tensors = [
        log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1,
        encoded_len_e1
    ]

    # inference
    evaluated_tensors = neural_factory.infer(tensors=eval_tensors,
                                             checkpoint_dir=load_dir)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3], vocab)

    wer = word_error_rate(hypotheses=greedy_hypotheses, references=references)
    logging.info("Greedy WER {:.2f}%".format(wer * 100))

    # Convert logits to list of numpy arrays
    logprob = []
    for i, batch in enumerate(evaluated_tensors[0]):
        for j in range(batch.shape[0]):
            logprob.append(
                batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy())
    if args.save_logprob:
        with open(args.save_logprob, 'wb') as f:
            pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)

    # language model
    if args.lm_path:
        if args.alpha_max is None:
            args.alpha_max = args.alpha
        # include alpha_max in tuning range
        args.alpha_max += args.alpha_step / 10.0

        if args.beta_max is None:
            args.beta_max = args.beta
        # include beta_max in tuning range
        args.beta_max += args.beta_step / 10.0

        beam_wers = []

        logprobexp = [np.exp(p) for p in logprob]
        for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step):
            for beta in np.arange(args.beta, args.beta_max, args.beta_step):
                logging.info('================================')
                logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})')
                beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
                    vocab=vocab,
                    beam_width=args.beam_width,
                    alpha=alpha,
                    beta=beta,
                    lm_path=args.lm_path,
                    num_cpus=max(os.cpu_count(), 1),
                    input_tensor=False,
                )

                beam_predictions = beam_search_with_lm(log_probs=logprobexp,
                                                       log_probs_length=None,
                                                       force_pt=True)

                beam_predictions = [b[0][1] for b in beam_predictions[0]]
                lm_wer = word_error_rate(hypotheses=beam_predictions,
                                         references=references)
                logging.info("Beam WER {:.2f}%".format(lm_wer * 100))
                beam_wers.append(((alpha, beta), lm_wer * 100))

        logging.info('Beam WER for (alpha, beta)')
        logging.info('================================')
        logging.info('\n' + '\n'.join([str(e) for e in beam_wers]))
        logging.info('================================')
        best_beam_wer = min(beam_wers, key=lambda x: x[1])
        logging.info('Best (alpha, beta): '
                     f'{best_beam_wer[0]}, '
                     f'WER: {best_beam_wer[1]:.2f}%')
Exemplo n.º 4
0
def main():
    # Usage and Command line arguments
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5-En",
        required=True,
        help=
        "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En'",
    )
    parser.add_argument("--dataset",
                        type=str,
                        required=True,
                        help="path to evaluation data")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=1,
                        help="batch size to use for evaluation")
    parser.add_argument("--wer_target",
                        type=float,
                        default=None,
                        help="used by test")
    parser.add_argument("--wer_tolerance",
                        type=float,
                        default=1.0,
                        help="used by test")
    parser.add_argument("--trim_silence",
                        default=True,
                        type=bool,
                        help="trim audio from silence or not")
    parser.add_argument(
        "--normalize_text",
        default=True,
        type=bool,
        help="Normalize transcripts or not. Set to False for non-English.")
    args = parser.parse_args()

    # Setup NeuralModuleFactory to control training
    # instantiate Neural Factory with supported backend
    nf = nemo.core.NeuralModuleFactory()

    # Instantiate the model which we'll train
    logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}")
    asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained(
        model_info=args.asr_model)
    asr_model.eval()

    logging.info("\n\n")
    logging.info(f"Evaluation using {type(asr_model)} model.")
    logging.info(f"Evaluation using alphabet {asr_model.vocabulary}.")
    logging.info(f"The model has {asr_model.num_weights} weights.\n\n")

    eval_data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.dataset,
        labels=asr_model.vocabulary,
        batch_size=args.eval_batch_size,
        trim_silence=args.trim_silence,
        shuffle=False,
        normalize_transcripts=args.normalize_text,
    )
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer(
    )
    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    predictions = greedy_decoder(log_probs=log_probs)

    # inference
    eval_tensors = [
        log_probs, predictions, transcript, transcript_len, encoded_len
    ]
    evaluated_tensors = nf.infer(tensors=eval_tensors)

    greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                 asr_model.vocabulary)
    references = post_process_transcripts(evaluated_tensors[2],
                                          evaluated_tensors[3],
                                          asr_model.vocabulary)

    if args.asr_model.strip().endswith('-Zh'):
        val = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references,
                              use_cer=True)
        metric = 'CER'
    else:
        val = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references,
                              use_cer=False)
        metric = 'WER'
    logging.info(f"Greedy {metric} = {val}")
    if args.wer_target is not None:
        if args.wer_target * args.wer_tolerance < wer:
            raise ValueError(
                f"Resulting WER {wer} is higher than the target {args.wer_target}"
            )
Exemplo n.º 5
0
                     optimizer='novograd',
                     optimization_params=optimization_params)

print('Inference Only')
# We've already built the inference DAG above, so all we need is to call infer().
evaluated_tensors = neural_factory.infer(
    # These are the tensors we want to get from the model.
    tensors=[loss_test, preds_test, transcript_test, transcript_len_test],
    # checkpoint_dir specifies where the model params are loaded from.
    checkpoint_dir=(data_dir + '/an4_checkpoints'))

# Process the results to get WER
greedy_hypotheses = helpers.post_process_predictions(evaluated_tensors[1],
                                                     labels)

references = helpers.post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], labels)

wer = helpers.word_error_rate(hypotheses=greedy_hypotheses,
                              references=references)
print("*** Greedy WER: {:.2f} ***".format(wer * 100))
"""And that's it!

## Model Improvements

You already have all you need to create your own ASR model in NeMo, but there are a few more tricks that you can employ if you so desire. In this section, we'll briefly cover a few possibilities for improving an ASR model.

### Data Augmentation

There exist several ASR data augmentation methods that can increase the size of our training set.

For example, we can perform augmentation on the spectrograms by zeroing out specific frequency segments ("frequency masking") or time segments ("time masking") as described by [SpecAugment](https://arxiv.org/abs/1904.08779), or zero out rectangles on the spectrogram as in [Cutout](https://arxiv.org/pdf/1708.04552.pdf). In NeMo, we can do all three of these by simply adding in a `SpectrogramAugmentation` neural module. (As of now, it does not perform the time warping from the SpecAugment paper.)