예제 #1
0
def create_all_dags(args, neural_factory):
    logger = neural_factory.logger
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer)
    steps_per_epoch = int(
        N / (args.batch_size * args.iter_per_step * args.num_gpus))
    logger.info('Have {0} examples to train on.'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    multiply_batch_config = jasper_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        neural_factory.logger.info("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
        factory=neural_factory)

    ctc_loss = nemo_asr.CTCLossNM(
        num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    logger.info('================================')
    logger.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logger.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logger.info(
        f"Total number of parameters in decoder: "
        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logger.info('================================')

    # Train DAG
    audio_signal_t, a_sig_length_t, \
        transcript_t, transcript_len_t = data_layer()
    processed_signal_t, p_length_t = data_preprocessor(
        input_signal=audio_signal_t,
        length=a_sig_length_t)

    if multiply_batch_config:
        processed_signal_t, p_length_t, transcript_t, transcript_len_t = \
            multiply_batch(
                in_x=processed_signal_t, in_x_len=p_length_t,
                in_y=transcript_t,
                in_y_len=transcript_len_t)

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(
            input_spec=processed_signal_t)

    encoded_t, encoded_len_t = jasper_encoder(
        audio_signal=processed_signal_t,
        length=p_length_t)
    log_probs_t = jasper_decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t,
        targets=transcript_t,
        input_length=encoded_len_t,
        target_length=transcript_len_t)

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(
            monitor_asr_train_progress,
            labels=vocab,
            logger=logger),
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir,
        step_freq=args.checkpoint_save_freq)

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \
            eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e,
            length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(
            audio_signal=processed_signal_e,
            length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e)

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_e, predictions_e,
                          transcript_e, transcript_len_e],
            user_iter_callback=partial(
                process_evaluation_batch,
                labels=vocab),
            user_epochs_done_callback=partial(
                process_evaluation_epoch,
                tag=tagname,
                logger=logger),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer)

        callbacks.append(eval_callback)
    return loss_t, callbacks, steps_per_epoch
예제 #2
0
파일: jasper_an4.py 프로젝트: yarenty/NeMo
def main():
    parser = argparse.ArgumentParser(parents=[nm_argparse.NemoArgParser()],
                                     description='AN4 ASR',
                                     conflict_handler='resolve')

    # Overwrite default args
    parser.add_argument("--train_dataset",
                        type=str,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs=1,
                        help="validation dataset path")

    # Create new args
    parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str)
    parser.add_argument("--test_after_training", action='store_true')
    parser.add_argument("--momentum", type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.25, type=float)
    parser.set_defaults(
        model_config="./configs/jasper_an4.yaml",
        train_dataset="/home/mrjenkins/TestData/an4_dataset/an4_train.json",
        eval_datasets="/home/mrjenkins/TestData/an4_dataset/an4_val.json",
        work_dir="./tmp",
        checkpoint_dir="./tmp",
        optimizer="novograd",
        num_epochs=50,
        batch_size=32,
        eval_batch_size=16,
        lr=0.02,
        weight_decay=0.005,
        checkpoint_save_freq=1000,
        eval_freq=100,
        amp_opt_level="O1")

    args = parser.parse_args()
    betas = (args.beta1, args.beta2)

    wer_thr = 0.20
    beam_wer_thr = 0.15

    nf = nemo.core.NeuralModuleFactory(local_rank=args.local_rank,
                                       optimization_level=args.amp_opt_level,
                                       random_seed=0,
                                       log_dir=args.work_dir,
                                       checkpoint_dir=args.checkpoint_dir,
                                       create_tb_writer=True,
                                       cudnn_benchmark=args.cudnn_benchmark)
    tb_writer = nf.tb_writer
    checkpoint_dir = nf.checkpoint_dir
    args.checkpoint_dir = nf.checkpoint_dir

    # Load model definition
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    vocab = jasper_params['labels']
    sample_rate = jasper_params['sample_rate']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params)

    num_samples = len(data_layer)
    total_steps = int(num_samples * args.num_epochs / args.batch_size)
    print("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioPreprocessing(
        sample_rate=sample_rate, **jasper_params["AudioPreprocessing"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params)

    num_samples = len(data_layer_eval)
    nf.logger.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioPreprocessing"]["features"],
        **jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(log_probs=log_probs_e,
                      targets=transcript_e,
                      input_length=encoded_len_e,
                      target_length=transcript_len_e)
    nf.logger.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=lambda x: monitor_asr_train_progress(x, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=lambda x, y: process_evaluation_batch(
            x, y, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=tb_writer)

    nf.train(tensors_to_optimize=[loss],
             callbacks=[train_callback, eval_callback, checkpointer_callback],
             optimizer=args.optimizer,
             lr_policy=CosineAnnealing(total_steps=total_steps),
             optimization_params={
                 "num_epochs": args.num_epochs,
                 "max_steps": args.max_steps,
                 "lr": args.lr,
                 "momentum": args.momentum,
                 "betas": betas,
                 "weight_decay": args.weight_decay,
                 "grad_norm_clip": None
             },
             batches_per_step=args.iter_per_step)

    if args.test_after_training:
        # Create BeamSearch NM
        beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM(
            vocab=vocab,
            beam_width=64,
            alpha=2.,
            beta=1.5,
            lm_path=args.lm,
            num_cpus=max(os.cpu_count(), 1))
        beam_predictions = beam_search_with_lm(log_probs=log_probs_e,
                                               log_probs_length=encoded_len_e)
        eval_tensors.append(beam_predictions)

        evaluated_tensors = nf.infer(eval_tensors)
        greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                     vocab)
        references = post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], vocab)
        wer = word_error_rate(hypotheses=greedy_hypotheses,
                              references=references)
        nf.logger.info("Greedy WER: {:.2f}".format(wer * 100))
        assert wer <= wer_thr, (
            "Final eval greedy WER {:.2f}% > than {:.2f}%".format(
                wer * 100, wer_thr * 100))

        beam_hypotheses = []
        # Over mini-batch
        for i in evaluated_tensors[-1]:
            # Over samples
            for j in i:
                beam_hypotheses.append(j[0][1])

        beam_wer = word_error_rate(hypotheses=beam_hypotheses,
                                   references=references)
        nf.logger.info("Beam WER {:.2f}%".format(beam_wer * 100))
        assert beam_wer <= beam_wer_thr, (
            "Final eval beam WER {:.2f}%  > than {:.2f}%".format(
                beam_wer * 100, beam_wer_thr * 100))
        assert beam_wer <= wer, ("Final eval beam WER > than the greedy WER.")

        # Reload model weights and train for extra 10 epochs
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=checkpoint_dir,
            step_freq=args.checkpoint_save_freq,
            force_load=True)

        nf.reset_trainer()
        nf.train(tensors_to_optimize=[loss],
                 callbacks=[train_callback, checkpointer_callback],
                 optimizer=args.optimizer,
                 optimization_params={
                     "num_epochs": args.num_epochs + 10,
                     "lr": args.lr,
                     "momentum": args.momentum,
                     "betas": betas,
                     "weight_decay": args.weight_decay,
                     "grad_norm_clip": None
                 },
                 reset=True)

        evaluated_tensors = nf.infer(eval_tensors[:-1])
        greedy_hypotheses = post_process_predictions(evaluated_tensors[1],
                                                     vocab)
        references = post_process_transcripts(evaluated_tensors[2],
                                              evaluated_tensors[3], vocab)
        wer_new = word_error_rate(hypotheses=greedy_hypotheses,
                                  references=references)
        nf.logger.info("New greedy WER: {:.2f}%".format(wer_new * 100))
        assert wer_new <= wer * 1.1, (
            f"Fine tuning: new WER {wer * 100:.2f}% > than the previous WER "
            f"{wer_new * 100:.2f}%")
예제 #3
0
파일: jasper_an4.py 프로젝트: mritu301/NeMo
def create_dags(jasper_params, args, nf):
    vocab = jasper_params['labels']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params)

    num_samples = len(data_layer)
    steps_per_epoch = math.ceil(
        num_samples / (args.batch_size * args.iter_per_step * nf.world_size))
    total_steps = steps_per_epoch * args.num_epochs
    print("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params)

    num_samples = len(data_layer_eval)
    nemo.logging.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"])

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(log_probs=log_probs_e,
                      targets=transcript_e,
                      input_length=encoded_len_e,
                      target_length=transcript_len_e)
    nemo.logging.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=nf.tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=partial(process_evaluation_batch, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=nf.tb_writer)
    callbacks = [train_callback, checkpointer_callback, eval_callback]
    return (loss, eval_tensors, callbacks, total_steps, vocab, log_probs_e,
            encoded_len_e)
예제 #4
0
data_layer = nemo_asr.AudioToTextDataLayer(manifest_filepath=train_dataset,
                                           labels=labels,
                                           batch_size=32)
data_layer_val = nemo_asr.AudioToTextDataLayer(manifest_filepath=eval_datasets,
                                               labels=labels,
                                               batch_size=32,
                                               shuffle=False)

data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor()
spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)

jasper_encoder = nemo_asr.JasperEncoder(
    feat_in=64, **jasper_model_definition['JasperEncoder'])
jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                              num_classes=len(labels))
ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
greedy_decoder = nemo_asr.GreedyCTCDecoder()

# Training DAG (Model)
audio_signal, audio_signal_len, transcript, transcript_len = data_layer()
processed_signal, processed_signal_len = data_preprocessor(
    input_signal=audio_signal, length=audio_signal_len)
aug_signal = spec_augment(input_spec=processed_signal)
encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                      length=processed_signal_len)
log_probs = jasper_decoder(encoder_output=encoded)
predictions = greedy_decoder(log_probs=log_probs)
loss = ctc_loss(log_probs=log_probs,
                targets=transcript,
                input_length=encoded_len,
                target_length=transcript_len)
예제 #5
0
def create_all_dags(args, neural_factory):
    '''
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks'''

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        quartz_params = yaml.load(f)

    vocab = quartz_params['labels']
    sample_rate = quartz_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # create data layer for training
    train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(
        N / (args.batch_size * args.iter_per_step * args.num_gpus))

    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_eval = []
    if args.eval_datasets:
        for eval_dataset in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        nemo.logging.warning("There were no val datasets passed")

    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **quartz_params["AudioToMelSpectrogramPreprocessor"])

    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(
        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **quartz_params["JasperEncoder"])

    decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # create augmentation modules (only used for training) if their configs
    # are present

    multiply_batch_config = quartz_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    # assemble train DAG

    audio_signal_t, a_sig_length_t, \
        transcript_t, transcript_len_t = data_layer_train()

    processed_signal_t, p_length_t = data_preprocessor(
        input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        processed_signal_t, p_length_t, transcript_t, transcript_len_t = \
            multiply_batch(
                in_x=processed_signal_t, in_x_len=p_length_t,
                in_y=transcript_t,
                in_y_len=transcript_len_t)

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(
            input_spec=processed_signal_t)

    encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t,
                                       length=p_length_t)
    log_probs_t = decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(log_probs=log_probs_t,
                      targets=transcript_t,
                      input_length=encoded_len_t,
                      target_length=transcript_len_t)

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=neural_factory.tb_writer)

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir,
            load_from_folder=args.load_dir,
            step_freq=args.checkpoint_save_freq)

        callbacks.append(chpt_callback)

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):

        audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \
            eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e,
                                           length=p_length_e)
        log_probs_e = decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(log_probs=log_probs_e,
                          targets=transcript_e,
                          input_length=encoded_len_e,
                          target_length=transcript_len_e)

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e, predictions_e, transcript_e, transcript_len_e
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer)

        callbacks.append(eval_callback)

    return loss_t, callbacks, steps_per_epoch
예제 #6
0
    def test_simple_dags(self):
        # module instantiation
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        labels = jasper_model_definition['labels']

        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=labels,
            batch_size=4)
        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **jasper_model_definition['AudioToMelSpectrogramPreprocessor'])
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'])
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()

        # DAG definition
        audio_signal, audio_signal_len, transcript, transcript_len = \
            data_layer()
        processed_signal, processed_signal_len = data_preprocessor(
            input_signal=audio_signal, length=audio_signal_len)

        spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
        aug_signal = spec_augment(input_spec=processed_signal)

        encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                              length=processed_signal_len)
        log_probs = jasper_decoder(encoder_output=encoded)
        predictions = greedy_decoder(log_probs=log_probs)
        loss = ctc_loss(log_probs=log_probs,
                        targets=transcript,
                        input_length=encoded_len,
                        target_length=transcript_len)

        def wrong():
            with open("tests/data/jasper_smaller.yaml") as file:
                jasper_config = self.yaml.load(file)
            labels = jasper_config['labels']

            data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=self.manifest_filepath,
                labels=labels,
                batch_size=4)
            data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                **jasper_config['AudioToMelSpectrogramPreprocessor'])
            jasper_encoder = nemo_asr.JasperEncoder(
                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']
                ['features'],
                **jasper_config['JasperEncoder'])
            jasper_decoder = nemo_asr.JasperDecoderForCTC(
                feat_in=1024, num_classes=len(labels))
            # DAG definition
            audio_signal, audio_signal_len, transcript, transcript_len = \
                data_layer()
            processed_signal, processed_signal_len = data_preprocessor(
                input_signal=audio_signal, length=audio_signal_len)

            spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
            aug_signal = spec_augment(input_spec=processed_signal)

            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                                  length=processed_signal_len)
            log_probs = jasper_decoder(encoder_output=processed_signal)

        self.assertRaises(NeuralPortNmTensorMismatchError, wrong)