def test_quartznet_model_training(self):
        """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data.
        test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside
        of AudioToMelSpectrogramPreprocessor.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../../examples/asr/configs/jasper_an4.yaml"))
        ) as file:
            model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        model = nemo_asr.models.ASRConvCTCModel(
            preprocessor_params=model_definition[
                'AudioToMelSpectrogramPreprocessor'],
            encoder_params=model_definition['JasperEncoder'],
            decoder_params=model_definition['JasperDecoderForCTC'],
        )
        model.train()
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        log_probs, encoded_len = model(input_signal=audio_signal,
                                       length=a_sig_length)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=partial(self.print_and_log_loss,
                               loss_log_list=loss_list),
            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
示例#2
0
    def test_freeze_unfreeze_TrainableNM(self):
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))
        with open(path) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            # featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            #'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        jasper_encoder.freeze()
        jasper_encoder.unfreeze(set(['encoder.4.mconv.0.conv.weight']))
        frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.detach().cpu().numpy()
        unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.detach().cpu().numpy()
        # jasper_decoder.unfreeze()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'),
        )
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003},
        )
        new_frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.data
        new_unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.data
        self.assertTrue(np.array_equal(frozen_weight, new_frozen_weight.detach().cpu().numpy()))
        self.assertFalse(np.array_equal(unfrozen_weight, new_unfrozen_weight.detach().cpu().numpy()))
示例#3
0
    def test_asr_with_zero_ds(self):
        logging.info("Testing ASR NMs with ZeroDS and without pre-processing")
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))
        with open(path) as file:
            jasper_model_definition = self.yaml.load(file)

        dl = nemo.backends.pytorch.common.ZerosDataLayer(
            size=100,
            dtype=torch.FloatTensor,
            batch_size=4,
            output_ports={
                # "processed_signal": NeuralType(
                #    {
                #        0: AxisType(BatchTag),
                #        1: AxisType(SpectrogramSignalTag, dim=64),
                #        2: AxisType(ProcessedTimeTag, dim=64),
                #    }
                # ),
                # "processed_length": NeuralType({0: AxisType(BatchTag)}),
                # "transcript": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag, dim=64)}),
                # "transcript_length": NeuralType({0: AxisType(BatchTag)}),
                "processed_signal": NeuralType(
                    (AxisType(AxisKind.Batch), AxisType(AxisKind.Dimension, 64), AxisType(AxisKind.Time, 64)),
                    SpectrogramType(),
                ),
                "processed_length": NeuralType(tuple('B'), LengthsType()),
                "transcript": NeuralType((AxisType(AxisKind.Batch), AxisType(AxisKind.Time, 64)), LabelsType()),
                "transcript_length": NeuralType(tuple('B'), LengthsType()),
            },
        )

        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition["JasperEncoder"],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        processed_signal, p_length, transcript, transcript_len = dl()
        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'),
        )
        # Instantiate an optimizer to perform `train` action
        self.nf.train(
            [loss], callbacks=[callback], optimization_params={"num_epochs": 2, "lr": 0.0003}, optimizer="sgd",
        )
示例#4
0
    def test_freeze_unfreeze_TrainableNM(self):
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        jasper_encoder.freeze()
        jasper_encoder.unfreeze(set(['encoder.4.conv.1.weight']))
        jasper_decoder.unfreeze()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # print(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'),
        )
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003},
        )
示例#5
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = load_vocab(args.vocab_file)
    sample_rate = jasper_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    train_dl_params["normalize_transcripts"] = False
    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer)
    steps_per_epoch = int(N / (args.batch_size * args.num_gpus))
    nemo.logging.info('Have {0} examples to train on.'.format(N))

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"],
    )

    multiply_batch_config = jasper_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    eval_dl_params["normalize_transcripts"] = False
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        nemo.logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab))

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    nemo.logging.info('================================')
    nemo.logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    nemo.logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    nemo.logging.info(
        f"Total number of parameters in model: "
        f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    nemo.logging.info('================================')

    # Train DAG
    (
        audio_signal_t,
        a_sig_length_t,
        transcript_t,
        transcript_len_t,
    ) = data_layer()
    processed_signal_t, p_length_t = data_preprocessor(
        input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (
            processed_signal_t,
            p_length_t,
            transcript_t,
            transcript_len_t,
        ) = multiply_batch(
            in_x=processed_signal_t,
            in_x_len=p_length_t,
            in_y=transcript_t,
            in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(
            input_spec=processed_signal_t)

    encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t,
                                              length=p_length_t)
    log_probs_t = jasper_decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t,
        targets=transcript_t,
        input_length=encoded_len_t,
        target_length=transcript_len_t,
    )

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress,
                           labels=vocab,
                           eval_metric='CER'),
        step_freq=args.train_eval_freq,
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir,
        step_freq=args.checkpoint_save_freq,
    )

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (
            audio_signal_e,
            a_sig_length_e,
            transcript_e,
            transcript_len_e,
        ) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(
            audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e,
                predictions_e,
                transcript_e,
                transcript_len_e,
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              eval_metric='CER',
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return loss_t, callbacks, steps_per_epoch
示例#6
0
文件: test_asr.py 项目: benhoff/NeMo
    def test_jasper_eval(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()
        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )
        predictions = greedy_decoder(log_probs=log_probs)

        from nemo.collections.asr.helpers import (
            process_evaluation_batch,
            process_evaluation_epoch,
        )

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss, predictions, transcript, transcript_len],
            user_iter_callback=lambda x, y: process_evaluation_batch(
                x, y, labels=self.labels),
            user_epochs_done_callback=process_evaluation_epoch,
        )
        # Instantiate an optimizer to perform `train` action
        self.nf.eval(callbacks=[eval_callback])
示例#7
0
文件: test_asr.py 项目: benhoff/NeMo
    def test_stft_conv(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        optimizer = self.nf.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
示例#8
0
def create_all_dags(args, neural_factory):
    '''
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks'''

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        quartz_params = yaml.load(f)

    try:
        vocab = quartz_params['labels']
        sample_rate = quartz_params['sample_rate']
    except KeyError:
        logging.error("Please make sure you are using older config format (the ones with -old suffix)")
        exit(1)

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # create data layer for training
    train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        sample_rate=sample_rate,
        labels=vocab,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus))

    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"])
    eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_eval = []
    if args.eval_datasets:
        for eval_dataset in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                sample_rate=sample_rate,
                labels=vocab,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        logging.warning("There were no val datasets passed")

    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"],
    )

    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(
        feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"],
    )

    decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab),
    )

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # create augmentation modules (only used for training) if their configs
    # are present

    multiply_batch_config = quartz_params.get('MultiplyBatch', None)
    if multiply_batch_config:
        multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)

    spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)

    # assemble train DAG

    (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train()

    processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t)

    if multiply_batch_config:
        (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch(
            in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t,
        )

    if spectr_augment_config:
        processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)

    encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t)
    log_probs_t = decoder(encoder_output=encoded_t)
    predictions_t = greedy_decoder(log_probs=log_probs_t)
    loss_t = ctc_loss(
        log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t,
    )

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=neural_factory.tb_writer,
    )

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq,
        )

        callbacks.append(chpt_callback)

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)

    return loss_t, callbacks, steps_per_epoch
示例#9
0
def create_dags(model_config_file, vocab, args, nf):

    # Create a data_layer for training.
    data_layer = nemo_asr.AudioToTextDataLayer.import_from_config(
        model_config_file,
        "AudioToTextDataLayer_train",
        overwrite_params={
            "manifest_filepath": args.train_dataset,
            "batch_size": args.batch_size
        },
    )

    num_samples = len(data_layer)
    steps_per_epoch = math.ceil(
        num_samples /
        (data_layer.batch_size * args.iter_per_step * nf.world_size))
    total_steps = steps_per_epoch * args.num_epochs
    logging.info("Train samples=", num_samples, "num_steps=", total_steps)

    # Create a data_layer for evaluation.
    data_layer_eval = nemo_asr.AudioToTextDataLayer.import_from_config(
        model_config_file,
        "AudioToTextDataLayer_eval",
        overwrite_params={"manifest_filepath": args.eval_datasets},
    )

    num_samples = len(data_layer_eval)
    logging.info(f"Eval samples={num_samples}")

    # Instantiate data processor.
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
        model_config_file, "AudioToMelSpectrogramPreprocessor")

    # Instantiate JASPER encoder-decoder modules.
    jasper_encoder = nemo_asr.JasperEncoder.import_from_config(
        model_config_file, "JasperEncoder")
    jasper_decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
        model_config_file,
        "JasperDecoderForCTC",
        overwrite_params={"num_classes": len(vocab)})

    # Instantiate losses.
    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Create a training graph.
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(
        log_probs=log_probs,
        targets=transcript,
        input_length=encoded_len,
        target_length=transcript_len,
    )

    # Create an evaluation graph.
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(
        log_probs=log_probs_e,
        targets=transcript_e,
        input_length=encoded_len_e,
        target_length=transcript_len_e,
    )
    logging.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard.
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=nf.tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=partial(process_evaluation_batch, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=nf.tb_writer,
        eval_at_start=not args.do_not_eval_at_start,
    )
    callbacks = [train_callback, checkpointer_callback, eval_callback]

    # Return entities required by the actual training.
    return (
        loss,
        eval_tensors,
        callbacks,
        total_steps,
        log_probs_e,
        encoded_len_e,
    )
示例#10
0
    def test_simple_dags(self):
        # module instantiation
        with open("tests/data/jasper_smaller.yaml") as file:
            jasper_model_definition = self.yaml.load(file)
        labels = jasper_model_definition['labels']

        data_layer = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=labels,
            batch_size=4,
        )
        data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **jasper_model_definition['AudioToMelSpectrogramPreprocessor'])
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
        greedy_decoder = nemo_asr.GreedyCTCDecoder()

        # DAG definition
        (
            audio_signal,
            audio_signal_len,
            transcript,
            transcript_len,
        ) = data_layer()
        processed_signal, processed_signal_len = data_preprocessor(
            input_signal=audio_signal, length=audio_signal_len)

        spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
        aug_signal = spec_augment(input_spec=processed_signal)

        encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                              length=processed_signal_len)
        log_probs = jasper_decoder(encoder_output=encoded)
        predictions = greedy_decoder(log_probs=log_probs)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        def wrong():
            with open("tests/data/jasper_smaller.yaml") as file:
                jasper_config = self.yaml.load(file)
            labels = jasper_config['labels']

            data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=self.manifest_filepath,
                labels=labels,
                batch_size=4,
            )
            data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                **jasper_config['AudioToMelSpectrogramPreprocessor'])
            jasper_encoder = nemo_asr.JasperEncoder(
                feat_in=jasper_config['AudioToMelSpectrogramPreprocessor']
                ['features'],
                **jasper_config['JasperEncoder'],
            )
            jasper_decoder = nemo_asr.JasperDecoderForCTC(
                feat_in=1024, num_classes=len(labels))
            # DAG definition
            (
                audio_signal,
                audio_signal_len,
                transcript,
                transcript_len,
            ) = data_layer()
            processed_signal, processed_signal_len = data_preprocessor(
                input_signal=audio_signal, length=audio_signal_len)

            spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
            aug_signal = spec_augment(input_spec=processed_signal)

            encoded, encoded_len = jasper_encoder(audio_signal=aug_signal,
                                                  length=processed_signal_len)
            log_probs = jasper_decoder(encoder_output=processed_signal)

        self.assertRaises(NeuralPortNmTensorMismatchError, wrong)
    def test_stft_conv_training(self):
        """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data.
        test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside
        of AudioToMelSpectrogramPreprocessor.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                      num_classes=len(
                                                          self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
    def test_contextnet_ctc_training(self):
        """Integtaion test that instantiates a small ContextNet model and tests training with the sample asr data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss
        Checks SE-block with fixed context size and global context, residual_mode='stride_add' and 'stride_last' flags
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/contextnet_32.yaml"))) as f:
            contextnet_model_definition = self.yaml.load(f)
        dl = nemo_asr.AudioToTextDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=30)
        pre_process_params = {
            'frame_splicing': 1,
            'features': 80,
            'window_size': 0.025,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)

        spec_aug = nemo_asr.SpectrogramAugmentation(
            **contextnet_model_definition['SpectrogramAugmentation'])

        contextnet_encoder = nemo_asr.ContextNetEncoder(
            feat_in=contextnet_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **contextnet_model_definition['ContextNetEncoder'],
        )
        contextnet_decoder = nemo_asr.ContextNetDecoderForCTC(feat_in=32,
                                                              hidden_size=16,
                                                              num_classes=len(
                                                                  self.labels))
        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        processed_signal = spec_aug(input_spec=processed_signal)

        encoded, encoded_len = contextnet_encoder(
            audio_signal=processed_signal, length=p_length)
        log_probs = contextnet_decoder(encoder_output=encoded)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        loss_list = []
        callback = SimpleLossLoggerCallback(tensors=[loss],
                                            print_func=partial(
                                                self.print_and_log_loss,
                                                loss_log_list=loss_list),
                                            step_freq=1)

        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 3,
                "lr": 0.001
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
示例#13
0
def create_dags(jasper_params, args, nf):
    vocab = jasper_params['labels']

    # build train and eval model
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]

    data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=vocab,
        batch_size=args.batch_size,
        **train_dl_params,
    )

    num_samples = len(data_layer)
    steps_per_epoch = math.ceil(
        num_samples / (args.batch_size * args.iter_per_step * nf.world_size))
    total_steps = steps_per_epoch * args.num_epochs
    logging.info("Train samples=", num_samples, "num_steps=", total_steps)

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    # data_augmentation = nemo_asr.SpectrogramAugmentation(
    #     **jasper_params['SpectrogramAugmentation']
    # )

    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layer_eval = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.eval_datasets,
        labels=vocab,
        batch_size=args.eval_batch_size,
        **eval_dl_params,
    )

    num_samples = len(data_layer_eval)
    logging.info(f"Eval samples={num_samples}")

    jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"])

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"])

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # Training model
    audio, audio_len, transcript, transcript_len = data_layer()
    processed, processed_len = data_preprocessor(input_signal=audio,
                                                 length=audio_len)
    encoded, encoded_len = jasper_encoder(audio_signal=processed,
                                          length=processed_len)
    log_probs = jasper_decoder(encoder_output=encoded)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(
        log_probs=log_probs,
        targets=transcript,
        input_length=encoded_len,
        target_length=transcript_len,
    )

    # Evaluation model
    audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval()
    processed_e, processed_len_e = data_preprocessor(input_signal=audio_e,
                                                     length=audio_len_e)
    encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e,
                                              length=processed_len_e)
    log_probs_e = jasper_decoder(encoder_output=encoded_e)
    predictions_e = greedy_decoder(log_probs=log_probs_e)
    loss_e = ctc_loss(
        log_probs=log_probs_e,
        targets=transcript_e,
        input_length=encoded_len_e,
        target_length=transcript_len_e,
    )
    logging.info("Num of params in encoder: {0}".format(
        jasper_encoder.num_weights))

    # Callbacks to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        print_func=partial(monitor_asr_train_progress, labels=vocab),
        get_tb_values=lambda x: [["loss", x[0]]],
        tb_writer=nf.tb_writer,
    )

    checkpointer_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq)

    eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e]
    eval_callback = nemo.core.EvaluatorCallback(
        eval_tensors=eval_tensors,
        user_iter_callback=partial(process_evaluation_batch, labels=vocab),
        user_epochs_done_callback=process_evaluation_epoch,
        eval_step=args.eval_freq,
        tb_writer=nf.tb_writer,
    )
    callbacks = [train_callback, checkpointer_callback, eval_callback]
    return (
        loss,
        eval_tensors,
        callbacks,
        total_steps,
        vocab,
        log_probs_e,
        encoded_len_e,
    )
示例#14
0
def main():
    # Usage and Command line arguments
    parser = ArgumentParser()
    parser.add_argument(
        "--asr_model",
        type=str,
        default="QuartzNet15x5-En",
        required=True,
        help=
        "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En' to train from pre-trained models. To train from scratch pass path to modelfile ending with .yaml.",
    )
    parser.add_argument(
        "--amp_opt_level",
        default="O0",
        type=str,
        choices=["O0", "O1", "O2", "O3"],
        help="See: https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--train_dataset",
                        type=str,
                        required=True,
                        default=None,
                        help="training dataset path")
    parser.add_argument("--eval_datasets",
                        type=str,
                        nargs="*",
                        help="evaluation datasets paths")
    parser.add_argument("--eval_freq",
                        default=1000,
                        type=int,
                        help="Evaluation frequency")
    parser.add_argument("--eval_batch_size",
                        type=int,
                        default=8,
                        help="batch size to use for evaluation")
    parser.add_argument("--local_rank",
                        default=None,
                        type=int,
                        help="node rank for distributed training")
    parser.add_argument("--stats_freq",
                        default=25,
                        type=int,
                        help="frequency with which to update train stats")
    parser.add_argument("--checkpoint_dir",
                        default=None,
                        type=str,
                        help="Folder where to save checkpoints")
    parser.add_argument("--checkpoint_save_freq",
                        required=False,
                        type=int,
                        help="how often to checkpoint")
    parser.add_argument("--optimizer", default="novograd", type=str)
    parser.add_argument("--warmup_ratio",
                        default=0.02,
                        type=float,
                        help="learning rate warmup ratio")
    parser.add_argument("--batch_size",
                        required=True,
                        type=int,
                        help="train batch size per GPU")
    parser.add_argument("--num_epochs",
                        default=5,
                        type=int,
                        help="number of epochs to train")
    parser.add_argument("--lr", default=0.01, type=float)
    parser.add_argument("--beta1", default=0.95, type=float)
    parser.add_argument("--beta2", default=0.5, type=float)
    parser.add_argument("--weight_decay", default=0.001, type=float)
    parser.add_argument("--iter_per_step",
                        default=1,
                        type=int,
                        help="number of grad accumulations per batch")
    parser.add_argument("--wandb_exp_name", default=None, type=str)
    parser.add_argument("--wandb_project", default=None, type=str)
    parser.add_argument("--max_train_audio_len",
                        default=16.7,
                        type=float,
                        help="max audio length")
    parser.add_argument("--do_not_trim_silence",
                        action="store_false",
                        help="Add this flag to disable silence trimming")
    parser.add_argument("--do_not_normalize_text",
                        action="store_false",
                        help="Add this flag to set to False for non-English.")
    args = parser.parse_args()

    # Setup NeuralModuleFactory to control training
    # instantiate Neural Factory with supported backend
    nf = nemo.core.NeuralModuleFactory(
        local_rank=args.
        local_rank,  # This is necessary for distributed training
        optimization_level=args.
        amp_opt_level,  # This is necessary for mixed precision optimization
        cudnn_benchmark=True,
    )

    # Instantiate the model which we'll train
    if args.asr_model.endswith('.yaml'):
        logging.info(
            f"Speech2Text: Will train from scratch using config from {args.asr_model}"
        )
        asr_model = nemo_asr.models.ASRConvCTCModel.import_from_config(
            args.asr_model)
    else:
        logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}")
        asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained(
            model_info=args.asr_model, local_rank=args.local_rank)

    if args.asr_model.strip().endswith('-Zh'):
        logging.info('USING CER')
        eval_metric = 'CER'
    else:
        eval_metric = 'WER'

    logging.info("\n\n")
    logging.info(f"Speech2Text: Training on {nf.world_size} GPUs.")
    logging.info(f"Training {type(asr_model)} model.")
    logging.info(f"Training CTC model with alphabet {asr_model.vocabulary}.")
    logging.info(
        f"Training CTC model with {asr_model.num_weights} weights.\n\n")

    train_data_layer = nemo_asr.AudioToTextDataLayer(
        manifest_filepath=args.train_dataset,
        labels=asr_model.vocabulary,
        batch_size=args.batch_size,
        trim_silence=args.do_not_trim_silence,
        max_duration=args.max_train_audio_len,
        shuffle=True,
        normalize_transcripts=args.do_not_normalize_text,
    )
    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(asr_model.vocabulary))
    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    audio_signal, audio_signal_len, transcript, transcript_len = train_data_layer(
    )
    log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                       length=audio_signal_len)
    predictions = greedy_decoder(log_probs=log_probs)
    loss = ctc_loss(log_probs=log_probs,
                    targets=transcript,
                    input_length=encoded_len,
                    target_length=transcript_len)

    # Callbacks which we'll be using:
    callbacks = []
    # SimpleLossLogger prints basic training stats (e.g. loss) to console
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, predictions, transcript, transcript_len],
        step_freq=args.stats_freq,
        print_func=partial(monitor_asr_train_progress,
                           labels=asr_model.vocabulary,
                           eval_metric=eval_metric),
    )
    callbacks.append(train_callback)
    if args.checkpoint_dir is not None and args.checkpoint_save_freq is not None:
        # Checkpoint callback saves checkpoints periodically
        checkpointer_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq)
        callbacks.append(checkpointer_callback)

    if args.wandb_exp_name is not None and args.wandb_project is not None:
        # WandbCallback saves stats to Weights&Biases
        wandb_callback = nemo.core.WandBLogger(
            step_freq=args.stats_freq,
            wandb_name=args.wandb_exp_name,
            wandb_project=args.wandb_project,
            args=args)
        callbacks.append(wandb_callback)

    # Evaluation
    if args.eval_datasets is not None and args.eval_freq is not None:
        asr_model.eval()  # switch model to evaluation mode
        logging.info(f"Will perform evaluation every {args.eval_freq} steps.")
        for ind, eval_dataset in enumerate(args.eval_datasets):
            eval_data_layer = nemo_asr.AudioToTextDataLayer(
                manifest_filepath=eval_dataset,
                labels=asr_model.vocabulary,
                batch_size=args.eval_batch_size,
                normalize_transcripts=args.do_not_normalize_text,
            )
            audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer(
            )
            log_probs, encoded_len = asr_model(input_signal=audio_signal,
                                               length=audio_signal_len)
            eval_predictions = greedy_decoder(log_probs=log_probs)
            eval_loss = ctc_loss(log_probs=log_probs,
                                 targets=transcript,
                                 input_length=encoded_len,
                                 target_length=transcript_len)
            tag_name = os.path.basename(eval_dataset).split(".")[0]
            eval_callback = nemo.core.EvaluatorCallback(
                eval_tensors=[
                    eval_loss, eval_predictions, transcript, transcript_len
                ],
                user_iter_callback=partial(process_evaluation_batch,
                                           labels=asr_model.vocabulary),
                user_epochs_done_callback=partial(process_evaluation_epoch,
                                                  tag=tag_name,
                                                  eval_metric=eval_metric),
                eval_step=args.eval_freq,
                wandb_name=args.wandb_exp_name,
                wandb_project=args.wandb_project,
            )
            callbacks.append(eval_callback)

    steps_in_epoch = len(train_data_layer) / (
        args.batch_size * args.iter_per_step * nf.world_size)
    lr_policy = CosineAnnealing(total_steps=args.num_epochs * steps_in_epoch,
                                warmup_ratio=args.warmup_ratio)

    nf.train(
        tensors_to_optimize=[loss],
        callbacks=callbacks,
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "lr": args.lr,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
        },
        batches_per_step=args.iter_per_step,
        lr_policy=lr_policy,
    )
示例#15
0
data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor.import_from_config(
    config_path, "AudioToMelSpectrogramPreprocessor"
)

# Create the Jasper_4x1 encoder as specified, and a CTC decoder
encoder = nemo_asr.JasperEncoder.import_from_config(
    config_path, "JasperEncoder"
)

decoder = nemo_asr.JasperDecoderForCTC.import_from_config(
    config_path, "JasperDecoderForCTC",
    overwrite_params={"num_classes": len(labels)}
)

ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))
greedy_decoder = nemo_asr.GreedyCTCDecoder()

# --- Assemble Training DAG --- #
audio_signal, audio_signal_len, transcript, transcript_len = data_layer_train()

processed_signal, processed_signal_len = data_preprocessor(
    input_signal=audio_signal,
    length=audio_signal_len)

encoded, encoded_len = encoder(
    audio_signal=processed_signal,
    length=processed_signal_len)

log_probs = decoder(encoder_output=encoded)
preds = greedy_decoder(log_probs=log_probs)  # Training predictions
示例#16
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)
    vocab = jasper_params["labels"]
    sample_rate = jasper_params["sample_rate"]

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)
    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    if args.dataset:
        d_path = Path(args.dataset)
        if not args.train_dataset:
            args.train_dataset = str(d_path / Path("train_manifest.json"))
        if not args.eval_datasets:
            args.eval_datasets = [str(d_path / Path("test_manifest.json"))]

    data_loader_layer = nemo_asr.AudioToTextDataLayer

    if args.remote_data:
        train_dl_params["rpyc_host"] = args.remote_data
        data_loader_layer = RpycAudioToTextDataLayer

    # data_layer = data_loader_layer(
    #     manifest_filepath=args.train_dataset,
    #     sample_rate=sample_rate,
    #     labels=vocab,
    #     batch_size=args.batch_size,
    #     num_workers=cpu_per_traindl,
    #     **train_dl_params,
    #     # normalize_transcripts=False
    # )
    #
    # N = len(data_layer)
    # steps_per_epoch = math.ceil(
    #     N / (args.batch_size * args.iter_per_step * args.num_gpus)
    # )
    # logging.info("Have {0} examples to train on.".format(N))
    #
    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMelSpectrogramPreprocessor"])

    # multiply_batch_config = jasper_params.get("MultiplyBatch", None)
    # if multiply_batch_config:
    #     multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config)
    #
    # spectr_augment_config = jasper_params.get("SpectrogramAugmentation", None)
    # if spectr_augment_config:
    #     data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
    #         **spectr_augment_config
    #     )
    #
    eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"])
    eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"])
    if args.remote_data:
        eval_dl_params["rpyc_host"] = args.remote_data
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    # if args.eval_datasets:
    for eval_datasets in args.eval_datasets:
        data_layer_eval = data_loader_layer(
            manifest_filepath=eval_datasets,
            sample_rate=sample_rate,
            labels=vocab,
            batch_size=args.eval_batch_size,
            num_workers=cpu_per_traindl,
            **eval_dl_params,
        )

        data_layers_eval.append(data_layer_eval)
    # else:
    #     logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(
        feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"],
        **jasper_params["JasperEncoder"],
    )
    jasper_encoder.restore_from(args.encoder_checkpoint, local_rank=0)

    jasper_decoder = nemo_asr.JasperDecoderForCTC(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(vocab),
    )
    jasper_decoder.restore_from(args.decoder_checkpoint, local_rank=0)

    ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab))

    greedy_decoder = nemo_asr.GreedyCTCDecoder()

    # logging.info("================================")
    # logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    # logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    # logging.info(
    #     f"Total number of parameters in model: "
    #     f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
    # )
    # logging.info("================================")
    #
    # # Train DAG
    # (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t) = data_layer()
    # processed_signal_t, p_length_t = data_preprocessor(
    #     input_signal=audio_signal_t, length=a_sig_length_t
    # )
    #
    # if multiply_batch_config:
    #     (
    #         processed_signal_t,
    #         p_length_t,
    #         transcript_t,
    #         transcript_len_t,
    #     ) = multiply_batch(
    #         in_x=processed_signal_t,
    #         in_x_len=p_length_t,
    #         in_y=transcript_t,
    #         in_y_len=transcript_len_t,
    #     )
    #
    # if spectr_augment_config:
    #     processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t)
    #
    # encoded_t, encoded_len_t = jasper_encoder(
    #     audio_signal=processed_signal_t, length=p_length_t
    # )
    # log_probs_t = jasper_decoder(encoder_output=encoded_t)
    # predictions_t = greedy_decoder(log_probs=log_probs_t)
    # loss_t = ctc_loss(
    #     log_probs=log_probs_t,
    #     targets=transcript_t,
    #     input_length=encoded_len_t,
    #     target_length=transcript_len_t,
    # )
    #
    # # Callbacks needed to print info to console and Tensorboard
    # train_callback = nemo.core.SimpleLossLoggerCallback(
    #     tensors=[loss_t, predictions_t, transcript_t, transcript_len_t],
    #     print_func=partial(monitor_asr_train_progress, labels=vocab),
    #     get_tb_values=lambda x: [("loss", x[0])],
    #     tb_writer=neural_factory.tb_writer,
    # )
    #
    # chpt_callback = nemo.core.CheckpointCallback(
    #     folder=neural_factory.checkpoint_dir,
    #     load_from_folder=args.load_dir,
    #     step_freq=args.checkpoint_save_freq,
    #     checkpoints_to_keep=30,
    # )
    #
    # callbacks = [train_callback, chpt_callback]
    callbacks = []
    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        (audio_signal_e, a_sig_length_e, transcript_e,
         transcript_len_e) = eval_dl()
        processed_signal_e, p_length_e = data_preprocessor(
            input_signal=audio_signal_e, length=a_sig_length_e)
        encoded_e, encoded_len_e = jasper_encoder(
            audio_signal=processed_signal_e, length=p_length_e)
        log_probs_e = jasper_decoder(encoder_output=encoded_e)
        predictions_e = greedy_decoder(log_probs=log_probs_e)
        loss_e = ctc_loss(
            log_probs=log_probs_e,
            targets=transcript_e,
            input_length=encoded_len_e,
            target_length=transcript_len_e,
        )

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[
                loss_e, predictions_e, transcript_e, transcript_len_e
            ],
            user_iter_callback=partial(process_evaluation_batch, labels=vocab),
            user_epochs_done_callback=partial(process_evaluation_epoch,
                                              tag=tagname),
            eval_step=args.eval_freq,
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return callbacks
示例#17
0
    def test_double_jasper_training(self):
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/jasper_smaller.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToTextDataLayer(
            featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=4,
        )
        pre_process_params = {
            'int_values': False,
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder1 = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        jasper_encoder2 = nemo_asr.JasperEncoder(
            feat_in=jasper_model_definition[
                'AudioToMelSpectrogramPreprocessor']['features'],
            **jasper_model_definition['JasperEncoder'],
        )
        mx_max1 = nemo.backends.pytorch.common.SimpleCombiner(mode="max")
        mx_max2 = nemo.backends.pytorch.common.SimpleCombiner(mode="max")
        jasper_decoder1 = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                       num_classes=len(
                                                           self.labels))
        jasper_decoder2 = nemo_asr.JasperDecoderForCTC(feat_in=1024,
                                                       num_classes=len(
                                                           self.labels))

        ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels))

        # DAG
        audio_signal, a_sig_length, transcript, transcript_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded1, encoded_len1 = jasper_encoder1(audio_signal=processed_signal,
                                                 length=p_length)
        encoded2, encoded_len2 = jasper_encoder2(audio_signal=processed_signal,
                                                 length=p_length)
        log_probs1 = jasper_decoder1(encoder_output=encoded1)
        log_probs2 = jasper_decoder2(encoder_output=encoded2)
        log_probs = mx_max1(x1=log_probs1, x2=log_probs2)
        encoded_len = mx_max2(x1=encoded_len1, x2=encoded_len2)
        loss = ctc_loss(
            log_probs=log_probs,
            targets=transcript,
            input_length=encoded_len,
            target_length=transcript_len,
        )

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        neural_factory = nemo.core.NeuralModuleFactory(
            backend=nemo.core.Backend.PyTorch,
            local_rank=None,
            create_tb_writer=False,
        )
        optimizer = neural_factory.get_trainer()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )