Exemplo n.º 1
0
 def test_trim_silence(self):
     batch_size = 2
     normal_dl = nemo_asr.AudioToSpeechLabelDataLayer(
         # featurizer_config=self.featurizer_config,
         manifest_filepath=self.manifest_filepath,
         labels=self.labels,
         batch_size=batch_size,
         # placement=DeviceType.GPU,
         drop_last=False,
         shuffle=False,
     )
     trimmed_dl = nemo_asr.AudioToSpeechLabelDataLayer(
         # featurizer_config=self.featurizer_config,
         manifest_filepath=self.manifest_filepath,
         trim_silence=True,
         labels=self.labels,
         batch_size=batch_size,
         # placement=DeviceType.GPU,
         drop_last=False,
         shuffle=False,
     )
     for norm, trim in zip(normal_dl.data_iterator,
                           trimmed_dl.data_iterator):
         for point in range(batch_size):
             self.assertTrue(norm[1][point].data >= trim[1][point].data)
Exemplo n.º 2
0
    def test_jasper_eval(self):
        with open(
                os.path.abspath(
                    os.path.join(
                        os.path.dirname(__file__),
                        "../data/quartznet_speech_recognition.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToSpeechLabelDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=2,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            **jasper_model_definition['JasperEncoder'], )
        jasper_decoder = nemo_asr.JasperDecoderForClassification(
            feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1]
            ['filters'],
            num_classes=len(self.labels))
        ce_loss = nemo_asr.CrossEntropyLossNM()

        # DAG
        audio_signal, a_sig_length, targets, targets_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        logits = jasper_decoder(encoder_output=encoded)
        loss = ce_loss(
            logits=logits,
            labels=targets,
        )

        from nemo.collections.asr.helpers import (
            process_classification_evaluation_batch,
            process_classification_evaluation_epoch,
        )

        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss, logits, targets],
            user_iter_callback=lambda x, y:
            process_classification_evaluation_batch(x, y, top_k=[1]),
            user_epochs_done_callback=process_classification_evaluation_epoch,
        )
        # Instantiate an optimizer to perform `train` action
        self.nf.eval(callbacks=[eval_callback])
Exemplo n.º 3
0
    def test_stft_conv(self):
        with open(
                os.path.abspath(
                    os.path.join(
                        os.path.dirname(__file__),
                        "../data/quartznet_speech_recognition.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToSpeechLabelDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=2,
        )
        pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
            'stft_conv': True,
        }
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            **pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(
            **jasper_model_definition['JasperEncoder'], )
        jasper_decoder = nemo_asr.JasperDecoderForClassification(
            feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1]
            ['filters'],
            num_classes=len(self.labels))

        ce_loss = nemo_asr.CrossEntropyLossNM()

        # DAG
        audio_signal, a_sig_length, targets, targets_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        logits = jasper_decoder(encoder_output=encoded)
        loss = ce_loss(logits=logits, labels=targets)

        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=lambda x: logging.info(str(x[0].item())))
        # Instantiate an optimizer to perform `train` action
        optimizer = nemo.backends.pytorch.actions.PtActions()
        optimizer.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "num_epochs": 10,
                "lr": 0.0003
            },
        )
Exemplo n.º 4
0
    def test_quartznet_vad_training(self):
        """Integtaion test that instantiates a small QuartzNet model for vad and tests training with the
        sample vad data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/quartznet_vad.yaml"))) as file:
            jasper_model_definition = self.yaml.load(file)
        dl = nemo_asr.AudioToSpeechLabelDataLayer(
            # featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=6,
        )
        pre_process_params = pre_process_params = {
            'frame_splicing': 1,
            'features': 64,
            'window_size': 0.02,
            'n_fft': 512,
            'dither': 1e-05,
            'window': 'hann',
            'sample_rate': 16000,
            'normalize': 'per_feature',
            'window_stride': 0.01,
        }
        #         preprocessing = nemo_asr.AudioToMFCCPreprocessor(**pre_process_params)
        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params)
        jasper_encoder = nemo_asr.JasperEncoder(**jasper_model_definition['JasperEncoder'])
        jasper_decoder = nemo_asr.JasperDecoderForClassification(
            feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1]['filters'], num_classes=len(self.labels)
        )
        ce_loss = nemo_asr.CrossEntropyLossNM()

        # DAG
        audio_signal, a_sig_length, targets, targets_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length)
        # logging.info(jasper_encoder)
        log_probs = jasper_decoder(encoder_output=encoded)
        loss = ce_loss(logits=log_probs, labels=targets)

        loss_list = []
        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1
        )

        self.nf.train(
            [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.003},
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Exemplo n.º 5
0
    def test_audio_preprocessors(self):
        batch_size = 2
        dl = nemo_asr.AudioToSpeechLabelDataLayer(
            # featurizer_config=self.featurizer_config,
            manifest_filepath=self.manifest_filepath,
            labels=self.labels,
            batch_size=batch_size,
            # placement=DeviceType.GPU,
            drop_last=False,
            shuffle=False,
        )

        installed_torchaudio = True
        try:
            import torchaudio
        except ModuleNotFoundError:
            installed_torchaudio = False
            with self.assertRaises(ModuleNotFoundError):
                to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None)
            with self.assertRaises(ModuleNotFoundError):
                to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15)

        if installed_torchaudio:
            to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None)
            to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15)
            time_stretch_augment = nemo_asr.TimeStretchAugmentation(
                self.featurizer_config['sample_rate'], probability=1.0, min_speed_rate=0.9, max_speed_rate=1.1
            )

        to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50)

        for batch in dl.data_iterator:
            input_signals, seq_lengths, _, _ = batch
            input_signals = input_signals.to(to_melspec._device)
            seq_lengths = seq_lengths.to(to_melspec._device)

            melspec = to_melspec.forward(input_signals, seq_lengths)

            if installed_torchaudio:
                spec = to_spectrogram.forward(input_signals, seq_lengths)
                mfcc = to_mfcc.forward(input_signals, seq_lengths)
                ts_input_signals = time_stretch_augment.forward(input_signals, seq_lengths)

            # Check that number of features is what we expect
            self.assertTrue(melspec[0].shape[1] == 50)

            if installed_torchaudio:
                self.assertTrue(spec[0].shape[1] == 201)  # n_fft // 2 + 1 bins
                self.assertTrue(mfcc[0].shape[1] == 15)

                timesteps = ts_input_signals[0].shape[1]
                self.assertTrue(timesteps <= int(1.15 * self.featurizer_config['sample_rate']))
                self.assertTrue(timesteps >= int(0.85 * self.featurizer_config['sample_rate']))
Exemplo n.º 6
0
 def test_dataloader(self):
     batch_size = 2
     dl = nemo_asr.AudioToSpeechLabelDataLayer(
         # featurizer_config=self.featurizer_config,
         manifest_filepath=self.manifest_filepath,
         labels=self.labels,
         batch_size=batch_size,
         # placement=DeviceType.GPU,
         sample_rate=16000,
     )
     for ind, data in enumerate(dl.data_iterator):
         # With num_workers update, this is no longer true
         # Moving to GPU is handled by AudioPreprocessor
         # data is on GPU
         # self.assertTrue(data[0].is_cuda)
         # self.assertTrue(data[1].is_cuda)
         # self.assertTrue(data[2].is_cuda)
         # self.assertTrue(data[3].is_cuda)
         # first dimension is batch
         self.assertTrue(data[0].size(0) == batch_size)
         self.assertTrue(data[1].size(0) == batch_size)
         self.assertTrue(data[2].size(0) == batch_size)
         self.assertTrue(data[3].size(0) == batch_size)
Exemplo n.º 7
0
def create_all_dags(args, neural_factory):
    """
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks"""

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        spkr_params = yaml.load(f)

    sample_rate = spkr_params["sample_rate"]
    time_length = spkr_params.get("time_length", 8)
    logging.info("max time length considered is {} sec".format(time_length))

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) // 2

    # create data layer for training
    train_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"])
    train_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    audio_augmentor = spkr_params.get("AudioAugmentor", None)
    # del train_dl_params["normalize_transcripts"]

    data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer(
        manifest_filepath=args.train_dataset,
        labels=None,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        augmentor=audio_augmentor,
        time_length=time_length,
        **train_dl_params,
        # normalize_transcripts=False
    )

    N = len(data_layer_train)
    steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus))

    logging.info("Number of steps per epoch {}".format(steps_per_epoch))
    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"])
    eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]

    data_layers_test = []
    for test_set in args.eval_datasets:

        data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer(
            manifest_filepath=test_set,
            labels=data_layer_train.labels,
            batch_size=args.batch_size,
            num_workers=cpu_per_traindl,
            time_length=time_length,
            **eval_dl_params,
            # normalize_transcripts=False
        )
        data_layers_test.append(data_layer_test)
    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"],
    )

    spectr_augment_config = spkr_params.get("SpectrogramAugmentation", None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)
    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"],)

    decoder = nemo_asr.JasperDecoderForSpkrClass(
        feat_in=spkr_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=data_layer_train.num_classes,
        pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'],
        emb_sizes=spkr_params["JasperDecoderForSpkrClass"]["emb_sizes"].split(","),
    )
    if os.path.exists(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt"):
        encoder.restore_from(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt")
        logging.info("Pretrained Encoder loaded")

    weight = None
    xent_loss = nemo_asr.CrossEntropyLossNM(weight=weight)

    # assemble train DAG

    audio_signal, audio_signal_len, label, label_len = data_layer_train()

    processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)

    if spectr_augment_config:
        processed_signal = data_spectr_augmentation(input_spec=processed_signal)

    encoded, encoded_len = encoder(audio_signal=processed_signal, length=processed_signal_len)

    logits, _ = decoder(encoder_output=encoded)
    loss = xent_loss(logits=logits, labels=label)

    # create train callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[loss, logits, label],
        print_func=partial(monitor_classification_training_progress, eval_metric=[1]),
        step_freq=args.print_freq,
        get_tb_values=lambda x: [("train_loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    callbacks = [train_callback]

    if args.checkpoint_dir or args.load_dir:
        chpt_callback = nemo.core.CheckpointCallback(
            folder=args.checkpoint_dir,
            load_from_folder=args.checkpoint_dir,  # load dir
            step_freq=args.checkpoint_save_freq,
            checkpoints_to_keep=125,
        )

        callbacks.append(chpt_callback)

    # --- Assemble Validation DAG --- #

    for i, eval_layer in enumerate(data_layers_test):

        audio_signal_test, audio_len_test, label_test, _ = eval_layer()
        processed_signal_test, processed_len_test = data_preprocessor(
            input_signal=audio_signal_test, length=audio_len_test
        )
        encoded_test, encoded_len_test = encoder(audio_signal=processed_signal_test, length=processed_len_test)
        logits_test, _ = decoder(encoder_output=encoded_test)
        loss_test = xent_loss(logits=logits_test, labels=label_test)

        tagname = os.path.dirname(args.eval_datasets[i]).split("/")[-1] + "_" + str(i)
        print(tagname)
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[loss_test, logits_test, label_test],
            user_iter_callback=partial(process_classification_evaluation_batch, top_k=1),
            user_epochs_done_callback=partial(process_classification_evaluation_epoch, tag=tagname),
            eval_step=args.eval_freq,  # How often we evaluate the model on the test set
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)

    return loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test
Exemplo n.º 8
0
def create_all_dags(args, neural_factory):
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    labels = jasper_params['labels']  # Vocab of tokens
    sample_rate = jasper_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # perturb_config = jasper_params.get('perturb', None)
    train_dl_params = copy.deepcopy(
        jasper_params["AudioToSpeechLabelDataLayer"])
    train_dl_params.update(
        jasper_params["AudioToSpeechLabelDataLayer"]["train"])
    del train_dl_params["train"]
    del train_dl_params["eval"]
    # del train_dl_params["normalize_transcripts"]

    # Look for augmentations
    audio_augmentor = jasper_params.get('AudioAugmentor', None)

    data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
        manifest_filepath=args.train_dataset,
        labels=labels,
        sample_rate=sample_rate,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        augmentor=audio_augmentor,
        **train_dl_params,
    )

    crop_pad_augmentation = nemo_asr.CropOrPadSpectrogramAugmentation(
        audio_length=128)

    N = len(data_layer)
    steps_per_epoch = math.ceil(
        N / (args.batch_size * args.iter_per_step * args.num_gpus))
    logging.info('Steps per epoch : {0}'.format(steps_per_epoch))
    logging.info('Have {0} examples to train on.'.format(N))

    data_preprocessor = nemo_asr.AudioToMFCCPreprocessor(
        sample_rate=sample_rate,
        **jasper_params["AudioToMFCCPreprocessor"],
    )

    spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)
    if spectr_augment_config:
        data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(
            **spectr_augment_config)

    eval_dl_params = copy.deepcopy(
        jasper_params["AudioToSpeechLabelDataLayer"])
    eval_dl_params.update(jasper_params["AudioToSpeechLabelDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    data_layers_eval = []

    if args.eval_datasets:
        for eval_datasets in args.eval_datasets:
            data_layer_eval = nemo_asr.AudioToSpeechLabelDataLayer(
                manifest_filepath=eval_datasets,
                sample_rate=sample_rate,
                labels=labels,
                batch_size=args.eval_batch_size,
                num_workers=cpu_per_traindl,
                **eval_dl_params,
            )

            data_layers_eval.append(data_layer_eval)
    else:
        logging.warning("There were no val datasets passed")

    jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"], )

    jasper_decoder = nemo_asr.JasperDecoderForClassification(
        feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
        num_classes=len(labels),
        **jasper_params['JasperDecoderForClassification'],
    )

    ce_loss = nemo_asr.CrossEntropyLossNM()

    logging.info('================================')
    logging.info(
        f"Number of parameters in encoder: {jasper_encoder.num_weights}")
    logging.info(
        f"Number of parameters in decoder: {jasper_decoder.num_weights}")
    logging.info(f"Total number of parameters in model: "
                 f"{jasper_decoder.num_weights + jasper_encoder.num_weights}")
    logging.info('================================')

    # Train DAG
    # --- Assemble Training DAG --- #
    audio_signal, audio_signal_len, commands, command_len = data_layer()

    processed_signal, processed_signal_len = data_preprocessor(
        input_signal=audio_signal, length=audio_signal_len)

    processed_signal, processed_signal_len = crop_pad_augmentation(
        input_signal=processed_signal, length=audio_signal_len)

    if spectr_augment_config:
        processed_signal = data_spectr_augmentation(
            input_spec=processed_signal)

    encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                          length=processed_signal_len)

    decoded = jasper_decoder(encoder_output=encoded)

    loss = ce_loss(logits=decoded, labels=commands)

    # Callbacks needed to print info to console and Tensorboard
    train_callback = nemo.core.SimpleLossLoggerCallback(
        # Notice that we pass in loss, predictions, and the labels (commands).
        # Of course we would like to see our training loss, but we need the
        # other arguments to calculate the accuracy.
        tensors=[loss, decoded, commands],
        # The print_func defines what gets printed.
        print_func=partial(monitor_classification_training_progress,
                           eval_metric=None),
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=neural_factory.tb_writer,
    )

    chpt_callback = nemo.core.CheckpointCallback(
        folder=neural_factory.checkpoint_dir,
        load_from_folder=args.load_dir,
        step_freq=args.checkpoint_save_freq,
    )

    callbacks = [train_callback, chpt_callback]

    # assemble eval DAGs
    for i, eval_dl in enumerate(data_layers_eval):
        # --- Assemble Training DAG --- #
        test_audio_signal, test_audio_signal_len, test_commands, test_command_len = eval_dl(
        )

        test_processed_signal, test_processed_signal_len = data_preprocessor(
            input_signal=test_audio_signal, length=test_audio_signal_len)

        test_processed_signal, test_processed_signal_len = crop_pad_augmentation(
            input_signal=test_processed_signal,
            length=test_processed_signal_len)

        test_encoded, test_encoded_len = jasper_encoder(
            audio_signal=test_processed_signal,
            length=test_processed_signal_len)

        test_decoded = jasper_decoder(encoder_output=test_encoded)

        test_loss = ce_loss(logits=test_decoded, labels=test_commands)

        # create corresponding eval callback
        tagname = os.path.basename(args.eval_datasets[i]).split(".")[0]
        eval_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[test_loss, test_decoded, test_commands],
            user_iter_callback=partial(process_classification_evaluation_batch,
                                       top_k=1),
            user_epochs_done_callback=partial(
                process_classification_evaluation_epoch,
                eval_metric=1,
                tag=tagname),
            eval_step=args.
            eval_freq,  # How often we evaluate the model on the test set
            tb_writer=neural_factory.tb_writer,
        )

        callbacks.append(eval_callback)
    return loss, callbacks, steps_per_epoch
Exemplo n.º 9
0
def create_all_dags(args, neural_factory):
    '''
    creates train and eval dags as well as their callbacks
    returns train loss tensor and callbacks'''

    # parse the config files
    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        spkr_params = yaml.load(f)

    sample_rate = spkr_params['sample_rate']

    # Calculate num_workers for dataloader
    total_cpus = os.cpu_count()
    cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1)

    # create separate data layers for eval
    # we need separate eval dags for separate eval datasets
    # but all other modules in these dags will be shared

    eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"])
    eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"])
    del eval_dl_params["train"]
    del eval_dl_params["eval"]
    eval_dl_params[
        'shuffle'] = False  # To grab  the file names without changing data_layer

    data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer(
        manifest_filepath=args.eval_datasets[0],
        labels=None,
        batch_size=args.batch_size,
        num_workers=cpu_per_traindl,
        **eval_dl_params,
        # normalize_transcripts=False
    )
    # create shared modules

    data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
        sample_rate=sample_rate,
        **spkr_params["AudioToMelSpectrogramPreprocessor"],
    )

    # (QuartzNet uses the Jasper baseline encoder and decoder)
    encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"], )

    decoder = nemo_asr.JasperDecoderForSpkrClass(
        feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'],
        num_classes=254,
        emb_sizes=spkr_params['JasperDecoderForSpkrClass']['emb_sizes'].split(
            ','),
        pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'],
    )

    # --- Assemble Validation DAG --- #
    audio_signal_test, audio_len_test, label_test, _ = data_layer_test()

    processed_signal_test, processed_len_test = data_preprocessor(
        input_signal=audio_signal_test, length=audio_len_test)

    encoded_test, _ = encoder(audio_signal=processed_signal_test,
                              length=processed_len_test)

    _, embeddings = decoder(encoder_output=encoded_test)

    return embeddings, label_test
    def test_quartznet_speaker_reco_training(self):
        """Integtaion test that instantiates a small QuartzNet model for speaker recognition and tests training with the
        sample an4 data.
        Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss
        at the first step.
        """
        with open(
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__),
                                 "../data/quartznet_spkr_test.yaml"))) as file:
            spkr_params = self.yaml.load(file)
        dl = nemo_asr.AudioToSpeechLabelDataLayer(
            manifest_filepath=self.manifest_filepath,
            labels=None,
            batch_size=10,
        )
        sample_rate = 16000

        preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(
            sample_rate=sample_rate,
            **spkr_params["AudioToMelSpectrogramPreprocessor"],
        )
        jasper_encoder = nemo_asr.JasperEncoder(**spkr_params['JasperEncoder'])
        jasper_decoder = nemo_asr.JasperDecoderForSpkrClass(
            feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'],
            num_classes=dl.num_classes,
            pool_mode=spkr_params['JasperDecoderForSpkrClass']['pool_mode'],
            emb_sizes=spkr_params["JasperDecoderForSpkrClass"]
            ["emb_sizes"].split(","),
        )
        ce_loss = nemo_asr.CrossEntropyLossNM()

        # DAG
        audio_signal, a_sig_length, targets, targets_len = dl()
        processed_signal, p_length = preprocessing(input_signal=audio_signal,
                                                   length=a_sig_length)

        encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                              length=p_length)
        # logging.info(jasper_encoder)
        log_probs, _ = jasper_decoder(encoder_output=encoded)
        loss = ce_loss(logits=log_probs, labels=targets)

        loss_list = []
        callback = nemo.core.SimpleLossLoggerCallback(
            tensors=[loss],
            print_func=partial(self.print_and_log_loss,
                               loss_log_list=loss_list),
            step_freq=1)
        self.nf.random_seed = 42
        self.nf.train(
            [loss],
            callbacks=[callback],
            optimizer="sgd",
            optimization_params={
                "max_steps": 4,
                "lr": 0.002
            },
        )
        self.nf.reset_trainer()

        # Assert that training loss went down
        assert loss_list[-1] < loss_list[0]
Exemplo n.º 11
0
tmp_labels = labels
sample_rate = jasper_params['sample_rate']

batch_size = 128
num_classes = len(labels)

logdir = data_dir + '/runs/' + args.name

neural_factory = nemo.core.NeuralModuleFactory(log_dir=logdir,
                                               create_tb_writer=True)
tb_writer = neural_factory.tb_writer

train_data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
    manifest_filepath=train_dataset,
    labels=labels,
    sample_rate=sample_rate,
    batch_size=batch_size,
    num_workers=0,
    augmentor=None,
    shuffle=True)
eval_data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
    manifest_filepath=test_dataset,
    sample_rate=sample_rate,
    labels=labels,
    batch_size=batch_size,
    num_workers=0,
    shuffle=True,
)

data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
    sample_rate=sample_rate,
    **jasper_params["AudioToMelSpectrogramPreprocessor"],
Exemplo n.º 12
0
    labels=labels,
    sample_rate=sample_rate,
    batch_size=args.batch_classes * args.per_class,
    num_workers=0,
    augmentor=audio_augmentor,
    shuffle=True,
    num_classes=args.batch_classes,
    class_dists=dists,
    class_probs=probs,
    probs_num=args.data_probs
)

eval_data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
    manifest_filepath=val_dataset,
    sample_rate=sample_rate,
    labels=labels,
    batch_size=batch_size,
    num_workers=0,
    shuffle=True,
)

data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
    sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
)

N = len(train_data_layer)
steps_per_epoch = math.ceil(N / float(batch_size) + 1)

logging.info("Steps per epoch : {0}".format(steps_per_epoch))
logging.info('Have {0} examples to train on.'.format(N))

spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)