def test_trim_silence(self): batch_size = 2 normal_dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=False, shuffle=False, ) trimmed_dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, trim_silence=True, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=False, shuffle=False, ) for norm, trim in zip(normal_dl.data_iterator, trimmed_dl.data_iterator): for point in range(batch_size): self.assertTrue(norm[1][point].data >= trim[1][point].data)
def test_jasper_eval(self): with open( os.path.abspath( os.path.join( os.path.dirname(__file__), "../data/quartznet_speech_recognition.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=2, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1] ['filters'], num_classes=len(self.labels)) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) logits = jasper_decoder(encoder_output=encoded) loss = ce_loss( logits=logits, labels=targets, ) from nemo.collections.asr.helpers import ( process_classification_evaluation_batch, process_classification_evaluation_epoch, ) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss, logits, targets], user_iter_callback=lambda x, y: process_classification_evaluation_batch(x, y, top_k=[1]), user_epochs_done_callback=process_classification_evaluation_epoch, ) # Instantiate an optimizer to perform `train` action self.nf.eval(callbacks=[eval_callback])
def test_stft_conv(self): with open( os.path.abspath( os.path.join( os.path.dirname(__file__), "../data/quartznet_speech_recognition.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=2, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1] ['filters'], num_classes=len(self.labels)) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) logits = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=logits, labels=targets) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_quartznet_vad_training(self): """Integtaion test that instantiates a small QuartzNet model for vad and tests training with the sample vad data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/quartznet_vad.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=6, ) pre_process_params = pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } # preprocessing = nemo_asr.AudioToMFCCPreprocessor(**pre_process_params) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) jasper_encoder = nemo_asr.JasperEncoder(**jasper_model_definition['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1]['filters'], num_classes=len(self.labels) ) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=log_probs, labels=targets) loss_list = [] callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.003}, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def test_audio_preprocessors(self): batch_size = 2 dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=False, shuffle=False, ) installed_torchaudio = True try: import torchaudio except ModuleNotFoundError: installed_torchaudio = False with self.assertRaises(ModuleNotFoundError): to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None) with self.assertRaises(ModuleNotFoundError): to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) if installed_torchaudio: to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor(n_fft=400, window=None) to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) time_stretch_augment = nemo_asr.TimeStretchAugmentation( self.featurizer_config['sample_rate'], probability=1.0, min_speed_rate=0.9, max_speed_rate=1.1 ) to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50) for batch in dl.data_iterator: input_signals, seq_lengths, _, _ = batch input_signals = input_signals.to(to_melspec._device) seq_lengths = seq_lengths.to(to_melspec._device) melspec = to_melspec.forward(input_signals, seq_lengths) if installed_torchaudio: spec = to_spectrogram.forward(input_signals, seq_lengths) mfcc = to_mfcc.forward(input_signals, seq_lengths) ts_input_signals = time_stretch_augment.forward(input_signals, seq_lengths) # Check that number of features is what we expect self.assertTrue(melspec[0].shape[1] == 50) if installed_torchaudio: self.assertTrue(spec[0].shape[1] == 201) # n_fft // 2 + 1 bins self.assertTrue(mfcc[0].shape[1] == 15) timesteps = ts_input_signals[0].shape[1] self.assertTrue(timesteps <= int(1.15 * self.featurizer_config['sample_rate'])) self.assertTrue(timesteps >= int(0.85 * self.featurizer_config['sample_rate']))
def test_dataloader(self): batch_size = 2 dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, sample_rate=16000, ) for ind, data in enumerate(dl.data_iterator): # With num_workers update, this is no longer true # Moving to GPU is handled by AudioPreprocessor # data is on GPU # self.assertTrue(data[0].is_cuda) # self.assertTrue(data[1].is_cuda) # self.assertTrue(data[2].is_cuda) # self.assertTrue(data[3].is_cuda) # first dimension is batch self.assertTrue(data[0].size(0) == batch_size) self.assertTrue(data[1].size(0) == batch_size) self.assertTrue(data[2].size(0) == batch_size) self.assertTrue(data[3].size(0) == batch_size)
def create_all_dags(args, neural_factory): """ creates train and eval dags as well as their callbacks returns train loss tensor and callbacks""" # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: spkr_params = yaml.load(f) sample_rate = spkr_params["sample_rate"] time_length = spkr_params.get("time_length", 8) logging.info("max time length considered is {} sec".format(time_length)) # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) // 2 # create data layer for training train_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) train_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] audio_augmentor = spkr_params.get("AudioAugmentor", None) # del train_dl_params["normalize_transcripts"] data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.train_dataset, labels=None, batch_size=args.batch_size, num_workers=cpu_per_traindl, augmentor=audio_augmentor, time_length=time_length, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer_train) steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus)) logging.info("Number of steps per epoch {}".format(steps_per_epoch)) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_test = [] for test_set in args.eval_datasets: data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=test_set, labels=data_layer_train.labels, batch_size=args.batch_size, num_workers=cpu_per_traindl, time_length=time_length, **eval_dl_params, # normalize_transcripts=False ) data_layers_test.append(data_layer_test) # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) spectr_augment_config = spkr_params.get("SpectrogramAugmentation", None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"],) decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=data_layer_train.num_classes, pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], emb_sizes=spkr_params["JasperDecoderForSpkrClass"]["emb_sizes"].split(","), ) if os.path.exists(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt"): encoder.restore_from(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt") logging.info("Pretrained Encoder loaded") weight = None xent_loss = nemo_asr.CrossEntropyLossNM(weight=weight) # assemble train DAG audio_signal, audio_signal_len, label, label_len = data_layer_train() processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len) if spectr_augment_config: processed_signal = data_spectr_augmentation(input_spec=processed_signal) encoded, encoded_len = encoder(audio_signal=processed_signal, length=processed_signal_len) logits, _ = decoder(encoder_output=encoded) loss = xent_loss(logits=logits, labels=label) # create train callbacks train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss, logits, label], print_func=partial(monitor_classification_training_progress, eval_metric=[1]), step_freq=args.print_freq, get_tb_values=lambda x: [("train_loss", x[0])], tb_writer=neural_factory.tb_writer, ) callbacks = [train_callback] if args.checkpoint_dir or args.load_dir: chpt_callback = nemo.core.CheckpointCallback( folder=args.checkpoint_dir, load_from_folder=args.checkpoint_dir, # load dir step_freq=args.checkpoint_save_freq, checkpoints_to_keep=125, ) callbacks.append(chpt_callback) # --- Assemble Validation DAG --- # for i, eval_layer in enumerate(data_layers_test): audio_signal_test, audio_len_test, label_test, _ = eval_layer() processed_signal_test, processed_len_test = data_preprocessor( input_signal=audio_signal_test, length=audio_len_test ) encoded_test, encoded_len_test = encoder(audio_signal=processed_signal_test, length=processed_len_test) logits_test, _ = decoder(encoder_output=encoded_test) loss_test = xent_loss(logits=logits_test, labels=label_test) tagname = os.path.dirname(args.eval_datasets[i]).split("/")[-1] + "_" + str(i) print(tagname) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss_test, logits_test, label_test], user_iter_callback=partial(process_classification_evaluation_batch, top_k=1), user_epochs_done_callback=partial(process_classification_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, # How often we evaluate the model on the test set tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test
def create_all_dags(args, neural_factory): yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) labels = jasper_params['labels'] # Vocab of tokens sample_rate = jasper_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy( jasper_params["AudioToSpeechLabelDataLayer"]) train_dl_params.update( jasper_params["AudioToSpeechLabelDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] # Look for augmentations audio_augmentor = jasper_params.get('AudioAugmentor', None) data_layer = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.train_dataset, labels=labels, sample_rate=sample_rate, batch_size=args.batch_size, num_workers=cpu_per_traindl, augmentor=audio_augmentor, **train_dl_params, ) crop_pad_augmentation = nemo_asr.CropOrPadSpectrogramAugmentation( audio_length=128) N = len(data_layer) steps_per_epoch = math.ceil( N / (args.batch_size * args.iter_per_step * args.num_gpus)) logging.info('Steps per epoch : {0}'.format(steps_per_epoch)) logging.info('Have {0} examples to train on.'.format(N)) data_preprocessor = nemo_asr.AudioToMFCCPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMFCCPreprocessor"], ) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( **spectr_augment_config) eval_dl_params = copy.deepcopy( jasper_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(jasper_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=labels, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: logging.warning("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(labels), **jasper_params['JasperDecoderForClassification'], ) ce_loss = nemo_asr.CrossEntropyLossNM() logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') # Train DAG # --- Assemble Training DAG --- # audio_signal, audio_signal_len, commands, command_len = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) processed_signal, processed_signal_len = crop_pad_augmentation( input_signal=processed_signal, length=audio_signal_len) if spectr_augment_config: processed_signal = data_spectr_augmentation( input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len) decoded = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=decoded, labels=commands) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( # Notice that we pass in loss, predictions, and the labels (commands). # Of course we would like to see our training loss, but we need the # other arguments to calculate the accuracy. tensors=[loss, decoded, commands], # The print_func defines what gets printed. print_func=partial(monitor_classification_training_progress, eval_metric=None), get_tb_values=lambda x: [("loss", x[0])], tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq, ) callbacks = [train_callback, chpt_callback] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): # --- Assemble Training DAG --- # test_audio_signal, test_audio_signal_len, test_commands, test_command_len = eval_dl( ) test_processed_signal, test_processed_signal_len = data_preprocessor( input_signal=test_audio_signal, length=test_audio_signal_len) test_processed_signal, test_processed_signal_len = crop_pad_augmentation( input_signal=test_processed_signal, length=test_processed_signal_len) test_encoded, test_encoded_len = jasper_encoder( audio_signal=test_processed_signal, length=test_processed_signal_len) test_decoded = jasper_decoder(encoder_output=test_encoded) test_loss = ce_loss(logits=test_decoded, labels=test_commands) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[test_loss, test_decoded, test_commands], user_iter_callback=partial(process_classification_evaluation_batch, top_k=1), user_epochs_done_callback=partial( process_classification_evaluation_epoch, eval_metric=1, tag=tagname), eval_step=args. eval_freq, # How often we evaluate the model on the test set tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss, callbacks, steps_per_epoch
def create_all_dags(args, neural_factory): ''' creates train and eval dags as well as their callbacks returns train loss tensor and callbacks''' # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: spkr_params = yaml.load(f) sample_rate = spkr_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] eval_dl_params[ 'shuffle'] = False # To grab the file names without changing data_layer data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.eval_datasets[0], labels=None, batch_size=args.batch_size, num_workers=cpu_per_traindl, **eval_dl_params, # normalize_transcripts=False ) # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"], ) decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], num_classes=254, emb_sizes=spkr_params['JasperDecoderForSpkrClass']['emb_sizes'].split( ','), pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], ) # --- Assemble Validation DAG --- # audio_signal_test, audio_len_test, label_test, _ = data_layer_test() processed_signal_test, processed_len_test = data_preprocessor( input_signal=audio_signal_test, length=audio_len_test) encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test) _, embeddings = decoder(encoder_output=encoded_test) return embeddings, label_test
def test_quartznet_speaker_reco_training(self): """Integtaion test that instantiates a small QuartzNet model for speaker recognition and tests training with the sample an4 data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/quartznet_spkr_test.yaml"))) as file: spkr_params = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=self.manifest_filepath, labels=None, batch_size=10, ) sample_rate = 16000 preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) jasper_encoder = nemo_asr.JasperEncoder(**spkr_params['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], num_classes=dl.num_classes, pool_mode=spkr_params['JasperDecoderForSpkrClass']['pool_mode'], emb_sizes=spkr_params["JasperDecoderForSpkrClass"] ["emb_sizes"].split(","), ) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs, _ = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=log_probs, labels=targets) loss_list = [] callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.random_seed = 42 self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 4, "lr": 0.002 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
tmp_labels = labels sample_rate = jasper_params['sample_rate'] batch_size = 128 num_classes = len(labels) logdir = data_dir + '/runs/' + args.name neural_factory = nemo.core.NeuralModuleFactory(log_dir=logdir, create_tb_writer=True) tb_writer = neural_factory.tb_writer train_data_layer = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=train_dataset, labels=labels, sample_rate=sample_rate, batch_size=batch_size, num_workers=0, augmentor=None, shuffle=True) eval_data_layer = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=test_dataset, sample_rate=sample_rate, labels=labels, batch_size=batch_size, num_workers=0, shuffle=True, ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"],
labels=labels, sample_rate=sample_rate, batch_size=args.batch_classes * args.per_class, num_workers=0, augmentor=audio_augmentor, shuffle=True, num_classes=args.batch_classes, class_dists=dists, class_probs=probs, probs_num=args.data_probs ) eval_data_layer = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=val_dataset, sample_rate=sample_rate, labels=labels, batch_size=batch_size, num_workers=0, shuffle=True, ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) N = len(train_data_layer) steps_per_epoch = math.ceil(N / float(batch_size) + 1) logging.info("Steps per epoch : {0}".format(steps_per_epoch)) logging.info('Have {0} examples to train on.'.format(N)) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)