def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> List[str]: """Transcripts audio batch to text. Args: audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects. Returns: text_batch: Batch of transcripts. """ data_layer = AudioInferDataLayer( audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer']) audio_signal, audio_signal_len = data_layer() processed_signal, processed_signal_len = self.data_preprocessor( input_signal=audio_signal, length=audio_signal_len) encoded, encoded_len = self.jasper_encoder( audio_signal=processed_signal, length=processed_signal_len) log_probs = self.jasper_decoder(encoder_output=encoded) predictions = self.greedy_decoder(log_probs=log_probs) eval_tensors = [predictions] tensors = self.neural_factory.infer(tensors=eval_tensors) text_batch = post_process_predictions(tensors[0], self.labels) return text_batch
def wav_to_text(self, manifest, greedy=True): data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False, manifest_filepath=manifest, labels=self.labels, batch_size=1) audio_signal, audio_signal_len, transcript, transcript_len = data_layer( ) log_probs, encoded_len = self.asr_model(input_signal=audio_signal, length=audio_signal_len) predictions = self.greedy_decoder(log_probs=log_probs) eval_tensors = [predictions] if self.ENABLE_NGRAM: print('Running with beam search') beam_predictions = self.beam_search_with_lm( log_probs=log_probs, log_probs_length=encoded_len) eval_tensors.append(beam_predictions) tensors = self.neural_factory.infer(tensors=eval_tensors) if greedy: prediction = post_process_predictions(tensors[0], self.labels) else: prediction = tensors[0][0][0][0][1] del data_layer del eval_tensors del beam_predictions del predictions del tensors del audio_signal, audio_signal_len, transcript, transcript_len del log_probs, encoded_len return prediction
def wav_to_text(manifest, greedy=True): from ruamel.yaml import YAML yaml = YAML(typ="safe") with open(MODEL_YAML) as f: jasper_model_definition = yaml.load(f) labels = jasper_model_definition['labels'] # Instantiate necessary neural modules data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False, manifest_filepath=manifest, labels=labels, batch_size=1) # Define inference DAG audio_signal, audio_signal_len, _, _ = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) if ENABLE_NGRAM: logging.info('Running with beam search') beam_predictions = beam_search_with_lm(log_probs=log_probs, log_probs_length=encoded_len) eval_tensors = [beam_predictions] if greedy: eval_tensors = [predictions] tensors = neural_factory.infer(tensors=eval_tensors) if greedy: from nemo.collections.asr.helpers import post_process_predictions prediction = post_process_predictions(tensors[0], labels) else: prediction = tensors[0][0][0][0][1] return prediction
def main(): parser = argparse.ArgumentParser( parents=[nm_argparse.NemoArgParser()], description='AN4 ASR', conflict_handler='resolve', ) # Overwrite default args parser.add_argument("--train_dataset", type=str, help="training dataset path") parser.add_argument("--eval_datasets", type=str, help="validation dataset path") # Create new args # parser.add_argument("--lm", default="./an4-lm.3gram.binary", type=str) parser.add_argument("--batch_size", default=48, type=int, help="size of the training batch") parser.add_argument("--lm", default=None, type=str) parser.add_argument("--test_after_training", action='store_true') parser.add_argument("--momentum", type=float) parser.add_argument("--beta1", default=0.95, type=float) parser.add_argument("--beta2", default=0.25, type=float) parser.add_argument("--do_not_eval_at_start", action='store_true') parser.set_defaults( model_config="./configs/jasper_an4.yaml", train_dataset="~/TestData/an4_dataset/an4_train.json", eval_datasets="~/TestData/an4_dataset/an4_val.json", work_dir="./tmp", optimizer="novograd", num_epochs=50, lr=0.02, weight_decay=0.005, checkpoint_save_freq=1000, eval_freq=100, amp_opt_level="O1", ) args = parser.parse_args() betas = (args.beta1, args.beta2) wer_thr = 0.20 beam_wer_thr = 0.15 nf = nemo.core.NeuralModuleFactory( local_rank=args.local_rank, files_to_copy=[__file__], optimization_level=args.amp_opt_level, random_seed=0, log_dir=args.work_dir, create_tb_writer=True, cudnn_benchmark=args.cudnn_benchmark, ) tb_writer = nf.tb_writer checkpoint_dir = nf.checkpoint_dir # Load model definition yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) # Get vocabulary. vocab = jasper_params['labels'] ( loss, eval_tensors, callbacks, total_steps, log_probs_e, encoded_len_e, ) = create_dags(args.model_config, vocab, args, nf) nf.train( tensors_to_optimize=[loss], callbacks=callbacks, optimizer=args.optimizer, lr_policy=CosineAnnealing(total_steps=total_steps, min_lr=args.lr / 100), optimization_params={ "num_epochs": args.num_epochs, "max_steps": args.max_steps, "lr": args.lr, "momentum": args.momentum, "betas": betas, "weight_decay": args.weight_decay, "grad_norm_clip": None, }, batches_per_step=args.iter_per_step, amp_max_loss_scale=256.0, # synced_batchnorm=(nf.global_rank is not None), ) if args.test_after_training: logging.info("Testing greedy and beam search with LM WER.") # Create BeamSearch NM if nf.world_size > 1 or args.lm is None: logging.warning( "Skipping beam search WER as it does not work if doing distributed training." ) else: beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=vocab, beam_width=64, alpha=2.0, beta=1.5, lm_path=args.lm, num_cpus=max(os.cpu_count(), 1), ) beam_predictions = beam_search_with_lm( log_probs=log_probs_e, log_probs_length=encoded_len_e) eval_tensors.append(beam_predictions) evaluated_tensors = nf.infer(eval_tensors) if nf.global_rank in [0, None]: greedy_hypotheses = post_process_predictions( evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) wer = word_error_rate(hypotheses=greedy_hypotheses, references=references) logging.info("Greedy WER: {:.2f}%".format(wer * 100)) if wer > wer_thr: nf.sync_all_processes(False) raise ValueError(f"Final eval greedy WER {wer * 100:.2f}% > :" f"than {wer_thr * 100:.2f}%") nf.sync_all_processes() if nf.world_size == 1 and args.lm is not None: beam_hypotheses = [] # Over mini-batch for i in evaluated_tensors[-1]: # Over samples for j in i: beam_hypotheses.append(j[0][1]) beam_wer = word_error_rate(hypotheses=beam_hypotheses, references=references) logging.info("Beam WER {:.2f}%".format(beam_wer * 100)) assert beam_wer <= beam_wer_thr, "Final eval beam WER {:.2f}% > than {:.2f}%".format( beam_wer * 100, beam_wer_thr * 100) assert beam_wer <= wer, "Final eval beam WER > than the greedy WER." # Reload model weights and train for extra 10 epochs checkpointer_callback = nemo.core.CheckpointCallback( folder=checkpoint_dir, step_freq=args.checkpoint_save_freq, force_load=True, ) # Distributed Data Parallel changes the underlying class so we need # to reinstantiate Encoder and Decoder args.num_epochs += 10 previous_step_count = total_steps loss, eval_tensors, callbacks, total_steps, _, _ = create_dags( args.model_config, vocab, args, nf) nf.reset_trainer() nf.train( tensors_to_optimize=[loss], callbacks=callbacks, optimizer=args.optimizer, lr_policy=CosineAnnealing(warmup_steps=previous_step_count, total_steps=total_steps), optimization_params={ "num_epochs": args.num_epochs, "lr": args.lr / 100, "momentum": args.momentum, "betas": betas, "weight_decay": args.weight_decay, "grad_norm_clip": None, }, reset=True, amp_max_loss_scale=256.0, # synced_batchnorm=(nf.global_rank is not None), ) evaluated_tensors = nf.infer(eval_tensors) if nf.global_rank in [0, None]: greedy_hypotheses = post_process_predictions( evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) wer_new = word_error_rate(hypotheses=greedy_hypotheses, references=references) logging.info("New greedy WER: {:.2f}%".format(wer_new * 100)) if wer_new > wer * 1.1: nf.sync_all_processes(False) raise ValueError( f"Fine tuning: new WER {wer_new * 100:.2f}% > than the " f"previous WER {wer * 100:.2f}%") nf.sync_all_processes() # Open the log file and ensure that epochs is strictly increasing if nf._exp_manager.log_file: epochs = [] with open(nf._exp_manager.log_file, "r") as log_file: line = log_file.readline() while line: index = line.find("Starting epoch") if index != -1: epochs.append(int(line[index + len("Starting epoch"):])) line = log_file.readline() for i, e in enumerate(epochs): if i != e: raise ValueError("Epochs from logfile was not understood")
def main(): parser = argparse.ArgumentParser(description='Jasper') parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--model_config", type=str, required=True) parser.add_argument("--eval_datasets", type=str, required=True) parser.add_argument("--load_dir", type=str, required=True) parser.add_argument("--vocab_file", type=str, required=True) parser.add_argument("--save_logprob", default=None, type=str) parser.add_argument("--lm_path", default=None, type=str) parser.add_argument("--beam_width", default=50, type=int) parser.add_argument("--alpha", default=2.0, type=float) parser.add_argument("--beta", default=1.0, type=float) parser.add_argument("--cutoff_prob", default=0.99, type=float) parser.add_argument("--cutoff_top_n", default=40, type=int) args = parser.parse_args() batch_size = args.batch_size load_dir = args.load_dir if args.local_rank is not None: if args.lm_path: raise NotImplementedError( "Beam search decoder with LM does not currently support evaluation on multi-gpu." ) device = nemo.core.DeviceType.AllGpu else: device = nemo.core.DeviceType.GPU # Instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=nemo.core.Optimization.mxprO1, placement=device, ) if args.local_rank is not None: logging.info('Doing ALL GPU') yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = load_vocab(args.vocab_file) sample_rate = jasper_params['sample_rate'] eval_datasets = args.eval_datasets eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) eval_dl_params["normalize_transcripts"] = False del eval_dl_params["train"] del eval_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=batch_size, **eval_dl_params, ) n = len(data_layer) logging.info('Evaluating {0} examples'.format(n)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), ) greedy_decoder = nemo_asr.GreedyCTCDecoder() if args.lm_path: beam_width = args.beam_width alpha = args.alpha beta = args.beta cutoff_prob = args.cutoff_prob cutoff_top_n = args.cutoff_top_n beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=vocab, beam_width=beam_width, alpha=alpha, beta=beta, cutoff_prob=cutoff_prob, cutoff_top_n=cutoff_top_n, lm_path=args.lm_path, num_cpus=max(os.cpu_count(), 1), ) logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') ( audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1, ) = data_layer() processed_signal_e1, p_length_e1 = data_preprocessor( input_signal=audio_signal_e1, length=a_sig_length_e1) encoded_e1, encoded_len_e1 = jasper_encoder( audio_signal=processed_signal_e1, length=p_length_e1) log_probs_e1 = jasper_decoder(encoder_output=encoded_e1) predictions_e1 = greedy_decoder(log_probs=log_probs_e1) eval_tensors = [ log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1, encoded_len_e1, ] if args.lm_path: beam_predictions_e1 = beam_search_with_lm( log_probs=log_probs_e1, log_probs_length=encoded_len_e1) eval_tensors.append(beam_predictions_e1) evaluated_tensors = neural_factory.infer( tensors=eval_tensors, checkpoint_dir=load_dir, ) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) cer = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=True) logging.info("Greedy CER {:.2f}%".format(cer * 100)) if args.lm_path: beam_hypotheses = [] # Over mini-batch for i in evaluated_tensors[-1]: # Over samples for j in i: beam_hypotheses.append(j[0][1]) cer = word_error_rate(hypotheses=beam_hypotheses, references=references, use_cer=True) logging.info("Beam CER {:.2f}".format(cer * 100)) if args.save_logprob: # Convert logits to list of numpy arrays logprob = [] for i, batch in enumerate(evaluated_tensors[0]): for j in range(batch.shape[0]): logprob.append( batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy()) with open(args.save_logprob, 'wb') as f: pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
def main(): # Usage and Command line arguments parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5-En", required=True, help= "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--eval_batch_size", type=int, default=1, help="batch size to use for evaluation") parser.add_argument("--wer_target", type=float, default=None, help="used by test") parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument("--trim_silence", default=True, type=bool, help="trim audio from silence or not") parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English.") args = parser.parse_args() # Setup NeuralModuleFactory to control training # instantiate Neural Factory with supported backend nf = nemo.core.NeuralModuleFactory() # Instantiate the model which we'll train logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}") asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained( model_info=args.asr_model) asr_model.eval() logging.info("\n\n") logging.info(f"Evaluation using {type(asr_model)} model.") logging.info(f"Evaluation using alphabet {asr_model.vocabulary}.") logging.info(f"The model has {asr_model.num_weights} weights.\n\n") eval_data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.dataset, labels=asr_model.vocabulary, batch_size=args.eval_batch_size, trim_silence=args.trim_silence, shuffle=False, normalize_transcripts=args.normalize_text, ) greedy_decoder = nemo_asr.GreedyCTCDecoder() audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer( ) log_probs, encoded_len = asr_model(input_signal=audio_signal, length=audio_signal_len) predictions = greedy_decoder(log_probs=log_probs) # inference eval_tensors = [ log_probs, predictions, transcript, transcript_len, encoded_len ] evaluated_tensors = nf.infer(tensors=eval_tensors) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], asr_model.vocabulary) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], asr_model.vocabulary) if args.asr_model.strip().endswith('-Zh'): val = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=True) metric = 'CER' else: val = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=False) metric = 'WER' logging.info(f"Greedy {metric} = {val}") if args.wer_target is not None: if args.wer_target * args.wer_tolerance < wer: raise ValueError( f"Resulting WER {wer} is higher than the target {args.wer_target}" )
def main(): parser = argparse.ArgumentParser(description='Jasper') # model params parser.add_argument("--model_config", type=str, required=True) parser.add_argument("--eval_datasets", type=str, required=True) parser.add_argument("--load_dir", type=str, required=True) # run params parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=64, type=int) parser.add_argument("--amp_opt_level", default="O1", type=str) # store results parser.add_argument("--save_logprob", default=None, type=str) # lm inference parameters parser.add_argument("--lm_path", default=None, type=str) parser.add_argument('--alpha', default=2.0, type=float, help='value of LM weight', required=False) parser.add_argument( '--alpha_max', type=float, help='maximum value of LM weight (for a grid search in \'eval\' mode)', required=False, ) parser.add_argument('--alpha_step', type=float, help='step for LM weight\'s tuning in \'eval\' mode', required=False, default=0.1) parser.add_argument('--beta', default=1.5, type=float, help='value of word count weight', required=False) parser.add_argument( '--beta_max', type=float, help='maximum value of word count weight (for a grid search in \ \'eval\' mode', required=False, ) parser.add_argument( '--beta_step', type=float, help='step for word count weight\'s tuning in \'eval\' mode', required=False, default=0.1, ) parser.add_argument("--beam_width", default=128, type=int) args = parser.parse_args() batch_size = args.batch_size load_dir = args.load_dir if args.local_rank is not None: if args.lm_path: raise NotImplementedError( "Beam search decoder with LM does not currently support evaluation on multi-gpu." ) device = nemo.core.DeviceType.AllGpu else: device = nemo.core.DeviceType.GPU # Instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, placement=device, ) if args.local_rank is not None: logging.info('Doing ALL GPU') yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = jasper_params['labels'] sample_rate = jasper_params['sample_rate'] eval_datasets = args.eval_datasets eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=batch_size, **eval_dl_params, ) N = len(data_layer) logging.info('Evaluating {0} examples'.format(N)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"]) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"]) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') # Define inference DAG audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer( ) processed_signal_e1, p_length_e1 = data_preprocessor( input_signal=audio_signal_e1, length=a_sig_length_e1) encoded_e1, encoded_len_e1 = jasper_encoder( audio_signal=processed_signal_e1, length=p_length_e1) log_probs_e1 = jasper_decoder(encoder_output=encoded_e1) predictions_e1 = greedy_decoder(log_probs=log_probs_e1) eval_tensors = [ log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1, encoded_len_e1 ] # inference evaluated_tensors = neural_factory.infer(tensors=eval_tensors, checkpoint_dir=load_dir) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) wer = word_error_rate(hypotheses=greedy_hypotheses, references=references) logging.info("Greedy WER {:.2f}%".format(wer * 100)) # Convert logits to list of numpy arrays logprob = [] for i, batch in enumerate(evaluated_tensors[0]): for j in range(batch.shape[0]): logprob.append( batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy()) if args.save_logprob: with open(args.save_logprob, 'wb') as f: pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL) # language model if args.lm_path: if args.alpha_max is None: args.alpha_max = args.alpha # include alpha_max in tuning range args.alpha_max += args.alpha_step / 10.0 if args.beta_max is None: args.beta_max = args.beta # include beta_max in tuning range args.beta_max += args.beta_step / 10.0 beam_wers = [] logprobexp = [np.exp(p) for p in logprob] for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step): for beta in np.arange(args.beta, args.beta_max, args.beta_step): logging.info('================================') logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})') beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=vocab, beam_width=args.beam_width, alpha=alpha, beta=beta, lm_path=args.lm_path, num_cpus=max(os.cpu_count(), 1), input_tensor=False, ) beam_predictions = beam_search_with_lm(log_probs=logprobexp, log_probs_length=None, force_pt=True) beam_predictions = [b[0][1] for b in beam_predictions[0]] lm_wer = word_error_rate(hypotheses=beam_predictions, references=references) logging.info("Beam WER {:.2f}%".format(lm_wer * 100)) beam_wers.append(((alpha, beta), lm_wer * 100)) logging.info('Beam WER for (alpha, beta)') logging.info('================================') logging.info('\n' + '\n'.join([str(e) for e in beam_wers])) logging.info('================================') best_beam_wer = min(beam_wers, key=lambda x: x[1]) logging.info('Best (alpha, beta): ' f'{best_beam_wer[0]}, ' f'WER: {best_beam_wer[1]:.2f}%')
print('Start Training!') neural_factory.train(tensors_to_optimize=[loss], callbacks=callbacks, optimizer='novograd', optimization_params=optimization_params) print('Inference Only') # We've already built the inference DAG above, so all we need is to call infer(). evaluated_tensors = neural_factory.infer( # These are the tensors we want to get from the model. tensors=[loss_test, preds_test, transcript_test, transcript_len_test], # checkpoint_dir specifies where the model params are loaded from. checkpoint_dir=(data_dir + '/an4_checkpoints')) # Process the results to get WER greedy_hypotheses = helpers.post_process_predictions(evaluated_tensors[1], labels) references = helpers.post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], labels) wer = helpers.word_error_rate(hypotheses=greedy_hypotheses, references=references) print("*** Greedy WER: {:.2f} ***".format(wer * 100)) """And that's it! ## Model Improvements You already have all you need to create your own ASR model in NeMo, but there are a few more tricks that you can employ if you so desire. In this section, we'll briefly cover a few possibilities for improving an ASR model. ### Data Augmentation
def transcribe(self, audio_data, greedy=True): audio_file = tempfile.NamedTemporaryFile(dir=WORK_DIR, prefix="jasper_audio.", delete=False) # audio_file.write(audio_data) audio_file.close() audio_file_path = audio_file.name wf = wave.open(audio_file_path, "w") wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(24000) wf.writeframesraw(audio_data) wf.close() manifest = { "audio_filepath": audio_file_path, "duration": 60, "text": "todo" } manifest_file = tempfile.NamedTemporaryFile(dir=WORK_DIR, prefix="jasper_manifest.", delete=False, mode="w") manifest_file.write(json.dumps(manifest)) manifest_file.close() manifest_file_path = manifest_file.name data_layer = nemo_asr.AudioToTextDataLayer( shuffle=False, manifest_filepath=manifest_file_path, labels=self.labels, batch_size=1, ) # Define inference DAG audio_signal, audio_signal_len, _, _ = data_layer() processed_signal, processed_signal_len = self.data_preprocessor( input_signal=audio_signal, length=audio_signal_len) encoded, encoded_len = self.jasper_encoder( audio_signal=processed_signal, length=processed_signal_len) log_probs = self.jasper_decoder(encoder_output=encoded) predictions = self.greedy_decoder(log_probs=log_probs) if greedy: eval_tensors = [predictions] else: if self.beam_search_with_lm: logging.info("Running with beam search") beam_predictions = self.beam_search_with_lm( log_probs=log_probs, log_probs_length=encoded_len) eval_tensors = [beam_predictions] else: logging.info( "language_model not specified. falling back to greedy decoding." ) eval_tensors = [predictions] tensors = self.neural_factory.infer(tensors=eval_tensors) prediction = post_process_predictions(tensors[0], self.labels) prediction_text = ". ".join(prediction) os.unlink(manifest_file.name) os.unlink(audio_file.name) return prediction_text