def test_eff_save_restore_from_nemo_file_encrypted(self, asr_model): """" Test makes sure that after encrypted save-restore the model has the same weights. """ with tempfile.NamedTemporaryFile() as fp: filename = fp.name # Set key - use checkpoint encryption. NeMoArchive.set_encryption_key("test_key") # Save model (with random artifact). with tempfile.NamedTemporaryFile() as artifact: asr_model.register_artifact(config_path=None, src=artifact.name) asr_model.save_to(save_path=filename) # Try to restore the encrypted archive (weights) without the encryption key. NeMoArchive.set_encryption_key(None) with pytest.raises(PermissionError): # Restore the model. asr_model2 = EncDecCTCModel.restore_from(restore_path=filename) # Restore the model. NeMoArchive.set_encryption_key("test_key") asr_model3 = EncDecCTCModel.restore_from(restore_path=filename) # Reset encryption so it won't mess up with other save/restore. NeMoArchive.set_encryption_key(None) assert asr_model.num_weights == asr_model3.num_weights
def oth_quartznet15x5_en_nr(pretrained=False, num_classes=29, **kwargs): from nemo.collections.asr.models import EncDecCTCModel quartznet_nemo_path = path_pref + "QuartzNet15x5NR-En_b05e34f3.nemo" raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path) net = QuartzNet(raw_net=raw_net, num_classes=num_classes) net = net.cpu() return net
def oth_quartznet15x5_ru(pretrained=False, num_classes=35, **kwargs): from nemo.collections.asr.models import EncDecCTCModel quartznet_nemo_path = path_pref + "stt_ru_quartznet15x5_88a3e5aa.nemo" raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path) net = QuartzNet(raw_net=raw_net, num_classes=num_classes) net = net.cpu() return net
def oth_quartznet15x5_ru34(pretrained=False, num_classes=34, **kwargs): from nemo.collections.asr.models import EncDecCTCModel quartznet_nemo_path = path_pref + "QuartzNet15x5_golos_1a63a2d8.nemo" raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path) net = QuartzNet(raw_net=raw_net, num_classes=num_classes) net = net.cpu() return net#, raw_net
def test_save_restore_from_nemo_file(self, asr_model): """" Test makes sure that the second instance created from the same configuration AND checkpoint has the same weights. """ with tempfile.NamedTemporaryFile() as fp: filename = fp.name # Save model (with random artifact). with tempfile.NamedTemporaryFile() as artifact: asr_model.register_artifact(config_path=None, src=artifact.name) asr_model.save_to(save_path=filename) # Restore the model. asr_model2 = EncDecCTCModel.restore_from(restore_path=filename) assert len(asr_model.decoder.vocabulary) == len( asr_model2.decoder.vocabulary) assert asr_model.num_weights == asr_model2.num_weights w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach( ).cpu().numpy() w2 = asr_model2.encoder.encoder[0].mconv[ 0].conv.weight.data.detach().cpu().numpy() assert np.array_equal(w1, w2)
def main( nemo_file, enemo_file, onnx_file, model_type="asr", ): if model_type == "asr": logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == "speech_label": logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == "speaker": logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError( "Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file") with tarfile.open(nemo_file, "r") as archive: archive.extract("./model_config.yaml") with tarfile.open(enemo_file, "w") as enemo_archive: enemo_archive.add("./model_config.yaml") copyfile(onnx_file, "model_graph.onnx") enemo_archive.add("model_graph.onnx") os.remove("model_graph.onnx") # cleanup extra file
def oth_jasperdr10x5_en_nr(pretrained=False, num_classes=29, **kwargs): from nemo.collections.asr.models import EncDecCTCModel quartznet_nemo_path = path_pref + "stt_en_jasper10x5dr_0d5ebc6c.nemo" raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path) net = QuartzNet(raw_net=raw_net, num_classes=num_classes) net = net.cpu() return net#, raw_net
def main( nemo_file, enemo_file, onnx_file, model_type='asr', ): if model_type == 'asr': logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == 'speech_label': logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == 'speaker': logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError( "Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file") with tarfile.open(nemo_file, 'r') as archive: archive.extract('./model_config.yaml') with tarfile.open(enemo_file, 'w') as enemo_archive: enemo_archive.add('./model_config.yaml') enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"), open(onnx_file))
def test_save_model_level_pt_ckpt(self, asr_model): with tempfile.TemporaryDirectory() as ckpt_dir: nemo_file = os.path.join(ckpt_dir, 'asr.nemo') asr_model.save_to(nemo_file) # Save model level PT checkpoint asr_model.extract_state_dict_from(nemo_file, ckpt_dir) ckpt_path = os.path.join(ckpt_dir, 'model_weights.ckpt') assert os.path.exists(ckpt_path) # Restore the model. asr_model2 = EncDecCTCModel.restore_from(restore_path=nemo_file) assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary) assert asr_model.num_weights == asr_model2.num_weights # Change weights values asr_model2.encoder.encoder[0].mconv[0].conv.weight.data += 1.0 w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() assert not np.array_equal(w1, w2) # Restore from checkpoint asr_model2.load_state_dict(torch.load(ckpt_path)) w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() assert np.array_equal(w1, w2)
def oth_jasperdr10x5_en(pretrained=False, num_classes=29, **kwargs): from nemo.collections.asr.models import EncDecCTCModel quartznet_nemo_path = path_pref + "Jasper10x5Dr-En_2b94c9d1.nemo" raw_net = EncDecCTCModel.restore_from(quartznet_nemo_path) net = QuartzNet(raw_net=raw_net, num_classes=num_classes) net = net.cpu() return net
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=4) parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English." ) args = parser.parse_args() torch.set_grad_enabled(False) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) asr_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': asr_model.decoder.vocabulary, 'batch_size': args.batch_size, 'normalize_transcripts': args.normalize_text, } ) if can_gpu: asr_model = asr_model.cuda() asr_model.eval() labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) wer = WER(vocabulary=asr_model.decoder.vocabulary) hypotheses = [] references = [] for test_batch in asr_model.test_dataloader(): if can_gpu: test_batch = [x.cuda() for x in test_batch] with autocast(): log_probs, encoded_len, greedy_predictions = asr_model( input_signal=test_batch[0], input_signal_length=test_batch[1] ) hypotheses += wer.ctc_decoder_predictions_tensor(greedy_predictions) for batch_ind in range(greedy_predictions.shape[0]): reference = ''.join([labels_map[c] for c in test_batch[2][batch_ind].cpu().detach().numpy()]) references.append(reference) del test_batch wer_value = word_error_rate(hypotheses=hypotheses, references=references) if wer_value > args.wer_tolerance: raise ValueError(f"Got WER of {wer_value}. It was higher than {args.wer_tolerance}") logging.info(f'Got WER of {wer_value}. Tolerance was {args.wer_tolerance}')
def test_save_restore_from_nemo_file_with_override(self, asr_model, tmpdir): """" Test makes sure that the second instance created from the same configuration AND checkpoint has the same weights. Args: tmpdir: fixture providing a temporary directory unique to the test invocation. """ # Name of the archive in tmp folder. filename = os.path.join(tmpdir, "eff.nemo") # Get path where the command is executed - the artifacts will be "retrieved" there. # (original .nemo behavior) cwd = os.getcwd() with tempfile.NamedTemporaryFile(mode='a+') as conf_fp: # Create a "random artifact". with tempfile.NamedTemporaryFile(mode="w", delete=False) as artifact: artifact.write("magic content 42") # Remember the filename of the artifact. _, artifact_filename = os.path.split(artifact.name) # Add artifact to model. asr_model.register_artifact(config_path=None, src=artifact.name) # Save model (with "random artifact"). asr_model.save_to(save_path=filename) # Modify config slightly cfg = asr_model.cfg cfg.encoder.params.activation = 'swish' yaml_cfg = OmegaConf.to_yaml(cfg) conf_fp.write(yaml_cfg) conf_fp.seek(0) # Restore the model. asr_model2 = EncDecCTCModel.restore_from(restore_path=filename, override_config_path=conf_fp.name) assert len(asr_model.decoder.vocabulary) == len(asr_model2.decoder.vocabulary) assert asr_model.num_weights == asr_model2.num_weights w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() w2 = asr_model2.encoder.encoder[0].mconv[0].conv.weight.data.detach().cpu().numpy() assert np.array_equal(w1, w2) assert asr_model2.cfg.encoder.params.activation == 'swish'
def infer(model, audiofiles, batch_size=4): asr_model = EncDecCTCModel.restore_from(model) mode = asr_model.training device = next(asr_model.parameters()).device asr_model.eval() vocab = asr_model._cfg.train_ds.labels with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: for file in audiofiles: entry = { 'audio_filepath': file, 'duration': 100000, 'text': 'nothing' } fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': audiofiles, 'batch_size': batch_size, 'temp_dir': tmpdir } characters = [] log_probs = [] temporary_datalayer = asr_model._setup_transcribe_dataloader(config) for test_batch in temporary_datalayer: log_prob, encoded_len, greedy_predictions = asr_model.forward( input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)) character = asr_model._wer.ctc_decoder_predictions_tensor( greedy_predictions) characters += character encoded_len = encoded_len.long().cpu() log_prob = log_prob.float().cpu() for i in range(0, encoded_len.shape[0]): el = encoded_len[i].detach().numpy().tolist() lp = log_prob[i].detach().numpy().tolist() log_probs += [lp[0:el]] del test_batch asr_model.train(mode) return characters, log_probs, vocab
def main( nemo_file, onnx_file, model_type='asr', ): if model_type == 'asr': logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == 'speech_label': logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == 'speaker': logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError("Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file")
def batch_inference(args: argparse.Namespace): torch.set_grad_enabled(False) if args.asr_model.endswith(".nemo"): print(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: print(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) manifest = prepare_manifest(args.corpora_dir, args.limit) asr_model.setup_test_data( test_data_config={ "sample_rate": 16000, "manifest_filepath": manifest, "labels": asr_model.decoder.vocabulary, "batch_size": args.batch_size, "normalize_transcripts": args.normalize_text, }) refs_hyps = list(tqdm(generate_ref_hyps(asr_model, args.search, args.arpa))) references, hypotheses = [list(k) for k in zip(*refs_hyps)] os.makedirs(args.results_dir, exist_ok=True) data_io.write_lines(f"{args.results_dir}/refs.txt.gz", references) data_io.write_lines(f"{args.results_dir}/hyps.txt.gz", hypotheses) wer_value = word_error_rate(hypotheses=hypotheses, references=references) sys.stdout.flush() stats = { "wer": wer_value, "args": args.__dict__, } data_io.write_json(f"{args.results_dir}/stats.txt", stats) print(f"Got WER of {wer_value}") return stats
def test_save_restore_from_nemo_file_with_override(self, asr_model): """" Test makes sure that the second instance created from the same configuration AND checkpoint has the same weights. """ with tempfile.NamedTemporaryFile() as fp, tempfile.NamedTemporaryFile( mode='a+') as conf_fp: filename = fp.name # Save model (with random artifact). with tempfile.NamedTemporaryFile() as artifact: asr_model.register_artifact(config_path=None, src=artifact.name) asr_model.save_to(save_path=filename) # Modify config slightly cfg = asr_model.cfg cfg.encoder.params.activation = 'swish' yaml_cfg = OmegaConf.to_yaml(cfg) conf_fp.write(yaml_cfg) conf_fp.seek(0) # Restore the model. asr_model2 = EncDecCTCModel.restore_from( restore_path=filename, override_config_path=conf_fp.name) assert len(asr_model.decoder.vocabulary) == len( asr_model2.decoder.vocabulary) assert asr_model.num_weights == asr_model2.num_weights w1 = asr_model.encoder.encoder[0].mconv[0].conv.weight.data.detach( ).cpu().numpy() w2 = asr_model2.encoder.encoder[0].mconv[ 0].conv.weight.data.detach().cpu().numpy() assert np.array_equal(w1, w2) assert asr_model2.cfg.encoder.params.activation == 'swish'
def ASR_Grade(dataset, id, key): try: from torch.cuda.amp import autocast except ImportError: from contextlib import contextmanager @contextmanager def autocast(enabled=None): yield can_gpu = torch.cuda.is_available() parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default=model_Selected, required=True, help=f'Pass: {model_Selected}', ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=4) parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument( "--normalize_text", default=False, # False <- we're using phonetic references type=bool, help="Normalize transcripts or not. Set to False for non-English.", ) args = parser.parse_args( ["--dataset", dataset, "--asr_model", model_Selected]) torch.set_grad_enabled(False) # Instantiate Jasper/QuartzNet models with the EncDecCTCModel class. asr_model = EncDecCTCModel.restore_from(model_Path) asr_model.setup_test_data( test_data_config={ "sample_rate": 16000, "manifest_filepath": args.dataset, "labels": asr_model.decoder.vocabulary, "batch_size": args.batch_size, "normalize_transcripts": args.normalize_text, }) if can_gpu: # noqa asr_model = asr_model.cuda() asr_model.eval() labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) wer = WER(vocabulary=asr_model.decoder.vocabulary) hypotheses = [] references = [] for test_batch in asr_model.test_dataloader(): if can_gpu: test_batch = [x.cuda() for x in test_batch] with autocast(): log_probs, encoded_len, greedy_predictions = asr_model( input_signal=test_batch[0], input_signal_length=test_batch[1]) hypotheses = wer.ctc_decoder_predictions_tensor(greedy_predictions) for batch_ind in range(greedy_predictions.shape[0]): reference = key #reference = "".join([labels_map[c] for c in test_batch[2][batch_ind].cpu().detach().numpy()]) #debug print(reference) #debug references.append(reference) del test_batch wer_value = word_error_rate(hypotheses=hypotheses, references=references) #cer=True REC = '.' REF = '.' for h, r in zip(hypotheses, references): print("Recognized:\t{}\nReference:\t{}\n".format(h, r)) REC = h REF = r logging.info(f"Got PER of {wer_value}. Tolerance was {args.wer_tolerance}") #Score Calculation, phoneme conversion # divide wer_value by wer_tolerance to get the ratio of correctness (and round it) # then multiply by 100 to get a value above 0 # since this give the "% wrong", subtract from 100 to get "% correct" # this gives a positive grade to show return to the user score = 100.00 - (round((wer_value / args.wer_tolerance), 4) * 100) if score < 0.0: score = 0.0 print(score) #Result file creation, to be accessed by JS via 'app.py' Results = open(datasetPath + id + '_graded.txt', 'w') Results.write(REC + '\n' + REF + '\n' + str(score)) Results.close() return score
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", required=False, help="Pass: '******'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=4) parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English.") parser.add_argument( "--sclite_fmt", default="trn", type=str, help="sclite output format. Only trn and ctm are supported") parser.add_argument("--out_dir", type=str, required=True, help="Destination dir for output files") parser.add_argument("--sctk_dir", type=str, required=False, default="", help="Path to sctk root dir") parser.add_argument("--glm", type=str, required=False, default="", help="Path to glm file") parser.add_argument("--ref_stm", type=str, required=False, default="", help="Path to glm file") args = parser.parse_args() torch.set_grad_enabled(False) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) use_sctk = os.path.exists(args.sctk_dir) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) asr_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': asr_model.decoder.vocabulary, 'batch_size': args.batch_size, 'normalize_transcripts': args.normalize_text, }) if can_gpu: asr_model = asr_model.cuda() asr_model.eval() labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) wer = WER(vocabulary=asr_model.decoder.vocabulary) hypotheses = [] references = [] all_log_probs = [] for test_batch in asr_model.test_dataloader(): if can_gpu: test_batch = [x.cuda() for x in test_batch] with autocast(): log_probs, encoded_len, greedy_predictions = asr_model( input_signal=test_batch[0], input_signal_length=test_batch[1]) for r in log_probs.cpu().numpy(): all_log_probs.append(r) hypotheses += wer.ctc_decoder_predictions_tensor(greedy_predictions) for batch_ind in range(greedy_predictions.shape[0]): reference = ''.join([ labels_map[c] for c in test_batch[2][batch_ind].cpu().detach().numpy() ]) references.append(reference) del test_batch info_list = get_utt_info(args.dataset) hypfile = os.path.join(args.out_dir, "hyp.trn") reffile = os.path.join(args.out_dir, "ref.trn") with open(hypfile, "w") as hyp_f, open(reffile, "w") as ref_f: for i in range(len(hypotheses)): utt_id = os.path.splitext( os.path.basename(info_list[i]['audio_filepath']))[0] # rfilter in sctk likes each transcript to have a space at the beginning hyp_f.write(" " + hypotheses[i] + " (" + utt_id + ")" + "\n") ref_f.write(" " + references[i] + " (" + utt_id + ")" + "\n") if use_sctk: score_with_sctk(args.sctk_dir, reffile, hypfile, args.out_dir, glm=args.glm, fmt="trn")
def main(): parser = ArgumentParser() parser.add_argument("--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'") parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--num_batch", type=int, default=50, help="number of batches of the synthetic data") parser.add_argument("--batch_size", type=int, default=8, help="batch size of the synthetic data") parser.add_argument("--seqlen", type=int, default=500, help="sequence length of the synthetic data") parser.add_argument( "--train_iter", type=int, default=200, help="training iterations for the synthetic data generation") parser.add_argument("--dump_path", type=str, default=None, help="path to dump the synthetic data") parser.add_argument( "--dump_prefix", type=str, default='syn', help="prefix for the filename of the dumped synthetic data") parser.add_argument("--lr", type=float, default=0.01, help="Learning rate for the synthetic data generation") args = parser.parse_args() torch.set_grad_enabled(False) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") teacher_model = EncDecCTCModel.restore_from( restore_path=args.asr_model) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") teacher_model = EncDecCTCModel.from_pretrained( model_name=args.asr_model) teacher_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': teacher_model.decoder.vocabulary, 'batch_size': 8, 'normalize_transcripts': True, 'shuffle': True, }) ############################## Distillation ##################################### teacher_model.set_quant_mode( 'none') # distable quantization mode for the teacher model torch.set_grad_enabled(True) # enable backward graph generation print("Num batches: %d, Batch size: %d, Training iterations: %d, Learning rate: %.3f " \ % (args.num_batch, args.batch_size, args.train_iter, args.lr)) print('Synthesizing...') synthetic_data = get_synthetic_data(teacher_model.encoder, teacher_model.decoder, batch_size=args.batch_size, dim=64, seqlen=args.seqlen, num_batch=args.num_batch, train_iter=args.train_iter, lr=args.lr) file_name = '%s_nb%d_iter%d_lr%.3f.pkl' % \ (args.dump_prefix, args.num_batch, args.train_iter, args.lr) if args.dump_path is not None: if not os.path.exists(args.dump_path): os.makedirs(args.dump_path) file_name = os.path.join(args.dump_path, file_name) print('Synthetic data dumped as ', file_name) with open(file_name, 'wb') as f: pickle.dump([x.cpu() for x in synthetic_data], f)
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'", ) parser.add_argument( "--asr_onnx", type=str, default="./QuartzNet15x5Base-En-max-32.onnx", help="Pass: '******'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=4) parser.add_argument( "--dont_normalize_text", default=False, action='store_false', help="Turn off trasnscript normalization. Recommended for non-English.", ) parser.add_argument( "--use_cer", default=False, action='store_true', help="Use Character Error Rate as the evaluation metric") parser.add_argument('--qat', action="store_true", help="Use onnx file exported from QAT tools") args = parser.parse_args() torch.set_grad_enabled(False) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model_cfg = EncDecCTCModel.restore_from( restore_path=args.asr_model, return_config=True) with open_dict(asr_model_cfg): asr_model_cfg.encoder.quantize = True asr_model = EncDecCTCModel.restore_from( restore_path=args.asr_model, override_config_path=asr_model_cfg) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model_cfg = EncDecCTCModel.from_pretrained( model_name=args.asr_model, return_config=True) with open_dict(asr_model_cfg): asr_model_cfg.encoder.quantize = True asr_model = EncDecCTCModel.from_pretrained( model_name=args.asr_model, override_config_path=asr_model_cfg) asr_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': asr_model.decoder.vocabulary, 'batch_size': args.batch_size, 'normalize_transcripts': args.dont_normalize_text, }) asr_model.preprocessor.featurizer.dither = 0.0 asr_model.preprocessor.featurizer.pad_to = 0 if can_gpu: asr_model = asr_model.cuda() asr_model.eval() labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) wer = WER(vocabulary=asr_model.decoder.vocabulary, use_cer=args.use_cer) wer_result = evaluate(asr_model, args.asr_onnx, labels_map, wer, args.qat) logging.info(f'Got WER of {wer_result}.')
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--wer_target", type=float, default=None, help="used by test") parser.add_argument("--batch_size", type=int, default=4) parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English." ) parser.add_argument('--sensitivity', action="store_true", help="Perform sensitivity analysis") parser.add_argument('--onnx', action="store_true", help="Export to ONNX") args = parser.parse_args() torch.set_grad_enabled(False) quant_modules.initialize() if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) asr_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': asr_model.decoder.vocabulary, 'batch_size': args.batch_size, 'normalize_transcripts': args.normalize_text, } ) if can_gpu: asr_model = asr_model.cuda() asr_model.eval() labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) wer = WER(vocabulary=asr_model.decoder.vocabulary) wer_quant = evaluate(asr_model, labels_map, wer) logging.info(f'Got WER of {wer_quant}. Tolerance was {args.wer_tolerance}') if args.sensitivity: if wer_quant < args.wer_tolerance: logging.info("Tolerance is already met. Skip sensitivity analyasis.") return quant_layer_names = [] for name, module in asr_model.named_modules(): if isinstance(module, quant_nn.TensorQuantizer): module.disable() layer_name = name.replace("._input_quantizer", "").replace("._weight_quantizer", "") if layer_name not in quant_layer_names: quant_layer_names.append(layer_name) logging.info(F"{len(quant_layer_names)} quantized layers found.") # Build sensitivity profile quant_layer_sensitivity = {} for i, quant_layer in enumerate(quant_layer_names): logging.info(F"Enable {quant_layer}") for name, module in asr_model.named_modules(): if isinstance(module, quant_nn.TensorQuantizer) and quant_layer in name: module.enable() logging.info(F"{name:40}: {module}") # Eval the model wer_value = evaluate(asr_model, labels_map, wer) logging.info(F"WER: {wer_value}") quant_layer_sensitivity[quant_layer] = args.wer_tolerance - wer_value for name, module in asr_model.named_modules(): if isinstance(module, quant_nn.TensorQuantizer) and quant_layer in name: module.disable() logging.info(F"{name:40}: {module}") # Skip most sensitive layers until WER target is met for name, module in asr_model.named_modules(): if isinstance(module, quant_nn.TensorQuantizer): module.enable() quant_layer_sensitivity = collections.OrderedDict(sorted(quant_layer_sensitivity.items(), key=lambda x: x[1])) pprint(quant_layer_sensitivity) skipped_layers = [] for quant_layer, _ in quant_layer_sensitivity.items(): for name, module in asr_model.named_modules(): if isinstance(module, quant_nn.TensorQuantizer): if quant_layer in name: logging.info(F"Disable {name}") if not quant_layer in skipped_layers: skipped_layers.append(quant_layer) module.disable() wer_value = evaluate(asr_model, labels_map, wer) if wer_value <= args.wer_tolerance: logging.info( F"WER tolerance {args.wer_tolerance} is met by skipping {len(skipped_layers)} sensitive layers." ) print(skipped_layers) return raise ValueError(f"WER tolerance {args.wer_tolerance} can not be met with any layer quantized!") if args.onnx: if args.asr_model.endswith("nemo"): onnx_name = args.asr_model.replace(".nemo", ".onnx") else: onnx_name = args.asr_model logging.info("Export to ", onnx_name) quant_nn.TensorQuantizer.use_fb_fake_quant = True asr_model.export(onnx_name, onnx_opset_version=13) quant_nn.TensorQuantizer.use_fb_fake_quant = False
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=256) parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English.") parser.add_argument('--num_calib_batch', default=1, type=int, help="Number of batches for calibration.") parser.add_argument('--calibrator', type=str, choices=["max", "histogram"], default="max") parser.add_argument('--percentile', nargs='+', type=float, default=[99.9, 99.99, 99.999, 99.9999]) parser.add_argument("--amp", action="store_true", help="Use AMP in calibration.") parser.set_defaults(amp=False) args = parser.parse_args() torch.set_grad_enabled(False) # Initialize quantization quant_desc_input = QuantDescriptor(calib_method=args.calibrator) quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input) quant_nn.QuantConvTranspose2d.set_default_quant_desc_input( quant_desc_input) quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model_cfg = EncDecCTCModel.restore_from( restore_path=args.asr_model, return_config=True) with open_dict(asr_model_cfg): asr_model_cfg.encoder.quantize = True asr_model = EncDecCTCModel.restore_from( restore_path=args.asr_model, override_config_path=asr_model_cfg) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model_cfg = EncDecCTCModel.from_pretrained( model_name=args.asr_model, return_config=True) with open_dict(asr_model_cfg): asr_model_cfg.encoder.quantize = True asr_model = EncDecCTCModel.from_pretrained( model_name=args.asr_model, override_config_path=asr_model_cfg) asr_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': asr_model.decoder.vocabulary, 'batch_size': args.batch_size, 'normalize_transcripts': args.normalize_text, 'shuffle': True, }) if can_gpu: asr_model = asr_model.cuda() asr_model.eval() # Enable calibrators for name, module in asr_model.named_modules(): if isinstance(module, quant_nn.TensorQuantizer): if module._calibrator is not None: module.disable_quant() module.enable_calib() else: module.disable() for i, test_batch in enumerate(asr_model.test_dataloader()): if can_gpu: test_batch = [x.cuda() for x in test_batch] if args.amp: with autocast(): _ = asr_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) else: _ = asr_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) if i >= args.num_calib_batch: break # Save calibrated model(s) model_name = args.asr_model.replace( ".nemo", "") if args.asr_model.endswith(".nemo") else args.asr_model if not args.calibrator == "histogram": compute_amax(asr_model, method="max") asr_model.save_to( F"{model_name}-max-{args.num_calib_batch*args.batch_size}.nemo") else: for percentile in args.percentile: print(F"{percentile} percentile calibration") compute_amax(asr_model, method="percentile") asr_model.save_to( F"{model_name}-percentile-{percentile}-{args.num_calib_batch*args.batch_size}.nemo" ) for method in ["mse", "entropy"]: print(F"{method} calibration") compute_amax(asr_model, method=method) asr_model.save_to( F"{model_name}-{method}-{args.num_calib_batch*args.batch_size}.nemo" )
def main(): parser = ArgumentParser() """Training arguments""" parser.add_argument("--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: '******'") parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=8) parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English.") parser.add_argument("--shuffle", action='store_true', help="Shuffle test data.") """Calibration arguments""" parser.add_argument("--load", type=str, default=None, help="load path for the synthetic data") parser.add_argument( "--percentile", type=float, default=None, help="Max/min percentile for outlier handling. e.g., 99.9") """Quantization arguments""" parser.add_argument("--weight_bit", type=int, default=8, help="quantization bit for weights") parser.add_argument("--act_bit", type=int, default=8, help="quantization bit for activations") parser.add_argument("--dynamic", action='store_true', help="Dynamic quantization mode.") parser.add_argument("--no_quant", action='store_true', help="No quantization mode.") """Debugging arguments""" parser.add_argument("--eval_early_stop", type=int, default=None, help="early stop for debugging") parser.add_argument("--calib_early_stop", type=int, default=None, help="early stop calibration") args = parser.parse_args() torch.set_grad_enabled(False) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) asr_model.setup_test_data( test_data_config={ 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': asr_model.decoder.vocabulary, 'batch_size': args.batch_size, 'normalize_transcripts': args.normalize_text, 'shuffle': args.shuffle, }) if args.load is not None: print('Data loaded from %s' % args.load) with open(args.load, 'rb') as f: distilled_data = pickle.load(f) synthetic_batch_size, _, synthetic_seqlen = distilled_data[0].shape else: assert args.dynamic, \ "synthetic data must be loaded unless running with the dynamic quantization mode" ############################## Calibration ##################################### torch.set_grad_enabled(False) # disable backward graph generation asr_model.eval() # evaluation mode asr_model.set_quant_bit(args.weight_bit, mode='weight') asr_model.set_quant_bit(args.act_bit, mode='act') # set percentile if args.percentile is not None: qm.set_percentile(asr_model, args.percentile) if args.no_quant: asr_model.set_quant_mode('none') else: asr_model.encoder.bn_folding() # BN folding # if not dynamic quantization, calibrate min/max/range for the activations using synthetic data # if dynamic, we can skip calibration if not args.dynamic: print('Calibrating...') qm.calibrate(asr_model) length = torch.tensor([synthetic_seqlen] * synthetic_batch_size).cuda() for batch_idx, inputs in enumerate(distilled_data): if args.calib_early_stop is not None and batch_idx == args.calib_early_stop: break inputs = inputs.cuda() encoded, encoded_len, encoded_scaling_factor = asr_model.encoder( audio_signal=inputs, length=length) log_probs = asr_model.decoder( encoder_output=encoded, encoder_output_scaling_factor=encoded_scaling_factor) ############################## Evaluation ##################################### print('Evaluating...') qm.evaluate(asr_model) qm.set_dynamic( asr_model, args.dynamic) # if dynamic quantization, this will be enabled labels_map = dict([(i, asr_model.decoder.vocabulary[i]) for i in range(len(asr_model.decoder.vocabulary))]) wer = WER(vocabulary=asr_model.decoder.vocabulary) hypotheses = [] references = [] progress_bar = tqdm(asr_model.test_dataloader()) for i, test_batch in enumerate(progress_bar): if i == args.eval_early_stop: break test_batch = [x.cuda().float() for x in test_batch] with autocast(): log_probs, encoded_len, greedy_predictions = asr_model( input_signal=test_batch[0], input_signal_length=test_batch[1]) hypotheses += wer.ctc_decoder_predictions_tensor(greedy_predictions) for batch_ind in range(greedy_predictions.shape[0]): reference = ''.join([ labels_map[c] for c in test_batch[2][batch_ind].cpu().detach().numpy() ]) references.append(reference) del test_batch wer_value = word_error_rate(hypotheses=hypotheses, references=references) print('WER:', wer_value)