def main(cfg): trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer) trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer(gpus=gpu) if asr_model.prepare_test(trainer): trainer.test(asr_model)
def init_vad_model(model_path: str): """ Initiate VAD model with model path """ if model_path.endswith('.nemo'): logging.info(f"Using local VAD model from {model_path}") vad_model = EncDecClassificationModel.restore_from(restore_path=model_path) elif model_path.endswith('.ckpt'): vad_model = EncDecClassificationModel.load_from_checkpoint(checkpoint_path=model_path) else: logging.info(f"Using NGC cloud VAD model {model_path}") vad_model = EncDecClassificationModel.from_pretrained(model_name=model_path) return vad_model
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer) # Initialize the weights of the model from another model, if provided via config asr_model.maybe_init_from_pretrained_checkpoint(cfg) trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: if asr_model.prepare_test(trainer): trainer.test(asr_model)
def main( nemo_file, enemo_file, onnx_file, model_type="asr", ): if model_type == "asr": logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == "speech_label": logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == "speaker": logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError( "Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file") with tarfile.open(nemo_file, "r") as archive: archive.extract("./model_config.yaml") with tarfile.open(enemo_file, "w") as enemo_archive: enemo_archive.add("./model_config.yaml") copyfile(onnx_file, "model_graph.onnx") enemo_archive.add("model_graph.onnx") os.remove("model_graph.onnx") # cleanup extra file
def main( nemo_file, enemo_file, onnx_file, model_type='asr', ): if model_type == 'asr': logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == 'speech_label': logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == 'speaker': logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError( "Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file") with tarfile.open(nemo_file, 'r') as archive: archive.extract('./model_config.yaml') with tarfile.open(enemo_file, 'w') as enemo_archive: enemo_archive.add('./model_config.yaml') enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"), open(onnx_file))
def speech_classification_model(): preprocessor = { 'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({}) } encoder = { 'cls': 'nemo.collections.asr.modules.ConvASREncoder', 'params': { 'feat_in': 64, 'activation': 'relu', 'conv_mask': True, 'jasper': [{ 'filters': 32, 'repeat': 1, 'kernel': [1], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': False, 'separable': True, 'se': True, 'se_context_size': -1, }], }, } decoder = { 'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification', 'params': { 'feat_in': 32, 'num_classes': 30, }, } modelConfig = DictConfig({ 'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder), 'labels': ListConfig(["dummy_cls_{}".format(i + 1) for i in range(30)]), }) model = EncDecClassificationModel(cfg=modelConfig) return model
def test_constructor(self, speech_classification_model): asr_model = speech_classification_model.train() conv_cnt = (64 * 32 * 1 + 32) + (64 * 1 * 1 + 32) # separable kernel + bias + pointwise kernel + bias bn_cnt = (4 * 32) * 2 # 2 * moving averages dec_cnt = 32 * 30 + 30 # fc + bias param_count = conv_cnt + bn_cnt + dec_cnt assert asr_model.num_weights == param_count # Check to/from config_dict: confdict = asr_model.to_config_dict() instance2 = EncDecClassificationModel.from_config_dict(confdict) assert isinstance(instance2, EncDecClassificationModel)
def main( nemo_file, onnx_file, model_type='asr', ): if model_type == 'asr': logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == 'speech_label': logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == 'speaker': logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError("Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file")
def main(): parser = ArgumentParser() parser.add_argument("--vad_model", type=str, default="MatchboxNet-VAD-3x2", required=False, help="Pass: '******'") parser.add_argument( "--dataset", type=str, required=True, help= "Path of json file of evaluation data. Audio files should have unique names.", ) parser.add_argument("--out_dir", type=str, default="vad_frame", help="Dir of your vad outputs") parser.add_argument("--time_length", type=float, default=0.63) parser.add_argument("--shift_length", type=float, default=0.01) parser.add_argument("--normalize_audio", type=bool, default=False) parser.add_argument("--num_workers", type=float, default=20) parser.add_argument("--split_duration", type=float, default=400) parser.add_argument( "--dont_auto_split", default=False, action='store_true', help= "Whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue.", ) args = parser.parse_args() torch.set_grad_enabled(False) if args.vad_model.endswith('.nemo'): logging.info(f"Using local VAD model from {args.vad_model}") vad_model = EncDecClassificationModel.restore_from( restore_path=args.vad_model) else: logging.info(f"Using NGC cloud VAD model {args.vad_model}") vad_model = EncDecClassificationModel.from_pretrained( model_name=args.vad_model) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) # Prepare manifest for streaming VAD manifest_vad_input = args.dataset if not args.dont_auto_split: logging.info("Split long audio file to avoid CUDA memory issue") logging.debug( "Try smaller split_duration if you still have CUDA memory issue") config = { 'manifest_filepath': manifest_vad_input, 'time_length': args.time_length, 'split_duration': args.split_duration, 'num_workers': args.num_workers, } manifest_vad_input = prepare_manifest(config) else: logging.warning( "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." ) # setup_test_data vad_model.setup_test_data( test_data_config={ 'vad_stream': True, 'sample_rate': 16000, 'manifest_filepath': manifest_vad_input, 'labels': [ 'infer', ], 'num_workers': args.num_workers, 'shuffle': False, 'time_length': args.time_length, 'shift_length': args.shift_length, 'trim_silence': False, 'normalize_audio': args.normalize_audio, }) vad_model = vad_model.to(device) vad_model.eval() time_unit = int(args.time_length / args.shift_length) trunc = int(time_unit / 2) trunc_l = time_unit - trunc all_len = 0 data = [] for line in open(args.dataset, 'r'): file = json.loads(line)['audio_filepath'].split("/")[-1] data.append(file.split(".wav")[0]) logging.info(f"Inference on {len(data)} audio files/json lines!") status = get_vad_stream_status(data) for i, test_batch in enumerate(vad_model.test_dataloader()): test_batch = [x.to(device) for x in test_batch] with autocast(): log_probs = vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) probs = torch.softmax(log_probs, dim=-1) pred = probs[:, 1] if status[i] == 'start': to_save = pred[:-trunc] elif status[i] == 'next': to_save = pred[trunc:-trunc_l] elif status[i] == 'end': to_save = pred[trunc_l:] else: to_save = pred all_len += len(to_save) outpath = os.path.join(args.out_dir, data[i] + ".frame") with open(outpath, "a") as fout: for f in range(len(to_save)): fout.write('{0:0.4f}\n'.format(to_save[f])) del test_batch if status[i] == 'end' or status[i] == 'single': logging.debug( f"Overall length of prediction of {data[i]} is {all_len}!") all_len = 0
def main(): parser = ArgumentParser() parser.add_argument("--vad_model", type=str, default="MatchboxNet-VAD-3x2", required=False, help="Pass: '******'") parser.add_argument( "--dataset", type=str, required=True, help= "Path of json file of evaluation data. Audio files should have unique names.", ) parser.add_argument("--out_dir", type=str, default="vad_frame", help="Dir of your vad outputs") parser.add_argument("--time_length", type=float, default=0.63) parser.add_argument("--shift_length", type=float, default=0.01) args = parser.parse_args() torch.set_grad_enabled(False) if args.vad_model.endswith('.nemo'): logging.info(f"Using local VAD model from {args.vad_model}") vad_model = EncDecClassificationModel.restore_from( restore_path=args.vad_model) else: logging.info(f"Using NGC cloud VAD model {args.vad_model}") vad_model = EncDecClassificationModel.from_pretrained( model_name=args.vad_model) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) # setup_test_data vad_model.setup_test_data( test_data_config={ 'vad_stream': True, 'sample_rate': 16000, 'manifest_filepath': args.dataset, 'labels': [ 'infer', ], 'num_workers': 20, 'shuffle': False, 'time_length': args.time_length, 'shift_length': args.shift_length, 'trim_silence': False, }) vad_model = vad_model.to(device) vad_model.eval() data = [] for line in open(args.dataset, 'r'): file = json.loads(line)['audio_filepath'].split("/")[-1] data.append(file.split(".wav")[0]) print(f"Inference on {len(data)} audio files/json lines!") time_unit = int(args.time_length / args.shift_length) trunc = int(time_unit / 2) trunc_l = time_unit - trunc all_len = 0 for i, test_batch in enumerate(vad_model.test_dataloader()): if i == 0: status = 'start' if data[i] == data[i + 1] else 'single' elif i == len(data) - 1: status = 'end' if data[i] == data[i - 1] else 'single' else: if data[i] != data[i - 1] and data[i] == data[i + 1]: status = 'start' elif data[i] == data[i - 1] and data[i] == data[i + 1]: status = 'next' elif data[i] == data[i - 1] and data[i] != data[i + 1]: status = 'end' else: status = 'single' print(data[i], status) test_batch = [x.to(device) for x in test_batch] with autocast(): log_probs = vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) probs = torch.softmax(log_probs, dim=-1) pred = probs[:, 1] if status == 'start': to_save = pred[:-trunc] elif status == 'next': to_save = pred[trunc:-trunc_l] elif status == 'end': to_save = pred[trunc_l:] else: to_save = pred all_len += len(to_save) outpath = os.path.join(args.out_dir, data[i] + ".frame") with open(outpath, "a") as fout: for f in range(len(to_save)): fout.write('{0:0.4f}\n'.format(to_save[f])) del test_batch if status == 'end' or status == 'single': print(f"Overall length of prediction of {data[i]} is {all_len}!") all_len = 0