def main(): parser = ArgumentParser() parser.add_argument( "--pretrained_model", type=str, default="speakerrecognition_speakernet", required=False, help="Pass your trained .nemo model", ) parser.add_argument( "--finetune_config_file", type=str, required=True, help="path to speakernet config yaml file to load train, validation dataset and also for trainer parameters", ) parser.add_argument( "--freeze_encoder", type=bool, required=False, default=True, help="True if speakernet encoder paramteres needs to be frozen while finetuning", ) args = parser.parse_args() if args.pretrained_model.endswith('.nemo'): logging.info(f"Using local speaker model from {args.pretrained_model}") speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.pretrained_model) elif args.pretrained_model.endswith('.ckpt'): logging.info(f"Using local speaker model from checkpoint {args.pretrained_model}") speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(checkpoint_path=args.pretrained_model) else: logging.info("Using pretrained speaker recognition model from NGC") speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name=args.pretrained_model) finetune_config = OmegaConf.load(args.finetune_config_file) if 'test_ds' in finetune_config.model and finetune_config.model.test_ds is not None: finetune_config.model.test_ds = None logging.warning("Removing test ds") speaker_model.setup_finetune_model(finetune_config.model) finetune_trainer = pl.Trainer(**finetune_config.trainer) speaker_model.set_trainer(finetune_trainer) _ = exp_manager(finetune_trainer, finetune_config.get('exp_manager', None)) speaker_model.setup_optimization(finetune_config.optim) if args.freeze_encoder: for param in speaker_model.encoder.parameters(): param.requires_grad = False finetune_trainer.fit(speaker_model)
def main(cfg): logging.info(f'Hydra config: {cfg.pretty()}') trainer = pl.Trainer(**cfg.trainer) log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer) trainer.fit(speaker_model) model_path = os.path.join(log_dir, '..', 'spkr.nemo') speaker_model.save_to(model_path) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer(gpus=gpu) if speaker_model.prepare_test(trainer): trainer.test(speaker_model)
def speaker_label_model(): preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})} encoder = { 'cls': 'nemo.collections.asr.modules.ConvASREncoder', 'params': { 'feat_in': 64, 'activation': 'relu', 'conv_mask': True, 'jasper': [ { 'filters': 512, 'repeat': 1, 'kernel': [1], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': False, 'separable': False, } ], }, } decoder = { 'cls': 'nemo.collections.asr.modules.SpeakerDecoder', 'params': {'feat_in': 512, 'num_classes': 2, 'pool_mode': 'xvector', 'emb_sizes': [1024]}, } modelConfig = DictConfig( {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)} ) speaker_model = EncDecSpeakerLabelModel(cfg=modelConfig) return speaker_model
def main( nemo_file, enemo_file, onnx_file, model_type="asr", ): if model_type == "asr": logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == "speech_label": logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == "speaker": logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError( "Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file") with tarfile.open(nemo_file, "r") as archive: archive.extract("./model_config.yaml") with tarfile.open(enemo_file, "w") as enemo_archive: enemo_archive.add("./model_config.yaml") copyfile(onnx_file, "model_graph.onnx") enemo_archive.add("model_graph.onnx") os.remove("model_graph.onnx") # cleanup extra file
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer) trainer.fit(speaker_model) if not trainer.fast_dev_run: model_path = os.path.join(log_dir, '..', 'spkr.nemo') speaker_model.save_to(model_path) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: trainer = pl.Trainer(devices=1, accelerator=cfg.trainer.accelerator) if speaker_model.prepare_test(trainer): trainer.test(speaker_model)
def main( nemo_file, enemo_file, onnx_file, model_type='asr', ): if model_type == 'asr': logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == 'speech_label': logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == 'speaker': logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError( "Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file") with tarfile.open(nemo_file, 'r') as archive: archive.extract('./model_config.yaml') with tarfile.open(enemo_file, 'w') as enemo_archive: enemo_archive.add('./model_config.yaml') enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"), open(onnx_file))
def test_ecapa_enc_dec(self): preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})} encoder = { 'cls': 'nemo.collections.asr.modules.ECAPAEncoder', 'params': { 'feat_in': 80, 'filters': [4, 4, 4, 4, 3], 'kernel_sizes': [5, 3, 3, 3, 1], 'dilations': [1, 1, 1, 1, 1], 'scale': 2, }, } decoder = { 'cls': 'nemo.collections.asr.modules.SpeakerDecoder', 'params': {'feat_in': 3, 'num_classes': 2, 'pool_mode': 'attention', 'emb_sizes': 192}, } modelConfig = DictConfig( {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)} ) speaker_model = EncDecSpeakerLabelModel(cfg=modelConfig) speaker_model.train() # TODO: make proper config and assert correct number of weights # Check to/from config_dict: confdict = speaker_model.to_config_dict() instance2 = EncDecSpeakerLabelModel.from_config_dict(confdict) self.assertTrue(isinstance(instance2, EncDecSpeakerLabelModel))
def main(cfg): # add paths to manifests to config cfg.model.train_ds.manifest_filepath = '/Users/xujinghua/speaker-verification-with-NeMo/data/train.json' cfg.model.validation_ds.manifest_filepath = '/Users/xujinghua/speaker-verification-with-NeMo/data/train.json' # an4 test files have a different set of speakers # cfg.model.test_ds.manifest_filepath = '/Users/xujinghua/NeMo/data/an4/wav/an4_clstk/dev.json' cfg.model.decoder.num_classes = 74 os.environ["OMP_NUM_THREADS"] = '1' # tutorial default setting: flags # modify some trainer configs for this demo # Checks if we have GPU available and uses it cuda = 1 if torch.cuda.is_available() else 0 cfg.trainer.gpus = cuda # Reduces maximum number of epochs to 5 for quick demonstration cfg.trainer.max_epochs = 5 # Remove distributed training flags cfg.trainer.accelerator = None logging.info(f'Hydra config: {cfg.pretty()}') trainer = pl.Trainer(**cfg.trainer) log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer) trainer.fit(speaker_model) if not trainer.fast_dev_run: model_path = os.path.join(log_dir, '..', 'spkr.nemo') speaker_model.save_to(model_path) # no need for testing '''
def conformer_model(): preprocessor = { 'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({}) } encoder = { 'cls': 'nemo.collections.asr.modules.ConformerEncoder', 'params': { 'feat_in': 80, 'feat_out': -1, 'n_layers': 2, 'd_model': 256, 'subsampling': 'striding', 'subsampling_factor': 4, 'subsampling_conv_channels': 512, 'ff_expansion_factor': 4, 'self_attention_model': 'rel_pos', 'n_heads': 8, 'att_context_size': [-1, -1], 'xscaling': True, 'untie_biases': True, 'pos_emb_max_len': 500, 'conv_kernel_size': 31, 'dropout': 0.1, 'dropout_emb': 0.0, 'dropout_att': 0.1, }, } decoder = { 'cls': 'nemo.collections.asr.modules.ConvASRDecoder', 'params': { 'feat_in': 256, 'num_classes': 1024, 'vocabulary': list(chr(i % 28) for i in range(0, 1024)) }, } modelConfig = DictConfig({ 'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder) }) citri_model = EncDecSpeakerLabelModel(cfg=modelConfig) return citri_model
def main( nemo_file, onnx_file, model_type='asr', ): if model_type == 'asr': logging.info("Preparing ASR model") model = EncDecCTCModel.restore_from(nemo_file) elif model_type == 'speech_label': logging.info("Preparing Speech Label Classification model") model = EncDecClassificationModel.restore_from(nemo_file) elif model_type == 'speaker': logging.info("Preparing Speaker Recognition model") model = EncDecSpeakerLabelModel.restore_from(nemo_file) else: raise NameError("Available model names are asr, speech_label and speaker") logging.info("Writing onnx file") model.export(onnx_file, onnx_opset_version=12) logging.info("succesfully ported onnx file")
def test_constructor(self): preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})} encoder = { 'cls': 'nemo.collections.asr.modules.ConvASREncoder', 'params': { 'feat_in': 64, 'activation': 'relu', 'conv_mask': True, 'jasper': [ { 'filters': 512, 'repeat': 1, 'kernel': [1], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': False, 'separable': False, } ], }, } decoder = { 'cls': 'nemo.collections.asr.modules.SpeakerDecoder', 'params': {'feat_in': 512, 'num_classes': 2, 'pool_mode': 'xvector', 'emb_sizes': [1024]}, } modelConfig = DictConfig( {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)} ) speaker_model = EncDecSpeakerLabelModel(cfg=modelConfig) speaker_model.train() # TODO: make proper config and assert correct number of weights # Check to/from config_dict: confdict = speaker_model.to_config_dict() instance2 = EncDecSpeakerLabelModel.from_config_dict(confdict) self.assertTrue(isinstance(instance2, EncDecSpeakerLabelModel))
def citrinet_model(): preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})} encoder = { 'cls': 'nemo.collections.asr.modules.ConvASREncoder', 'params': { 'feat_in': 80, 'activation': 'relu', 'conv_mask': True, 'jasper': [ { 'filters': 512, 'repeat': 1, 'kernel': [5], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': False, 'separable': True, 'se': True, 'se_context_size': -1, }, { 'filters': 512, 'repeat': 5, 'kernel': [11], 'stride': [2], 'dilation': [1], 'dropout': 0.1, 'residual': True, 'separable': True, 'se': True, 'se_context_size': -1, 'stride_last': True, 'residual_mode': 'stride_add', }, { 'filters': 512, 'repeat': 5, 'kernel': [13], 'stride': [1], 'dilation': [1], 'dropout': 0.1, 'residual': True, 'separable': True, 'se': True, 'se_context_size': -1, }, { 'filters': 640, 'repeat': 1, 'kernel': [41], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': True, 'separable': True, 'se': True, 'se_context_size': -1, }, ], }, } decoder = { 'cls': 'nemo.collections.asr.modules.ConvASRDecoder', 'params': {'feat_in': 640, 'num_classes': 1024, 'vocabulary': list(chr(i % 28) for i in range(0, 1024))}, } modelConfig = DictConfig( {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)} ) citri_model = EncDecSpeakerLabelModel(cfg=modelConfig) return citri_model
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') device = 'cuda' if torch.cuda.is_available() else 'cpu' enrollment_manifest = cfg.data.enrollment_manifest test_manifest = cfg.data.test_manifest out_manifest = cfg.data.out_manifest sample_rate = cfg.data.sample_rate backend = cfg.backend.backend_model.lower() if backend == 'cosine_similarity': model_path = cfg.backend.cosine_similarity.model_path batch_size = cfg.backend.cosine_similarity.batch_size if model_path.endswith('.nemo'): speaker_model = EncDecSpeakerLabelModel.restore_from(model_path) else: speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path) enroll_embs, _, enroll_truelabels, enroll_id2label = EncDecSpeakerLabelModel.get_batch_embeddings( speaker_model, enrollment_manifest, batch_size, sample_rate, device=device, ) test_embs, _, _, _ = EncDecSpeakerLabelModel.get_batch_embeddings( speaker_model, test_manifest, batch_size, sample_rate, device=device, ) # length normalize enroll_embs = enroll_embs / (np.linalg.norm( enroll_embs, ord=2, axis=-1, keepdims=True)) test_embs = test_embs / (np.linalg.norm( test_embs, ord=2, axis=-1, keepdims=True)) # reference embedding reference_embs = [] keyslist = list(enroll_id2label.keys()) for label_id in keyslist: indices = np.where(enroll_truelabels == label_id) embedding = (enroll_embs[indices].sum( axis=0).squeeze()) / len(indices) reference_embs.append(embedding) reference_embs = np.asarray(reference_embs) scores = np.matmul(test_embs, reference_embs.T) matched_labels = scores.argmax(axis=-1) elif backend == 'neural_classifier': model_path = cfg.backend.neural_classifier.model_path batch_size = cfg.backend.neural_classifier.batch_size if model_path.endswith('.nemo'): speaker_model = EncDecSpeakerLabelModel.restore_from(model_path) else: speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path) featurizer = WaveformFeaturizer(sample_rate=sample_rate) dataset = AudioToSpeechLabelDataset( manifest_filepath=enrollment_manifest, labels=None, featurizer=featurizer) enroll_id2label = dataset.id2label if speaker_model.decoder.final.out_features != len(enroll_id2label): raise ValueError( "number of labels mis match. Make sure you trained or finetuned neural classifier with labels from enrollement manifest_filepath" ) _, test_logits, _, _ = EncDecSpeakerLabelModel.get_batch_embeddings( speaker_model, test_manifest, batch_size, sample_rate, device=device, ) matched_labels = test_logits.argmax(axis=-1) with open(test_manifest, 'rb') as f1, open(out_manifest, 'w', encoding='utf-8') as f2: lines = f1.readlines() for idx, line in enumerate(lines): line = line.strip() item = json.loads(line) item['infer'] = enroll_id2label[matched_labels[idx]] json.dump(item, f2) f2.write('\n') logging.info( "Inference labels have been written to {} manifest file".format( out_manifest))
def main(): parser = ArgumentParser() parser.add_argument( "--spkr_model", type=str, default="titanet_large", required=True, help="Pass your trained .nemo model", ) parser.add_argument( "--train_manifest", type=str, required=True, help="path to train manifest file to match labels" ) parser.add_argument( "--test_manifest", type=str, required=True, help="path to test manifest file to perform inference" ) parser.add_argument("--batch_size", type=int, default=32) args = parser.parse_args() torch.set_grad_enabled(False) if args.spkr_model.endswith('.nemo'): logging.info(f"Using local speaker model from {args.spkr_model}") speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.spkr_model) else: logging.error(f"Please pass a trained .nemo file") sys.exit() labels = [] with open(args.train_manifest, 'rb') as f: lines = f.readlines() for line in lines: line = line.strip() item = json.loads(line) labels.append(item['label']) labels_map = sorted(set(labels)) label2id, id2label = {}, {} for label_id, label in enumerate(labels_map): label2id[label] = label_id id2label[label_id] = label speaker_model.setup_test_data( test_data_layer_params={ 'sample_rate': 16000, 'manifest_filepath': args.test_manifest, 'labels': labels_map, 'batch_size': args.batch_size, 'trim_silence': False, 'shuffle': False, } ) if can_gpu: speaker_model = speaker_model.cuda() speaker_model.eval() speaker_model.test_dataloader() all_labels = [] all_logits = [] for test_batch in tqdm(speaker_model.test_dataloader()): if can_gpu: test_batch = [x.cuda() for x in test_batch] with autocast(): audio_signal, audio_signal_len, labels, _ = test_batch logits, _ = speaker_model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len) all_logits.extend(logits.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) all_logits, true_labels = np.asarray(all_logits), np.asarray(all_labels) infer_labels = all_logits.argmax(axis=1) out_manifest = os.path.basename(args.test_manifest).split('.')[0] + '_infer.json' out_manifest = os.path.join(os.path.dirname(args.test_manifest), out_manifest) with open(args.test_manifest, 'rb') as f1, open(out_manifest, 'w', encoding='utf-8') as f2: lines = f1.readlines() for idx, line in enumerate(lines): line = line.strip() item = json.loads(line) item['infer'] = id2label[infer_labels[idx]] json.dump(item, f2) f2.write('\n') logging.info("Inference labels have been written to {} manifest file".format(out_manifest))