def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=False) audio_preprocessor = AudioPreprocessing(**featurizer_config) audio_preprocessor.eval() eval_transforms = torchvision.transforms.Compose([ lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) eval( data_layer=data_layer, audio_processor=eval_transforms, args=args)
def main(args): random.seed(args.seed) np.random.seed(args.seed) #torch.set_default_dtype(torch.double) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark #print("CUDNN BENCHMARK ", args.cudnn_benchmark) if args.cuda: assert(torch.cuda.is_available()) model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" #print('model_config') #print_dict(model_definition) #print('feature_config') #print_dict(featurizer_config) data_layer = None data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, sampler='bucket' #sort by duration ) audio_preprocessor = AudioPreprocessing(**featurizer_config) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None and args.mode in[3]: #print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) audio_preprocessor.featurizer.normalize = "per_feature" if args.cuda: audio_preprocessor.cuda() audio_preprocessor.eval() eval_transforms = [] if args.cuda: eval_transforms.append(lambda xs: [xs[0].cuda(),xs[1].cuda(), *xs[2:]]) eval_transforms.append(lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]]) # These are just some very confusing transposes, that's all. # BxFxT -> TxBxF eval_transforms.append(lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]]) eval_transforms = torchvision.transforms.Compose(eval_transforms) if args.cuda: model.cuda() # Ideally, I would jit this as well... But this is just the constructor... greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model) eval( data_layer=data_layer, audio_processor=eval_transforms, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args)
def get_pytorch_components_and_onnx(args): '''Returns PyTorch components used for inference ''' model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] # Set up global labels for future vocab calls global _global_ctc_labels _global_ctc_labels = add_ctc_labels(dataset_vocab) featurizer_config = model_definition['input_eval'] optim_level = 3 if args.pyt_fp16 else 0 featurizer_config["optimization_level"] = optim_level audio_preprocessor = None onnx_path = None data_layer = None wav = None seq_len = None if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.dataset_dir is not None: data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=args.val_manifest, labels=dataset_vocab, batch_size=args.batch_size, shuffle=False) if args.wav is not None: args.batch_size = 1 wav, seq_len = audio_from_file(args.wav) if args.seq_len is None or args.seq_len == 0: args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100) if args.transpose: featurizer_config["transpose_out"] = True model_definition["transpose_in"] = True model = JasperEncoderDecoder(jasper_model_definition=model_definition, feat_in=1024, num_classes=len(get_vocab()), transpose_in=args.transpose) model = model.cuda() model.eval() audio_preprocessor = AudioPreprocessing(**featurizer_config) audio_preprocessor = audio_preprocessor.cuda() audio_preprocessor.eval() if args.ckpt_path is not None: if os.path.isdir(args.ckpt_path): d_checkpoint = torch.load(args.ckpt_path + "/decoder.pt", map_location="cpu") e_checkpoint = torch.load(args.ckpt_path + "/encoder.pt", map_location="cpu") model.jasper_encoder.load_state_dict(e_checkpoint, strict=False) model.jasper_decoder.load_state_dict(d_checkpoint, strict=False) else: checkpoint = torch.load(args.ckpt_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) # if we are to produce engine, not run/create ONNX, postpone AMP initialization # (ONNX parser cannot handle mixed FP16 ONNX yet) if args.pyt_fp16 and args.engine_path is None: amp.initialize(models=model, opt_level=AmpOptimizations[optim_level]) if args.make_onnx: if args.onnx_path is None or args.ckpt_path is None: raise Exception( "--ckpt_path, --onnx_path must be provided when using --make_onnx" ) onnx_path = get_onnx(args.onnx_path, model, args) if args.pyt_fp16 and args.engine_path is not None: amp.initialize(models=model, opt_level=AmpOptimizations[optim_level]) return { 'data_layer': data_layer, 'audio_preprocessor': audio_preprocessor, 'acoustic_model': model, 'input_wav': (wav, seq_len) }, onnx_path
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark assert (args.steps is None or args.steps > 5) print("CUDNN BENCHMARK ", args.cudnn_benchmark) assert (torch.cuda.is_available()) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 batch_size = args.batch_size jasper_model_definition = toml.load(args.model_toml) dataset_vocab = jasper_model_definition['labels']['labels'] ctc_vocab = add_ctc_labels(dataset_vocab) val_manifest = args.val_manifest featurizer_config = jasper_model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config[ 'pad_to'] = args.pad_to if args.pad_to >= 0 else "max" print('model_config') print_dict(jasper_model_definition) print('feature_config') print_dict(featurizer_config) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=False) audio_preprocessor = AudioPreprocessing(**featurizer_config) encoderdecoder = JasperEncoderDecoder( jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") for k in audio_preprocessor.state_dict().keys(): checkpoint['state_dict'][k] = checkpoint['state_dict'].pop( "audio_preprocessor." + k) audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False) encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False) greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) N = len(data_layer) step_per_epoch = math.ceil(N / args.batch_size) print('-----------------') if args.steps is None: print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) else: print('Have {0} examples to eval on.'.format(args.steps * args.batch_size)) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') audio_preprocessor.cuda() encoderdecoder.cuda() if args.fp16: encoderdecoder = amp.initialize( models=encoderdecoder, opt_level=AmpOptimizations[optim_level]) eval(data_layer=data_layer, audio_processor=audio_preprocessor, encoderdecoder=encoderdecoder, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) assert(torch.cuda.is_available()) torch.backends.cudnn.benchmark = args.cudnn # set up distributed training if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = torch.distributed.is_initialized() if multi_gpu: print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size())) # define amp optimiation level if args.fp16: optim_level = Optimization.mxprO1 else: optim_level = Optimization.mxprO0 jasper_model_definition = toml.load(args.model_toml) dataset_vocab = jasper_model_definition['labels']['labels'] ctc_vocab = add_ctc_labels(dataset_vocab) train_manifest = args.train_manifest val_manifest = args.val_manifest featurizer_config = jasper_model_definition['input'] featurizer_config_eval = jasper_model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config_eval["optimization_level"] = optim_level sampler_type = featurizer_config.get("sampler", 'default') perturb_config = jasper_model_definition.get('perturb', None) if args.pad_to_max: assert(args.max_duration > 0) featurizer_config['max_duration'] = args.max_duration featurizer_config_eval['max_duration'] = args.max_duration featurizer_config['pad_to'] = "max" featurizer_config_eval['pad_to'] = "max" print_once('model_config') print_dict(jasper_model_definition) if args.gradient_accumulation_steps < 1: raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps)) if args.batch_size % args.gradient_accumulation_steps != 0: raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size)) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, perturb_config=perturb_config, manifest_filepath=train_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, sampler=sampler_type) data_layer_eval = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ) model = Jasper(feature_config=featurizer_config, jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print_once("loading model from {}".format(args.ckpt)) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=True) args.start_epoch = checkpoint['epoch'] else: args.start_epoch = 0 ctc_loss = CTCLossNM( num_classes=len(ctc_vocab)) greedy_decoder = GreedyCTCDecoder() print_once("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) print_once("Number of parameters in decode: {0}".format(model.jasper_decoder.num_weights())) N = len(data_layer) if sampler_type == 'default': args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) elif sampler_type == 'bucket': args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size ) print_once('-----------------') print_once('Have {0} examples to train on.'.format(N)) print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) print_once('-----------------') fn_lr_policy = lambda s: lr_policy(args.lr, s, args.num_epochs * args.step_per_epoch) model.cuda() if args.optimizer_kind == "novograd": optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_kind == "adam": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind)) if optim_level in AmpOptimizations: model, optimizer = amp.initialize( #lnw block for error #min_loss_scale=1.0, models=model, optimizers=optimizer, opt_level=AmpOptimizations[optim_level]) if args.ckpt is not None: optimizer.load_state_dict(checkpoint['optimizer']) model = model_multi_gpu(model, multi_gpu) train( data_layer=data_layer, data_layer_eval=data_layer_eval, model=model, ctc_loss=ctc_loss, greedy_decoder=greedy_decoder, optimizer=optimizer, labels=ctc_vocab, optim_level=optim_level, multi_gpu=multi_gpu, fn_lr_policy=fn_lr_policy if args.lr_decay else None, args=args)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" print('model_config') print_dict(model_definition) print('feature_config') print_dict(featurizer_config) data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, # sampler='bucket', sort_by_duration=args.sort_by_duration, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) if args.ipex: import intel_extension_for_pytorch as ipex from rnn import IPEXStackTime model.joint_net.eval() data_type = torch.bfloat16 if args.mix_precision else torch.float32 if model.encoder["stack_time"].factor == 2: model.encoder["stack_time"] = IPEXStackTime(model.encoder["stack_time"].factor) model.joint_net = ipex.optimize(model.joint_net, dtype=data_type, auto_kernel_selection=True) model.prediction["embed"] = model.prediction["embed"].to(data_type) if args.jit: print("running jit path") model.joint_net.eval() if args.mix_precision: with torch.cpu.amp.autocast(), torch.no_grad(): model.joint_net = torch.jit.trace(model.joint_net, torch.randn(args.batch_size, 1, 1, model_definition['rnnt']['encoder_n_hidden'] + model_definition['rnnt']['pred_n_hidden']), check_trace=False) else: with torch.no_grad(): model.joint_net = torch.jit.trace(model.joint_net, torch.randn(args.batch_size, 1, 1, model_definition['rnnt']['encoder_n_hidden'] + model_definition['rnnt']['pred_n_hidden']), check_trace=False) model.joint_net = torch.jit.freeze(model.joint_net) else: model = model.to("cpu") #greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) # step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_available() else torch.distributed.get_world_size()))) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') # print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_available() else torch.distributed.get_world_size()))) print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} warm up steps / (gpu * epoch).'.format(args.warm_up)) print('Have {0} measure steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} warm up steps / (gpu * epoch).'.format(args.warm_up)) print('Have {0} measure steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') else: audio_preprocessor.featurizer.normalize = "per_feature" print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) audio_preprocessor.eval() # eval_transforms = torchvision.transforms.Compose([ # lambda xs: [x.to(ipex.DEVICE) if args.ipex else x.cpu() for x in xs], # lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], # lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], # ]) eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) model.eval() if args.ipex: ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) eval( data_layer=data_layer, audio_processor=eval_transforms, encoderdecoder=model, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) if not args.cpu_run: assert(torch.cuda.is_available()) if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = 3 else: optim_level = 0 jasper_model_definition = toml.load(args.model_toml) dataset_vocab = jasper_model_definition['labels']['labels'] ctc_vocab = add_ctc_labels(dataset_vocab) val_manifest = args.val_manifest featurizer_config = jasper_model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config["fp16"] = args.fp16 args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True) if args.masked_fill is not None: print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling")) jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if featurizer_config['pad_to'] == "max": featurizer_config['pad_to'] = -1 print('=== model_config ===') print_dict(jasper_model_definition) print() print('=== feature_config ===') print_dict(featurizer_config) print() data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == -1, shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print("loading model from ", args.ckpt) if os.path.isdir(args.ckpt): exit(0) else: checkpoint = torch.load(args.ckpt, map_location="cpu") for k in audio_preprocessor.state_dict().keys(): checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k) audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False) encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False) greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) if not args.cpu_run: audio_preprocessor.cuda() encoderdecoder.cuda() if args.fp16: encoderdecoder = amp.initialize( models=encoderdecoder, opt_level=AmpOptimizations[optim_level]) encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu) audio_preprocessor.eval() encoderdecoder.eval() greedy_decoder.eval() eval( data_layer=data_layer, audio_processor=audio_preprocessor, encoderdecoder=encoderdecoder, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)
def get_pytorch_components_and_onnx(args): '''Returns PyTorch components used for inference ''' model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] # Set up global labels for future vocab calls global _global_ctc_labels _global_ctc_labels = add_ctc_labels(dataset_vocab) featurizer_config = model_definition['input_eval'] optim_level = Optimization.mxprO3 if args.pyt_fp16 else Optimization.mxprO0 featurizer_config["optimization_level"] = optim_level acoustic_model = None audio_preprocessor = None onnx_path = None data_layer = None wav = None seq_len = None dtype = torch.float if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.dataset_dir is not None: data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=args.val_manifest, labels=dataset_vocab, batch_size=args.batch_size, shuffle=False) if args.wav is not None: args.batch_size = 1 args.engine_batch_size = 1 wav, seq_len = audio_from_file(args.wav) if args.seq_len is None or args.seq_len == 0: args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100) model = Jasper(feature_config=featurizer_config, jasper_model_definition=model_definition, feat_in=1024, num_classes=len(get_vocab())) model.cuda() model.eval() acoustic_model = model.acoustic_model audio_preprocessor = model.audio_preprocessor if args.ckpt_path is not None: checkpoint = torch.load(args.ckpt_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) if args.make_onnx: if args.onnx_path is None or acoustic_model is None: raise Exception( "--ckpt_path, --onnx_path must be provided when using --make_onnx" ) onnx_path = get_onnx(args.onnx_path, acoustic_model, signal_shape=(args.engine_batch_size, 64, args.seq_len), dtype=torch.float) if args.pyt_fp16: amp.initialize(models=acoustic_model, opt_level=AmpOptimizations[optim_level]) return { 'data_layer': data_layer, 'audio_preprocessor': audio_preprocessor, 'acoustic_model': acoustic_model, 'input_wav': (wav, seq_len) }, onnx_path
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) assert(torch.cuda.is_available()) if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" print('model_config') print_dict(model_definition) print('feature_config') print_dict(featurizer_config) data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) #greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') else: audio_preprocessor.featurizer.normalize = "per_feature" print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) audio_preprocessor.cuda() audio_preprocessor.eval() eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cuda() for x in xs], lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) model.cuda() if args.fp16: model = amp.initialize( models=model, opt_level=AmpOptimizations[optim_level]) model = model_multi_gpu(model, multi_gpu) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) eval( data_layer=data_layer, audio_processor=eval_transforms, encoderdecoder=model, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) args.local_rank = os.environ.get('LOCAL_RANK', args.local_rank) # set up distributed training cpu_distributed_training = False if torch.distributed.is_available() and int(os.environ.get('PMI_SIZE', '0')) > 1: print('Distributed training with DDP') os.environ['RANK'] = os.environ.get('PMI_RANK', '0') os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1') if not 'MASTER_ADDR' in os.environ: os.environ['MASTER_ADDR'] = args.master_addr if not 'MASTER_PORT' in os.environ: os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group( backend=args.backend ) cpu_distributed_training = True if torch.distributed.is_initialized(): print("Torch distributed is initialized.") args.rank = torch.distributed.get_rank() args.world_size = torch.distributed.get_world_size() else: print("Torch distributed is not initialized.") args.rank = 0 args.world_size = 1 multi_gpu = False if multi_gpu: print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size())) optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) train_manifest = args.train_manifest val_manifest = args.val_manifest tst_manifest = args.tst_manifest featurizer_config = model_definition['input'] featurizer_config_eval = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config_eval["optimization_level"] = optim_level sampler_type = featurizer_config.get("sampler", 'default') perturb_config = model_definition.get('perturb', None) if args.pad_to_max: assert(args.max_duration > 0) featurizer_config['max_duration'] = args.max_duration featurizer_config_eval['max_duration'] = args.max_duration featurizer_config['pad_to'] = "max" featurizer_config_eval['pad_to'] = "max" print_once('model_config') print_dict(model_definition) if args.gradient_accumulation_steps < 1: raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps)) if args.batch_size % args.gradient_accumulation_steps != 0: raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size)) preprocessor = preprocessing.AudioPreprocessing(**featurizer_config) if args.cuda: preprocessor.cuda() else: preprocessor.cpu() augmentations = preprocessing.SpectrogramAugmentation(**featurizer_config) if args.cuda: augmentations.cuda() else: augmentations.cpu() train_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], lambda xs: [augmentations(xs[0]), *xs[1:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, perturb_config=perturb_config, manifest_filepath=train_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, sampler=sampler_type, cpu_distributed_training=cpu_distributed_training) eval_datasets = [( AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.eval_batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ), args.eval_frequency, 'Eval clean', )] if tst_manifest: eval_datasets.append(( AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=tst_manifest, labels=dataset_vocab, batch_size=args.eval_batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ), args.test_frequency, 'Test other', )) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print_once("loading model from {}".format(args.ckpt)) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=True) args.start_epoch = checkpoint['epoch'] else: args.start_epoch = 0 loss_fn = RNNTLoss(blank=len(ctc_vocab) - 1) N = len(data_layer) if sampler_type == 'default': args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) elif sampler_type == 'bucket': args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size ) print_once('-----------------') print_once('Have {0} examples to train on.'.format(N)) print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) print_once('-----------------') constant_lr_policy = lambda _: args.lr fn_lr_policy = constant_lr_policy if args.lr_decay: pre_decay_policy = fn_lr_policy fn_lr_policy = lambda s: lr_decay(args.num_epochs * args.step_per_epoch, s, pre_decay_policy(s)) if args.lr_warmup: pre_warmup_policy = fn_lr_policy fn_lr_policy = lambda s: lr_warmup(args.lr_warmup, s, pre_warmup_policy(s) ) if args.optimizer_kind == "novograd": optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_kind == "adam": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind)) if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" if args.ckpt is not None: optimizer.load_state_dict(checkpoint['optimizer']) if args.ipex: if args.bf16: model, optimizer = ipex.optimize(model, dtype=torch.bfloat16, optimizer=optimizer) ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) else: model, optimizer = ipex.optimize(model, dtype=torch.float32, optimizer=optimizer) ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) if args.world_size > 1: device_ids = None model = torch.nn.parallel.DistributedDataParallel(model, device_ids=device_ids) print_once(model) print_once("# parameters: {}".format(sum(p.numel() for p in model.parameters()))) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) if args.tb_path and args.local_rank == 0: logger = TensorBoardLogger(args.tb_path, model.module if multi_gpu else model, args.histogram) else: logger = DummyLogger() train( data_layer=data_layer, model=model, loss_fn=loss_fn, greedy_decoder=greedy_decoder, optimizer=optimizer, data_transforms=train_transforms, labels=ctc_vocab, optim_level=optim_level, multi_gpu=multi_gpu, fn_lr_policy=fn_lr_policy, evalutaion=evaluator(model, eval_transforms, loss_fn, greedy_decoder, ctc_vocab, eval_datasets, logger), logger=logger, args=args)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) assert (args.steps is None or args.steps > 5) if args.cpu: device = torch.device('cpu') else: assert (torch.cuda.is_available()) device = torch.device('cuda') torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) optim_level = 3 if args.amp else 0 batch_size = args.batch_size jasper_model_definition = toml.load(args.model_toml) dataset_vocab = jasper_model_definition['labels']['labels'] ctc_vocab = add_ctc_labels(dataset_vocab) val_manifest = args.val_manifest featurizer_config = jasper_model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration # TORCHSCRIPT: Cant use mixed types. Using -1 for "max" if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else -1 if featurizer_config['pad_to'] == "max": featurizer_config['pad_to'] = -1 args.use_conv_mask = jasper_model_definition['encoder'].get( 'convmask', True) if args.use_conv_mask and args.torch_script: print( 'WARNING: Masked convs currently not supported for TorchScript. Disabling.' ) jasper_model_definition['encoder']['convmask'] = False print('model_config') print_dict(jasper_model_definition) print('feature_config') print_dict(featurizer_config) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=batch_size, pad_to_max=featurizer_config['pad_to'] == -1, shuffle=False, multi_gpu=False) audio_preprocessor = AudioPreprocessing(**featurizer_config) encoderdecoder = JasperEncoderDecoder( jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") for k in audio_preprocessor.state_dict().keys(): checkpoint['state_dict'][k] = checkpoint['state_dict'].pop( "audio_preprocessor." + k) audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False) encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False) greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) N = len(data_layer) step_per_epoch = math.ceil(N / args.batch_size) print('-----------------') if args.steps is None: print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (epoch).'.format(step_per_epoch)) else: print('Have {0} examples to eval on.'.format(args.steps * args.batch_size)) print('Have {0} steps / (epoch).'.format(args.steps)) print('-----------------') audio_preprocessor.to(device) encoderdecoder.to(device) if args.amp: encoderdecoder = amp.initialize(models=encoderdecoder, opt_level='O' + str(optim_level)) eval(data_layer=data_layer, audio_processor=audio_preprocessor, encoderdecoder=encoderdecoder, greedy_decoder=greedy_decoder, labels=ctc_vocab, device=device, args=args)