def main(cfg: ServerConfig): global model, spect_parser, decoder, config, device, model2, model3 global commo_model, dict_data, word_dict, char_dict commo_model, dict_data, word_dict, char_dict = transcribe_comma.loadModel() config = cfg model1Path = '/work/Source/deepspeech.pytorch/models/deepspeech_50_1600_gru_fpt.pth' logging.info('Setting up server...') device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=model1Path, use_half=cfg.model.use_half) logging.info('Loaded model 1') model2Path = '/work/Source/deepspeech.pytorch/models/deepspeech_1600_lstm_16_50_vin.pth' model2 = load_model(device=device, model_path=model2Path, use_half=cfg.model.use_half) logging.info('Loaded model 2') model3Path = '/work/Source/deepspeech.pytorch/models/deepspeech_1600_vinfpt_25_50.pth' model3 = load_model(device=device, model_path=model3Path, use_half=cfg.model.use_half) logging.info('Loaded model 3') decoder = load_decoder(labels=model.labels, cfg=cfg.lm) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') app.run(host=cfg.host, port=cfg.port, debug=False, use_reloader=False)
def __init__(self, model_path, gpus=None, batch_size=1, lr_stage1=100, lr_stage2=0.1, num_iter_stage1=1000, num_iter_stage2=4000, labels_path='labels.json'): # handle attacked model self.device = torch.device("cuda" if gpus is None else gpus) self.model = load_model(device=self.device, model_path=model_path, use_half=False) self.model.eval() # handle training parameters self.num_iter_stage1 = num_iter_stage1 self.num_iter_stage2 = num_iter_stage2 self.batch_size = batch_size self.lr_stage1 = lr_stage1 with open(labels_path) as label_file: label = json.load(label_file) self.text2label = {x: idx for idx, x in enumerate(label)} self.label2text = {idx: x for idx, x in enumerate(label)} self.ctc_loss = torch.nn.CTCLoss(blank=len(self.text2label) - 1)
def transcribe(cfg: TranscribeConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path, use_half=cfg.model.use_half) decoder = load_decoder( "beam" if cfg.lm.decoder_type == DecoderType.beam else "greedy", model.labels, cfg.lm.lm_path, cfg.lm.alpha, cfg.lm.beta, cfg.lm.cutoff_top_n, cfg.lm.cutoff_prob, cfg.lm.beam_width, cfg.lm.lm_workers) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) start = time.time() decoded_output, decoded_offsets = run_transcribe( audio_path=cfg.audio_path, spect_parser=spect_parser, model=model, decoder=decoder, device=device, use_half=cfg.model.use_half) results = decode_results(decoded_output=decoded_output, decoded_offsets=decoded_offsets, cfg=cfg) end = time.time() print("Time taken: {}".format(end - start)) print(json.dumps(results, ensure_ascii=False))
def main(cfg: ServerConfig): global model, spect_parser, decoder, config, device config = cfg logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path, use_half=cfg.model.use_half) decoder = load_decoder( "beam" if cfg.lm.decoder_type == DecoderType.beam else "greedy", model.labels, cfg.lm.lm_path, cfg.lm.alpha, cfg.lm.beta, cfg.lm.cutoff_top_n, cfg.lm.cutoff_prob, cfg.lm.beam_width, cfg.lm.lm_workers) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') serve(app, host=cfg.host, port=cfg.port)
def __init__(self, cfg): self.cfg = cfg self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model = load_model( self.device, hydra.utils.to_absolute_path(self.cfg.model_path)) self.ckpt = torch.load(hydra.utils.to_absolute_path( self.cfg.model_path), map_location=self.device) self.labels = self.ckpt['hyper_parameters']['labels'] self.decoder = BeamCTCDecoder(labels=self.labels, lm_path=hydra.utils.to_absolute_path( self.cfg.lm_path), beam_width=self.cfg.beam_width, num_processes=self.cfg.num_workers, blank_index=self.labels.index('_')) self.target_decoder = GreedyDecoder(labels=self.labels, blank_index=self.labels.index('_')) test_dataset = SpectrogramDataset( audio_conf=self.cfg.spect_cfg, input_path=hydra.utils.to_absolute_path(cfg.test_path), labels=self.labels, normalize=True) self.test_loader = AudioDataLoader(test_dataset, batch_size=self.cfg.batch_size, num_workers=self.cfg.num_workers)
def evaluate(cfg: EvalConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path, use_half=cfg.model.use_half) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset( audio_conf=model.audio_conf, manifest_filepath=hydra.utils.to_absolute_path(cfg.test_manifest), labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers) wer, cer, output_data = run_evaluation(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=cfg.save_output, verbose=cfg.verbose, use_half=cfg.model.use_half) print('Test Summary \t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer)) if cfg.save_output: torch.save(output_data, hydra.utils.to_absolute_path(cfg.save_output))
def evaluate(cfg: EvalConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) target_decoder = GreedyDecoder(labels=model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.spect_cfg, input_path=hydra.utils.to_absolute_path( cfg.test_path), labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers) wer, cer = run_evaluation_print(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, precision=cfg.model.precision) print('Test Summary \t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(wer=wer, cer=cer))
def transcribe(cfg: TranscribeConfig): commo_model, dict_data, word_dict, char_dict = transcribe_comma.loadModel() device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path, use_half=cfg.model.use_half) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) #Đối với beamsearch decoded_putput cho ra mảng (1xbeam_width) với các phần tử là các câu có thể xảy ra: #VD: [["toi đi hộc", "tôi di hoc", "tôi đi ho",...]] 512 phần tử (beam_width=512) tim1 = time.time() decoded_output, decoded_outputGreedy, decoded_offsets, decoded_offsets2 = run_transcribe( audio_path=cfg.audio_path, spect_parser=spect_parser, model=model, decoder=decoder, device=device, use_half=cfg.model.use_half) results = decode_results(decoded_output=decoded_output, decoded_offsets=decoded_offsets, cfg=cfg) results2 = decode_results(decoded_output=decoded_outputGreedy, decoded_offsets=decoded_offsets2, cfg=cfg) resp = json.dumps(results, ensure_ascii=False) tim2 = time.time() print("Audio transcribe cost : " + str(tim2 - tim1)) results['output'][0]['transcription'] = transcribe_comma.runTranscribe( commo_model, dict_data, word_dict, char_dict, results['output'][0]['transcription']) results2['output'][0]['transcription'] = transcribe_comma.runTranscribe( commo_model, dict_data, word_dict, char_dict, results2['output'][0]['transcription']) #print("DEBUG : ", resp) return results['output'][0]['transcription'], results2['output'][0][ 'transcription'], results['_meta']
def main(cfg: ServerConfig): global model, spect_parser, decoder, config, device config = cfg logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) spect_parser = SpectrogramParser(audio_conf=model.spect_cfg, normalize=True) logging.info('Server initialised') app.run(host=cfg.host, port=cfg.port, debug=True, use_reloader=False)
def main(): import argparse global model, spect_parser, decoder, args, device parser = argparse.ArgumentParser( description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from deepspeech_pytorch.decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def transcribe(cfg: TranscribeConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) spect_parser = SpectrogramParser(audio_conf=model.spect_cfg, normalize=True) decoded_output, decoded_offsets = run_transcribe( audio_path=hydra.utils.to_absolute_path(cfg.audio_path), spect_parser=spect_parser, model=model, decoder=decoder, device=device, precision=cfg.model.precision) results = decode_results(decoded_output=decoded_output, decoded_offsets=decoded_offsets, cfg=cfg) print(json.dumps(results))
def __init__(self, *args, voting_kwargs, niters_forward=1, niters_backward=1, batch_backward=0, batch_forward=0, load_weights_file=None, use_half=False, random_init=False, **kwargs): filename = load_weights_file if load_weights_file else "librispeech_pretrained_v2.pth" saved_model_dir = paths.runtime_paths().saved_model_dir model_path = os.path.join(saved_model_dir, filename) model = load_model(device="cpu", model_path=model_path, use_half=use_half) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5, amsgrad=False) super(SmoothedDeepSpeech, self).__init__(model, *args, optimizer=optimizer, **kwargs) self.model_path = model_path self.use_half = use_half self.niters_forward = niters_forward self.niters_backward = niters_backward if random_init: for p in self._model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform(p) else: torch.nn.init.zeros_(p) self.decoder = load_decoder_with_scores(self.decoder) self.set_voting_module(**voting_kwargs, **kwargs) self.batch_backward = batch_backward self.batch_forward = batch_forward
def transcribe(cfg: TranscribeConfig): device = torch.device("cuda" if cfg.model.cuda else "cpu") model = load_model(device=device, model_path=cfg.model.model_path, use_half=cfg.model.use_half) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) decoded_output, decoded_offsets = run_transcribe(audio_path=cfg.audio_path, spect_parser=spect_parser, model=model, decoder=decoder, device=device, use_half=cfg.model.use_half) results = decode_results(decoded_output=decoded_output, decoded_offsets=decoded_offsets, cfg=cfg) print(json.dumps(results))
default=8, type=int, help='Number of workers used in dataloading') parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample") parser.add_argument('--save-output', default=None, help="Saves output of model from test to this file_path") parser = add_decoder_args(parser) if __name__ == '__main__': args = parser.parse_args() torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path) with open('labels.json') as label_file: labels = json.load(label_file) decoder = load_decoder(decoder_type=args.decoder, labels=labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, lm_workers=args.lm_workers) target_decoder = GreedyDecoder(labels)
def run_quantsim_evaluation(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") import deepspeech_pytorch.model def wrapped_forward_function(self, x, lengths=None): if lengths is None: lengths = torch.IntTensor([_x.shape[0] for _x in x]) return self.infer(x, lengths) deepspeech_pytorch.model.DeepSpeech.infer = deepspeech_pytorch.model.DeepSpeech.forward deepspeech_pytorch.model.DeepSpeech.forward = wrapped_forward_function model = load_model(device=device, model_path=args.model_path, use_half=False) decoder = load_decoder(labels=model.labels, cfg=LMConfig) target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) def eval_func(model, iterations=None, device=device): test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels, normalize=True) if iterations is not None: test_dataset.size = iterations test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data = run_evaluation(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=False, verbose=True, use_half=False) return wer, cer, output_data quant_scheme = QuantScheme.post_training_tf_enhanced sim = QuantizationSimModel(model.cpu(), input_shapes=tuple([1, 1, 161, 500]), quant_scheme=quant_scheme, default_param_bw=args.default_param_bw, default_output_bw=args.default_output_bw, config_file=args.quantsim_config_file) manually_configure_quant_ops(sim) sim.model.to(device) sim.compute_encodings(eval_func, forward_pass_callback_args=args.encodings_iterations) wer, cer, output_data = eval_func(sim.model, None) print('Average WER {:.4f}'.format(wer))
parser.add_argument('--lm-alpha-to', default=3.0, type=float, help='Language model weight end tuning') parser.add_argument('--lm-beta-from', default=0.0, type=float, help='Language model word bonus (all words) start tuning') parser.add_argument('--lm-beta-to', default=0.5, type=float, help='Language model word bonus (all words) end tuning') parser.add_argument('--lm-num-alphas', default=45, type=float, help='Number of alpha candidates for tuning') parser.add_argument('--lm-num-betas', default=8, type=float, help='Number of beta candidates for tuning') parser = add_decoder_args(parser) args = parser.parse_args() if args.lm_path is None: print("error: LM must be provided for tuning") sys.exit(1) model = load_model(model_path=args.model_path, device='cpu', use_half=False) saved_output = torch.load(args.saved_output) def init(beam_width, blank_index, lm_path): global decoder decoder = BeamCTCDecoder(model.labels, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers, blank_index=blank_index) def decode_dataset(params): lm_alpha, lm_beta = params global decoder decoder._decoder.reset_params(lm_alpha, lm_beta)
default=45, type=float, help='Number of alpha candidates for tuning') parser.add_argument('--lm-num-betas', default=8, type=float, help='Number of beta candidates for tuning') parser = add_decoder_args(parser) args = parser.parse_args() if args.lm_path is None: print("error: LM must be provided for tuning") sys.exit(1) model = load_model( model_path=args.model_path, device='cpu', ) saved_output = torch.load(args.saved_output) with open('labels.json') as label_file: labels = json.load(label_file) def init(beam_width, blank_index, lm_path): global decoder decoder = BeamCTCDecoder(labels, lm_path=lm_path, beam_width=beam_width, num_processes=args.lm_workers, blank_index=blank_index)
def __init__( self, model: Optional["DeepSpeech"] = None, pretrained_model: Optional[str] = None, filename: Optional[str] = None, url: Optional[str] = None, use_half: bool = False, optimizer: Optional["torch.optim.Optimizer"] = None, # type: ignore use_amp: bool = False, opt_level: str = "O1", decoder_type: str = "greedy", lm_path: str = "", top_paths: int = 1, alpha: float = 0.0, beta: float = 0.0, cutoff_top_n: int = 40, cutoff_prob: float = 1.0, beam_width: int = 10, lm_workers: int = 4, clip_values: Optional["CLIP_VALUES_TYPE"] = None, preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None, postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None, preprocessing: "PREPROCESSING_TYPE" = None, device_type: str = "gpu", verbose: bool = True, ): """ Initialization of an instance PyTorchDeepSpeech. :param model: DeepSpeech model. :param pretrained_model: The choice of pretrained model if a pretrained model is required. Currently this estimator supports 3 different pretrained models consisting of `an4`, `librispeech` and `tedlium`. :param filename: Name of the file. :param url: Download URL. :param use_half: Whether to use FP16 for pretrained model. :param optimizer: The optimizer used to train the estimator. :param use_amp: Whether to use the automatic mixed precision tool to enable mixed precision training or gradient computation, e.g. with loss gradient computation. When set to True, this option is only triggered if there are GPUs available. :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted values are `O0`, `O1`, `O2`, and `O3`. :param decoder_type: Decoder type. Either `greedy` or `beam`. This parameter is only used when users want transcription outputs. :param lm_path: Path to an (optional) kenlm language model for use with beam search. This parameter is only used when users want transcription outputs. :param top_paths: Number of beams to be returned. This parameter is only used when users want transcription outputs. :param alpha: The weight used for the language model. This parameter is only used when users want transcription outputs. :param beta: Language model word bonus (all words). This parameter is only used when users want transcription outputs. :param cutoff_top_n: Cutoff_top_n characters with highest probs in vocabulary will be used in beam search. This parameter is only used when users want transcription outputs. :param cutoff_prob: Cutoff probability in pruning. This parameter is only used when users want transcription outputs. :param beam_width: The width of beam to be used. This parameter is only used when users want transcription outputs. :param lm_workers: Number of language model processes to use. This parameter is only used when users want transcription outputs. :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and maximum values allowed for features. If floats are provided, these will be used as the range of all features. If arrays are provided, each value will be considered the bound for a feature, thus the shape of clip values needs to match the total number of features. :param preprocessing_defences: Preprocessing defence(s) to be applied by the estimator. :param postprocessing_defences: Postprocessing defence(s) to be applied by the estimator. :param preprocessing: Tuple of the form `(subtrahend, divisor)` of floats or `np.ndarray` of values to be used for data preprocessing. The first value will be subtracted from the input. The input will then be divided by the second one. :param device_type: Type of device to be used for model and tensors, if `cpu` run on CPU, if `gpu` run on GPU if available otherwise run on CPU. """ import torch # lgtm [py/repeated-import] from deepspeech_pytorch.configs.inference_config import LMConfig from deepspeech_pytorch.enums import DecoderType from deepspeech_pytorch.utils import load_decoder, load_model # Super initialization super().__init__( model=None, clip_values=clip_values, channels_first=None, preprocessing_defences=preprocessing_defences, postprocessing_defences=postprocessing_defences, preprocessing=preprocessing, ) self.verbose = verbose # Check clip values if self.clip_values is not None: if not np.all(self.clip_values[0] == -1): raise ValueError( "This estimator requires normalized input audios with clip_vales=(-1, 1)." ) if not np.all(self.clip_values[1] == 1): raise ValueError( "This estimator requires normalized input audios with clip_vales=(-1, 1)." ) # Check postprocessing defences if self.postprocessing_defences is not None: raise ValueError( "This estimator does not support `postprocessing_defences`.") # Set cpu/gpu device self._device: torch.device if device_type == "cpu" or not torch.cuda.is_available(): self._device = torch.device("cpu") else: cuda_idx = torch.cuda.current_device() self._device = torch.device("cuda:{}".format(cuda_idx)) self._input_shape = None # Load model if model is None: if pretrained_model == "an4": filename, url = ( "an4_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/an4_pretrained_v2.pth", ) elif pretrained_model == "librispeech": filename, url = ( "librispeech_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/" "librispeech_pretrained_v2.pth", ) elif pretrained_model == "tedlium": filename, url = ( "ted_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/ted_pretrained_v2.pth", ) elif pretrained_model is None: # If model is None and no pretrained model is selected, then we need to have parameters filename and # url to download, extract and load the automatic speech recognition model if filename is None or url is None: filename, url = ( "librispeech_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/" "librispeech_pretrained_v2.pth", ) else: raise ValueError( "The input pretrained model %s is not supported." % pretrained_model) # Download model model_path = get_file(filename=filename, path=config.ART_DATA_PATH, url=url, extract=False, verbose=self.verbose) # Then load model self._model = load_model(device=self._device, model_path=model_path, use_half=use_half) else: self._model = model # Push model to the corresponding device self._model.to(self._device) # Save first version of the optimizer self._optimizer = optimizer self._use_amp = use_amp # Now create a decoder # Create the language model config first lm_config = LMConfig() # Then setup the config if decoder_type == "greedy": lm_config.decoder_type = DecoderType.greedy elif decoder_type == "beam": lm_config.decoder_type = DecoderType.beam else: raise ValueError("Decoder type %s currently not supported." % decoder_type) lm_config.lm_path = lm_path lm_config.top_paths = top_paths lm_config.alpha = alpha lm_config.beta = beta lm_config.cutoff_top_n = cutoff_top_n lm_config.cutoff_prob = cutoff_prob lm_config.beam_width = beam_width lm_config.lm_workers = lm_workers # Create the decoder with the lm config self.decoder = load_decoder(labels=self._model.labels, cfg=lm_config) # Setup for AMP use if self._use_amp: from apex import amp if self._optimizer is None: logger.warning( "An optimizer is needed to use the automatic mixed precision tool, but none for provided. " "A default optimizer is used.") # Create the optimizers parameters = self._model.parameters() self._optimizer = torch.optim.SGD(parameters, lr=0.01) if self._device.type == "cpu": enabled = False else: enabled = True self._model, self._optimizer = amp.initialize( models=self._model, optimizers=self._optimizer, enabled=enabled, opt_level=opt_level, loss_scale=1.0, )
help='PGD iteration times') # plot parameters parser.add_argument('--plot_ori_spec', type=str, default="None", help='Path to save the original spectrogram') parser.add_argument('--plot_adv_spec', type=str, default="None", help='Path to save the adversarial spectrogram') args = parser.parse_args() cfg = TranscribeConfig model = load_model(device="cpu", model_path=args.model_path, use_half=False) decoder = load_decoder(labels=model.labels, cfg=cfg.lm) sound, sample_rate = torchaudio.load(args.input_wav) target_sentence = args.target_sentence.upper() if args.output_wav == "None": args.output_wav = None attacker = Attacker(model=model, sound=sound, target=target_sentence, decoder=decoder, device=args.device, save=output_wav) attacker.attack(epsilon=args.epsilon,
print(transcription) if __name__ == "__main__": arg_parser = argparse.ArgumentParser( description='DeepSpeech transcription') arg_parser = add_inference_args(arg_parser) arg_parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information') arg_parser = add_decoder_args(arg_parser) args = arg_parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) decoder = load_decoder(decoder_type=args.decoder, labels=model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, lm_workers=args.lm_workers) spect_parser = SpectrogramParser(audio_conf=model.audio_conf, normalize=True) vad = webrtcvad.Vad()
def reload_model(self): model = load_model(device="cpu", model_path=self.model_path, use_half=self.use_half) self._model = model.to(self._device)