def test_pretrained_eval_inference(self): # Disabled GPU due to using TravisCI cuda, use_half = False, False train_manifest, val_manifest, test_manifest = self.download_data( DatasetConfig(target_dir=self.target_dir, manifest_dir=self.manifest_dir)) wget.download(lm_path) for pretrained_url in pretrained_urls: print("Running Pre-trained Smoke test for: ", pretrained_url) wget.download(pretrained_url) file_path = os.path.basename(pretrained_url) pretrained_path = os.path.abspath(file_path) lm_configs = [ LMConfig(), # Greedy LMConfig(decoder_type=DecoderType.beam), # Test Beam Decoder LMConfig(decoder_type=DecoderType.beam, lm_path=os.path.basename(lm_path), alpha=1, beta=1) # Test Beam Decoder with LM ] for lm_config in lm_configs: self.eval_model(model_path=pretrained_path, test_manifest=test_manifest, cuda=cuda, use_half=use_half, lm_config=lm_config) self.inference(test_manifest=test_manifest, model_path=pretrained_path, cuda=cuda, lm_config=lm_config, use_half=use_half)
def build_train_evaluate_model(self, limit_train_batches: int, limit_val_batches: int, epoch: int, batch_size: int, model_config: BiDirectionalConfig, precision: int, gpus: int, folders: bool): cuda = gpus > 0 train_path, val_path, test_path = self.download_data( DatasetConfig( target_dir=self.target_dir, manifest_dir=self.manifest_dir ), folders=folders ) train_cfg = self.create_training_config( limit_train_batches=limit_train_batches, limit_val_batches=limit_val_batches, max_epochs=epoch, batch_size=batch_size, train_path=train_path, val_path=val_path, model_config=model_config, precision=precision, gpus=gpus ) print("Running Training DeepSpeech Model Smoke Test") train(train_cfg) # Expected final model path after training print(os.listdir(self.model_dir)) model_path = self.model_dir + '/last.ckpt' assert os.path.exists(model_path) lm_configs = [ LMConfig(), # Test Greedy LMConfig( decoder_type=DecoderType.beam ) # Test Beam Decoder ] print("Running Inference Smoke Tests") for lm_config in lm_configs: self.eval_model( model_path=model_path, test_path=test_path, cuda=cuda, precision=precision, lm_config=lm_config ) self.inference(test_path=test_path, model_path=model_path, cuda=cuda, precision=precision, lm_config=lm_config)
def build_train_evaluate_model(self, epoch: int, batch_size: int, model_config: BiDirectionalConfig, use_half: bool, cuda: bool): train_manifest, val_manifest, test_manifest = self.download_data( DatasetConfig(target_dir=self.target_dir, manifest_dir=self.manifest_dir)) train_cfg = self.create_training_config(epoch=epoch, batch_size=batch_size, train_manifest=train_manifest, val_manifest=val_manifest, model_config=model_config, cuda=cuda) print("Running Training DeepSpeech Model Smoke Test") train(train_cfg) # Expected final model path after training model_path = self.model_dir + '/deepspeech_final.pth' assert os.path.exists(model_path) lm_configs = [ LMConfig(), # Test Greedy LMConfig(decoder_type=DecoderType.beam) # Test Beam Decoder ] print("Running Inference Smoke Tests") for lm_config in lm_configs: self.eval_model(model_path=model_path, test_manifest=test_manifest, cuda=cuda, use_half=use_half, lm_config=lm_config) self.inference(test_manifest=test_manifest, model_path=model_path, cuda=cuda, use_half=use_half, lm_config=lm_config)
def load_decoder(labels, cfg: LMConfig): if cfg.decoder_type == DecoderType.beam: from deepspeech_pytorch.decoder import BeamCTCDecoder if cfg.lm_path: cfg.lm_path = hydra.utils.to_absolute_path(cfg.lm_path) decoder = BeamCTCDecoder(labels=labels, lm_path=cfg.lm_path, alpha=cfg.alpha, beta=cfg.beta, cutoff_top_n=cfg.cutoff_top_n, cutoff_prob=cfg.cutoff_prob, beam_width=cfg.beam_width, num_processes=cfg.lm_workers) else: decoder = GreedyDecoder(labels=labels, blank_index=labels.index('_')) return decoder
def __init__( self, model: Optional["DeepSpeech"] = None, pretrained_model: Optional[str] = None, filename: Optional[str] = None, url: Optional[str] = None, use_half: bool = False, optimizer: Optional["torch.optim.Optimizer"] = None, # type: ignore use_amp: bool = False, opt_level: str = "O1", decoder_type: str = "greedy", lm_path: str = "", top_paths: int = 1, alpha: float = 0.0, beta: float = 0.0, cutoff_top_n: int = 40, cutoff_prob: float = 1.0, beam_width: int = 10, lm_workers: int = 4, clip_values: Optional["CLIP_VALUES_TYPE"] = None, preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None, postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None, preprocessing: "PREPROCESSING_TYPE" = None, device_type: str = "gpu", verbose: bool = True, ): """ Initialization of an instance PyTorchDeepSpeech. :param model: DeepSpeech model. :param pretrained_model: The choice of pretrained model if a pretrained model is required. Currently this estimator supports 3 different pretrained models consisting of `an4`, `librispeech` and `tedlium`. :param filename: Name of the file. :param url: Download URL. :param use_half: Whether to use FP16 for pretrained model. :param optimizer: The optimizer used to train the estimator. :param use_amp: Whether to use the automatic mixed precision tool to enable mixed precision training or gradient computation, e.g. with loss gradient computation. When set to True, this option is only triggered if there are GPUs available. :param opt_level: Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted values are `O0`, `O1`, `O2`, and `O3`. :param decoder_type: Decoder type. Either `greedy` or `beam`. This parameter is only used when users want transcription outputs. :param lm_path: Path to an (optional) kenlm language model for use with beam search. This parameter is only used when users want transcription outputs. :param top_paths: Number of beams to be returned. This parameter is only used when users want transcription outputs. :param alpha: The weight used for the language model. This parameter is only used when users want transcription outputs. :param beta: Language model word bonus (all words). This parameter is only used when users want transcription outputs. :param cutoff_top_n: Cutoff_top_n characters with highest probs in vocabulary will be used in beam search. This parameter is only used when users want transcription outputs. :param cutoff_prob: Cutoff probability in pruning. This parameter is only used when users want transcription outputs. :param beam_width: The width of beam to be used. This parameter is only used when users want transcription outputs. :param lm_workers: Number of language model processes to use. This parameter is only used when users want transcription outputs. :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and maximum values allowed for features. If floats are provided, these will be used as the range of all features. If arrays are provided, each value will be considered the bound for a feature, thus the shape of clip values needs to match the total number of features. :param preprocessing_defences: Preprocessing defence(s) to be applied by the estimator. :param postprocessing_defences: Postprocessing defence(s) to be applied by the estimator. :param preprocessing: Tuple of the form `(subtrahend, divisor)` of floats or `np.ndarray` of values to be used for data preprocessing. The first value will be subtracted from the input. The input will then be divided by the second one. :param device_type: Type of device to be used for model and tensors, if `cpu` run on CPU, if `gpu` run on GPU if available otherwise run on CPU. """ import torch # lgtm [py/repeated-import] from deepspeech_pytorch.configs.inference_config import LMConfig from deepspeech_pytorch.enums import DecoderType from deepspeech_pytorch.utils import load_decoder, load_model # Super initialization super().__init__( model=None, clip_values=clip_values, channels_first=None, preprocessing_defences=preprocessing_defences, postprocessing_defences=postprocessing_defences, preprocessing=preprocessing, ) self.verbose = verbose # Check clip values if self.clip_values is not None: if not np.all(self.clip_values[0] == -1): raise ValueError( "This estimator requires normalized input audios with clip_vales=(-1, 1)." ) if not np.all(self.clip_values[1] == 1): raise ValueError( "This estimator requires normalized input audios with clip_vales=(-1, 1)." ) # Check postprocessing defences if self.postprocessing_defences is not None: raise ValueError( "This estimator does not support `postprocessing_defences`.") # Set cpu/gpu device self._device: torch.device if device_type == "cpu" or not torch.cuda.is_available(): self._device = torch.device("cpu") else: cuda_idx = torch.cuda.current_device() self._device = torch.device("cuda:{}".format(cuda_idx)) self._input_shape = None # Load model if model is None: if pretrained_model == "an4": filename, url = ( "an4_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/an4_pretrained_v2.pth", ) elif pretrained_model == "librispeech": filename, url = ( "librispeech_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/" "librispeech_pretrained_v2.pth", ) elif pretrained_model == "tedlium": filename, url = ( "ted_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/ted_pretrained_v2.pth", ) elif pretrained_model is None: # If model is None and no pretrained model is selected, then we need to have parameters filename and # url to download, extract and load the automatic speech recognition model if filename is None or url is None: filename, url = ( "librispeech_pretrained_v2.pth", "https://github.com/SeanNaren/deepspeech.pytorch/releases/download/v2.0/" "librispeech_pretrained_v2.pth", ) else: raise ValueError( "The input pretrained model %s is not supported." % pretrained_model) # Download model model_path = get_file(filename=filename, path=config.ART_DATA_PATH, url=url, extract=False, verbose=self.verbose) # Then load model self._model = load_model(device=self._device, model_path=model_path, use_half=use_half) else: self._model = model # Push model to the corresponding device self._model.to(self._device) # Save first version of the optimizer self._optimizer = optimizer self._use_amp = use_amp # Now create a decoder # Create the language model config first lm_config = LMConfig() # Then setup the config if decoder_type == "greedy": lm_config.decoder_type = DecoderType.greedy elif decoder_type == "beam": lm_config.decoder_type = DecoderType.beam else: raise ValueError("Decoder type %s currently not supported." % decoder_type) lm_config.lm_path = lm_path lm_config.top_paths = top_paths lm_config.alpha = alpha lm_config.beta = beta lm_config.cutoff_top_n = cutoff_top_n lm_config.cutoff_prob = cutoff_prob lm_config.beam_width = beam_width lm_config.lm_workers = lm_workers # Create the decoder with the lm config self.decoder = load_decoder(labels=self._model.labels, cfg=lm_config) # Setup for AMP use if self._use_amp: from apex import amp if self._optimizer is None: logger.warning( "An optimizer is needed to use the automatic mixed precision tool, but none for provided. " "A default optimizer is used.") # Create the optimizers parameters = self._model.parameters() self._optimizer = torch.optim.SGD(parameters, lr=0.01) if self._device.type == "cpu": enabled = False else: enabled = True self._model, self._optimizer = amp.initialize( models=self._model, optimizers=self._optimizer, enabled=enabled, opt_level=opt_level, loss_scale=1.0, )