def main(config_path, audio_path, plot): config = parse_yaml(config_path)["infer"] device = torch.device("cuda" if CUDA_ENABLED else "cpu") loader = Deserializer() model = loader.deserialize(TDSSeq2Seq, config["model_checkpoint_path"]) model.to(device) model.eval() decoder = GreedyDecoder(model.decoder.labels) processor = AudioDataProcessor(**config) spect, lens = processor.parse_audio_for_inference(audio_path) spect = spect.to(device) # Watch out lens is modified after this call! # It is now equal to the number of encoded states with torch.no_grad(): out, output_lens, attention = model(spect, lens) decoded_output, decoded_offsets = decoder.decode(out, output_lens) LOGGER.info(decoded_output) if plot: import matplotlib.pyplot as plt plt.matshow(attention[0].numpy()) plt.show()
def __init__(self, model_path): """ DeepSpeech inference implementation. Args: model_path: path to DeepSpeech model """ super().__init__(model_path) self.model = DeepSpeech2.load_model(model_path) self.model.eval() self.processor = AudioDataProcessor(**self.model.audio_conf) self.decoder = GreedyDecoder(self.model.labels, blank_index=self.model.labels.index('_'))
def __init__(self, model_path): """ LAS inference implementation. Args: model_path: path to LAs model """ super().__init__(model_path) self.model, self.config = self.loader.deserialize(Seq2Seq, model_path, with_config=True) self.model.eval() self.processor = AudioDataProcessor(**self.config) self.decoder = GreedyDecoder(self.config["labels"])
def main(model_path, cuda, audio_path, **kwargs): device = torch.device("cuda" if cuda else "cpu") model = DeepSpeech2.load_model(model_path) model.eval() decoder = BeamCTCDecoder(model.labels, blank_index=model.labels.index('_')) processor = AudioDataProcessor(**model.audio_conf) spect = processor.parse_audio_from_file(audio_path) spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(device) input_sizes = torch.IntTensor([spect.size(3)]).int() out, output_sizes = model(spect, input_sizes) decoded_output, decoded_offsets = decoder.decode(out, output_sizes) print(decoded_output)
def evaluate_deepspeech(): setup_logging(LOGGER) path_to_model_checkpoint = '/Users/florianlay/roboy/sonosco/pretrained/deepspeech_final.pth' config_path = "bootstrap_deepspeech.yaml" config = parse_yaml(config_path) experiment = Experiment.create(config, LOGGER) model = DeepSpeech2.load_model(path_to_model_checkpoint) model.eval() processor = AudioDataProcessor(**config) test_dataset = AudioDataset(processor, manifest_filepath=config["test_manifest"]) sampler = RandomSampler(data_source=test_dataset, replacement=True, num_samples=2) test_loader = AudioDataLoader(dataset=test_dataset, num_workers=config["num_data_workers"], sampler=sampler) device = torch.device("cuda" if CUDA_ENABLED else "cpu") metrics = [word_error_rate, character_error_rate] evaluator = ModelEvaluator(model, test_loader, config['bootstrap_size'], config['num_bootstraps'], decoder=GreedyDecoder(config['labels']), device=device, metrics=metrics) evaluator.start_evaluation(log_path=experiment.logs)
class DeepSpeech2Inference(SonoscoASR): def __init__(self, model_path): """ DeepSpeech inference implementation. Args: model_path: path to DeepSpeech model """ super().__init__(model_path) self.model = DeepSpeech2.load_model(model_path) self.model.eval() self.processor = AudioDataProcessor(**self.model.audio_conf) self.decoder = GreedyDecoder(self.model.labels, blank_index=self.model.labels.index('_')) def infer_from_path(self, path: str) -> str: """ Infer speech from audio under path Args: path: path to audio Returns: inferred text """ loaded, sr = librosa.load(path, sr=self.processor.sample_rate) spect = self.processor.parse_audio(sound=loaded, sample_rate=sr) spect = spect.view(1, 1, spect.size(0), spect.size(1)) spect = spect.to(DEVICE) input_sizes = torch.IntTensor([spect.size(3)]).int() with torch.no_grad(): out, output_sizes = self.model(spect, input_sizes) decoded_output, decoded_offsets = self.decoder.decode( out, output_sizes, remove_repetitions=True) return decoded_output[0]
def main(config_path, audio_path): config = parse_yaml(config_path)["infer"] loader = Deserializer() model: Seq2Seq = loader.deserialize(Seq2Seq, config["model_checkpoint_path"]) model.to(DEVICE) model.eval() decoder = GreedyDecoder(config["labels"]) processor = AudioDataProcessor(**config) spect, lens = processor.parse_audio_for_inference(audio_path) spect = spect.to(DEVICE) with torch.no_grad(): output = model.recognize(spect[0], lens, config["labels"], config["recognizer"])[0] transcription = decoder.convert_to_strings( torch.tensor([output['yseq']])) LOGGER.info(transcription)
class LasInference(SonoscoASR): def __init__(self, model_path): """ LAS inference implementation. Args: model_path: path to LAs model """ super().__init__(model_path) self.model, self.config = self.loader.deserialize(Seq2Seq, model_path, with_config=True) self.model.eval() self.processor = AudioDataProcessor(**self.config) self.decoder = GreedyDecoder(self.config["labels"]) def infer_from_path(self, path: str) -> str: """ Infer speech from audio under path Args: path: path to audio Returns: inferred text """ spect, lens = self.processor.parse_audio_for_inference(path) spect = spect.to(DEVICE) with torch.no_grad(): output = self.model.recognize(spect[0], lens, self.config["labels"], self.config["recognizer"])[0] transcription = self.decoder.convert_to_strings( torch.tensor([output['yseq']]))[0] return transcription