예제 #1
0
파일: infer.py 프로젝트: battyone/sonosco
def main(config_path, audio_path, plot):
    config = parse_yaml(config_path)["infer"]
    device = torch.device("cuda" if CUDA_ENABLED else "cpu")

    loader = Deserializer()
    model = loader.deserialize(TDSSeq2Seq, config["model_checkpoint_path"])
    model.to(device)
    model.eval()

    decoder = GreedyDecoder(model.decoder.labels)

    processor = AudioDataProcessor(**config)
    spect, lens = processor.parse_audio_for_inference(audio_path)
    spect = spect.to(device)

    # Watch out lens is modified after this call!
    # It is now equal to the number of encoded states
    with torch.no_grad():
        out, output_lens, attention = model(spect, lens)
        decoded_output, decoded_offsets = decoder.decode(out, output_lens)
        LOGGER.info(decoded_output)
        if plot:
            import matplotlib.pyplot as plt
            plt.matshow(attention[0].numpy())
            plt.show()
예제 #2
0
 def __init__(self, model_path):
     """
     DeepSpeech inference implementation.
     Args:
         model_path: path to DeepSpeech model
     """
     super().__init__(model_path)
     self.model = DeepSpeech2.load_model(model_path)
     self.model.eval()
     self.processor = AudioDataProcessor(**self.model.audio_conf)
     self.decoder = GreedyDecoder(self.model.labels,
                                  blank_index=self.model.labels.index('_'))
예제 #3
0
 def __init__(self, model_path):
     """
     LAS inference implementation.
     Args:
         model_path: path to LAs model
     """
     super().__init__(model_path)
     self.model, self.config = self.loader.deserialize(Seq2Seq,
                                                       model_path,
                                                       with_config=True)
     self.model.eval()
     self.processor = AudioDataProcessor(**self.config)
     self.decoder = GreedyDecoder(self.config["labels"])
예제 #4
0
파일: test.py 프로젝트: battyone/sonosco
def main(model_path, cuda, audio_path, **kwargs):
    device = torch.device("cuda" if cuda else "cpu")
    model = DeepSpeech2.load_model(model_path)
    model.eval()
    decoder = BeamCTCDecoder(model.labels, blank_index=model.labels.index('_'))
    processor = AudioDataProcessor(**model.audio_conf)

    spect = processor.parse_audio_from_file(audio_path)
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    spect = spect.to(device)
    input_sizes = torch.IntTensor([spect.size(3)]).int()
    out, output_sizes = model(spect, input_sizes)
    decoded_output, decoded_offsets = decoder.decode(out, output_sizes)
    print(decoded_output)
예제 #5
0
def evaluate_deepspeech():
    setup_logging(LOGGER)
    path_to_model_checkpoint = '/Users/florianlay/roboy/sonosco/pretrained/deepspeech_final.pth'
    config_path = "bootstrap_deepspeech.yaml"

    config = parse_yaml(config_path)

    experiment = Experiment.create(config, LOGGER)

    model = DeepSpeech2.load_model(path_to_model_checkpoint)
    model.eval()

    processor = AudioDataProcessor(**config)
    test_dataset = AudioDataset(processor, manifest_filepath=config["test_manifest"])
    sampler = RandomSampler(data_source=test_dataset, replacement=True,
                                       num_samples=2)
    test_loader = AudioDataLoader(dataset=test_dataset, num_workers=config["num_data_workers"], sampler=sampler)

    device = torch.device("cuda" if CUDA_ENABLED else "cpu")

    metrics = [word_error_rate, character_error_rate]
    evaluator = ModelEvaluator(model,
                               test_loader,
                               config['bootstrap_size'],
                               config['num_bootstraps'],
                               decoder=GreedyDecoder(config['labels']),
                               device=device,
                               metrics=metrics)

    evaluator.start_evaluation(log_path=experiment.logs)
예제 #6
0
class DeepSpeech2Inference(SonoscoASR):
    def __init__(self, model_path):
        """
        DeepSpeech inference implementation.
        Args:
            model_path: path to DeepSpeech model
        """
        super().__init__(model_path)
        self.model = DeepSpeech2.load_model(model_path)
        self.model.eval()
        self.processor = AudioDataProcessor(**self.model.audio_conf)
        self.decoder = GreedyDecoder(self.model.labels,
                                     blank_index=self.model.labels.index('_'))

    def infer_from_path(self, path: str) -> str:
        """
        Infer speech from audio under path
        Args:
            path: path to audio

        Returns: inferred text

        """
        loaded, sr = librosa.load(path, sr=self.processor.sample_rate)
        spect = self.processor.parse_audio(sound=loaded, sample_rate=sr)
        spect = spect.view(1, 1, spect.size(0), spect.size(1))
        spect = spect.to(DEVICE)
        input_sizes = torch.IntTensor([spect.size(3)]).int()
        with torch.no_grad():
            out, output_sizes = self.model(spect, input_sizes)
        decoded_output, decoded_offsets = self.decoder.decode(
            out, output_sizes, remove_repetitions=True)
        return decoded_output[0]
예제 #7
0
def main(config_path, audio_path):
    config = parse_yaml(config_path)["infer"]

    loader = Deserializer()
    model: Seq2Seq = loader.deserialize(Seq2Seq,
                                        config["model_checkpoint_path"])
    model.to(DEVICE)
    model.eval()

    decoder = GreedyDecoder(config["labels"])

    processor = AudioDataProcessor(**config)
    spect, lens = processor.parse_audio_for_inference(audio_path)
    spect = spect.to(DEVICE)

    with torch.no_grad():
        output = model.recognize(spect[0], lens, config["labels"],
                                 config["recognizer"])[0]
        transcription = decoder.convert_to_strings(
            torch.tensor([output['yseq']]))
        LOGGER.info(transcription)
예제 #8
0
class LasInference(SonoscoASR):
    def __init__(self, model_path):
        """
        LAS inference implementation.
        Args:
            model_path: path to LAs model
        """
        super().__init__(model_path)
        self.model, self.config = self.loader.deserialize(Seq2Seq,
                                                          model_path,
                                                          with_config=True)
        self.model.eval()
        self.processor = AudioDataProcessor(**self.config)
        self.decoder = GreedyDecoder(self.config["labels"])

    def infer_from_path(self, path: str) -> str:
        """
        Infer speech from audio under path
        Args:
            path: path to audio

        Returns: inferred text

        """
        spect, lens = self.processor.parse_audio_for_inference(path)
        spect = spect.to(DEVICE)

        with torch.no_grad():
            output = self.model.recognize(spect[0], lens,
                                          self.config["labels"],
                                          self.config["recognizer"])[0]

        transcription = self.decoder.convert_to_strings(
            torch.tensor([output['yseq']]))[0]

        return transcription