Пример #1
0
 def __init__(
     self,
     model: EncDecCTCModel,
     sample_rate: int,
     batch_size: int = 1,
     device: str = "cuda",
 ) -> None:
     super(ASRAudioEncoderDecoder, self).__init__()
     self.online_audio = ASROnlineAudioData(sample_rate)
     self.data_loader = DataLoader(
         dataset=self.online_audio,
         batch_size=batch_size,
         collate_fn=self.online_audio.collate_fn,
     )
     model.eval()
     self.device = torch.device(device)
     self.model = model.to(self.device)
Пример #2
0
def generate_ref_hyps(asr_model: EncDecCTCModel, search: str, arpa: str):

    if can_gpu:
        asr_model = asr_model.cuda()
        print("USING GPU!")

    asr_model.eval()
    vocabulary = asr_model.decoder.vocabulary
    labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))])
    wer = WER(vocabulary=vocabulary)

    if search == "kenlm" or search == "beamsearch":
        arpa_file = prepare_arpa_file(arpa)
        lm_path = arpa_file if search == "kenlm" else None

        beamsearcher = nemo_asr.modules.BeamSearchDecoderWithLM(
            vocab=list(vocabulary),
            beam_width=16,
            alpha=2,
            beta=1.5,
            lm_path=lm_path,
            num_cpus=max(os.cpu_count(), 1),
            input_tensor=True,
        )

    for batch in asr_model.test_dataloader():
        # TODO(tilo): test_loader should return dict or some typed object not tuple of tensors!!
        if can_gpu:
            batch = [x.cuda() for x in batch]
        input_signal, inpsig_len, transcript, transc_len = batch
        with autocast():
            log_probs, encoded_len, greedy_predictions = asr_model(
                input_signal=input_signal, input_signal_length=inpsig_len)
        if search == "greedy":
            decoded = wer.ctc_decoder_predictions_tensor(greedy_predictions)
        else:
            decoded = beamsearch_forward(beamsearcher,
                                         log_probs=log_probs,
                                         log_probs_length=encoded_len)

        for i, hyp in enumerate(decoded):
            reference = "".join([
                labels_map[c]
                for c in transcript[i].cpu().detach().numpy()[:transc_len[i]]
            ])
            yield reference, hyp