Python CTCBeamDecoder 예제들, ctcdecode.CTCBeamDecoder Python 예제들

예제 #1

0

파일 보기

파일: decoder.py 프로젝트: maggieezzat/deepspeech.pytorch

 def __init__(
     self,
     labels,
     lm_path=None,
     alpha=0,
     beta=0,
     cutoff_top_n=40,
     cutoff_prob=1.0,
     beam_width=100,
     num_processes=4,
     blank_index=0,
 ):
     super(BeamCTCDecoder, self).__init__(labels)
     try:
         from ctcdecode import CTCBeamDecoder
     except ImportError:
         raise ImportError("BeamCTCDecoder requires paddledecoder package.")
     self._decoder = CTCBeamDecoder(
         labels,
         lm_path,
         alpha,
         beta,
         cutoff_top_n,
         cutoff_prob,
         beam_width,
         num_processes,
         blank_index,
     )

예제 #2

0

파일 보기

파일: eval.py 프로젝트: YIWANG3/phoneme_rnn

def validate(model, dev_loader):
    decoder = CTCBeamDecoder(['$'] * 47, beam_width=100, log_probs_input=True)
    with torch.no_grad():
        model.eval()
        model.cuda()
        count = 0
        dist_sum = 0
        for batch_idx, lst in enumerate(dev_loader):
            X, X_lens, Y, Y_lens = process_train_lst(lst)
            out, out_lens = model(X, X_lens)
            val_Y, _, _, val_Y_lens = decoder.decode(out.transpose(0, 1),
                                                     out_lens)
            this_batch_size = val_Y.shape[0]

            predicted_list = [
                val_Y[i, 0, :val_Y_lens[i, 0]] for i in range(this_batch_size)
            ]
            ground_truth_list = [
                Y[i, 0:Y_lens[i]] for i in range(this_batch_size)
            ]
            ground_truth_phoneme_list = convert_to_phoneme(ground_truth_list)
            predicted_phoneme_list = convert_to_phoneme(predicted_list)

            for i in range(len(predicted_list)):
                count += 1
                cur_predicted_str = "".join(predicted_phoneme_list[i])
                cur_label_str = "".join(ground_truth_phoneme_list[i])
                cur_dist = Levenshtein.distance(cur_predicted_str,
                                                cur_label_str)
                dist_sum += cur_dist
            print(f"Batch: {batch_idx} | Avg Distance: {dist_sum / count}")
        print("Dev Avg Distance: {:.4f}".format(dist_sum / count))

예제 #3

0

파일 보기

 def __init__(self,
              labels,
              lm_path=None,
              alpha=0,
              beta=0,
              cutoff_top_n=40,
              cutoff_prob=1.0,
              beam_width=100,
              num_processes=16,
              blank_index=0):
     super(BeamCTCDecoder, self).__init__(labels)
     # try:
     #     from ctcdecode import CTCBeamDecoder
     # except ImportError:
     #     raise ImportError("BeamCTCDecoder requires paddledecoder package.")
     log_probs_input = True
     # self._log_probs = 1 if log_probs_input else 0
     labels = list(labels)
     # print(labels)
     # print(2,type(labels),3,type(len(labels)),4,type(beam_width),5,type(num_processes),6,type(cutoff_prob),
     # 7,type(cutoff_top_n),8,type(blank_index),9,type(self._log_probs))
     self._decoder = CTCBeamDecoder(labels,
                                    lm_path,
                                    alpha,
                                    beta,
                                    cutoff_top_n,
                                    cutoff_prob,
                                    beam_width,
                                    num_processes,
                                    blank_index,
                                    log_probs_input=True)

예제 #4

0

파일 보기

    def __init__(self, 
            labels, 
            lm_path=None, 
            alpha=1.5, 
            beta=0.8,
            cutoff_top_n=15,
            cutoff_prob=1.0,
            beam_width=256,
            num_processes=4,
            blank_id=31,
            log_probs_input=False):

        print("Initializing Decoder")
        self.decoder = CTCBeamDecoder(
            labels,
            model_path = lm_path,
            alpha=alpha,
            beta=beta,
            cutoff_top_n=cutoff_top_n,
            cutoff_prob=cutoff_prob,
            beam_width=beam_width,
            num_processes=num_processes,
            blank_id=blank_id,
            log_probs_input=log_probs_input
        )

        self.decode_dict = self._dict_from_labels(labels)
        print("Decoder ready")

예제 #5

0

파일 보기

파일: training.py 프로젝트: catapulta/speech-recognition

    def __init__(self, model, loader, val_loader, test_loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model.cuda() if torch.cuda.is_available() else model
        self.loader = loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id

        self.optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
        # self.optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-6, momentum=0.9)
        self.criterion = CTCLoss()#size_average=True, length_average=False)
        self.criterion = self.criterion.cuda() if torch.cuda.is_available() else self.criterion
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2)
        self.LD = Levenshtein(phoneme_list.PHONEME_MAP)
        self.best_rate = 1e10
        self.decoder = CTCBeamDecoder(labels=[' '] + phoneme_list.PHONEME_MAP, blank_id=0, beam_width=150)

예제 #6

0

파일 보기

파일: training.py 프로젝트: catapulta/speech-recognition

 def __init__(self, charmap):
     self.label_map = [' '] + charmap  # add blank to first entry
     self.decoder = CTCBeamDecoder(
         labels=self.label_map,
         blank_id=0,
         beam_width=100
     )

예제 #7

0

파일 보기

 def __init__(self):
     super().__init__()
     self.labels = [' '] + PHONEME_MAP
     self.decoder = CTCBeamDecoder(labels=self.labels,
                                   beam_width=100,
                                   blank_id=0,
                                   num_processes=32)

예제 #8

0

파일 보기

파일: ctc_beam_decoder.py 프로젝트: yannidd/aitrainer

    def __init__(
            self,
            labels: list = LABELS,
            beam_width: int = 100,
            model_path: str = None,
            alpha: float = 0.0,
            beta: float = 0.0,
            cutoff_top_n: int = 40,
            cutoff_prob: float = 1.0,
            blank_id: int = LABELS.index('_'),
            log_probs_input: bool = False,
    ):
        self.labels = labels
        self.beam_width = beam_width
        self.model_path = model_path
        self.alpha = alpha
        self.beta = beta
        self.cutoff_top_n = cutoff_top_n
        self.cutoff_prob = cutoff_prob
        self.blank_id = blank_id
        self.log_probs_input = log_probs_input

        self.decoder = CTCBeamDecoder(labels=labels,
                                      beam_width=beam_width,
                                      model_path=model_path,
                                      alpha=alpha,
                                      beta=beta,
                                      cutoff_top_n=cutoff_top_n,
                                      cutoff_prob=cutoff_prob,
                                      num_processes=max(os.cpu_count(), 1),
                                      blank_id=blank_id,
                                      log_probs_input=log_probs_input)

예제 #9

0

파일 보기

파일: ctc_decoder.py 프로젝트: toshiks/number_recognizer

 def __init__(self,
              blank_id: int,
              alphabet: List[str],
              count_prediction=10):
     self.decoder = CTCBeamDecoder(alphabet,
                                   beam_width=count_prediction,
                                   blank_id=blank_id)

예제 #10

0

파일 보기

파일: pred.py 프로젝트: ziqian98/Deep-Learning-Projects

def pred_model(model, test_loader):
    with torch.no_grad():
        model.eval()

        predLabel = []

        for batch_idx, (padinp, xlens) in enumerate(test_loader):
            padinp = padinp.to(device)

            batchlabel = []

            out, out_lens = model(padinp, xlens)

            phonemes = [" "] + PHONEME_MAP

            decoder = CTCBeamDecoder(phonemes,
                                     beam_width=10,
                                     log_probs_input=True)
            out_lens = torch.LongTensor(out_lens)

            pred, _, _, pred_lens = decoder.decode(out.transpose(0, 1),
                                                   out_lens)

            for i in range(len(pred)):
                seq = ""
                for j in range(pred_lens[i, 0]):
                    seq += phonemes[int(pred[i, 0, j])]

                batchlabel.append(seq)

            predLabel = predLabel + batchlabel

    return predLabel

예제 #11

0

파일 보기

파일: decoder.py 프로젝트: unixnme/deepspeech.pytorch

 def __init__(self,
              labels,
              lm_path=None,
              alpha=0,
              beta=0,
              cutoff_top_n=40,
              cutoff_prob=1.0,
              beam_width=100,
              num_processes=4,
              blank_index=0,
              wfst: bool = False):
     super(BeamCTCDecoder, self).__init__(labels)
     try:
         from ctcdecode import CTCBeamDecoder
     except ImportError:
         raise ImportError("BeamCTCDecoder requires paddledecoder package.")
     self._decoder = CTCBeamDecoder(labels,
                                    lm_path,
                                    alpha,
                                    beta,
                                    cutoff_top_n,
                                    cutoff_prob,
                                    beam_width,
                                    num_processes,
                                    blank_index,
                                    wfst=wfst)
     self.wfst = wfst
     if wfst:
         self.mapping = dict((65 + i, 2 + i) for i in range(26))
         self.mapping[39] = 1  # '
         self.mapping[32] = 28  # space
         self.mapping[0] = 0

예제 #12

0

파일 보기

파일: beam_decoder.py 프로젝트: battyone/sonosco

 def __init__(self,
              labels: str,
              lm_path: str = None,
              alpha: int = 0,
              beta: int = 0,
              cutoff_top_n: int = 40,
              cutoff_prob: float = 1.0,
              beam_width: int = 100,
              num_processes: int = 4,
              blank_index: int = 0):
     """
     CTC decoder.
     Args:
         labels: labels
         lm_path: language model path
         alpha: ctc param
         beta: ctc param
         cutoff_top_n: ctc param
         cutoff_prob: ctc param
         beam_width: ctc param
         num_processes: ctc param
         blank_index: ctc param
     """
     super(BeamCTCDecoder, self).__init__(labels)
     try:
         from ctcdecode import CTCBeamDecoder
     except ImportError:
         raise ImportError("BeamCTCDecoder requires paddledecoder package.")
     self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width,
                                    num_processes, blank_index)

예제 #13

0

파일 보기

파일: beam.py 프로젝트: vilmara/inference

    def __init__(self,
                 alphabet,
                 blank_symbol,
                 model_path=None,
                 alpha=1.0,
                 beta=1.0,
                 cutoff_prob=1.0,
                 cutoff_top_n=None,
                 beam_width=128,
                 num_processes=4):
        super().__init__(alphabet, blank_symbol)

        cutoff_top_n = cutoff_top_n or len(alphabet)
        blank_id = alphabet.get_index(blank_symbol)

        if model_path is None:
            self._logger.warning('language model will not be used as '
                                 '`model_path` is None')
        if model_path is not None and alpha == 0.0:
            self._logger.warning("language model will not be used as it's "
                                 "weighting `alpha` is zero")

        self._decoder = CTCBeamDecoder(labels=alphabet,
                                       model_path=model_path,
                                       alpha=alpha,
                                       beta=beta,
                                       cutoff_top_n=cutoff_top_n,
                                       cutoff_prob=cutoff_prob,
                                       beam_width=beam_width,
                                       num_processes=num_processes,
                                       blank_id=blank_id)

예제 #14

0

파일 보기

파일: decoder_ronit.py 프로젝트: ronitd/asr_for_low_resource_languages

 def __init__(self,
              labels,
              ctc_labels=None,
              lm_path=None,
              alpha=0,
              beta=0,
              cutoff_top_n=25,
              cutoff_prob=-2.1,
              beam_width=100,
              num_processes=4,
              blank_index=0,
              log_probs_input=True,
              phoneme_vocab=None,
              trie=None):
     super(BeamCTCDecoder, self).__init__(labels)
     self.ctc_labels = ctc_labels
     from .CTCBeamSearchCustom import CTCBeamSearchCustom
     try:
         from ctcdecode import CTCBeamDecoder
     except ImportError:
         raise ImportError("BeamCTCDecoder requires ctcdecoder package")
     # self._decoder = CTCBeamDecoder(labels.lower(), lm_path, alpha, beta, cutoff_top_n,
     # 	cutoff_prob, beam_width=beam_width, num_processes=num_processes)
     #print("Labels Decoder: ", labels)
     self._decoder = CTCBeamDecoder(ctc_labels,
                                    lm_path,
                                    alpha,
                                    beta,
                                    cutoff_top_n,
                                    cutoff_prob,
                                    beam_width=beam_width,
                                    num_processes=num_processes,
                                    log_probs_input=False)

예제 #15

0

파일 보기

    def __init__(
        self,
        labels,
        lm_path=None,
        alpha=0,
        beta=0,
        cutoff_top_n=40,
        cutoff_prob=1.0,
        beam_width=100,
        num_processes=4,
        blank_index=0,
    ):
        super(BeamCTCDecoder, self).__init__(labels)
        try:
            from ctcdecode import CTCBeamDecoder
        except ImportError:

            class CTCBeamDecoder:
                ...

        self._decoder = CTCBeamDecoder(
            labels,
            lm_path,
            alpha,
            beta,
            cutoff_top_n,
            cutoff_prob,
            beam_width,
            num_processes,
            blank_index,
        )

예제 #16

0

파일 보기

파일: decoders.py 프로젝트: lfelipesv/speech2text

    def __init__(self,
                 alphabet,
                 lm_path=None,
                 alpha=0,
                 beta=0,
                 cutoff_top_n=40,
                 cutoff_prob=1.0,
                 beam_width=100,
                 num_processes=4):
        super().__init__(alphabet)

        try:
            from ctcdecode import CTCBeamDecoder
        except ImportError:
            raise ImportError("BeamCTCDecoder requires ctcdecode package.")

        self._decoder = CTCBeamDecoder(alphabet.tokens,
                                       lm_path,
                                       alpha,
                                       beta,
                                       cutoff_top_n,
                                       cutoff_prob,
                                       beam_width,
                                       num_processes,
                                       alphabet.blank_index,
                                       log_probs_input=True)

예제 #17

0

파일 보기

파일: utils.py 프로젝트: nightfuryyy/deep-text-recognition-benchmark

 def decode_beamsearch(self, preds) :
     texts = []
     preds = preds.softmax(2)
     # preds = torch.Tensor.cpu(preds).detach().numpy()
     # # print(preds.shape)
     # for i in range(preds.shape[0]) :
     #     seq, path = beam_search(preds[i], self.alphabet, beam_size=20, beam_cut_threshold=0.00001)
     #     texts.append(seq)
     decoder = CTCBeamDecoder(
         self.character,
         model_path=None,
         alpha=0,
         beta=0,
         cutoff_top_n=10,
         cutoff_prob=1.0,
         beam_width=4,
         num_processes=16,
         blank_id=0,
         log_probs_input=False
     )
     beam_results, beam_scores, timesteps, out_lens = decoder.decode(preds)
     for i in range(preds.shape[0]) :
         seq = "".join(self.character[n] for n in beam_results[i][0][:out_lens[i][0]])
         texts.append(seq)
     
     # return decoder(preds)
     return texts

예제 #18

0

파일 보기

파일: dynamic_cnn_transformer_ctc_regulated.py 프로젝트: tongjinle123/speech_recognition

 def init_beam_decoder(self, alpha=0.8, beta=0.3, cutoff_top_n=40, cutoff_prob=1.0, beam_width=32, num_processes=4, use_lm=True):
     lm_path = "lm/zh_giga.no_cna_cmn.prune01244.klm" if use_lm else None
     blank_index = 1
     self.beam_decoder = CTCBeamDecoder(
         labels=self.vocab._id2token, model_path=lm_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n,
         cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=num_processes, blank_id=blank_index,
         log_probs_input=True
     )

예제 #19

0

파일 보기

파일: test.py 프로젝트: JLuisRojas/pytorch-reconocimiento-voz

    def __init__(self, params):
        super().__init__()
        self.model = DeepSpeech2()
        self.ctc_loss = nn.CTCLoss(reduction='none')
        self.vocab_str = list('_abcdefghijklmnñopqrstuvwxyz ')
        print(self.vocab_str)

        self.ctc_decoder = CTCBeamDecoder(self.vocab_str, log_probs_input=True)

예제 #20

0

파일 보기

 def __init__(self, device='cpu'):
     self.device = device
     self.preds = []
     self.gts = []
     self.decoder = CTCBeamDecoder(CHAR_LIST,
                                   beam_width=1,
                                   num_processes=16,
                                   blank_id=0,
                                   log_probs_input=False)

예제 #21

0

파일 보기

 def __init__(self):
     self.label_map = PHONEME_MAP + [' ']
     self.phoneme_list = PHONEME_LIST + [' ']
     self.decoder = CTCBeamDecoder(labels=self.label_map,
                                   blank_id=phonemes_len,
                                   log_probs_input=True,
                                   beam_width=200)
     self.greedy_decoder = GreedyDecoder(labels=self.label_map,
                                         blank_index=phonemes_len)

예제 #22

0

파일 보기

파일: decoders.py 프로젝트: lfelipesv/speech2text

class BeamCTCDecoder(Decoder):
    def __init__(self,
                 alphabet,
                 lm_path=None,
                 alpha=0,
                 beta=0,
                 cutoff_top_n=40,
                 cutoff_prob=1.0,
                 beam_width=100,
                 num_processes=4):
        super().__init__(alphabet)

        try:
            from ctcdecode import CTCBeamDecoder
        except ImportError:
            raise ImportError("BeamCTCDecoder requires ctcdecode package.")

        self._decoder = CTCBeamDecoder(alphabet.tokens,
                                       lm_path,
                                       alpha,
                                       beta,
                                       cutoff_top_n,
                                       cutoff_prob,
                                       beam_width,
                                       num_processes,
                                       alphabet.blank_index,
                                       log_probs_input=True)

    def decode(self, log_probs, sizes=None):
        """
        Given a matrix of character probabilities, returns the decoder's
        best guess of the transcription

        Arguments:
            log_probs (tensor): Tensor of log probabilities with shape (B, T, L), 
                where `log_probs[b, t, l]` is the log probability of character `c` at time `t` 
                in batch `b`
            sizes (optional): Size of each sequence in the batch
        Returns:
            decoded (list of string): sequence of the model's best guess for the transcription
            scores (tensor): tensor of size B the negative log probability 
            offsets (tensor): time-step per character predicted
        """
        log_probs = log_probs.cpu()

        out, scores, offsets, seq_lens = self._decoder.decode(log_probs, sizes)

        strings = self.tensor2str(out[:, 0, :], seq_lens[:, 0])

        scores = scores[:, 0]
        offsets = offsets[:, 0]

        return strings, scores, offsets

    def reset_params(self, alpha, beta):
        self._decoder.reset_params(alpha, beta)

예제 #23

0

파일 보기

파일: util.py 프로젝트: SiriusKY/pytorch-ocr

def recognize(image_path, model, label_dict, device):
    img = Image.open(image_path).convert("RGB")
    tgt_height = 64

    width, height = img.size
    reshape_width = tgt_height * (width / height)
    img = img.resize([int(reshape_width), int(tgt_height)])
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
    ])
    img = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(img)

    _, ind2ch = get_label_dict(label_dict)

    # output = output.squeeze(1).cpu().numpy()
    # results, score = ctcdecoder.decode(output, 20, 98)

    labels = list(ind2ch.values())
    replace_label = {
        'UNK': '_',
        'SOS': '_',
        'EOS': '_',
        'SPACE': ' ',
        'BLANK': '_'
    }
    labels = ''.join(
        [replace_label[l] if l in replace_label.keys() else l for l in labels])
    decoder = CTCBeamDecoder(labels,
                             model_path=None,
                             alpha=0,
                             beta=0,
                             cutoff_top_n=40,
                             cutoff_prob=1.0,
                             beam_width=20,
                             num_processes=8,
                             blank_id=98,
                             log_probs_input=True)
    output = output.permute(1, 0, 2)
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(output)
    results = beam_results[0][0][:out_lens[0][0]].cpu().tolist()
    # print(results)
    # print(1/torch.exp(beam_scores))

    pred = ''
    for ch in results:
        ch = ind2ch[ch]
        if ch in ['UNK', 'SOS', 'EOS', 'BLANK']:
            continue
        elif ch == 'SPACE':
            pred += ' '
        else:
            pred += ch
    return pred

예제 #24

0

파일 보기

 def __init__(self, PHONEME_MAP, blank_index=0, beam_width=100):
     # Add the blank to the phoneme_map as the first element
     if PHONEME_MAP[blank_index] != ' ':
         PHONEME_MAP.insert(0, ' ')
     # Define the int_to_char dictionary
     self.int_to_char = dict([(i, c) for (i, c) in enumerate(PHONEME_MAP)])
     self._decoder = CTCBeamDecoder(PHONEME_MAP,
                                    blank_id=blank_index,
                                    beam_width=beam_width,
                                    log_probs_input=True)

예제 #25

0

파일 보기

파일: ctc_decoder.py 프로젝트: zjc6666/wav2vec

 def __init__(self, args, tgt_dict):
     self.tgt_dict = tgt_dict
     self.vocab_size = len(tgt_dict)
     self.nbest = args.nbest
     self.beam = args.beam
     self.blank = (tgt_dict.index("<ctc_blank>")
                   if "<ctc_blank>" in tgt_dict.indices else tgt_dict.bos())
     self.decode_fn = CTCBeamDecoder(tgt_dict.symbols,
                                     beam_width=self.beam,
                                     blank_id=self.blank,
                                     num_processes=10)

예제 #26

0

파일 보기

def fast_beam_search_decode(logprobs,
                            logprobs_lens,
                            vocab,
                            beam_size,
                            cutoff_top_n,
                            cutoff_prob,
                            ext_scoring_func,
                            alpha,
                            beta,
                            num_processes,
                            rescorer=None):
    blank_index = vocab['<blank>']

    labels = ''.join(vocab.indices2tokens()).replace('<blank>',
                                                     '_').replace('<unk>', '')
    decoder = CTCBeamDecoder(labels=labels,
                             blank_id=blank_index,
                             cutoff_top_n=cutoff_top_n,
                             cutoff_prob=cutoff_prob,
                             beam_width=beam_size,
                             model_path=ext_scoring_func,
                             alpha=alpha,
                             beta=beta,
                             num_processes=num_processes,
                             log_probs_input=True)
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(
        torch.transpose(logprobs, 0, 1), logprobs_lens)

    predictions = []
    for idx in range(beam_results.shape[0]):
        beam = []
        for jdx in range(beam_results.shape[1]):
            hypo = ''.join(
                vocab.lookup_tokens(
                    beam_results[idx, jdx, :out_lens[idx, jdx]].tolist()))
            hypo_score = -beam_scores[idx, jdx]
            beam.append((hypo, hypo_score))
        predictions.append(beam)

    if rescorer is not None:
        all_hypos = [hypo for beam in predictions for hypo, _ in beam]
        scoring_results = rescorer.score(all_hypos)
        all_lm_scores = [
            scoring_result['positional_scores'].mean().item()
            for scoring_result in scoring_results
        ]
        all_lm_scores = torch.tensor(all_lm_scores).reshape(beam_scores.shape)
        all_lm_scores = torch.softmax(all_lm_scores, dim=1)
        predictions = [[(predictions[idx][jdx][0], all_lm_scores[idx, jdx])
                        for jdx in range(beam_results.shape[1])]
                       for idx in range(beam_results.shape[0])]

    return predictions

예제 #27

0

파일 보기

파일: test.py 프로젝트: abhishek-96/testHW3P2

def val():
  model.eval()
  distances = []
  for batch_idx, (data, target, in_lens, target_lens) in enumerate(test_loader):
    data, in_lens = data.to(device), in_lens.to(device)
    out, out_lens = model(data, in_lens)
    decoder = CTCBeamDecoder(PHONEME_LIST, beam_width=3)
    decoded_out, _, _, decoded_lens = decoder.decode(out.transpose(0, 1).cpu(), out_lens.cpu())
    decoded_strings = [label_to_short_phoneme(decoded_out[i, 0, :decoded_lens[i]]) for i in range(decoded_out.shape[0])]
    decoded_labels = [label_to_short_phoneme(label_pad[i, : target_lens[i]]) for i in range(label_pad.shape[0])]
    batch_distances = [distance(o, l) for o, l in zip(decoded_strings, decoded_labels)]
    distances.extend(batch_distances)
    print('Distance = ', np.mean(distances))

예제 #28

0

파일 보기

파일: benchmark.py 프로젝트: tadas-subonis/fast-ctc-decode

 def cpp_beam_search(predictions, labels, beam_width=5, beam_cut_threshold=0.1):
     """
     C++ Beam search CTC decoder https://github.com/parlance/ctcdecode
     """
     # add batch dimension expected by CTCBeamDecoder
     predictions = np.expand_dims(predictions, 0)
     predictions = torch.FloatTensor(predictions)
     decoder = CTCBeamDecoder(
         labels, beam_width=beam_width, cutoff_prob=beam_cut_threshold
     )
     beam_result, _, _, out_seq_len = decoder.decode(predictions)
     beam_result = beam_result[0][0][0:out_seq_len[0][0]]
     return ''.join(labels[x] for x in beam_result)

예제 #29

0

파일 보기

파일: test.py 프로젝트: pmuilu/ocr_crnn

def test(model, test_loader, ocr_dataset):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    ratios = []
    lv_ratios = []

    BLANK = ocr_dataset.get_num_classes()-1

    with torch.no_grad():
        for ((x, input_lengths),(y,target_lengths)) in test_loader:
            print("Run eval")
            x = x.to(device)
            
            outputs = model.forward(x)
            outputs = outputs.permute(1, 0, 2)
            
            decoder = CTCBeamDecoder(ocr_dataset.char_vec,
                                    blank_id=BLANK,
                                    log_probs_input=True)

            output, scores, ts, out_seq_len = decoder.decode(outputs.data, 
                                                    torch.IntTensor(input_lengths))

            results = []
        
            for b, batch in enumerate(output):
                size = out_seq_len[b][0]
                dec = batch[0]

                text = ''
                if size > 0:
                    text = ocr_dataset.get_decoded_label(dec[0:size])
                
                results.append(text)
            
            
            ptr = 0
            for i, p in enumerate(target_lengths):
                yi = y[ptr:ptr+p]
                
                s1 = results[i]
                s2 = ocr_dataset.get_decoded_label(yi)

                ratios.append(SequenceMatcher(None, s1, s2).quick_ratio())
                
                lv_ratios.append(char_err_rate(s1, s2))

                ptr += p   

    print("SequenceMatcher acc:", np.mean(ratios), np.std(ratios))
    print("Levenshtein acc:", np.mean(lv_ratios), np.std(lv_ratios))

예제 #30

0

파일 보기

파일: main.py 프로젝트: sherryzyy/GPU-Accelerated-Speech-Recognition

def run(config):
    batch_size = config["batch_size"]
    seq_len = config["seg_len"]
    n_iter = config["epoch"]
    input_size = config["input_size"]
    device = config["device"]
    vocab_size = config["vocab_size"]
    # num_processes = config["num_processes"]

    beam_width = config["beam_width"]
    # print("num_processes_cpu: ", os.cpu_count())
    num_threads = config["num_threads"]
    if device == "cpu":
        torch.set_num_threads(num_threads)
        print("num_threads: ", torch.get_num_threads())
    model = DeepSpeech(config)
    decoder = CTCBeamDecoder(['$'] * (vocab_size + 1),
                             beam_width=beam_width,
                             blank_id=0,
                             num_processes=num_threads,
                             log_probs_input=True)

    # inp = torch.ones((batch_size, seq_len, input_size+2*input_size*n_context))

    model = model.to(device)

    forward_time = 0
    decode_time = 0
    overall_time = 0
    for i in range(n_iter):
        start_time = time.perf_counter()
        inp = torch.rand(
            (batch_size, seq_len, input_size + 2 * input_size * n_context))
        inp = inp.to(device)
        out = model(inp)
        end_time1 = time.perf_counter()
        start_time1 = time.perf_counter()
        out = out.transpose(0, 1)
        out_lens = torch.tensor([seq_len for _ in range(batch_size)])
        output, scores, timesteps, out_seq_len = decoder.decode(
            out,
            out_lens)  # [b, seq_len, vocab_size] -> [b, beam_width, seq_len]
        end_time2 = time.perf_counter()

        forward_time += end_time1 - start_time
        decode_time += end_time2 - start_time1
        overall_time += end_time2 - start_time

    print("Forward: %f s" % (forward_time / n_iter))
    print("CTC Decode %f s" % (decode_time / n_iter))
    print("Overall %f s" % (overall_time / n_iter))