def __init__( self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=4, blank_index=0, ): super(BeamCTCDecoder, self).__init__(labels) try: from ctcdecode import CTCBeamDecoder except ImportError: raise ImportError("BeamCTCDecoder requires paddledecoder package.") self._decoder = CTCBeamDecoder( labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index, )
def validate(model, dev_loader): decoder = CTCBeamDecoder(['$'] * 47, beam_width=100, log_probs_input=True) with torch.no_grad(): model.eval() model.cuda() count = 0 dist_sum = 0 for batch_idx, lst in enumerate(dev_loader): X, X_lens, Y, Y_lens = process_train_lst(lst) out, out_lens = model(X, X_lens) val_Y, _, _, val_Y_lens = decoder.decode(out.transpose(0, 1), out_lens) this_batch_size = val_Y.shape[0] predicted_list = [ val_Y[i, 0, :val_Y_lens[i, 0]] for i in range(this_batch_size) ] ground_truth_list = [ Y[i, 0:Y_lens[i]] for i in range(this_batch_size) ] ground_truth_phoneme_list = convert_to_phoneme(ground_truth_list) predicted_phoneme_list = convert_to_phoneme(predicted_list) for i in range(len(predicted_list)): count += 1 cur_predicted_str = "".join(predicted_phoneme_list[i]) cur_label_str = "".join(ground_truth_phoneme_list[i]) cur_dist = Levenshtein.distance(cur_predicted_str, cur_label_str) dist_sum += cur_dist print(f"Batch: {batch_idx} | Avg Distance: {dist_sum / count}") print("Dev Avg Distance: {:.4f}".format(dist_sum / count))
def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=16, blank_index=0): super(BeamCTCDecoder, self).__init__(labels) # try: # from ctcdecode import CTCBeamDecoder # except ImportError: # raise ImportError("BeamCTCDecoder requires paddledecoder package.") log_probs_input = True # self._log_probs = 1 if log_probs_input else 0 labels = list(labels) # print(labels) # print(2,type(labels),3,type(len(labels)),4,type(beam_width),5,type(num_processes),6,type(cutoff_prob), # 7,type(cutoff_top_n),8,type(blank_index),9,type(self._log_probs)) self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index, log_probs_input=True)
def __init__(self, labels, lm_path=None, alpha=1.5, beta=0.8, cutoff_top_n=15, cutoff_prob=1.0, beam_width=256, num_processes=4, blank_id=31, log_probs_input=False): print("Initializing Decoder") self.decoder = CTCBeamDecoder( labels, model_path = lm_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=num_processes, blank_id=blank_id, log_probs_input=log_probs_input ) self.decode_dict = self._dict_from_labels(labels) print("Decoder ready")
def __init__(self, model, loader, val_loader, test_loader, max_epochs=1, run_id='exp'): """ Use this class to train your model """ # feel free to add any other parameters here self.model = model.cuda() if torch.cuda.is_available() else model self.loader = loader self.val_loader = val_loader self.test_loader = test_loader self.train_losses = [] self.val_losses = [] self.predictions = [] self.predictions_test = [] self.generated_logits = [] self.generated = [] self.generated_logits_test = [] self.generated_test = [] self.epochs = 0 self.max_epochs = max_epochs self.run_id = run_id self.optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6) # self.optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-6, momentum=0.9) self.criterion = CTCLoss()#size_average=True, length_average=False) self.criterion = self.criterion.cuda() if torch.cuda.is_available() else self.criterion self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.1, patience=2) self.LD = Levenshtein(phoneme_list.PHONEME_MAP) self.best_rate = 1e10 self.decoder = CTCBeamDecoder(labels=[' '] + phoneme_list.PHONEME_MAP, blank_id=0, beam_width=150)
def __init__(self, charmap): self.label_map = [' '] + charmap # add blank to first entry self.decoder = CTCBeamDecoder( labels=self.label_map, blank_id=0, beam_width=100 )
def __init__(self): super().__init__() self.labels = [' '] + PHONEME_MAP self.decoder = CTCBeamDecoder(labels=self.labels, beam_width=100, blank_id=0, num_processes=32)
def __init__( self, labels: list = LABELS, beam_width: int = 100, model_path: str = None, alpha: float = 0.0, beta: float = 0.0, cutoff_top_n: int = 40, cutoff_prob: float = 1.0, blank_id: int = LABELS.index('_'), log_probs_input: bool = False, ): self.labels = labels self.beam_width = beam_width self.model_path = model_path self.alpha = alpha self.beta = beta self.cutoff_top_n = cutoff_top_n self.cutoff_prob = cutoff_prob self.blank_id = blank_id self.log_probs_input = log_probs_input self.decoder = CTCBeamDecoder(labels=labels, beam_width=beam_width, model_path=model_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, num_processes=max(os.cpu_count(), 1), blank_id=blank_id, log_probs_input=log_probs_input)
def __init__(self, blank_id: int, alphabet: List[str], count_prediction=10): self.decoder = CTCBeamDecoder(alphabet, beam_width=count_prediction, blank_id=blank_id)
def pred_model(model, test_loader): with torch.no_grad(): model.eval() predLabel = [] for batch_idx, (padinp, xlens) in enumerate(test_loader): padinp = padinp.to(device) batchlabel = [] out, out_lens = model(padinp, xlens) phonemes = [" "] + PHONEME_MAP decoder = CTCBeamDecoder(phonemes, beam_width=10, log_probs_input=True) out_lens = torch.LongTensor(out_lens) pred, _, _, pred_lens = decoder.decode(out.transpose(0, 1), out_lens) for i in range(len(pred)): seq = "" for j in range(pred_lens[i, 0]): seq += phonemes[int(pred[i, 0, j])] batchlabel.append(seq) predLabel = predLabel + batchlabel return predLabel
def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=4, blank_index=0, wfst: bool = False): super(BeamCTCDecoder, self).__init__(labels) try: from ctcdecode import CTCBeamDecoder except ImportError: raise ImportError("BeamCTCDecoder requires paddledecoder package.") self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index, wfst=wfst) self.wfst = wfst if wfst: self.mapping = dict((65 + i, 2 + i) for i in range(26)) self.mapping[39] = 1 # ' self.mapping[32] = 28 # space self.mapping[0] = 0
def __init__(self, labels: str, lm_path: str = None, alpha: int = 0, beta: int = 0, cutoff_top_n: int = 40, cutoff_prob: float = 1.0, beam_width: int = 100, num_processes: int = 4, blank_index: int = 0): """ CTC decoder. Args: labels: labels lm_path: language model path alpha: ctc param beta: ctc param cutoff_top_n: ctc param cutoff_prob: ctc param beam_width: ctc param num_processes: ctc param blank_index: ctc param """ super(BeamCTCDecoder, self).__init__(labels) try: from ctcdecode import CTCBeamDecoder except ImportError: raise ImportError("BeamCTCDecoder requires paddledecoder package.") self._decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index)
def __init__(self, alphabet, blank_symbol, model_path=None, alpha=1.0, beta=1.0, cutoff_prob=1.0, cutoff_top_n=None, beam_width=128, num_processes=4): super().__init__(alphabet, blank_symbol) cutoff_top_n = cutoff_top_n or len(alphabet) blank_id = alphabet.get_index(blank_symbol) if model_path is None: self._logger.warning('language model will not be used as ' '`model_path` is None') if model_path is not None and alpha == 0.0: self._logger.warning("language model will not be used as it's " "weighting `alpha` is zero") self._decoder = CTCBeamDecoder(labels=alphabet, model_path=model_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=num_processes, blank_id=blank_id)
def __init__(self, labels, ctc_labels=None, lm_path=None, alpha=0, beta=0, cutoff_top_n=25, cutoff_prob=-2.1, beam_width=100, num_processes=4, blank_index=0, log_probs_input=True, phoneme_vocab=None, trie=None): super(BeamCTCDecoder, self).__init__(labels) self.ctc_labels = ctc_labels from .CTCBeamSearchCustom import CTCBeamSearchCustom try: from ctcdecode import CTCBeamDecoder except ImportError: raise ImportError("BeamCTCDecoder requires ctcdecoder package") # self._decoder = CTCBeamDecoder(labels.lower(), lm_path, alpha, beta, cutoff_top_n, # cutoff_prob, beam_width=beam_width, num_processes=num_processes) #print("Labels Decoder: ", labels) self._decoder = CTCBeamDecoder(ctc_labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width=beam_width, num_processes=num_processes, log_probs_input=False)
def __init__( self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=4, blank_index=0, ): super(BeamCTCDecoder, self).__init__(labels) try: from ctcdecode import CTCBeamDecoder except ImportError: class CTCBeamDecoder: ... self._decoder = CTCBeamDecoder( labels, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, blank_index, )
def __init__(self, alphabet, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=4): super().__init__(alphabet) try: from ctcdecode import CTCBeamDecoder except ImportError: raise ImportError("BeamCTCDecoder requires ctcdecode package.") self._decoder = CTCBeamDecoder(alphabet.tokens, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, alphabet.blank_index, log_probs_input=True)
def decode_beamsearch(self, preds) : texts = [] preds = preds.softmax(2) # preds = torch.Tensor.cpu(preds).detach().numpy() # # print(preds.shape) # for i in range(preds.shape[0]) : # seq, path = beam_search(preds[i], self.alphabet, beam_size=20, beam_cut_threshold=0.00001) # texts.append(seq) decoder = CTCBeamDecoder( self.character, model_path=None, alpha=0, beta=0, cutoff_top_n=10, cutoff_prob=1.0, beam_width=4, num_processes=16, blank_id=0, log_probs_input=False ) beam_results, beam_scores, timesteps, out_lens = decoder.decode(preds) for i in range(preds.shape[0]) : seq = "".join(self.character[n] for n in beam_results[i][0][:out_lens[i][0]]) texts.append(seq) # return decoder(preds) return texts
def init_beam_decoder(self, alpha=0.8, beta=0.3, cutoff_top_n=40, cutoff_prob=1.0, beam_width=32, num_processes=4, use_lm=True): lm_path = "lm/zh_giga.no_cna_cmn.prune01244.klm" if use_lm else None blank_index = 1 self.beam_decoder = CTCBeamDecoder( labels=self.vocab._id2token, model_path=lm_path, alpha=alpha, beta=beta, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=num_processes, blank_id=blank_index, log_probs_input=True )
def __init__(self, params): super().__init__() self.model = DeepSpeech2() self.ctc_loss = nn.CTCLoss(reduction='none') self.vocab_str = list('_abcdefghijklmnñopqrstuvwxyz ') print(self.vocab_str) self.ctc_decoder = CTCBeamDecoder(self.vocab_str, log_probs_input=True)
def __init__(self, device='cpu'): self.device = device self.preds = [] self.gts = [] self.decoder = CTCBeamDecoder(CHAR_LIST, beam_width=1, num_processes=16, blank_id=0, log_probs_input=False)
def __init__(self): self.label_map = PHONEME_MAP + [' '] self.phoneme_list = PHONEME_LIST + [' '] self.decoder = CTCBeamDecoder(labels=self.label_map, blank_id=phonemes_len, log_probs_input=True, beam_width=200) self.greedy_decoder = GreedyDecoder(labels=self.label_map, blank_index=phonemes_len)
class BeamCTCDecoder(Decoder): def __init__(self, alphabet, lm_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=100, num_processes=4): super().__init__(alphabet) try: from ctcdecode import CTCBeamDecoder except ImportError: raise ImportError("BeamCTCDecoder requires ctcdecode package.") self._decoder = CTCBeamDecoder(alphabet.tokens, lm_path, alpha, beta, cutoff_top_n, cutoff_prob, beam_width, num_processes, alphabet.blank_index, log_probs_input=True) def decode(self, log_probs, sizes=None): """ Given a matrix of character probabilities, returns the decoder's best guess of the transcription Arguments: log_probs (tensor): Tensor of log probabilities with shape (B, T, L), where `log_probs[b, t, l]` is the log probability of character `c` at time `t` in batch `b` sizes (optional): Size of each sequence in the batch Returns: decoded (list of string): sequence of the model's best guess for the transcription scores (tensor): tensor of size B the negative log probability offsets (tensor): time-step per character predicted """ log_probs = log_probs.cpu() out, scores, offsets, seq_lens = self._decoder.decode(log_probs, sizes) strings = self.tensor2str(out[:, 0, :], seq_lens[:, 0]) scores = scores[:, 0] offsets = offsets[:, 0] return strings, scores, offsets def reset_params(self, alpha, beta): self._decoder.reset_params(alpha, beta)
def recognize(image_path, model, label_dict, device): img = Image.open(image_path).convert("RGB") tgt_height = 64 width, height = img.size reshape_width = tgt_height * (width / height) img = img.resize([int(reshape_width), int(tgt_height)]) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) img = transform(img).unsqueeze(0).to(device) with torch.no_grad(): output = model(img) _, ind2ch = get_label_dict(label_dict) # output = output.squeeze(1).cpu().numpy() # results, score = ctcdecoder.decode(output, 20, 98) labels = list(ind2ch.values()) replace_label = { 'UNK': '_', 'SOS': '_', 'EOS': '_', 'SPACE': ' ', 'BLANK': '_' } labels = ''.join( [replace_label[l] if l in replace_label.keys() else l for l in labels]) decoder = CTCBeamDecoder(labels, model_path=None, alpha=0, beta=0, cutoff_top_n=40, cutoff_prob=1.0, beam_width=20, num_processes=8, blank_id=98, log_probs_input=True) output = output.permute(1, 0, 2) beam_results, beam_scores, timesteps, out_lens = decoder.decode(output) results = beam_results[0][0][:out_lens[0][0]].cpu().tolist() # print(results) # print(1/torch.exp(beam_scores)) pred = '' for ch in results: ch = ind2ch[ch] if ch in ['UNK', 'SOS', 'EOS', 'BLANK']: continue elif ch == 'SPACE': pred += ' ' else: pred += ch return pred
def __init__(self, PHONEME_MAP, blank_index=0, beam_width=100): # Add the blank to the phoneme_map as the first element if PHONEME_MAP[blank_index] != ' ': PHONEME_MAP.insert(0, ' ') # Define the int_to_char dictionary self.int_to_char = dict([(i, c) for (i, c) in enumerate(PHONEME_MAP)]) self._decoder = CTCBeamDecoder(PHONEME_MAP, blank_id=blank_index, beam_width=beam_width, log_probs_input=True)
def __init__(self, args, tgt_dict): self.tgt_dict = tgt_dict self.vocab_size = len(tgt_dict) self.nbest = args.nbest self.beam = args.beam self.blank = (tgt_dict.index("<ctc_blank>") if "<ctc_blank>" in tgt_dict.indices else tgt_dict.bos()) self.decode_fn = CTCBeamDecoder(tgt_dict.symbols, beam_width=self.beam, blank_id=self.blank, num_processes=10)
def fast_beam_search_decode(logprobs, logprobs_lens, vocab, beam_size, cutoff_top_n, cutoff_prob, ext_scoring_func, alpha, beta, num_processes, rescorer=None): blank_index = vocab['<blank>'] labels = ''.join(vocab.indices2tokens()).replace('<blank>', '_').replace('<unk>', '') decoder = CTCBeamDecoder(labels=labels, blank_id=blank_index, cutoff_top_n=cutoff_top_n, cutoff_prob=cutoff_prob, beam_width=beam_size, model_path=ext_scoring_func, alpha=alpha, beta=beta, num_processes=num_processes, log_probs_input=True) beam_results, beam_scores, timesteps, out_lens = decoder.decode( torch.transpose(logprobs, 0, 1), logprobs_lens) predictions = [] for idx in range(beam_results.shape[0]): beam = [] for jdx in range(beam_results.shape[1]): hypo = ''.join( vocab.lookup_tokens( beam_results[idx, jdx, :out_lens[idx, jdx]].tolist())) hypo_score = -beam_scores[idx, jdx] beam.append((hypo, hypo_score)) predictions.append(beam) if rescorer is not None: all_hypos = [hypo for beam in predictions for hypo, _ in beam] scoring_results = rescorer.score(all_hypos) all_lm_scores = [ scoring_result['positional_scores'].mean().item() for scoring_result in scoring_results ] all_lm_scores = torch.tensor(all_lm_scores).reshape(beam_scores.shape) all_lm_scores = torch.softmax(all_lm_scores, dim=1) predictions = [[(predictions[idx][jdx][0], all_lm_scores[idx, jdx]) for jdx in range(beam_results.shape[1])] for idx in range(beam_results.shape[0])] return predictions
def val(): model.eval() distances = [] for batch_idx, (data, target, in_lens, target_lens) in enumerate(test_loader): data, in_lens = data.to(device), in_lens.to(device) out, out_lens = model(data, in_lens) decoder = CTCBeamDecoder(PHONEME_LIST, beam_width=3) decoded_out, _, _, decoded_lens = decoder.decode(out.transpose(0, 1).cpu(), out_lens.cpu()) decoded_strings = [label_to_short_phoneme(decoded_out[i, 0, :decoded_lens[i]]) for i in range(decoded_out.shape[0])] decoded_labels = [label_to_short_phoneme(label_pad[i, : target_lens[i]]) for i in range(label_pad.shape[0])] batch_distances = [distance(o, l) for o, l in zip(decoded_strings, decoded_labels)] distances.extend(batch_distances) print('Distance = ', np.mean(distances))
def cpp_beam_search(predictions, labels, beam_width=5, beam_cut_threshold=0.1): """ C++ Beam search CTC decoder https://github.com/parlance/ctcdecode """ # add batch dimension expected by CTCBeamDecoder predictions = np.expand_dims(predictions, 0) predictions = torch.FloatTensor(predictions) decoder = CTCBeamDecoder( labels, beam_width=beam_width, cutoff_prob=beam_cut_threshold ) beam_result, _, _, out_seq_len = decoder.decode(predictions) beam_result = beam_result[0][0][0:out_seq_len[0][0]] return ''.join(labels[x] for x in beam_result)
def test(model, test_loader, ocr_dataset): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ratios = [] lv_ratios = [] BLANK = ocr_dataset.get_num_classes()-1 with torch.no_grad(): for ((x, input_lengths),(y,target_lengths)) in test_loader: print("Run eval") x = x.to(device) outputs = model.forward(x) outputs = outputs.permute(1, 0, 2) decoder = CTCBeamDecoder(ocr_dataset.char_vec, blank_id=BLANK, log_probs_input=True) output, scores, ts, out_seq_len = decoder.decode(outputs.data, torch.IntTensor(input_lengths)) results = [] for b, batch in enumerate(output): size = out_seq_len[b][0] dec = batch[0] text = '' if size > 0: text = ocr_dataset.get_decoded_label(dec[0:size]) results.append(text) ptr = 0 for i, p in enumerate(target_lengths): yi = y[ptr:ptr+p] s1 = results[i] s2 = ocr_dataset.get_decoded_label(yi) ratios.append(SequenceMatcher(None, s1, s2).quick_ratio()) lv_ratios.append(char_err_rate(s1, s2)) ptr += p print("SequenceMatcher acc:", np.mean(ratios), np.std(ratios)) print("Levenshtein acc:", np.mean(lv_ratios), np.std(lv_ratios))
def run(config): batch_size = config["batch_size"] seq_len = config["seg_len"] n_iter = config["epoch"] input_size = config["input_size"] device = config["device"] vocab_size = config["vocab_size"] # num_processes = config["num_processes"] beam_width = config["beam_width"] # print("num_processes_cpu: ", os.cpu_count()) num_threads = config["num_threads"] if device == "cpu": torch.set_num_threads(num_threads) print("num_threads: ", torch.get_num_threads()) model = DeepSpeech(config) decoder = CTCBeamDecoder(['$'] * (vocab_size + 1), beam_width=beam_width, blank_id=0, num_processes=num_threads, log_probs_input=True) # inp = torch.ones((batch_size, seq_len, input_size+2*input_size*n_context)) model = model.to(device) forward_time = 0 decode_time = 0 overall_time = 0 for i in range(n_iter): start_time = time.perf_counter() inp = torch.rand( (batch_size, seq_len, input_size + 2 * input_size * n_context)) inp = inp.to(device) out = model(inp) end_time1 = time.perf_counter() start_time1 = time.perf_counter() out = out.transpose(0, 1) out_lens = torch.tensor([seq_len for _ in range(batch_size)]) output, scores, timesteps, out_seq_len = decoder.decode( out, out_lens) # [b, seq_len, vocab_size] -> [b, beam_width, seq_len] end_time2 = time.perf_counter() forward_time += end_time1 - start_time decode_time += end_time2 - start_time1 overall_time += end_time2 - start_time print("Forward: %f s" % (forward_time / n_iter)) print("CTC Decode %f s" % (decode_time / n_iter)) print("Overall %f s" % (overall_time / n_iter))