def __init__(self, opt): for l in open(label_file,'r').readlines(): l = l.strip().split(' ') if l[0] == '4800': continue self.alphabet[int(l[0])] = l[1] opt.imgH = 32 opt.imgW = 800 opt.Transformation = 'None' opt.FeatureExtraction = 'ResNet' opt.input_channel=1 opt.num_class=4787 opt.output_channel=512 opt.hidden_size = 512 opt.dropout = 0.5 opt.rnnlayers= 1 opt.rnndropout=0 opt.batch_max_length=40 self.opt = opt if opt.lm is not None: self.lm_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), opt.lm) self.bm_decoder = ctcdecode.CTCBeamDecoder(self.alphabet, beam_width=opt.beam_width, num_processes = 16, blank_id = 0, model_path=self.lm_path, alpha = opt.alpha, beta = opt.beta) else: self.bm_decoder = ctcdecode.CTCBeamDecoder(self.alphabet, beam_width=opt.beam_width, num_processes = 16, blank_id = 0, alpha = opt.alpha, beta = opt.beta) self.net = Model(opt) # weight initialization for name, param in self.net.named_parameters(): if 'localization_fc2' in name: print(f'Skip {name} as it is already initialized') continue try: if 'bias' in name: init.constant_(param, 0.0) elif 'weight' in name: init.kaiming_normal_(param) except Exception as e: # for batchnorm. if 'weight' in name: param.data.fill_(1) continue # data parallel for multi-GPU self.net = torch.nn.DataParallel(self.net).cuda() self.net.load_state_dict(torch.load(opt.m)) self.net.eval() self.trans = ResizeAug(800,32,rand_scale=False) self.toT = transforms.ToTensor()
def decode(log_prob, input_len, catted_target=None, target_len=None): decoder = ctcdecode.CTCBeamDecoder(PHONEME_MAP, beam_width=100, blank_id=0, log_probs_input=True, num_processes=16) if catted_target is not None: # calculate levenshtein distance output, scores, timesteps, out_seq_len = decoder.decode( log_prob, input_len) y_start = 0 running_dist = [] for i in range(output.size(0)): pred_str = "".join(PHONEME_MAP[f] for f in output[i, 0, :out_seq_len[i, 0]]) label_str = "".join(PHONEME_MAP[f + 1] for f in catted_target[y_start:y_start + target_len[i]]) running_dist.append(L.distance(pred_str, label_str)) y_start += target_len[i] if i % 50 == 0: print("%s -> %s" % (label_str, pred_str)) break return running_dist else: # only calculate decoded result output, scores, timesteps, out_seq_len = decoder.decode( log_prob, input_len) pred_str = [] for i in range(output.size(0)): pred_str.append("".join(PHONEME_MAP[f] for f in output[i, 0, :out_seq_len[i, 0]])) return pred_str
def __init__(self, opts, vocab_size, blank_id): self.opts = opts self.vocab_size = vocab_size self.blank_id = blank_id self.network = SLRNetwork(self.opts, vocab_size, num_blocks=5, dilations=[1, 2, 4]) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') self.criterion = nn.CTCLoss(blank=self.blank_id, reduction='none') params_all = [{'params': self.network.parameters()}] self.optimizer = create_optimizer('adam', params_all, lr=self.opts.learning_rate, momentum=self.opts.momentum, weight_decay=self.opts.weight_decay) self.ctc_decoder_vocab = [ chr(x) for x in range(20000, 20000 + self.vocab_size) ] self.ctc_decoder = ctcdecode.CTCBeamDecoder( self.ctc_decoder_vocab, beam_width=self.opts.beam_width, blank_id=self.blank_id, num_processes=10) self.decoded_dict = {} pass
def __init__(self, beam_size=100, blank_id=labels.index('_'), kenlm_path=None): print("loading beam search with lm...") self.decoder = ctcdecode.CTCBeamDecoder( labels, alpha=0.522729216841, beta=0.96506699808, beam_width=beam_size, blank_id=labels.index('_'), model_path=kenlm_path) print("finished loading beam search")
def test_beam_search_decoder_2(self): probs_seq = np.log(np.array([self.probs_seq2], dtype=np.float32)) decoder = ctcdecode.CTCBeamDecoder(beam_width=self.beam_size, blank_id=self.vocab_list.index('_')) results = decoder.decode(probs_seq) output_str = self.convert_to_string(results[0][0][0]) self.assertEqual(output_str, self.beam_search_result[1])
def create_test_decoder_with_language_model( handwriting_recognition_root_dir: str, use_non_zero_language_model_weight: bool): vocab_list = TestCTCDecodeWithLanguageModel.create_test_vocab_list() language_model_binary_file = create_test_language_model( handwriting_recognition_root_dir) # alpha: language model weight # beta: word insertion weight # See: https://github.com/PaddlePaddle/models/issues/218 if use_non_zero_language_model_weight: language_model_weight = TestCTCDecodeWithLanguageModel.NONZERO_LANGUAGE_MODEL_WEIGHT language_model_path = language_model_binary_file else: language_model_weight = 0 language_model_path = None decoder = ctcdecode.CTCBeamDecoder( vocab_list, model_path=language_model_path, beam_width=TestCTCDecodeWithLanguageModel.BEAM_SIZE, alpha=language_model_weight, beta=TestCTCDecodeWithLanguageModel.WORD_INSERTION_WEIGHT, blank_id=vocab_list.index( TestCTCDecodeWithLanguageModel.BLANK_SYMBOL), num_processes=16) return decoder, vocab_list
def test_beam_search_decoder_2(self): probs_seq = torch.FloatTensor([self.probs_seq2]) decoder = ctcdecode.CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size, blank_id=self.vocab_list.index('_')) beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq) output_str = self.convert_to_string(beam_result[0][0], self.vocab_list, out_seq_len[0][0]) self.assertEqual(output_str, self.beam_search_result[1])
def test_ctc_output_probability(self): seq_len_0 = 2 classes = 3 input_prob_matrix_0 = np.asarray( [[0.4, 0.00000001, 0.6], [0.4, 0.00000001, 0.6]], dtype=np.float32) input_log_prob_matrix_0 = np.log(input_prob_matrix_0) inputs = np.array([ input_log_prob_matrix_0[t, :][np.newaxis, :] for t in range(seq_len_0) ]) seq_lens = np.array([seq_len_0], dtype=np.int32) th_input = torch.from_numpy(inputs) th_seq_len = torch.IntTensor(seq_lens) labels = "AB_" scorer = ctcdecode.Scorer() decoder = ctcdecode.CTCBeamDecoder(scorer, labels, blank_index=2, space_index=-1, top_paths=1, beam_width=3) decode_result, scores, decode_len, alignments, char_probs = decoder.decode( th_input, th_seq_len) self.assertEqual(decode_len[0][0], 1) self.assertEqual( decode_result.numpy()[0, 0, :decode_len[0][0]].tolist(), [0]) self.assertEqual(alignments.numpy()[0, 0, :decode_len[0][0]].tolist(), [1]) np.testing.assert_almost_equal(scores.numpy(), np.log(np.array([[0.64]])), 5)
def __init__( self, labels, lm_path, beam_width, beam_alpha = 0, beam_beta = 0, cutoff_top_n = 40, cutoff_prob = 1.0, num_workers = 1, topk = 1 ): import ctcdecode self.topk = topk self.beam_search_decoder = ctcdecode.CTCBeamDecoder( list(str(labels).lower()), lm_path, beam_alpha, beam_beta, cutoff_top_n if cutoff_top_n is not None else len(labels), cutoff_prob, beam_width, num_workers, labels.blank_idx, log_probs_input = True )
def __init__(self, beam_size=100, blank_id=labels.index('_'), kenlm_path=None): print("loading beam search with lm...") print("kenlm path: "+ kenlm_path) self.decoder = ctcdecode.CTCBeamDecoder( labels, alpha=1.51289039105002, beta=0.86506699808, beam_width=beam_size, blank_id=labels.index('_'), model_path=kenlm_path) print("finished loading beam search")
def test_beam_search_decoder_3(self): lm_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test.arpa') probs_seq = torch.FloatTensor([self.probs_seq2]) decoder = ctcdecode.CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size, blank_id=self.vocab_list.index('_'), model_path=lm_path) beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq) output_str = self.convert_to_string(beam_result[0][0], self.vocab_list, out_seq_len[0][0]) self.assertEqual(output_str, self.beam_search_result[2])
def test_beam_search_decoder_batch_log(self): probs_seq = torch.FloatTensor([self.probs_seq1, self.probs_seq2]).log() decoder = ctcdecode.CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size, blank_id=self.vocab_list.index('_'), log_probs_input=True, num_processes=24) beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq) output_str1 = self.convert_to_string(beam_results[0][0], self.vocab_list, out_seq_len[0][0]) output_str2 = self.convert_to_string(beam_results[1][0], self.vocab_list, out_seq_len[1][0]) self.assertEqual(output_str1, self.beam_search_result[0]) self.assertEqual(output_str2, self.beam_search_result[1])
def create_decoder(self, alpha, beta): self.decoder = ctcdecode.CTCBeamDecoder( self.vocab_list, model_path=self.hparams.lm_path, alpha=alpha, beta=beta, cutoff_top_n=50, cutoff_prob=0.99, beam_width=100, blank_id=self.vocab_list.index("_"), )
def __init__(self, vocabulary_size, batch_ordering): super().__init__() # WARINIG dont use chr(0) vocabulary_size += 1 # TODO unify blank label stuff self.vocabulary = [ chr(c) for c in list(range(65, 65 + 58)) + list(range(65 + 58 + 69, 65 + 58 + 69 + 500)) ][:vocabulary_size] self.decoder = ctcdecode.CTCBeamDecoder(self.vocabulary, log_probs_input=True, beam_width=1) self.batch_ordering = batch_ordering
def __init__(self, opts, model, criterion, vocabulary, vocab_size, blank_id): self.opts = opts self.model = model self.criterion = criterion self.vocab_size = vocab_size self.blank_id = blank_id self.pad = vocabulary.pad() self.unk = vocabulary.unk() self.eos = vocabulary.eos() self.bos = vocabulary.bos() self.cuda = torch.cuda.is_available() if self.cuda: self.criterion = self.criterion.cuda() self.model = self.model.cuda() self._num_updates = 0 pretrain_params, attn_params = self.cnn_freeze(opts) # params = list(filter(lambda p: p.requires_grad, self.model.parameters())) if not opts.freeze_cnn: self.optimizer = torch.optim.Adam( [{ "params": pretrain_params, "lr": self.opts.learning_rate }, { "params": attn_params, "lr": self.opts.learning_rate }], weight_decay=self.opts.weight_decay) else: self.optimizer = torch.optim.Adam( [{ "params": pretrain_params, "lr": 0.0 }, { "params": attn_params, "lr": self.opts.learning_rate }], weight_decay=self.opts.weight_decay) # self._build_optimizer(params, self.opts.optimizer, lr=self.opts.learning_rate, # momentum=self.opts.momentum, weight_decay=self.opts.weight_decay) self.decoder_vocab = [ chr(x) for x in range(20000, 20000 + self.vocab_size) ] self.decoder = ctcdecode.CTCBeamDecoder( self.decoder_vocab, beam_width=self.opts.beam_width, blank_id=self.blank_id, num_processes=10)
def test_subword_beam_search_decoder_batch(self): probs_seq = torch.FloatTensor([self.bigram_probs_seq1]) #, self.bigram_probs_seq2]) decoder = ctcdecode.CTCBeamDecoder(self.subword_vocab_list, beam_width=5, subword=True, num_processes=24) beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq) output_str1 = self.convert_to_string(beam_results[0][0], self.subword_vocab_list, out_seq_len[0][0]) #output_str2 = self.convert_to_string(beam_results[1][0], self.vocab_list, out_seq_len[1][0]) print(beam_results) print(beam_scores) print(timesteps) print(out_seq_len) print(output_str1) self.assertEqual(output_str1, self.bigram_beam_search_result[0])
def __init__(self, config): super(TransducerModel, self).__init__() self.encoder = Encoder(config) self.decoder = AutoregressiveDecoder(config) self.joiner = Joiner(config) self.blank_index = self.joiner.blank_index self.num_outputs = self.joiner.num_outputs #self.transducer_loss = Transducer(blank_label=self.blank_index) self.ctc_decoder = ctcdecode.CTCBeamDecoder( ["a" for _ in range(self.num_outputs)], blank_id=self.blank_index, beam_width=config.beam_width) self.beam_width = config.beam_width
def beam_decode(self, preds): preds = preds.transpose(0, 1) preds = F.softmax(preds, dim=2) batch_size = preds.size(0) decoder = ctcdecode.CTCBeamDecoder(self.alphabet, beam_width=self.beam_size, blank_id=self.alphabet.index('-'), num_processes=24) beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(preds) texts = [] for i in range(batch_size): output_str = self.beam_to_string(beam_results[i][0], self.alphabet, out_seq_len[i][0]) texts.append(output_str) # print(texts) return texts
def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=20, \ cutoff_prob=1.0, beam_width=4, num_processes=4): self.labels = labels self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)]) self.blank_index = self.labels.index("_") self.decoder = ctcdecode.CTCBeamDecoder(labels=self.labels, model_path=lm_path, alpha=alpha, beta=beta, cutoff_top_n=len(self.labels), cutoff_prob=cutoff_prob, beam_width=beam_width, num_processes=num_processes, blank_id=self.blank_index, log_probs_input=False) self.text_transform = TextTransform()
def __init__(self, opts, model, criterion, vocabulary, vocab_size, blank_id): self.opts = opts self.model = model self.criterion = criterion self.vocab_size = vocab_size self.blank_id = blank_id self.pad = vocabulary.pad() self.unk = vocabulary.unk() self.eos = vocabulary.eos() self.bos = vocabulary.bos() self.cuda = torch.cuda.is_available() if self.cuda: self.criterion = self.criterion.cuda() self.model = self.model.cuda() self._num_updates = 0 # params = [] # for params in self.model.parameters(): # if params not in self.model.decoder.parameters(): # params.append(params) params = list( filter(lambda p: p.requires_grad, self.model.parameters())) self.optimizer = torch.optim.Adam(params, lr=self.opts.learning_rate, weight_decay=self.opts.weight_decay) logging.info('| num. module params: {} (num. trained: {})'.format( sum(p.numel() for p in params), sum(p.numel() for p in params if p.requires_grad), )) self.dec_generator = IterativeGenerate(vocabulary, model) # self._build_optimizer(params, self.opts.optimizer, lr=self.opts.learning_rate, # momentum=self.opts.momentum, weight_decay=self.opts.weight_decay) self.decoder_vocab = [ chr(x) for x in range(20000, 20000 + self.vocab_size) ] self.decoder = ctcdecode.CTCBeamDecoder( self.decoder_vocab, beam_width=self.opts.beam_width, blank_id=self.blank_id, num_processes=10)
def __init__(self, opts, device, vocab_size, vocabulary, dilated_channels=512, num_blocks=1, dilations=[1, 2, 4], dropout=0.0): super(DilatedSLRNet, self).__init__() self.opts = opts self.device = device self.vocab_size = vocab_size self.in_channels = self.opts.feature_dim self.out_channels = dilated_channels self.vocab = vocabulary self.pad = self.vocab.pad() self.eos = self.vocab.eos() self.bos = self.vocab.bos() self.unk = self.vocab.unk() self.blank_id = self.vocab.blank() self.num_blocks = num_blocks self.dilations = dilations self.kernel_size = 3 self.block_list = nn.ModuleList() for i in range(self.num_blocks): self.block_list.append( DilatedBlock(self.in_channels, self.out_channels, self.kernel_size, self.dilations)) self.out_conv = nn.Conv1d(self.out_channels, self.out_channels, self.kernel_size, padding=(self.kernel_size - 1) // 2) self.act_tanh = nn.Tanh() self.fc = nn.Linear(self.out_channels, self.vocab_size) self.decoder = LevenshteinTransformerDecoder(opts, vocabulary) ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)] self.ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab, beam_width=opts.beam_width, blank_id=self.blank_id, num_processes=10)
def __init__(self): self._train_loader = None self._valid_loader = None self._device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self._loss = torch.nn.CTCLoss(blank=0, reduction="none") self._label_dict = { "_": 0, " ": 1, "'": 2, "A": 3, "B": 4, "C": 5, "D": 6, "E": 7, "F": 8, "G": 9, "H": 10, "I": 11, "J": 12, "K": 13, "L": 14, "M": 15, "N": 16, "O": 17, "P": 18, "Q": 19, "R": 20, "S": 21, "T": 22, "U": 23, "V": 24, "W": 25, "X": 26, "Y": 27, "Z": 28, } self._rev_label_dict = {v: k for k, v in self._label_dict.items()} self._decoder = ctcdecode.CTCBeamDecoder( labels=[str(c) for c in self._rev_label_dict], beam_width=1)
def test_simple_decode_different_blank_idx(self): aa = torch.FloatTensor( np.array([[[0.0, 1.0]], [[0.0, 1.0]], [[1.0, 0.0]], [[0.0, 1.0]], [[0.0, 1.0]]], dtype=np.float32)).log() seq_len = torch.IntTensor(np.array([5], dtype=np.int32)) labels = "_A" scorer = ctcdecode.Scorer() decoder_nomerge = ctcdecode.CTCBeamDecoder(scorer, labels, blank_index=0, space_index=-1, top_paths=1, beam_width=1) result_nomerge, _, result_nomerge_len, nomerge_alignments, _ = decoder_nomerge.decode( aa, seq_len) self.assertEqual(result_nomerge_len[0][0], 2) self.assertEqual( result_nomerge.numpy()[0, 0, :result_nomerge_len[0][0]].tolist(), [1, 1])
def __init__(self): self._param_shapes = None self._param_types = None self._eval_iters = {} self._loss = torch.nn.CTCLoss(blank=0, reduction="none") self._label_dict = { "_": 0, " ": 1, "'": 2, "A": 3, "B": 4, "C": 5, "D": 6, "E": 7, "F": 8, "G": 9, "H": 10, "I": 11, "J": 12, "K": 13, "L": 14, "M": 15, "N": 16, "O": 17, "P": 18, "Q": 19, "R": 20, "S": 21, "T": 22, "U": 23, "V": 24, "W": 25, "X": 26, "Y": 27, "Z": 28, } self._rev_label_dict = {v: k for k, v in self._label_dict.items()} self._decoder = ctcdecode.CTCBeamDecoder( labels=[str(c) for c in self._rev_label_dict], beam_width=1)
def testModel(model, test_loader, device): model.to(device) model.eval() with open('submission_1.txt', 'w') as file: with torch.no_grad(): i = 1 for batch_idx, (data, data_lengths, label, label_length) in enumerate(test_loader): label = torch.tensor(label) label_length = torch.tensor(label_length) data, data_lengths, label, label_length = \ data.to(device), data_lengths.to(device), label.to(device), label_length.to(device) outputs, hidden = model(data, data_lengths, label, label_length) # decode outputs_soft = outputs.permute(1, 0, 2) m = nn.Softmax(dim=2) outputs_soft = m(outputs_soft) probs_seq = outputs_soft decoder = ctcdecode.CTCBeamDecoder( PHONEME_MAP, beam_width=100, blank_id=PHONEME_MAP.index(' ')) beam_result, beam_scores, timesteps, out_seq_len = decoder.decode( probs_seq) output_str = convert_to_string(beam_result[0][0], PHONEME_MAP, out_seq_len[0][0]) file.write('\n' + '{}'.format(output_str)) print(i) print('{}'.format(output_str)) i += 1
def create_decoder(vocab_list: list, cutoff_top_n: int, beam_size: int, blank_symbol, language_model_parameters: LanguageModelParameters): """ :param vocab_list: :param beam_size: :param cutoff_top_n: A parameter that limits the number of vocabulary candidates that are kept by the decoder. :param blank_symbol: :param language_model_parameters: :return: """ if language_model_parameters is not None: print("Creating decoder with language model loaded from " + str(language_model_parameters.language_model_file_path)) decoder = ctcdecode.\ CTCBeamDecoder( vocab_list, model_path=language_model_parameters.language_model_file_path, cutoff_top_n=cutoff_top_n, beam_width=beam_size, alpha=language_model_parameters.language_model_weight, beta=language_model_parameters.word_insertion_penalty, blank_id=vocab_list.index(blank_symbol), space_symbol=Evaluator.WORD_SEPARATOR_SYMBOL, num_processes=16) else: decoder = ctcdecode.CTCBeamDecoder(vocab_list, cutoff_top_n=cutoff_top_n, beam_width=beam_size, blank_id=vocab_list.index(blank_symbol), space_symbol=Evaluator.WORD_SEPARATOR_SYMBOL, num_processes=16) return decoder
def main(): opts = parse_args() init_logging( os.path.join(opts.log_dir, '{:s}_win0_win4_log_test.txt'.format(opts.task))) if torch.cuda.is_available(): torch.cuda.set_device(opts.gpu) logging.info("Using GPU!") device = "cuda" else: logging.info("Using CPU!") device = "cpu" logging.info(opts) test_datasets = PhoenixVideo(opts.vocab_file, opts.corpus_dir, opts.video_path, phase=opts.task, DEBUG=opts.DEBUG) vocab_size = test_datasets.vocab.num_words blank_id = test_datasets.vocab.word2index['<BLANK>'] vocabulary = Vocabulary(opts.vocab_file) # model = DilatedSLRNet(opts, device, vocab_size, vocabulary, # dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0) model = MainStream(vocab_size) criterion = CtcLoss(opts, blank_id, device, reduction="none") trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id) # ctcdeocde ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)] ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab, beam_width=opts.beam_width, blank_id=blank_id, num_processes=10) if os.path.exists(opts.check_point): logging.info("Loading checkpoint file from {}".format( opts.check_point)) epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point) else: logging.info("No checkpoint file in found in {}".format( opts.check_point)) epoch, num_updates, loss = 0, 0, 0.0 test_iter = trainer.get_batch_iterator(test_datasets, batch_size=opts.batch_size, shuffle=False) decoded_dict = {} val_err, val_correct, val_count = np.zeros([4]), 0, 0 with open("Data/output/hypo_ctc.txt", "w") as f, open("Data/output/ref_ctc.txt", "w") as f2: with torch.no_grad(): model.eval() criterion.eval() for samples in tqdm(test_iter): samples = trainer._prepare_sample(samples) video = samples["data"] len_video = samples["len_data"] label = samples["label"] len_label = samples["len_label"] video_id = samples['id'] logits, _ = model(video, len_video) len_video /= 4 logits = F.softmax(logits, dim=-1) pred_seq, _, _, out_seq_len = ctc_decoder.decode( logits, len_video) start = 0 for i, length in enumerate(len_label): end = start + length ref = label[start:end].tolist() hyp = [ x[0] for x in groupby(pred_seq[i][0] [:out_seq_len[i][0]].tolist()) ] ref_sent = " ".join( [vocabulary.index2word[r] for r in ref]) hyp_sent = " ".join( [vocabulary.index2word[r] for r in hyp]) f.write(hyp_sent + "\n") f2.write(ref_sent + "\n") decoded_dict[video_id[i]] = hyp val_correct += int(ref == hyp) err = get_wer_delsubins(ref, hyp) val_err += np.array(err) val_count += 1 start = end assert end == label.size(0) logging.info('-' * 50) logging.info('Epoch: {:d}, DEV ACC: {:.5f}, {:d}/{:d}'.format( epoch, val_correct / val_count, val_correct, val_count)) logging.info( 'Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}' .format(epoch, val_err[0] / val_count, val_err[1] / val_count, val_err[2] / val_count, val_err[3] / val_count)) list_str_for_test = [] for k, v in decoded_dict.items(): start_time = 0 for wi in v: tl = np.random.random() * 0.1 list_str_for_test.append('{} 1 {:.3f} {:.3f} {}\n'.format( k, start_time, start_time + tl, test_datasets.vocab.index2word[wi])) start_time += tl tmp_prefix = str(uuid.uuid1()) txt_file = '{:s}.txt'.format(tmp_prefix) result_file = os.path.join('evaluation_relaxation', txt_file) with open(result_file, 'w') as fid: fid.writelines(list_str_for_test) phoenix_eval_err = get_phoenix_wer(txt_file, opts.task, tmp_prefix) logging.info( '[Relaxation Evaluation] Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}' .format(epoch, phoenix_eval_err[0], phoenix_eval_err[1], phoenix_eval_err[2], phoenix_eval_err[3])) return phoenix_eval_err
net = HTRNet(cnn_cfg, rnn_cfg, len(classes)) if load_model_name is not None: my_torch_load(net, load_model_name) net.cuda(args.gpu_id) loss = warp_ctc.CTCLoss() net_parameters = net.parameters() nlr = args.learning_rate optimizer = torch.optim.Adam(net_parameters, nlr, weight_decay=0.00005) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, [int(.5 * max_epochs), int(.75 * max_epochs)]) decoder = ctcdecode.CTCBeamDecoder([c for c in classes], beam_width=100) # decoder = ctcdecode. def train(epoch): optimizer.zero_grad() closs = [] for iter_idx, (img, transcr) in enumerate(train_loader): img = Variable(img.cuda(gpu_id)) # cuda augm - alternatively for cpu use it on dataloader img = torch_augm(img) output = net(img) act_lens = torch.IntTensor(img.size(0) * [output.size(0)])
def is_keyword_batch(self, input_features, sensitivity, tmp_out_dir=None): if tmp_out_dir is None: tmp_out_dir = self.out_dir # https://stackoverflow.com/questions/15638612/calculating-mean-and-standard-deviation-of-the-data-which-does-not-fit-in-memory # # _, feat = next(iter(input_features.items())) # _dim = feat.shape[-1] # # n = 0 # mean = np.zeros((_dim)) # M2 = np.zeros((_dim)) # # for sample_name, feat in tqdm(input_features.items()): # # for i in range(10): # for i in range(feat.shape[0]): # n += 1 # delta = feat[i, :] - mean # mean = mean + (delta / n) # M2 = M2 + (delta ** 2) # # std = np.sqrt(M2 / (n - 1)) # mean = torch.from_numpy(mean).to(dtype=torch.float32).unsqueeze(-1) # std = torch.from_numpy(std).to(dtype=torch.float32).unsqueeze(-1) # test_output = self.test_decoder() # plot_phns = metadata_dict is None plot_phns = False # if plot_phns: # lab_dict = {"lab_mono": { # "label_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b_ali_dev_clean_100/", # "label_opts": "ali-to-phones --per-frame=true", # "lab_data_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/data/dev_clean/", # "lab_graph": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b/graph_tgsmall/" # }} # label_index_from = 1 # _labels = _load_labels(lab_dict, label_index_from, max_label_length=None, phoneme_dict=self.phoneme_dict) # # lab_dict = {"lab_mono": { # "label_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b_ali_dev_clean_100/", # "label_opts": "ali-to-phones", # "lab_data_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/data/dev_clean/", # "lab_graph": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b/graph_tgsmall/" # }} # label_index_from = 1 # _labels_no_ali = _load_labels(lab_dict, label_index_from, max_label_length=None, # phoneme_dict=self.phoneme_dict) vocabulary_size = 42 vocabulary = [ chr(c) for c in list(range(65, 65 + 58)) + list(range(65 + 58 + 69, 65 + 58 + 69 + 500)) ][:vocabulary_size] decoder = ctcdecode.CTCBeamDecoder(vocabulary, log_probs_input=True, beam_width=1) all_samples_concat = None for sample_name, feat in tqdm(input_features.items()): if all_samples_concat is None: all_samples_concat = feat else: all_samples_concat = np.concatenate((all_samples_concat, feat)) mean = torch.from_numpy(np.mean( all_samples_concat, axis=0)).to(dtype=torch.float32).unsqueeze(-1) std = torch.from_numpy(np.std( all_samples_concat, axis=0)).to(dtype=torch.float32).unsqueeze(-1) post_files = [] plot_num = 0 # len = 88 # input_batch = [] # sample_names = [] # for sample_name in tqdm(input_features, desc="computing acoustic features:"): # input_feature = self.preprocess_feat(input_features[sample_name]) # # Normalize over whole chunk instead of only over a single file, which is done by applying the kaldi cmvn # _input_feature = ((input_feature - mean) / std).unsqueeze(1) # if _input_feature.shape[0] < len: # _zeros = torch.zeros((88, 1, 40, 11)) # _zeros[-_input_feature.shape[0]:, :, :, :] = _input_feature # _input_feature = _zeros # input_batch.append(_input_feature) # sample_names.append(sample_name) # input_batch = {'fbank': torch.cat(input_batch, dim=1)} beam_results = {} output_label = 'out_phn' assert output_label in self.model.out_names with KaldiOutputWriter(tmp_out_dir, "keyword", [output_label], self.epoch) as writer: post_files.append(writer.post_file[output_label].name) for sample_name in tqdm(input_features, desc="computing acoustic features:", position=1): # input_feature = {"fbank": self.preprocess_feat(input_features[sample_name])} input_feature = { "fbank": torch.from_numpy( input_features[sample_name].T).unsqueeze(0) } # Normalize over whole chunk instead of only over a single file, which is done by applying the kaldi cmvn input_feature["fbank"] = ((input_feature["fbank"] - mean) / std) # assert input_feature["fbank"].shape[2] > self.model.context_left + self.model.context_right + 50 if input_feature["fbank"].shape[ 2] < self.model.context_left + self.model.context_right + 100: padd = torch.zeros( (input_feature["fbank"].shape[0], input_feature["fbank"].shape[1], self.model.context_left + self.model.context_right), device=input_feature["fbank"].device, dtype=input_feature["fbank"].dtype) input_feature["fbank"] = torch.cat( (padd, input_feature["fbank"]), dim=2) output = self.model(input_feature) assert output_label in output output = output[output_label] _logits = output.detach().permute(0, 2, 1) output = output.detach().squeeze(0).numpy().T # output = test_output # if self.config['test'][output_label]['normalize_posteriors']: # counts = self.config['dataset']['dataset_definition']['data_info']['labels']['lab_phn']['lab_count'] # counts = np.array(counts) # blank_count = sum(counts) # heuristic sil * 2 for the moment # counts = counts * 0.5 # counts = np.concatenate((np.array([np.e]), counts)) # blank_scale = 1.0 # TODO try different blank_scales 4.0 5.0 6.0 7.0 # counts[0] /= blank_scale # for i in range(1, 8): # counts[i] /= noise_scale #TODO try noise_scale for SIL SPN etc I guess # prior = counts / np.sum(counts) # output[:, 1:] = output[:, 1:] - np.log(prior) # assert _logits.shape[0] == batch_size # output = np.exp(output) beam_result, beam_scores, timesteps, out_seq_len = decoder.decode( _logits) beam_result = beam_result[0, 0, :out_seq_len[0, 0]] result_decoded = [ self.phoneme_dict.reducedIdx2phoneme[l.item() - 1] for l in beam_result ] result_decoded = " ".join(result_decoded) beam_results[sample_name] = result_decoded if plot_num < 20 and plot_phns: # logger.debug(sample_name) # logger.debug(result_decoded) # if plot_phns: # label_decoded = " ".join( # [self.phoneme_dict.idx2phoneme[l.item()] for l in _labels_no_ali['lab_mono'][sample_name]]) # logger.debug(label_decoded) # if plot_phns: # plot_alignment_spectrogram(sample_name, input_feature["fbank"], # (np.exp(output).T / np.exp(output).sum(axis=1)).T, # self.phoneme_dict, _labels, result_decoded=result_decoded) # else: plot_alignment_spectrogram(sample_name, input_feature["fbank"], (np.exp(output).T / np.exp(output).sum(axis=1)).T, self.phoneme_dict, result_decoded=result_decoded) plot_num += 1 # else: # beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(_logits) # beam_result = beam_result[0, 0, :out_seq_len[0, 0]] # # logger.debug(sample_name) # result_decoded = [self.phoneme_dict.reducedIdx2phoneme[l.item() - 1] for l in beam_result] # result_decoded = " ".join(result_decoded) # # logger.debug(result_decoded) # plot_alignment_spectrogram(sample_name, input_feature["fbank"], # (np.exp(output).T / np.exp(output).sum(axis=1)).T, # self.phoneme_dict, metadata_dict[sample_name], result_decoded=result_decoded) # # plot_num += 1 assert len(output.shape) == 2 assert np.sum(np.isnan(output)) == 0, "NaN in output" assert output.shape[1] == len( self.phoneme_dict.reducedIdx2phoneme) + 1 writer.write_mat(output_label, output.squeeze(), sample_name) # self.config['decoding']['scoring_type'] = 'just_transcript' #### DECODING #### logger.debug("Decoding...") result = decode_ctc(**self.config['dataset']['dataset_definition'] ['decoding'], words_path=self.words_path, graph_path=self.graph_path, out_folder=tmp_out_dir, featstrings=post_files) # TODO filter result return result
def __init__(self, lexicon=None, backend='resnet18', base_model_dir=None, rnn_hidden_size=128, rnn_num_layers=2, rnn_dropout=0, seq_proj=[0, 0], do_beam_search=False, dropout_conv=False, dropout_rnn=False, dropout_output=False, cuda=True, do_ema=False, ada_after_rnn=False, ada_before_rnn=False): super().__init__() self.lexicon = lexicon print(lexicon) self.do_beam_search = do_beam_search self.num_classes = len(self.lexicon) self.ada_after_rnn = ada_after_rnn self.ada_before_rnn = ada_before_rnn self.feature_extractor = getattr(my_models, backend)(pretrained=True, model_dir=base_model_dir) self.cnn = nn.Sequential( self.feature_extractor.conv1, self.feature_extractor.bn1, self.feature_extractor.relu, nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=(1, 0)), self.feature_extractor.layer1, self.feature_extractor.layer2, nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=(1, 0)), self.feature_extractor.layer3, #self.feature_extractor.layer4, nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=(1, 0))) self.dropout_conv = dropout_conv self.dropout_rnn = dropout_rnn self.dropout_output = dropout_output self.dropout2d = nn.Dropout2d(p=0.5) self.dropout1d = nn.Dropout(p=0.5) self.fully_conv = True #seq_proj[0] == 0 if not self.fully_conv: self.proj = nn.Conv2d(seq_proj[0], seq_proj[1], kernel_size=1) self.rnn_hidden_size = rnn_hidden_size self.rnn_num_layers = rnn_num_layers if self.dropout_rnn: self.rnn = nn.GRU(self.get_block_size(self.cnn), rnn_hidden_size, rnn_num_layers, batch_first=False, bidirectional=True, dropout=0.5) else: self.rnn = nn.GRU(self.get_block_size(self.cnn), rnn_hidden_size, rnn_num_layers, batch_first=False, bidirectional=True, dropout=0.5) self.linear = nn.Linear(rnn_hidden_size * 2, self.num_classes + 1) self.softmax = nn.Softmax(dim=2) for i in range(20): length = random.randint(50, 300) height1, width1 = self._get_output_ratio(length) width2 = calc_im_seq_len(length) if (width2 != width1): raise Exception( "error, orig width is: {} ; width through network is: {} ; calculated width is: {} ." .format(length, width1, width2)) if height1 != 1: raise Exception( "hight after network should be one, but is: {}".format( height1)) if self.do_beam_search: sorted_letters = [ item[1] for item in sorted(lexicon.items(), key=operator.itemgetter(0)) ] sorted_keys = [ item[0] for item in sorted(lexicon.items(), key=operator.itemgetter(0)) ] #print(sorted_keys) #print(sorted_letters) self.label_str = ['_'] + sorted_letters #print(label_str) print('vocab size is: {}'.format(len(self.label_str))) self.beam_decode = ctcdecode.CTCBeamDecoder(self.label_str, blank_id=0, beam_width=20) if cuda: self.cuda() if do_ema: self.avg_param = self.copy_model_params() # initialize if cuda: for i in range(len(self.avg_param)): self.avg_param[i].cuda() if ada_after_rnn: self.domain_classifier_rnn = nn.Sequential() self.domain_classifier_rnn.add_module( 'd_fc1', nn.Linear(rnn_hidden_size * 2, 100)) self.domain_classifier_rnn.add_module('d_bn1', nn.BatchNorm1d(100)) self.domain_classifier_rnn.add_module('d_relu1', nn.ReLU(True)) self.domain_classifier_rnn.add_module('d_fc2', nn.Linear(100, 2)) self.domain_classifier_rnn.add_module('d_softmax', nn.LogSoftmax()) if ada_before_rnn: self.domain_classifier_cnn = nn.Sequential() self.domain_classifier_cnn.add_module( 'd_fc1', nn.Linear(self.get_block_size(self.cnn), 100)) self.domain_classifier_cnn.add_module('d_bn1', nn.BatchNorm1d(100)) self.domain_classifier_cnn.add_module('d_relu1', nn.ReLU(True)) self.domain_classifier_cnn.add_module('d_fc2', nn.Linear(100, 2)) self.domain_classifier_cnn.add_module('d_softmax', nn.LogSoftmax())