Exemplo n.º 1
0
    def __init__(self, opt):
        for l in open(label_file,'r').readlines():
            l = l.strip().split(' ')
            if l[0] == '4800':
                continue
            self.alphabet[int(l[0])] = l[1]


        opt.imgH = 32
        opt.imgW = 800
        opt.Transformation = 'None'
        opt.FeatureExtraction = 'ResNet'
        opt.input_channel=1
        opt.num_class=4787
        opt.output_channel=512
        opt.hidden_size = 512
        opt.dropout = 0.5
        opt.rnnlayers= 1
        opt.rnndropout=0
        opt.batch_max_length=40
        
        self.opt = opt
        if opt.lm is not None:
            self.lm_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), opt.lm)
            self.bm_decoder = ctcdecode.CTCBeamDecoder(self.alphabet, beam_width=opt.beam_width, num_processes = 16,
                                                       blank_id = 0, model_path=self.lm_path, alpha = opt.alpha, beta = opt.beta)
        else:
            self.bm_decoder = ctcdecode.CTCBeamDecoder(self.alphabet, beam_width=opt.beam_width, num_processes = 16,
                                                       blank_id = 0, alpha = opt.alpha, beta = opt.beta)
            
        
        self.net = Model(opt)
        # weight initialization
        for name, param in self.net.named_parameters():
            if 'localization_fc2' in name:
                print(f'Skip {name} as it is already initialized')
                continue
            try:
                if 'bias' in name:
                    init.constant_(param, 0.0)
                elif 'weight' in name:
                    init.kaiming_normal_(param)
            except Exception as e:  # for batchnorm.
                if 'weight' in name:
                    param.data.fill_(1)
                    continue

        # data parallel for multi-GPU
        self.net = torch.nn.DataParallel(self.net).cuda()
    
        self.net.load_state_dict(torch.load(opt.m))
        self.net.eval()
        self.trans = ResizeAug(800,32,rand_scale=False)
        self.toT =   transforms.ToTensor()
Exemplo n.º 2
0
def decode(log_prob, input_len, catted_target=None, target_len=None):
    decoder = ctcdecode.CTCBeamDecoder(PHONEME_MAP,
                                       beam_width=100,
                                       blank_id=0,
                                       log_probs_input=True,
                                       num_processes=16)
    if catted_target is not None:  # calculate levenshtein distance
        output, scores, timesteps, out_seq_len = decoder.decode(
            log_prob, input_len)
        y_start = 0
        running_dist = []
        for i in range(output.size(0)):
            pred_str = "".join(PHONEME_MAP[f]
                               for f in output[i, 0, :out_seq_len[i, 0]])
            label_str = "".join(PHONEME_MAP[f + 1]
                                for f in catted_target[y_start:y_start +
                                                       target_len[i]])
            running_dist.append(L.distance(pred_str, label_str))
            y_start += target_len[i]
            if i % 50 == 0:
                print("%s -> %s" % (label_str, pred_str))
            break
        return running_dist
    else:  # only calculate decoded result
        output, scores, timesteps, out_seq_len = decoder.decode(
            log_prob, input_len)
        pred_str = []
        for i in range(output.size(0)):
            pred_str.append("".join(PHONEME_MAP[f]
                                    for f in output[i, 0, :out_seq_len[i, 0]]))
        return pred_str
Exemplo n.º 3
0
    def __init__(self, opts, vocab_size, blank_id):
        self.opts = opts
        self.vocab_size = vocab_size
        self.blank_id = blank_id
        self.network = SLRNetwork(self.opts,
                                  vocab_size,
                                  num_blocks=5,
                                  dilations=[1, 2, 4])

        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        self.criterion = nn.CTCLoss(blank=self.blank_id, reduction='none')
        params_all = [{'params': self.network.parameters()}]
        self.optimizer = create_optimizer('adam',
                                          params_all,
                                          lr=self.opts.learning_rate,
                                          momentum=self.opts.momentum,
                                          weight_decay=self.opts.weight_decay)
        self.ctc_decoder_vocab = [
            chr(x) for x in range(20000, 20000 + self.vocab_size)
        ]
        self.ctc_decoder = ctcdecode.CTCBeamDecoder(
            self.ctc_decoder_vocab,
            beam_width=self.opts.beam_width,
            blank_id=self.blank_id,
            num_processes=10)
        self.decoded_dict = {}
        pass
Exemplo n.º 4
0
 def __init__(self, beam_size=100, blank_id=labels.index('_'), kenlm_path=None):
     print("loading beam search with lm...")
     self.decoder = ctcdecode.CTCBeamDecoder(
         labels, alpha=0.522729216841, beta=0.96506699808,
         beam_width=beam_size, blank_id=labels.index('_'),
         model_path=kenlm_path)
     print("finished loading beam search")
Exemplo n.º 5
0
 def test_beam_search_decoder_2(self):
     probs_seq = np.log(np.array([self.probs_seq2], dtype=np.float32))
     decoder = ctcdecode.CTCBeamDecoder(beam_width=self.beam_size,
                                        blank_id=self.vocab_list.index('_'))
     results = decoder.decode(probs_seq)
     output_str = self.convert_to_string(results[0][0][0])
     self.assertEqual(output_str, self.beam_search_result[1])
    def create_test_decoder_with_language_model(
            handwriting_recognition_root_dir: str,
            use_non_zero_language_model_weight: bool):
        vocab_list = TestCTCDecodeWithLanguageModel.create_test_vocab_list()
        language_model_binary_file = create_test_language_model(
            handwriting_recognition_root_dir)
        # alpha: language model weight
        # beta: word insertion weight
        # See: https://github.com/PaddlePaddle/models/issues/218

        if use_non_zero_language_model_weight:
            language_model_weight = TestCTCDecodeWithLanguageModel.NONZERO_LANGUAGE_MODEL_WEIGHT
            language_model_path = language_model_binary_file
        else:
            language_model_weight = 0
            language_model_path = None

        decoder = ctcdecode.CTCBeamDecoder(
            vocab_list,
            model_path=language_model_path,
            beam_width=TestCTCDecodeWithLanguageModel.BEAM_SIZE,
            alpha=language_model_weight,
            beta=TestCTCDecodeWithLanguageModel.WORD_INSERTION_WEIGHT,
            blank_id=vocab_list.index(
                TestCTCDecodeWithLanguageModel.BLANK_SYMBOL),
            num_processes=16)
        return decoder, vocab_list
Exemplo n.º 7
0
 def test_beam_search_decoder_2(self):
     probs_seq = torch.FloatTensor([self.probs_seq2])
     decoder = ctcdecode.CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size,
                                        blank_id=self.vocab_list.index('_'))
     beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq)
     output_str = self.convert_to_string(beam_result[0][0], self.vocab_list, out_seq_len[0][0])
     self.assertEqual(output_str, self.beam_search_result[1])
Exemplo n.º 8
0
    def test_ctc_output_probability(self):
        seq_len_0 = 2
        classes = 3
        input_prob_matrix_0 = np.asarray(
            [[0.4, 0.00000001, 0.6], [0.4, 0.00000001, 0.6]], dtype=np.float32)
        input_log_prob_matrix_0 = np.log(input_prob_matrix_0)
        inputs = np.array([
            input_log_prob_matrix_0[t, :][np.newaxis, :]
            for t in range(seq_len_0)
        ])
        seq_lens = np.array([seq_len_0], dtype=np.int32)

        th_input = torch.from_numpy(inputs)
        th_seq_len = torch.IntTensor(seq_lens)

        labels = "AB_"
        scorer = ctcdecode.Scorer()
        decoder = ctcdecode.CTCBeamDecoder(scorer,
                                           labels,
                                           blank_index=2,
                                           space_index=-1,
                                           top_paths=1,
                                           beam_width=3)

        decode_result, scores, decode_len, alignments, char_probs = decoder.decode(
            th_input, th_seq_len)
        self.assertEqual(decode_len[0][0], 1)
        self.assertEqual(
            decode_result.numpy()[0, 0, :decode_len[0][0]].tolist(), [0])
        self.assertEqual(alignments.numpy()[0, 0, :decode_len[0][0]].tolist(),
                         [1])
        np.testing.assert_almost_equal(scores.numpy(),
                                       np.log(np.array([[0.64]])), 5)
Exemplo n.º 9
0
	def __init__(
		self,
		labels,
		lm_path,
		beam_width,
		beam_alpha = 0,
		beam_beta = 0,
		cutoff_top_n = 40,
		cutoff_prob = 1.0,
		num_workers = 1,
		topk = 1
	):
		import ctcdecode
		self.topk = topk
		self.beam_search_decoder = ctcdecode.CTCBeamDecoder(
			list(str(labels).lower()),
			lm_path,
			beam_alpha,
			beam_beta,
			cutoff_top_n if cutoff_top_n is not None else len(labels),
			cutoff_prob,
			beam_width,
			num_workers,
			labels.blank_idx,
			log_probs_input = True
		)
Exemplo n.º 10
0
 def __init__(self, beam_size=100, blank_id=labels.index('_'), kenlm_path=None):
     print("loading beam search with lm...")
     print("kenlm path: "+ kenlm_path)
     self.decoder = ctcdecode.CTCBeamDecoder(
         labels, alpha=1.51289039105002, beta=0.86506699808,
         beam_width=beam_size, blank_id=labels.index('_'),
         model_path=kenlm_path)
     print("finished loading beam search")
Exemplo n.º 11
0
    def test_beam_search_decoder_3(self):
        lm_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test.arpa')
        probs_seq = torch.FloatTensor([self.probs_seq2])

        decoder = ctcdecode.CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size,
                                           blank_id=self.vocab_list.index('_'),
                                           model_path=lm_path)
        beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq)
        output_str = self.convert_to_string(beam_result[0][0], self.vocab_list, out_seq_len[0][0])
        self.assertEqual(output_str, self.beam_search_result[2])
Exemplo n.º 12
0
 def test_beam_search_decoder_batch_log(self):
     probs_seq = torch.FloatTensor([self.probs_seq1, self.probs_seq2]).log()
     decoder = ctcdecode.CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size,
                                        blank_id=self.vocab_list.index('_'), log_probs_input=True,
                                        num_processes=24)
     beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq)
     output_str1 = self.convert_to_string(beam_results[0][0], self.vocab_list, out_seq_len[0][0])
     output_str2 = self.convert_to_string(beam_results[1][0], self.vocab_list, out_seq_len[1][0])
     self.assertEqual(output_str1, self.beam_search_result[0])
     self.assertEqual(output_str2, self.beam_search_result[1])
Exemplo n.º 13
0
 def create_decoder(self, alpha, beta):
     self.decoder = ctcdecode.CTCBeamDecoder(
         self.vocab_list,
         model_path=self.hparams.lm_path,
         alpha=alpha,
         beta=beta,
         cutoff_top_n=50,
         cutoff_prob=0.99,
         beam_width=100,
         blank_id=self.vocab_list.index("_"),
     )
Exemplo n.º 14
0
 def __init__(self, vocabulary_size, batch_ordering):
     super().__init__()
     # WARINIG dont use chr(0)
     vocabulary_size += 1  # TODO unify blank label stuff
     self.vocabulary = [
         chr(c) for c in list(range(65, 65 + 58)) +
         list(range(65 + 58 + 69, 65 + 58 + 69 + 500))
     ][:vocabulary_size]
     self.decoder = ctcdecode.CTCBeamDecoder(self.vocabulary,
                                             log_probs_input=True,
                                             beam_width=1)
     self.batch_ordering = batch_ordering
Exemplo n.º 15
0
    def __init__(self, opts, model, criterion, vocabulary, vocab_size,
                 blank_id):
        self.opts = opts
        self.model = model
        self.criterion = criterion
        self.vocab_size = vocab_size
        self.blank_id = blank_id
        self.pad = vocabulary.pad()
        self.unk = vocabulary.unk()
        self.eos = vocabulary.eos()
        self.bos = vocabulary.bos()

        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.criterion = self.criterion.cuda()
            self.model = self.model.cuda()

        self._num_updates = 0

        pretrain_params, attn_params = self.cnn_freeze(opts)
        # params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
        if not opts.freeze_cnn:
            self.optimizer = torch.optim.Adam(
                [{
                    "params": pretrain_params,
                    "lr": self.opts.learning_rate
                }, {
                    "params": attn_params,
                    "lr": self.opts.learning_rate
                }],
                weight_decay=self.opts.weight_decay)
        else:
            self.optimizer = torch.optim.Adam(
                [{
                    "params": pretrain_params,
                    "lr": 0.0
                }, {
                    "params": attn_params,
                    "lr": self.opts.learning_rate
                }],
                weight_decay=self.opts.weight_decay)

        # self._build_optimizer(params, self.opts.optimizer, lr=self.opts.learning_rate,
        #                       momentum=self.opts.momentum, weight_decay=self.opts.weight_decay)
        self.decoder_vocab = [
            chr(x) for x in range(20000, 20000 + self.vocab_size)
        ]
        self.decoder = ctcdecode.CTCBeamDecoder(
            self.decoder_vocab,
            beam_width=self.opts.beam_width,
            blank_id=self.blank_id,
            num_processes=10)
Exemplo n.º 16
0
 def test_subword_beam_search_decoder_batch(self):
     probs_seq = torch.FloatTensor([self.bigram_probs_seq1]) #, self.bigram_probs_seq2])
     decoder = ctcdecode.CTCBeamDecoder(self.subword_vocab_list, beam_width=5,
                                        subword=True, num_processes=24)
     beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(probs_seq)
     output_str1 = self.convert_to_string(beam_results[0][0], self.subword_vocab_list, out_seq_len[0][0])
     #output_str2 = self.convert_to_string(beam_results[1][0], self.vocab_list, out_seq_len[1][0])
     print(beam_results)
     print(beam_scores)
     print(timesteps)
     print(out_seq_len)
     print(output_str1)
     self.assertEqual(output_str1, self.bigram_beam_search_result[0])
Exemplo n.º 17
0
 def __init__(self, config):
     super(TransducerModel, self).__init__()
     self.encoder = Encoder(config)
     self.decoder = AutoregressiveDecoder(config)
     self.joiner = Joiner(config)
     self.blank_index = self.joiner.blank_index
     self.num_outputs = self.joiner.num_outputs
     #self.transducer_loss = Transducer(blank_label=self.blank_index)
     self.ctc_decoder = ctcdecode.CTCBeamDecoder(
         ["a" for _ in range(self.num_outputs)],
         blank_id=self.blank_index,
         beam_width=config.beam_width)
     self.beam_width = config.beam_width
Exemplo n.º 18
0
    def beam_decode(self, preds):
        preds = preds.transpose(0, 1)
        preds = F.softmax(preds, dim=2)
        batch_size = preds.size(0)
        decoder = ctcdecode.CTCBeamDecoder(self.alphabet, beam_width=self.beam_size, blank_id=self.alphabet.index('-'), num_processes=24)
        beam_results, beam_scores, timesteps, out_seq_len = decoder.decode(preds)

        texts = []
        for i in range(batch_size):
            output_str = self.beam_to_string(beam_results[i][0], self.alphabet, out_seq_len[i][0])
            texts.append(output_str)

        # print(texts)
        return texts
Exemplo n.º 19
0
 def __init__(self, labels, lm_path=None, alpha=0, beta=0, cutoff_top_n=20, \
    cutoff_prob=1.0,  beam_width=4, num_processes=4):
     self.labels = labels
     self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
     self.blank_index = self.labels.index("_")
     self.decoder = ctcdecode.CTCBeamDecoder(labels=self.labels,
                                             model_path=lm_path,
                                             alpha=alpha,
                                             beta=beta,
                                             cutoff_top_n=len(self.labels),
                                             cutoff_prob=cutoff_prob,
                                             beam_width=beam_width,
                                             num_processes=num_processes,
                                             blank_id=self.blank_index,
                                             log_probs_input=False)
     self.text_transform = TextTransform()
Exemplo n.º 20
0
    def __init__(self, opts, model, criterion, vocabulary, vocab_size,
                 blank_id):
        self.opts = opts
        self.model = model
        self.criterion = criterion
        self.vocab_size = vocab_size
        self.blank_id = blank_id
        self.pad = vocabulary.pad()
        self.unk = vocabulary.unk()
        self.eos = vocabulary.eos()
        self.bos = vocabulary.bos()

        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.criterion = self.criterion.cuda()
            self.model = self.model.cuda()

        self._num_updates = 0

        # params = []
        # for params in self.model.parameters():
        #     if params not in self.model.decoder.parameters():
        #         params.append(params)
        params = list(
            filter(lambda p: p.requires_grad, self.model.parameters()))
        self.optimizer = torch.optim.Adam(params,
                                          lr=self.opts.learning_rate,
                                          weight_decay=self.opts.weight_decay)

        logging.info('| num. module params: {} (num. trained: {})'.format(
            sum(p.numel() for p in params),
            sum(p.numel() for p in params if p.requires_grad),
        ))

        self.dec_generator = IterativeGenerate(vocabulary, model)

        # self._build_optimizer(params, self.opts.optimizer, lr=self.opts.learning_rate,
        #                       momentum=self.opts.momentum, weight_decay=self.opts.weight_decay)
        self.decoder_vocab = [
            chr(x) for x in range(20000, 20000 + self.vocab_size)
        ]
        self.decoder = ctcdecode.CTCBeamDecoder(
            self.decoder_vocab,
            beam_width=self.opts.beam_width,
            blank_id=self.blank_id,
            num_processes=10)
Exemplo n.º 21
0
    def __init__(self,
                 opts,
                 device,
                 vocab_size,
                 vocabulary,
                 dilated_channels=512,
                 num_blocks=1,
                 dilations=[1, 2, 4],
                 dropout=0.0):
        super(DilatedSLRNet, self).__init__()
        self.opts = opts
        self.device = device
        self.vocab_size = vocab_size
        self.in_channels = self.opts.feature_dim
        self.out_channels = dilated_channels
        self.vocab = vocabulary
        self.pad = self.vocab.pad()
        self.eos = self.vocab.eos()
        self.bos = self.vocab.bos()
        self.unk = self.vocab.unk()
        self.blank_id = self.vocab.blank()

        self.num_blocks = num_blocks
        self.dilations = dilations
        self.kernel_size = 3

        self.block_list = nn.ModuleList()
        for i in range(self.num_blocks):
            self.block_list.append(
                DilatedBlock(self.in_channels, self.out_channels,
                             self.kernel_size, self.dilations))
        self.out_conv = nn.Conv1d(self.out_channels,
                                  self.out_channels,
                                  self.kernel_size,
                                  padding=(self.kernel_size - 1) // 2)
        self.act_tanh = nn.Tanh()
        self.fc = nn.Linear(self.out_channels, self.vocab_size)

        self.decoder = LevenshteinTransformerDecoder(opts, vocabulary)
        ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)]
        self.ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab,
                                                    beam_width=opts.beam_width,
                                                    blank_id=self.blank_id,
                                                    num_processes=10)
Exemplo n.º 22
0
 def __init__(self):
     self._train_loader = None
     self._valid_loader = None
     self._device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self._loss = torch.nn.CTCLoss(blank=0, reduction="none")
     self._label_dict = {
         "_": 0,
         " ": 1,
         "'": 2,
         "A": 3,
         "B": 4,
         "C": 5,
         "D": 6,
         "E": 7,
         "F": 8,
         "G": 9,
         "H": 10,
         "I": 11,
         "J": 12,
         "K": 13,
         "L": 14,
         "M": 15,
         "N": 16,
         "O": 17,
         "P": 18,
         "Q": 19,
         "R": 20,
         "S": 21,
         "T": 22,
         "U": 23,
         "V": 24,
         "W": 25,
         "X": 26,
         "Y": 27,
         "Z": 28,
     }
     self._rev_label_dict = {v: k for k, v in self._label_dict.items()}
     self._decoder = ctcdecode.CTCBeamDecoder(
         labels=[str(c) for c in self._rev_label_dict], beam_width=1)
Exemplo n.º 23
0
    def test_simple_decode_different_blank_idx(self):
        aa = torch.FloatTensor(
            np.array([[[0.0, 1.0]], [[0.0, 1.0]], [[1.0, 0.0]], [[0.0, 1.0]],
                      [[0.0, 1.0]]],
                     dtype=np.float32)).log()
        seq_len = torch.IntTensor(np.array([5], dtype=np.int32))

        labels = "_A"
        scorer = ctcdecode.Scorer()
        decoder_nomerge = ctcdecode.CTCBeamDecoder(scorer,
                                                   labels,
                                                   blank_index=0,
                                                   space_index=-1,
                                                   top_paths=1,
                                                   beam_width=1)

        result_nomerge, _, result_nomerge_len, nomerge_alignments, _ = decoder_nomerge.decode(
            aa, seq_len)
        self.assertEqual(result_nomerge_len[0][0], 2)
        self.assertEqual(
            result_nomerge.numpy()[0, 0, :result_nomerge_len[0][0]].tolist(),
            [1, 1])
Exemplo n.º 24
0
 def __init__(self):
   self._param_shapes = None
   self._param_types = None
   self._eval_iters = {}
   self._loss = torch.nn.CTCLoss(blank=0, reduction="none")
   self._label_dict = {
       "_": 0,
       " ": 1,
       "'": 2,
       "A": 3,
       "B": 4,
       "C": 5,
       "D": 6,
       "E": 7,
       "F": 8,
       "G": 9,
       "H": 10,
       "I": 11,
       "J": 12,
       "K": 13,
       "L": 14,
       "M": 15,
       "N": 16,
       "O": 17,
       "P": 18,
       "Q": 19,
       "R": 20,
       "S": 21,
       "T": 22,
       "U": 23,
       "V": 24,
       "W": 25,
       "X": 26,
       "Y": 27,
       "Z": 28,
   }
   self._rev_label_dict = {v: k for k, v in self._label_dict.items()}
   self._decoder = ctcdecode.CTCBeamDecoder(
       labels=[str(c) for c in self._rev_label_dict], beam_width=1)
def testModel(model, test_loader, device):
    model.to(device)
    model.eval()

    with open('submission_1.txt', 'w') as file:
        with torch.no_grad():
            i = 1
            for batch_idx, (data, data_lengths, label,
                            label_length) in enumerate(test_loader):
                label = torch.tensor(label)
                label_length = torch.tensor(label_length)

                data, data_lengths, label, label_length = \
                    data.to(device), data_lengths.to(device), label.to(device), label_length.to(device)

                outputs, hidden = model(data, data_lengths, label,
                                        label_length)

                # decode
                outputs_soft = outputs.permute(1, 0, 2)

                m = nn.Softmax(dim=2)
                outputs_soft = m(outputs_soft)

                probs_seq = outputs_soft
                decoder = ctcdecode.CTCBeamDecoder(
                    PHONEME_MAP,
                    beam_width=100,
                    blank_id=PHONEME_MAP.index(' '))
                beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(
                    probs_seq)

                output_str = convert_to_string(beam_result[0][0], PHONEME_MAP,
                                               out_seq_len[0][0])

                file.write('\n' + '{}'.format(output_str))
                print(i)
                print('{}'.format(output_str))
                i += 1
Exemplo n.º 26
0
    def create_decoder(vocab_list: list, cutoff_top_n: int,
                       beam_size: int,
                       blank_symbol,
                       language_model_parameters: LanguageModelParameters):
        """

        :param vocab_list:
        :param beam_size:
        :param cutoff_top_n:  A parameter that limits the number of vocabulary
                              candidates that are kept by the decoder.
        :param blank_symbol:
        :param language_model_parameters:
        :return:
        """
        if language_model_parameters is not None:

            print("Creating decoder with language model loaded from " +
                  str(language_model_parameters.language_model_file_path))

            decoder = ctcdecode.\
                CTCBeamDecoder(
                    vocab_list, model_path=language_model_parameters.language_model_file_path,
                    cutoff_top_n=cutoff_top_n,
                    beam_width=beam_size, alpha=language_model_parameters.language_model_weight,
                    beta=language_model_parameters.word_insertion_penalty,
                    blank_id=vocab_list.index(blank_symbol),
                    space_symbol=Evaluator.WORD_SEPARATOR_SYMBOL,
                    num_processes=16)
        else:

            decoder = ctcdecode.CTCBeamDecoder(vocab_list, cutoff_top_n=cutoff_top_n,
                                               beam_width=beam_size,
                                               blank_id=vocab_list.index(blank_symbol),
                                               space_symbol=Evaluator.WORD_SEPARATOR_SYMBOL,
                                               num_processes=16)
        return decoder
Exemplo n.º 27
0
def main():
    opts = parse_args()
    init_logging(
        os.path.join(opts.log_dir,
                     '{:s}_win0_win4_log_test.txt'.format(opts.task)))

    if torch.cuda.is_available():
        torch.cuda.set_device(opts.gpu)
        logging.info("Using GPU!")
        device = "cuda"
    else:
        logging.info("Using CPU!")
        device = "cpu"

    logging.info(opts)

    test_datasets = PhoenixVideo(opts.vocab_file,
                                 opts.corpus_dir,
                                 opts.video_path,
                                 phase=opts.task,
                                 DEBUG=opts.DEBUG)
    vocab_size = test_datasets.vocab.num_words
    blank_id = test_datasets.vocab.word2index['<BLANK>']
    vocabulary = Vocabulary(opts.vocab_file)
    #     model = DilatedSLRNet(opts, device, vocab_size, vocabulary,
    #                           dilated_channels=512, num_blocks=5, dilations=[1, 2, 4], dropout=0.0)
    model = MainStream(vocab_size)
    criterion = CtcLoss(opts, blank_id, device, reduction="none")
    trainer = Trainer(opts, model, criterion, vocabulary, vocab_size, blank_id)

    # ctcdeocde
    ctc_decoder_vocab = [chr(x) for x in range(20000, 20000 + vocab_size)]
    ctc_decoder = ctcdecode.CTCBeamDecoder(ctc_decoder_vocab,
                                           beam_width=opts.beam_width,
                                           blank_id=blank_id,
                                           num_processes=10)

    if os.path.exists(opts.check_point):
        logging.info("Loading checkpoint file from {}".format(
            opts.check_point))
        epoch, num_updates, loss = trainer.load_checkpoint(opts.check_point)
    else:
        logging.info("No checkpoint file in found in {}".format(
            opts.check_point))
        epoch, num_updates, loss = 0, 0, 0.0

    test_iter = trainer.get_batch_iterator(test_datasets,
                                           batch_size=opts.batch_size,
                                           shuffle=False)
    decoded_dict = {}
    val_err, val_correct, val_count = np.zeros([4]), 0, 0

    with open("Data/output/hypo_ctc.txt",
              "w") as f, open("Data/output/ref_ctc.txt", "w") as f2:
        with torch.no_grad():
            model.eval()
            criterion.eval()
            for samples in tqdm(test_iter):
                samples = trainer._prepare_sample(samples)
                video = samples["data"]
                len_video = samples["len_data"]
                label = samples["label"]
                len_label = samples["len_label"]
                video_id = samples['id']

                logits, _ = model(video, len_video)
                len_video /= 4
                logits = F.softmax(logits, dim=-1)
                pred_seq, _, _, out_seq_len = ctc_decoder.decode(
                    logits, len_video)
                start = 0
                for i, length in enumerate(len_label):
                    end = start + length
                    ref = label[start:end].tolist()
                    hyp = [
                        x[0] for x in groupby(pred_seq[i][0]
                                              [:out_seq_len[i][0]].tolist())
                    ]
                    ref_sent = " ".join(
                        [vocabulary.index2word[r] for r in ref])
                    hyp_sent = " ".join(
                        [vocabulary.index2word[r] for r in hyp])
                    f.write(hyp_sent + "\n")
                    f2.write(ref_sent + "\n")

                    decoded_dict[video_id[i]] = hyp
                    val_correct += int(ref == hyp)
                    err = get_wer_delsubins(ref, hyp)
                    val_err += np.array(err)
                    val_count += 1
                    start = end
                assert end == label.size(0)
            logging.info('-' * 50)
            logging.info('Epoch: {:d}, DEV ACC: {:.5f}, {:d}/{:d}'.format(
                epoch, val_correct / val_count, val_correct, val_count))
            logging.info(
                'Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}'
                .format(epoch, val_err[0] / val_count, val_err[1] / val_count,
                        val_err[2] / val_count, val_err[3] / val_count))

            list_str_for_test = []
            for k, v in decoded_dict.items():
                start_time = 0
                for wi in v:
                    tl = np.random.random() * 0.1
                    list_str_for_test.append('{} 1 {:.3f} {:.3f} {}\n'.format(
                        k, start_time, start_time + tl,
                        test_datasets.vocab.index2word[wi]))
                    start_time += tl
            tmp_prefix = str(uuid.uuid1())
            txt_file = '{:s}.txt'.format(tmp_prefix)
            result_file = os.path.join('evaluation_relaxation', txt_file)
            with open(result_file, 'w') as fid:
                fid.writelines(list_str_for_test)
            phoenix_eval_err = get_phoenix_wer(txt_file, opts.task, tmp_prefix)
            logging.info(
                '[Relaxation Evaluation] Epoch: {:d}, DEV WER: {:.5f}, SUB: {:.5f}, INS: {:.5f}, DEL: {:.5f}'
                .format(epoch, phoenix_eval_err[0], phoenix_eval_err[1],
                        phoenix_eval_err[2], phoenix_eval_err[3]))
            return phoenix_eval_err
Exemplo n.º 28
0
net = HTRNet(cnn_cfg, rnn_cfg, len(classes))

if load_model_name is not None:
    my_torch_load(net, load_model_name)
net.cuda(args.gpu_id)

loss = warp_ctc.CTCLoss()
net_parameters = net.parameters()
nlr = args.learning_rate
optimizer = torch.optim.Adam(net_parameters, nlr, weight_decay=0.00005)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
    optimizer,
    [int(.5 * max_epochs), int(.75 * max_epochs)])

decoder = ctcdecode.CTCBeamDecoder([c for c in classes], beam_width=100)
# decoder = ctcdecode.


def train(epoch):
    optimizer.zero_grad()

    closs = []
    for iter_idx, (img, transcr) in enumerate(train_loader):

        img = Variable(img.cuda(gpu_id))
        # cuda augm - alternatively for cpu use it on dataloader
        img = torch_augm(img)
        output = net(img)

        act_lens = torch.IntTensor(img.size(0) * [output.size(0)])
Exemplo n.º 29
0
    def is_keyword_batch(self, input_features, sensitivity, tmp_out_dir=None):
        if tmp_out_dir is None:
            tmp_out_dir = self.out_dir

        # https://stackoverflow.com/questions/15638612/calculating-mean-and-standard-deviation-of-the-data-which-does-not-fit-in-memory
        #
        # _, feat = next(iter(input_features.items()))
        # _dim = feat.shape[-1]
        #
        # n = 0
        # mean = np.zeros((_dim))
        # M2 = np.zeros((_dim))
        #
        # for sample_name, feat in tqdm(input_features.items()):
        #     # for i in range(10):
        #     for i in range(feat.shape[0]):
        #         n += 1
        #         delta = feat[i, :] - mean
        #         mean = mean + (delta / n)
        #         M2 = M2 + (delta ** 2)
        #
        # std = np.sqrt(M2 / (n - 1))
        # mean = torch.from_numpy(mean).to(dtype=torch.float32).unsqueeze(-1)
        # std = torch.from_numpy(std).to(dtype=torch.float32).unsqueeze(-1)

        # test_output = self.test_decoder()

        # plot_phns = metadata_dict is None
        plot_phns = False
        # if plot_phns:
        #     lab_dict = {"lab_mono": {
        #         "label_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b_ali_dev_clean_100/",
        #         "label_opts": "ali-to-phones --per-frame=true",
        #         "lab_data_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/data/dev_clean/",
        #         "lab_graph": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b/graph_tgsmall/"
        #     }}
        #     label_index_from = 1
        #     _labels = _load_labels(lab_dict, label_index_from, max_label_length=None, phoneme_dict=self.phoneme_dict)
        #
        #     lab_dict = {"lab_mono": {
        #         "label_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b_ali_dev_clean_100/",
        #         "label_opts": "ali-to-phones",
        #         "lab_data_folder": "/mnt/data/libs/kaldi/egs/librispeech/s5/data/dev_clean/",
        #         "lab_graph": "/mnt/data/libs/kaldi/egs/librispeech/s5/exp/tri4b/graph_tgsmall/"
        #     }}
        #     label_index_from = 1
        #     _labels_no_ali = _load_labels(lab_dict, label_index_from, max_label_length=None,
        #                                   phoneme_dict=self.phoneme_dict)

        vocabulary_size = 42
        vocabulary = [
            chr(c) for c in list(range(65, 65 + 58)) +
            list(range(65 + 58 + 69, 65 + 58 + 69 + 500))
        ][:vocabulary_size]
        decoder = ctcdecode.CTCBeamDecoder(vocabulary,
                                           log_probs_input=True,
                                           beam_width=1)

        all_samples_concat = None
        for sample_name, feat in tqdm(input_features.items()):
            if all_samples_concat is None:
                all_samples_concat = feat
            else:
                all_samples_concat = np.concatenate((all_samples_concat, feat))

        mean = torch.from_numpy(np.mean(
            all_samples_concat, axis=0)).to(dtype=torch.float32).unsqueeze(-1)
        std = torch.from_numpy(np.std(
            all_samples_concat, axis=0)).to(dtype=torch.float32).unsqueeze(-1)
        post_files = []

        plot_num = 0

        # len = 88

        # input_batch = []
        # sample_names = []
        # for sample_name in tqdm(input_features, desc="computing acoustic features:"):
        #     input_feature = self.preprocess_feat(input_features[sample_name])
        #     # Normalize over whole chunk instead of only over a single file, which is done by applying the kaldi cmvn
        #     _input_feature = ((input_feature - mean) / std).unsqueeze(1)
        #     if _input_feature.shape[0] < len:
        #         _zeros = torch.zeros((88, 1, 40, 11))
        #         _zeros[-_input_feature.shape[0]:, :, :, :] = _input_feature
        #         _input_feature = _zeros
        #     input_batch.append(_input_feature)
        #     sample_names.append(sample_name)

        # input_batch = {'fbank': torch.cat(input_batch, dim=1)}

        beam_results = {}
        output_label = 'out_phn'
        assert output_label in self.model.out_names
        with KaldiOutputWriter(tmp_out_dir, "keyword", [output_label],
                               self.epoch) as writer:
            post_files.append(writer.post_file[output_label].name)
            for sample_name in tqdm(input_features,
                                    desc="computing acoustic features:",
                                    position=1):
                # input_feature = {"fbank": self.preprocess_feat(input_features[sample_name])}
                input_feature = {
                    "fbank":
                    torch.from_numpy(
                        input_features[sample_name].T).unsqueeze(0)
                }
                # Normalize over whole chunk instead of only over a single file, which is done by applying the kaldi cmvn
                input_feature["fbank"] = ((input_feature["fbank"] - mean) /
                                          std)

                # assert input_feature["fbank"].shape[2] > self.model.context_left + self.model.context_right + 50
                if input_feature["fbank"].shape[
                        2] < self.model.context_left + self.model.context_right + 100:
                    padd = torch.zeros(
                        (input_feature["fbank"].shape[0],
                         input_feature["fbank"].shape[1],
                         self.model.context_left + self.model.context_right),
                        device=input_feature["fbank"].device,
                        dtype=input_feature["fbank"].dtype)
                    input_feature["fbank"] = torch.cat(
                        (padd, input_feature["fbank"]), dim=2)

                output = self.model(input_feature)
                assert output_label in output
                output = output[output_label]

                _logits = output.detach().permute(0, 2, 1)

                output = output.detach().squeeze(0).numpy().T
                # output = test_output

                # if self.config['test'][output_label]['normalize_posteriors']:
                # counts = self.config['dataset']['dataset_definition']['data_info']['labels']['lab_phn']['lab_count']
                # counts = np.array(counts)
                # blank_count = sum(counts)  # heuristic sil * 2 for the moment
                # counts = counts * 0.5
                # counts = np.concatenate((np.array([np.e]), counts))
                # blank_scale = 1.0
                # TODO try different blank_scales 4.0 5.0 6.0 7.0
                # counts[0] /= blank_scale
                # for i in range(1, 8):
                #     counts[i] /= noise_scale #TODO try noise_scale for SIL SPN etc I guess

                # prior = counts / np.sum(counts)

                # output[:, 1:] = output[:, 1:] - np.log(prior)
                # assert _logits.shape[0] == batch_size
                # output = np.exp(output)

                beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(
                    _logits)
                beam_result = beam_result[0, 0, :out_seq_len[0, 0]]
                result_decoded = [
                    self.phoneme_dict.reducedIdx2phoneme[l.item() - 1]
                    for l in beam_result
                ]
                result_decoded = " ".join(result_decoded)

                beam_results[sample_name] = result_decoded

                if plot_num < 20 and plot_phns:
                    # logger.debug(sample_name)

                    # logger.debug(result_decoded)
                    # if plot_phns:
                    #     label_decoded = " ".join(
                    #         [self.phoneme_dict.idx2phoneme[l.item()] for l in _labels_no_ali['lab_mono'][sample_name]])
                    #     logger.debug(label_decoded)

                    # if plot_phns:
                    #     plot_alignment_spectrogram(sample_name, input_feature["fbank"],
                    #                                (np.exp(output).T / np.exp(output).sum(axis=1)).T,
                    #                                self.phoneme_dict, _labels, result_decoded=result_decoded)
                    # else:
                    plot_alignment_spectrogram(sample_name,
                                               input_feature["fbank"],
                                               (np.exp(output).T /
                                                np.exp(output).sum(axis=1)).T,
                                               self.phoneme_dict,
                                               result_decoded=result_decoded)

                    plot_num += 1
                # else:
                #     beam_result, beam_scores, timesteps, out_seq_len = decoder.decode(_logits)
                #     beam_result = beam_result[0, 0, :out_seq_len[0, 0]]
                #     # logger.debug(sample_name)
                #     result_decoded = [self.phoneme_dict.reducedIdx2phoneme[l.item() - 1] for l in beam_result]
                #     result_decoded = " ".join(result_decoded)
                #     # logger.debug(result_decoded)
                #     plot_alignment_spectrogram(sample_name, input_feature["fbank"],
                #                                (np.exp(output).T / np.exp(output).sum(axis=1)).T,
                #                                self.phoneme_dict, metadata_dict[sample_name], result_decoded=result_decoded)
                #
                #     plot_num += 1

                assert len(output.shape) == 2
                assert np.sum(np.isnan(output)) == 0, "NaN in output"
                assert output.shape[1] == len(
                    self.phoneme_dict.reducedIdx2phoneme) + 1
                writer.write_mat(output_label, output.squeeze(), sample_name)

        # self.config['decoding']['scoring_type'] = 'just_transcript'
        #### DECODING ####
        logger.debug("Decoding...")
        result = decode_ctc(**self.config['dataset']['dataset_definition']
                            ['decoding'],
                            words_path=self.words_path,
                            graph_path=self.graph_path,
                            out_folder=tmp_out_dir,
                            featstrings=post_files)

        # TODO filter result

        return result
Exemplo n.º 30
0
    def __init__(self,
                 lexicon=None,
                 backend='resnet18',
                 base_model_dir=None,
                 rnn_hidden_size=128,
                 rnn_num_layers=2,
                 rnn_dropout=0,
                 seq_proj=[0, 0],
                 do_beam_search=False,
                 dropout_conv=False,
                 dropout_rnn=False,
                 dropout_output=False,
                 cuda=True,
                 do_ema=False,
                 ada_after_rnn=False,
                 ada_before_rnn=False):
        super().__init__()
        self.lexicon = lexicon
        print(lexicon)
        self.do_beam_search = do_beam_search
        self.num_classes = len(self.lexicon)
        self.ada_after_rnn = ada_after_rnn
        self.ada_before_rnn = ada_before_rnn

        self.feature_extractor = getattr(my_models,
                                         backend)(pretrained=True,
                                                  model_dir=base_model_dir)
        self.cnn = nn.Sequential(
            self.feature_extractor.conv1,
            self.feature_extractor.bn1,
            self.feature_extractor.relu,
            nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=(1, 0)),
            self.feature_extractor.layer1,
            self.feature_extractor.layer2,
            nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=(1, 0)),
            self.feature_extractor.layer3,
            #self.feature_extractor.layer4,
            nn.MaxPool2d(kernel_size=(3, 1), stride=(2, 1), padding=(1, 0)))

        self.dropout_conv = dropout_conv
        self.dropout_rnn = dropout_rnn
        self.dropout_output = dropout_output
        self.dropout2d = nn.Dropout2d(p=0.5)
        self.dropout1d = nn.Dropout(p=0.5)

        self.fully_conv = True  #seq_proj[0] == 0
        if not self.fully_conv:
            self.proj = nn.Conv2d(seq_proj[0], seq_proj[1], kernel_size=1)

        self.rnn_hidden_size = rnn_hidden_size
        self.rnn_num_layers = rnn_num_layers
        if self.dropout_rnn:
            self.rnn = nn.GRU(self.get_block_size(self.cnn),
                              rnn_hidden_size,
                              rnn_num_layers,
                              batch_first=False,
                              bidirectional=True,
                              dropout=0.5)
        else:
            self.rnn = nn.GRU(self.get_block_size(self.cnn),
                              rnn_hidden_size,
                              rnn_num_layers,
                              batch_first=False,
                              bidirectional=True,
                              dropout=0.5)

        self.linear = nn.Linear(rnn_hidden_size * 2, self.num_classes + 1)
        self.softmax = nn.Softmax(dim=2)

        for i in range(20):
            length = random.randint(50, 300)
            height1, width1 = self._get_output_ratio(length)

            width2 = calc_im_seq_len(length)
            if (width2 != width1):
                raise Exception(
                    "error, orig width is: {} ; width through network is: {} ; calculated width is: {} ."
                    .format(length, width1, width2))
            if height1 != 1:
                raise Exception(
                    "hight after network should be one, but is: {}".format(
                        height1))

        if self.do_beam_search:
            sorted_letters = [
                item[1]
                for item in sorted(lexicon.items(), key=operator.itemgetter(0))
            ]
            sorted_keys = [
                item[0]
                for item in sorted(lexicon.items(), key=operator.itemgetter(0))
            ]
            #print(sorted_keys)
            #print(sorted_letters)
            self.label_str = ['_'] + sorted_letters

            #print(label_str)

            print('vocab size is: {}'.format(len(self.label_str)))
            self.beam_decode = ctcdecode.CTCBeamDecoder(self.label_str,
                                                        blank_id=0,
                                                        beam_width=20)

        if cuda:
            self.cuda()
        if do_ema:
            self.avg_param = self.copy_model_params()  # initialize
            if cuda:
                for i in range(len(self.avg_param)):
                    self.avg_param[i].cuda()

        if ada_after_rnn:
            self.domain_classifier_rnn = nn.Sequential()
            self.domain_classifier_rnn.add_module(
                'd_fc1', nn.Linear(rnn_hidden_size * 2, 100))
            self.domain_classifier_rnn.add_module('d_bn1', nn.BatchNorm1d(100))
            self.domain_classifier_rnn.add_module('d_relu1', nn.ReLU(True))
            self.domain_classifier_rnn.add_module('d_fc2', nn.Linear(100, 2))
            self.domain_classifier_rnn.add_module('d_softmax', nn.LogSoftmax())

        if ada_before_rnn:
            self.domain_classifier_cnn = nn.Sequential()
            self.domain_classifier_cnn.add_module(
                'd_fc1', nn.Linear(self.get_block_size(self.cnn), 100))
            self.domain_classifier_cnn.add_module('d_bn1', nn.BatchNorm1d(100))
            self.domain_classifier_cnn.add_module('d_relu1', nn.ReLU(True))
            self.domain_classifier_cnn.add_module('d_fc2', nn.Linear(100, 2))
            self.domain_classifier_cnn.add_module('d_softmax', nn.LogSoftmax())