def test_score_sequence(self): # Network predicts <unk> probability. scorer = TextScorer(self.dummy_network) word_ids = numpy.arange(6) class_ids = numpy.arange(6) membership_probs = numpy.ones(6, dtype='float32') logprob = scorer.score_sequence(word_ids, class_ids, membership_probs) correct = word_ids[1:].astype('float32') correct = correct / 5 correct = numpy.log(correct).sum() self.assertAlmostEqual(logprob, correct, places=5) # <unk> is removed from the resulting logprobs. scorer = TextScorer(self.dummy_network, ignore_unk=True) word_ids = numpy.arange(6) word_ids[3] = self.vocabulary.word_to_id['<unk>'] class_ids = numpy.arange(6) membership_probs = numpy.ones(6, dtype='float32') logprob = scorer.score_sequence(word_ids, class_ids, membership_probs) correct = word_ids[[1, 2, 4, 5]].astype('float32') correct = correct / 5 correct = numpy.log(correct).sum() self.assertAlmostEqual(logprob, correct, places=5) # <unk> is assigned a constant logprob. scorer = TextScorer(self.dummy_network, ignore_unk=False, unk_penalty=-5) word_ids = numpy.arange(6) word_ids[3] = self.vocabulary.word_to_id['<unk>'] class_ids = numpy.arange(6) membership_probs = numpy.ones(6, dtype='float32') logprob = scorer.score_sequence(word_ids, class_ids, membership_probs) correct = word_ids[[1, 2, 4, 5]].astype('float32') correct = correct / 5 correct = numpy.log(correct).sum() - 5 self.assertAlmostEqual(logprob, correct, places=5)
def test_score_sequence(self): # Network predicts <unk> probability. scorer = TextScorer(self.dummy_network, use_shortlist=False) word_ids = numpy.arange(15) class_ids, _ = self.vocabulary.get_class_memberships(word_ids) membership_probs = numpy.ones_like(word_ids).astype('float32') logprob = scorer.score_sequence(word_ids, class_ids, membership_probs) correct = word_ids[1:].astype('float32') correct /= 100.0 correct[12] = 12.0 / 100.0 correct[13] = 12.0 / 100.0 correct = numpy.log(correct).sum() self.assertAlmostEqual(logprob, correct, places=4) # Network predicts <unk> probability. This is distributed for # out-of-shortlist words according to word frequency. scorer = TextScorer(self.dummy_network, use_shortlist=True) word_ids = numpy.arange(15) class_ids, _ = self.vocabulary.get_class_memberships(word_ids) membership_probs = numpy.ones_like(word_ids).astype('float32') logprob = scorer.score_sequence(word_ids, class_ids, membership_probs) correct = word_ids[1:].astype('float32') correct /= 100.0 correct[11] = 1.0 # <unk> is ignored correct[12] = 12.0 / 100.0 * 0.3 correct[13] = 12.0 / 100.0 * 0.7 correct = numpy.log(correct).sum() self.assertAlmostEqual(logprob, correct, places=5) # OOV and OOS words are excluded from the resulting logprobs. scorer = TextScorer(self.dummy_network, use_shortlist=False, exclude_unk=True) word_ids = numpy.arange(15) class_ids, _ = self.vocabulary.get_class_memberships(word_ids) membership_probs = numpy.ones_like(word_ids).astype('float32') logprob = scorer.score_sequence(word_ids, class_ids, membership_probs) correct = word_ids[1:12].astype('float32') correct /= 100.0 correct = numpy.log(correct).sum() self.assertAlmostEqual(logprob, correct, places=5)