示例#1
0
    def test_evaluate(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics,
                                               self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics,
                                                 self.vocabulary)
        word_id = numpy_optimizer.get_word_id('d')
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 1 if orig_class_id != 1 else 0

        orig_ll = numpy_optimizer.log_likelihood()
        self.assertTrue(
            numpy.isclose(orig_ll, theano_optimizer.log_likelihood()))

        ll_diff = numpy_optimizer._evaluate(word_id, new_class_id)
        self.assertTrue(
            numpy.isclose(ll_diff,
                          theano_optimizer._evaluate(word_id, new_class_id)))

        numpy_optimizer._move(word_id, new_class_id)
        new_ll = numpy_optimizer.log_likelihood()
        self.assertFalse(numpy.isclose(orig_ll, new_ll))
        self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll))

        theano_optimizer._move(word_id, new_class_id)
        self.assertTrue(
            numpy.isclose(new_ll, theano_optimizer.log_likelihood()))
示例#2
0
    def test_move_and_back(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)

        orig_class_counts = numpy.copy(numpy_optimizer._class_counts)
        orig_cc_counts = numpy.copy(numpy_optimizer._cc_counts)
        orig_cw_counts = numpy.copy(numpy_optimizer._cw_counts)
        orig_wc_counts = numpy.copy(numpy_optimizer._wc_counts)

        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        numpy_optimizer._move(word_id, new_class_id)
        theano_optimizer._move(word_id, new_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertEqual(numpy.count_nonzero(numpy_optimizer._class_counts != orig_class_counts), 2)
        self.assertEqual(numpy.sum(numpy_optimizer._class_counts), numpy.sum(orig_class_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cc_counts != orig_cc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cc_counts), numpy.sum(orig_cc_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cw_counts != orig_cw_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cw_counts), numpy.sum(orig_cw_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._wc_counts != orig_wc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._wc_counts), numpy.sum(orig_wc_counts))

        numpy_optimizer._move(word_id, orig_class_id)
        theano_optimizer._move(word_id, orig_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertTrue(numpy.array_equal(numpy_optimizer._class_counts, orig_class_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cc_counts, orig_cc_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cw_counts, orig_cw_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._wc_counts, orig_wc_counts))
示例#3
0
    def test_move_and_back(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)

        orig_class_counts = numpy.copy(numpy_optimizer._class_counts)
        orig_cc_counts = numpy.copy(numpy_optimizer._cc_counts)
        orig_cw_counts = numpy.copy(numpy_optimizer._cw_counts)
        orig_wc_counts = numpy.copy(numpy_optimizer._wc_counts)

        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        numpy_optimizer._move(word_id, new_class_id)
        theano_optimizer._move(word_id, new_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertEqual(numpy.count_nonzero(numpy_optimizer._class_counts != orig_class_counts), 2)
        self.assertEqual(numpy.sum(numpy_optimizer._class_counts), numpy.sum(orig_class_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cc_counts != orig_cc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cc_counts), numpy.sum(orig_cc_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._cw_counts != orig_cw_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._cw_counts), numpy.sum(orig_cw_counts))
        self.assertGreater(numpy.count_nonzero(numpy_optimizer._wc_counts != orig_wc_counts), 0)
        self.assertEqual(numpy.sum(numpy_optimizer._wc_counts), numpy.sum(orig_wc_counts))

        numpy_optimizer._move(word_id, orig_class_id)
        theano_optimizer._move(word_id, orig_class_id)

        self.assert_optimizers_equal(numpy_optimizer, theano_optimizer)
        self.assertTrue(numpy.array_equal(numpy_optimizer._class_counts, orig_class_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cc_counts, orig_cc_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._cw_counts, orig_cw_counts))
        self.assertTrue(numpy.array_equal(numpy_optimizer._wc_counts, orig_wc_counts))
示例#4
0
    def test_move_and_recompute(self):
        optimizer1 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = optimizer1.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        optimizer1._word_to_class[word_id] = new_class_id
        counts = optimizer1._compute_class_statistics(
            optimizer1._word_counts, optimizer1._ww_counts,
            optimizer1._word_to_class)

        class_counts = numpy.zeros(optimizer1.num_classes, 'int32')
        cc_counts = numpy.zeros(
            (optimizer1.num_classes, optimizer1.num_classes), dtype='int32')
        cw_counts = numpy.zeros(
            (optimizer1.num_classes, optimizer1.vocabulary_size),
            dtype='int32')
        wc_counts = numpy.zeros(
            (optimizer1.vocabulary_size, optimizer1.num_classes),
            dtype='int32')
        for wid, cid in enumerate(optimizer1._word_to_class):
            class_counts[cid] += optimizer1._word_counts[wid]
        for left_wid, right_wid in zip(*optimizer1._ww_counts.nonzero()):
            count = optimizer1._ww_counts[left_wid, right_wid]
            left_cid = optimizer1._word_to_class[left_wid]
            right_cid = optimizer1._word_to_class[right_wid]
            cc_counts[left_cid, right_cid] += count
            cw_counts[left_cid, right_wid] += count
            wc_counts[left_wid, right_cid] += count
        self.assertTrue(numpy.array_equal(class_counts, counts[0]))
        self.assertTrue(numpy.array_equal(cc_counts, counts[1]))
        self.assertTrue(numpy.array_equal(cw_counts, counts[2]))
        self.assertTrue(numpy.array_equal(wc_counts, counts[3]))
        optimizer1._class_counts = counts[0]
        optimizer1._cc_counts = counts[1]
        optimizer1._cw_counts = counts[2]
        optimizer1._wc_counts = counts[3]

        optimizer2 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer2.get_word_class(word_id)
        optimizer2._move(word_id, new_class_id)

        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._class_counts != optimizer2._class_counts), 0)
        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._cc_counts != optimizer2._cc_counts), 0)
        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._cw_counts != optimizer2._cw_counts), 0)
        self.assertEqual(
            numpy.count_nonzero(
                optimizer1._wc_counts != optimizer2._wc_counts), 0)

        optimizer3 = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer3.get_word_class(word_id)
        optimizer3._move(word_id, new_class_id)

        self.assert_optimizers_equal(optimizer2, optimizer3)
示例#5
0
    def test_move_and_recompute(self):
        optimizer1 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        word_id = self.vocabulary.word_to_id['d']
        orig_class_id = optimizer1.get_word_class(word_id)
        new_class_id = 3 if orig_class_id != 3 else 4
        optimizer1._word_to_class[word_id] = new_class_id
        counts = optimizer1._compute_class_statistics(optimizer1._word_counts,
                                                      optimizer1._ww_counts,
                                                      optimizer1._word_to_class)

        class_counts = numpy.zeros(optimizer1.num_classes, 'int32')
        cc_counts = numpy.zeros((optimizer1.num_classes, optimizer1.num_classes), dtype='int32')
        cw_counts = numpy.zeros((optimizer1.num_classes, optimizer1.vocabulary_size), dtype='int32')
        wc_counts = numpy.zeros((optimizer1.vocabulary_size, optimizer1.num_classes), dtype='int32')
        for wid, cid in enumerate(optimizer1._word_to_class):
            class_counts[cid] += optimizer1._word_counts[wid]
        for left_wid, right_wid in zip(*optimizer1._ww_counts.nonzero()):
            count = optimizer1._ww_counts[left_wid, right_wid]
            left_cid = optimizer1._word_to_class[left_wid]
            right_cid = optimizer1._word_to_class[right_wid]
            cc_counts[left_cid,right_cid] += count
            cw_counts[left_cid,right_wid] += count
            wc_counts[left_wid,right_cid] += count
        self.assertTrue(numpy.array_equal(class_counts, counts[0]))
        self.assertTrue(numpy.array_equal(cc_counts, counts[1]))
        self.assertTrue(numpy.array_equal(cw_counts, counts[2]))
        self.assertTrue(numpy.array_equal(wc_counts, counts[3]))
        optimizer1._class_counts = counts[0]
        optimizer1._cc_counts = counts[1]
        optimizer1._cw_counts = counts[2]
        optimizer1._wc_counts = counts[3]

        optimizer2 = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer2.get_word_class(word_id)
        optimizer2._move(word_id, new_class_id)

        self.assertEqual(numpy.count_nonzero(optimizer1._class_counts != optimizer2._class_counts), 0)
        self.assertEqual(numpy.count_nonzero(optimizer1._cc_counts != optimizer2._cc_counts), 0)
        self.assertEqual(numpy.count_nonzero(optimizer1._cw_counts != optimizer2._cw_counts), 0)
        self.assertEqual(numpy.count_nonzero(optimizer1._wc_counts != optimizer2._wc_counts), 0)

        optimizer3 = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        orig_class_id = optimizer3.get_word_class(word_id)
        optimizer3._move(word_id, new_class_id)

        self.assert_optimizers_equal(optimizer2, optimizer3)
示例#6
0
    def test_evaluate(self):
        numpy_optimizer = NumpyBigramOptimizer(self.statistics, self.vocabulary)
        theano_optimizer = TheanoBigramOptimizer(self.statistics, self.vocabulary)
        word_id = numpy_optimizer.get_word_id('d')
        orig_class_id = numpy_optimizer.get_word_class(word_id)
        new_class_id = 1 if orig_class_id != 1 else 0

        orig_ll = numpy_optimizer.log_likelihood()
        self.assertTrue(numpy.isclose(orig_ll, theano_optimizer.log_likelihood()))

        ll_diff = numpy_optimizer._evaluate(word_id, new_class_id)
        self.assertTrue(numpy.isclose(ll_diff, theano_optimizer._evaluate(word_id, new_class_id)))

        numpy_optimizer._move(word_id, new_class_id)
        new_ll = numpy_optimizer.log_likelihood()
        self.assertFalse(numpy.isclose(orig_ll, new_ll))
        self.assertTrue(numpy.isclose(orig_ll + ll_diff, new_ll))

        theano_optimizer._move(word_id, new_class_id)
        self.assertTrue(numpy.isclose(new_ll, theano_optimizer.log_likelihood()))