Exemplo n.º 1
0
 def test_as_training_data_error(self):
     with self.assertRaises(ValueError):
         feature = IndexedPairFeature([
             IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])
         ], [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)
         feature.as_training_data()
     with self.assertRaises(ValueError):
         self.feature.as_training_data(mode="words+character")
Exemplo n.º 2
0
 def setUp(self):
     super(TestIndexedPairFeature, self).setUp()
     self.feature = IndexedPairFeature([
         IndexedFeatureWord(1, [1, 2]),
         IndexedFeatureWord(2, [3, 4]),
         IndexedFeatureWord(3, [5]),
         IndexedFeatureWord(5, [1, 4, 1]),
         IndexedFeatureWord(4, [1, 2, 6])
     ], [
         IndexedFeatureWord(1, [1, 2]),
         IndexedFeatureWord(8, [3, 1, 2, 1]),
         IndexedFeatureWord(2, [3, 4]),
         IndexedFeatureWord(3, [5])
     ], [0, 1])
Exemplo n.º 3
0
 def setUp(self):
     super(TestIndexedDataset, self).setUp()
     self.features = [IndexedPairFeature([IndexedFeatureWord(1, [1, 5]),
                                           IndexedFeatureWord(2, [2, 1]),
                                           IndexedFeatureWord(3, [1, 4, 1])],
                                          [IndexedFeatureWord(2, [2, 1]),
                                           IndexedFeatureWord(3, [1, 4, 1])],
                                          [0, 1]),
                       IndexedPairFeature([IndexedFeatureWord(3, [1, 4, 1]),
                                           IndexedFeatureWord(1, [1, 5])],
                                          [IndexedFeatureWord(3, [1, 4, 1]),
                                           IndexedFeatureWord(1, [1, 5]),
                                           IndexedFeatureWord(3, [1, 4, 1]),
                                           IndexedFeatureWord(2, [2, 1])],
                                          [1, 0])]
     self.indexed_dataset = IndexedDataset(self.features)
Exemplo n.º 4
0
 def test_sort(self):
     # lengths: 3, 4, 1, 2, 2
     sorted_features = [IndexedPairFeature([IndexedFeatureWord(3, [1, 4, 1]),
                                             IndexedFeatureWord(1, [1, 5])],
                                            [IndexedFeatureWord(3, [1, 4, 1]),
                                             IndexedFeatureWord(1, [1, 5]),
                                             IndexedFeatureWord(3, [1, 4, 1]),
                                             IndexedFeatureWord(2, [2, 1])],
                                            [1, 0]),
                         IndexedPairFeature([IndexedFeatureWord(1, [1, 5]),
                                             IndexedFeatureWord(2, [2, 1]),
                                             IndexedFeatureWord(3, [1, 4, 1])],
                                            [IndexedFeatureWord(2, [2, 1]),
                                             IndexedFeatureWord(3, [1, 4, 1])],
                                            [0, 1])]
     self.assertNotEqual(sorted_features, self.indexed_dataset.features)
     self.indexed_dataset.sort()
     self.assertEquals(sorted_features, self.indexed_dataset.features)
Exemplo n.º 5
0
    def test_less_than(self):
        feature_1 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_2 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(2, [2, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_3 = IndexedPairFeature([IndexedFeatureWord(1, [1, 2])],
                                       [IndexedFeatureWord(1, [2, 2])], None)
        self.assertFalse(feature_1.__lt__(0))
        self.assertFalse(feature_2.__lt__(feature_1))
        self.assertLess(feature_1, feature_2)
        self.assertLess(feature_3, feature_2)
Exemplo n.º 6
0
    def test_equals(self):
        feature_1 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_2 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_3 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(1, [2, 2])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)
        feature_4 = IndexedPairFeature([IndexedFeatureWord(1, [1, 2])],
                                       [IndexedFeatureWord(1, [2, 2])], None)
        self.assertNotEquals(feature_1, feature_4)
        self.assertNotEquals(feature_1, feature_3)
        self.assertFalse(feature_1.__eq__(0))
        self.assertEquals(feature_1, feature_2)
Exemplo n.º 7
0
    def test_as_testing_data(self):
        features = [IndexedPairFeature([IndexedFeatureWord(1, [1, 4, 4]),
                                         IndexedFeatureWord(2, [2, 3]),
                                         IndexedFeatureWord(3, [5, 1])],
                                        [IndexedFeatureWord(2, [2, 3]),
                                         IndexedFeatureWord(3, [5, 1])],
                                        None),
                     IndexedPairFeature([IndexedFeatureWord(3, [5, 1]),
                                         IndexedFeatureWord(1, [1, 4, 4])],
                                        [IndexedFeatureWord(3, [5, 1]),
                                         IndexedFeatureWord(1, [1, 4, 4]),
                                         IndexedFeatureWord(3, [5, 1]),
                                         IndexedFeatureWord(2, [2, 3])],
                                        None)]
        indexed_dataset = IndexedDataset(features)
        indexed_dataset.pad_features(indexed_dataset.max_lengths())
        inputs, labels = indexed_dataset.as_testing_data()
        assert len(labels) == 0

        first_sentence, second_sentence = inputs[0]
        assert_allclose(first_sentence, np.array([1, 2, 3, 0]))
        assert_allclose(second_sentence, np.array([2, 3, 0, 0]))

        first_sentence, second_sentence = inputs[1]
        assert_allclose(first_sentence, np.array([3, 1, 0, 0]))
        assert_allclose(second_sentence, np.array([3, 1, 3, 2]))

        inputs, labels = indexed_dataset.as_testing_data(mode="character")
        assert len(labels) == 0

        first_sentence, second_sentence = inputs[0]
        assert_allclose(first_sentence, np.array([[1, 4, 4], [2, 3, 0],
                                                  [5, 1, 0], [0, 0, 0]]))
        assert_allclose(second_sentence, np.array([[2, 3, 0], [5, 1, 0],
                                                   [0, 0, 0], [0, 0, 0]]))

        first_sentence, second_sentence = inputs[1]
        assert_allclose(first_sentence, np.array([[5, 1, 0], [1, 4, 4],
                                                  [0, 0, 0], [0, 0, 0]]))
        assert_allclose(second_sentence, np.array([[5, 1, 0], [1, 4, 4],
                                                   [5, 1, 0], [2, 3, 0]]))

        inputs, labels = indexed_dataset.as_testing_data(mode="word+character")
        assert len(labels) == 0

        (first_sentence_words, first_sentence_characters,
         second_sentence_words, second_sentence_characters) = inputs[0]
        assert_allclose(first_sentence_words, np.array([1, 2, 3, 0]))
        assert_allclose(second_sentence_words, np.array([2, 3, 0, 0]))
        assert_allclose(first_sentence_characters, np.array([[1, 4, 4], [2, 3, 0],
                                                             [5, 1, 0], [0, 0, 0]]))
        assert_allclose(second_sentence_characters, np.array([[2, 3, 0], [5, 1, 0],
                                                              [0, 0, 0], [0, 0, 0]]))

        (first_sentence_words, first_sentence_characters,
         second_sentence_words, second_sentence_characters) = inputs[1]
        assert_allclose(first_sentence_words, np.array([3, 1, 0, 0]))
        assert_allclose(second_sentence_words, np.array([3, 1, 3, 2]))
        assert_allclose(first_sentence_characters, np.array([[5, 1, 0], [1, 4, 4],
                                                             [0, 0, 0], [0, 0, 0]]))
        assert_allclose(second_sentence_characters, np.array([[5, 1, 0], [1, 4, 4],
                                                              [5, 1, 0], [2, 3, 0]]))
        with self.assertRaises(ValueError):
            indexed_dataset.as_testing_data(mode="char")
Exemplo n.º 8
0
class TestIndexedPairFeature(DuplicateTestCase):
    def setUp(self):
        super(TestIndexedPairFeature, self).setUp()
        self.feature = IndexedPairFeature([
            IndexedFeatureWord(1, [1, 2]),
            IndexedFeatureWord(2, [3, 4]),
            IndexedFeatureWord(3, [5]),
            IndexedFeatureWord(5, [1, 4, 1]),
            IndexedFeatureWord(4, [1, 2, 6])
        ], [
            IndexedFeatureWord(1, [1, 2]),
            IndexedFeatureWord(8, [3, 1, 2, 1]),
            IndexedFeatureWord(2, [3, 4]),
            IndexedFeatureWord(3, [5])
        ], [0, 1])

    def test_get_lengths(self):
        assert self.feature.get_lengths() == {
            "num_sentence_words": 5,
            'num_word_characters': 4
        }

    def test_pad_adds_padding_words(self):
        self.feature.pad({"num_sentence_words": 6, 'num_word_characters': 5})
        first_sent_word_idxs, second_sent_word_idxs = self.feature.get_int_word_indices(
        )
        first_sent_char_idxs, second_sent_char_idxs = self.feature.get_int_char_indices(
        )

        assert first_sent_word_idxs == [1, 2, 3, 5, 4, 0]
        assert second_sent_word_idxs == [1, 8, 2, 3, 0, 0]
        assert first_sent_char_idxs == [[1, 2, 0, 0, 0], [3, 4, 0, 0, 0],
                                        [5, 0, 0, 0, 0], [1, 4, 1, 0, 0],
                                        [1, 2, 6, 0, 0], [0, 0, 0, 0, 0]]
        assert second_sent_char_idxs == [[1, 2, 0, 0, 0], [3, 1, 2, 1, 0],
                                         [3, 4, 0, 0, 0], [5, 0, 0, 0, 0],
                                         [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]
        assert self.feature.label == [0, 1]

    def test_pad_truncates(self):
        self.feature.pad({"num_sentence_words": 2, 'num_word_characters': 3})
        first_sent_word_idxs, second_sent_word_idxs = self.feature.get_int_word_indices(
        )
        first_sent_char_idxs, second_sent_char_idxs = self.feature.get_int_char_indices(
        )

        assert first_sent_word_idxs == [1, 2]
        assert second_sent_word_idxs == [1, 8]
        assert first_sent_char_idxs == [[1, 2, 0], [3, 4, 0]]
        assert second_sent_char_idxs == [[1, 2, 0], [3, 1, 2]]
        assert self.feature.label == [0, 1]

    def test_pad_general(self):
        self.feature.pad(self.feature.get_lengths())
        first_sent_word_idxs, second_sent_word_idxs = self.feature.get_int_word_indices(
        )
        first_sent_char_idxs, second_sent_char_idxs = self.feature.get_int_char_indices(
        )

        assert first_sent_word_idxs == [1, 2, 3, 5, 4]
        assert second_sent_word_idxs == [1, 8, 2, 3, 0]
        assert first_sent_char_idxs == [[1, 2, 0, 0], [3, 4, 0,
                                                       0], [5, 0, 0, 0],
                                        [1, 4, 1, 0], [1, 2, 6, 0]]
        assert second_sent_char_idxs == [[1, 2, 0, 0], [3, 1, 2, 1],
                                         [3, 4, 0, 0], [5, 0, 0, 0],
                                         [0, 0, 0, 0]]
        assert self.feature.label == [0, 1]

    def test_as_training_data_produces_correct_numpy_arrays(self):
        self.feature.pad({'num_sentence_words': 3, 'num_word_characters': 2})
        inputs, label = self.feature.as_training_data()
        assert_allclose(label[0], np.asarray([0, 1]))
        assert len(inputs) == 2
        assert_allclose(inputs[0], np.asarray([1, 2, 3]))
        assert_allclose(inputs[1], np.asarray([1, 8, 2]))

        inputs, label = self.feature.as_training_data(mode="character")
        assert_allclose(label[0], np.asarray([0, 1]))
        assert len(inputs) == 2
        assert_allclose(inputs[0], np.asarray([[1, 2], [3, 4], [5, 0]]))
        assert_allclose(inputs[1], np.asarray([[1, 2], [3, 1], [3, 4]]))

        inputs, label = self.feature.as_training_data(mode="word+character")
        assert_allclose(label[0], np.asarray([0, 1]))
        assert len(inputs) == 4
        assert_allclose(inputs[0], np.asarray([1, 2, 3]))
        assert_allclose(inputs[1], np.asarray([[1, 2], [3, 4], [5, 0]]))
        assert_allclose(inputs[2], np.asarray([1, 8, 2]))
        assert_allclose(inputs[3], np.asarray([[1, 2], [3, 1], [3, 4]]))

    def test_as_training_data_error(self):
        with self.assertRaises(ValueError):
            feature = IndexedPairFeature([
                IndexedFeatureWord(1, [1, 2]),
                IndexedFeatureWord(4, [1, 2, 6])
            ], [IndexedFeatureWord(1, [1, 2]),
                IndexedFeatureWord(3, [5])], None)
            feature.as_training_data()
        with self.assertRaises(ValueError):
            self.feature.as_training_data(mode="words+character")

    def test_as_testing_data_produces_correct_numpy_arrays(self):
        self.feature.pad({'num_sentence_words': 4, 'num_word_characters': 2})
        inputs, labels = self.feature.as_testing_data()
        assert len(labels) == 0
        assert len(inputs) == 2
        assert_allclose(inputs[0], np.asarray([1, 2, 3, 5]))
        assert_allclose(inputs[1], np.asarray([1, 8, 2, 3]))

        inputs, label = self.feature.as_training_data(mode="character")
        assert len(labels) == 0
        assert len(inputs) == 2
        assert_allclose(inputs[0], np.asarray([[1, 2], [3, 4], [5, 0], [1,
                                                                        4]]))
        assert_allclose(inputs[1], np.asarray([[1, 2], [3, 1], [3, 4], [5,
                                                                        0]]))

        inputs, label = self.feature.as_training_data(mode="word+character")
        assert len(labels) == 0
        assert len(inputs) == 4
        assert_allclose(inputs[0], np.asarray([1, 2, 3, 5]))
        assert_allclose(inputs[1], np.asarray([[1, 2], [3, 4], [5, 0], [1,
                                                                        4]]))
        assert_allclose(inputs[2], np.asarray([1, 8, 2, 3]))
        assert_allclose(inputs[3], np.asarray([[1, 2], [3, 1], [3, 4], [5,
                                                                        0]]))

    def test_as_testing_data_error(self):
        with self.assertRaises(ValueError):
            self.feature.as_testing_data(mode="words+character")

    def test_equals(self):
        feature_1 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_2 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_3 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(1, [2, 2])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)
        feature_4 = IndexedPairFeature([IndexedFeatureWord(1, [1, 2])],
                                       [IndexedFeatureWord(1, [2, 2])], None)
        self.assertNotEquals(feature_1, feature_4)
        self.assertNotEquals(feature_1, feature_3)
        self.assertFalse(feature_1.__eq__(0))
        self.assertEquals(feature_1, feature_2)

    def test_less_than(self):
        feature_1 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_2 = IndexedPairFeature(
            [IndexedFeatureWord(1, [1, 2]),
             IndexedFeatureWord(4, [1, 2, 6])],
            [IndexedFeatureWord(2, [2, 2]),
             IndexedFeatureWord(3, [5])], None)

        feature_3 = IndexedPairFeature([IndexedFeatureWord(1, [1, 2])],
                                       [IndexedFeatureWord(1, [2, 2])], None)
        self.assertFalse(feature_1.__lt__(0))
        self.assertFalse(feature_2.__lt__(feature_1))
        self.assertLess(feature_1, feature_2)
        self.assertLess(feature_3, feature_2)