예제 #1
0
class TestDataManagerValidation(DuplicateTestCase):
    @overrides
    def setUp(self):
        super(TestDataManagerValidation, self).setUp()
        self.write_duplicate_questions_train_file()
        self.write_duplicate_questions_validation_file()
        self.data_manager = DataManager(PairFeature)
        self.data_manager.get_train_data_from_file([self.TRAIN_FILE])

    def test_get_validation_data_default(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE])
        assert val_size == 3
        val_gen = get_val_gen()
        inputs1, labels1 = val_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 0]))
        assert_allclose(inputs1[1], np.array([3, 1]))
        assert_allclose(labels1[0], np.array([1, 0]))

        inputs2, labels2 = val_gen.__next__()
        assert_allclose(inputs2[0], np.array([1, 0]))
        assert_allclose(inputs2[1], np.array([1, 0]))
        assert_allclose(labels2[0], np.array([0, 1]))

        inputs3, labels3 = val_gen.__next__()
        assert_allclose(inputs3[0], np.array([7, 0]))
        assert_allclose(inputs3[1], np.array([8, 1]))
        assert_allclose(labels3[0], np.array([1, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            val_gen.__next__()

        # Test that we can make a new val generator
        new_val_gen = get_val_gen()
        # Verify that the new and old generator are not the same object
        assert new_val_gen != val_gen
        new_inputs1, new_labels1 = new_val_gen.__next__()
        assert_allclose(new_inputs1, inputs1)
        assert_allclose(new_labels1, labels1)
        new_inputs2, new_labels2 = new_val_gen.__next__()
        assert_allclose(new_inputs2, inputs2)
        assert_allclose(new_labels2, labels2)
        new_inputs3, new_labels3 = new_val_gen.__next__()
        assert_allclose(new_inputs3, inputs3)
        assert_allclose(new_labels3, labels3)

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            new_val_gen.__next__()

    def test_get_validation_data_default_character(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], mode="character")
        assert val_size == 3
        val_gen = get_val_gen()
        inputs1, labels1 = val_gen.__next__()
        assert_allclose(
            inputs1[0],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(
            inputs1[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 11, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 1, 0, 0, 0]]))
        assert_allclose(labels1[0], np.array([1, 0]))

        inputs2, labels2 = val_gen.__next__()
        assert_allclose(
            inputs2[0],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 1, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(
            inputs2[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 1, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(labels2[0], np.array([0, 1]))

        inputs3, labels3 = val_gen.__next__()
        assert_allclose(
            inputs3[0],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(
            inputs3[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 16, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 10, 10, 0, 0]]))
        assert_allclose(labels3[0], np.array([1, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            val_gen.__next__()

    def test_get_validation_data_default_word_and_character(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], mode="word+character")
        val_gen = get_val_gen()
        assert val_size == 3
        inputs1, labels1 = val_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 0]))
        assert_allclose(
            inputs1[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs1[2], np.array([3, 1]))
        assert_allclose(
            inputs1[3],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 11, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 1, 0, 0, 0]]))
        assert_allclose(labels1[0], np.array([1, 0]))

        inputs2, labels2 = val_gen.__next__()
        assert_allclose(inputs2[0], np.array([1, 0]))
        assert_allclose(
            inputs2[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 1, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs2[2], np.array([1, 0]))
        assert_allclose(
            inputs2[3],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 10, 1, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(labels2[0], np.array([0, 1]))

        inputs3, labels3 = val_gen.__next__()
        assert_allclose(inputs3[0], np.array([7, 0]))
        assert_allclose(
            inputs3[1],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 15, 0, 0, 0],
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
        assert_allclose(inputs3[2], np.array([8, 1]))
        assert_allclose(
            inputs3[3],
            np.array([[6, 9, 2, 7, 8, 3, 5, 4, 16, 0, 0, 0],
                      [6, 9, 2, 7, 8, 3, 5, 4, 10, 10, 0, 0]]))
        assert_allclose(labels3[0], np.array([1, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            val_gen.__next__()

    def test_get_validation_data_pad_with_max_lens(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], max_lengths={"num_sentence_words": 1})
        val_gen = get_val_gen()
        assert val_size == 3
        inputs1, labels1 = val_gen.__next__()
        assert_allclose(inputs1[0], np.array([2]))
        assert_allclose(inputs1[1], np.array([3]))
        assert_allclose(labels1[0], np.array([1, 0]))

        inputs2, labels2 = val_gen.__next__()
        assert_allclose(inputs2[0], np.array([1]))
        assert_allclose(inputs2[1], np.array([1]))
        assert_allclose(labels2[0], np.array([0, 1]))

        inputs3, labels3 = val_gen.__next__()
        assert_allclose(inputs3[0], np.array([7]))
        assert_allclose(inputs3[1], np.array([8]))
        assert_allclose(labels3[0], np.array([1, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            val_gen.__next__()

    def test_get_validation_data_with_max_features(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], max_features=2)
        val_size == 2
        val_gen = get_val_gen()
        inputs1, labels1 = val_gen.__next__()
        assert_allclose(inputs1[0], np.array([2, 0]))
        assert_allclose(inputs1[1], np.array([3, 1]))
        assert_allclose(labels1[0], np.array([1, 0]))

        inputs2, labels2 = val_gen.__next__()
        assert_allclose(inputs2[0], np.array([1, 0]))
        assert_allclose(inputs2[1], np.array([1, 0]))
        assert_allclose(labels2[0], np.array([0, 1]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            val_gen.__next__()

    def test_get_validation_data_errors(self):
        with self.assertRaises(ValueError):
            self.data_manager.get_validation_data_from_file(
                [self.VALIDATION_FILE],
                max_lengths={"num_sentence_words": 1},
                pad=False)
        with self.assertRaises(ValueError):
            self.data_manager.get_validation_data_from_file(
                [self.VALIDATION_FILE], max_lengths={"some wrong key": 1})

    def test_get_validation_data_no_pad(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE], pad=False)
        assert val_size == 3
        val_gen = get_val_gen()
        inputs1, labels1 = val_gen.__next__()
        assert_allclose(inputs1[0], np.array([2]))
        assert_allclose(inputs1[1], np.array([3, 1]))
        assert_allclose(labels1[0], np.array([1, 0]))

        inputs2, labels2 = val_gen.__next__()
        assert_allclose(inputs2[0], np.array([1]))
        assert_allclose(inputs2[1], np.array([1]))
        assert_allclose(labels2[0], np.array([0, 1]))

        inputs3, labels3 = val_gen.__next__()
        assert_allclose(inputs3[0], np.array([7]))
        assert_allclose(inputs3[1], np.array([8, 1, 1]))
        assert_allclose(labels3[0], np.array([1, 0]))

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            val_gen.__next__()

    def test_generate_validation_batches(self):
        get_val_gen, val_size = self.data_manager.get_validation_data_from_file(
            [self.VALIDATION_FILE])
        batch_gen = self.data_manager.get_batch_generator(get_val_gen, 2)
        new_batch_gen = DataManager.get_batch_generator(get_val_gen, 2)
        assert val_size == 3

        # Assert that the new generator is a different object
        # than the old generator.
        assert new_batch_gen != batch_gen

        first_batch = batch_gen.__next__()
        new_first_batch = new_batch_gen.__next__()
        inputs, labels = first_batch
        new_inputs, new_labels = new_first_batch
        assert len(inputs) == len(new_inputs) == 2
        assert len(labels) == len(new_labels) == 1

        # Ensure output matches ground truth.
        assert_allclose(inputs[0], np.array([[2, 0], [1, 0]]))
        assert_allclose(inputs[1], np.array([[3, 1], [1, 0]]))
        assert_allclose(labels[0], np.array([[1, 0], [0, 1]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])
        assert_allclose(labels[0], labels[0])

        second_batch = batch_gen.__next__()
        new_second_batch = new_batch_gen.__next__()
        inputs, labels = second_batch
        new_inputs, new_labels = new_second_batch
        assert len(inputs) == len(new_inputs) == 2
        assert len(labels) == len(new_labels) == 1

        # Ensure output matches ground truth.
        assert_allclose(inputs[0], np.array([[7, 0]]))
        assert_allclose(inputs[1], np.array([[8, 1]]))
        assert_allclose(labels[0], np.array([[1, 0]]))
        # Ensure both generators produce same results.
        assert_allclose(inputs[0], new_inputs[0])
        assert_allclose(inputs[1], new_inputs[1])
        assert_allclose(labels[0], labels[0])

        # Should raise a StopIteration
        with self.assertRaises(StopIteration):
            batch_gen.__next__()
            new_batch_gen.__next__()
예제 #2
0
# In[ ]:


data_manager = DataManager(dataset=quora_dataset)


# In[ ]:


get_train_data_gen, train_data_size = data_manager.get_train_data_from_file()


# In[ ]:


get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file()


# In[ ]:


# embedding_manager = EmbeddingManager(quora_dataset.data_indexer, pickle_dir='../models/')
# embedding_matrix = embedding_manager.get_embedding_matrix(300,'../data/quora/external/glove.6B.300d.txt')
nlp = spacy.load('en_core_web_md')
embedding_matrix = EmbeddingManager.get_spacy_embedding_matrix(nlp)


# In[ ]:


from pympler import asizeof