예제 #1
0
    def test_list_of_numpy_arrays(self):
        # test with list of np arrays with same shapes
        dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))]
        res = dataset_utils.split_dataset(dataset, left_size=4)

        self.assertLen(res, 2)
        left_split, right_split = res

        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(right_split, tf.data.Dataset)

        self.assertEqual(np.array(list(left_split)).shape, (4, 2, 32))
        self.assertEqual(np.array(list(right_split)).shape, (196, 2, 32))

        # test with different shapes
        dataset = [np.ones(shape=(5, 3)), np.ones(shape=(5, ))]
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.3)

        self.assertEqual(np.array(list(left_split)).shape, (2, 2))
        self.assertEqual(np.array(list(right_split)).shape, (3, 2))

        self.assertEqual(np.array(list(left_split)[0]).shape, (2, ))
        self.assertEqual(np.array(list(left_split)[0][0]).shape, (3, ))
        self.assertEqual(np.array(list(left_split)[0][1]).shape, ())

        self.assertEqual(np.array(list(right_split)[0]).shape, (2, ))
        self.assertEqual(np.array(list(right_split)[0][0]).shape, (3, ))
        self.assertEqual(np.array(list(right_split)[0][1]).shape, ())
예제 #2
0
 def test_invalid_dataset(self):
     with self.assertRaisesRegex(
             TypeError,
             "The `dataset` argument must be either a `tf.data.Dataset` "
             "object or a list/tuple of arrays.",
     ):
         dataset_utils.split_dataset(dataset=None, left_size=5)
     with self.assertRaisesRegex(
             TypeError,
             "The `dataset` argument must be either a `tf.data.Dataset` "
             "object or a list/tuple of arrays.",
     ):
         dataset_utils.split_dataset(dataset=1, left_size=5)
     with self.assertRaisesRegex(
             TypeError,
             "The `dataset` argument must be either a `tf.data.Dataset` "
             "object or a list/tuple of arrays.",
     ):
         dataset_utils.split_dataset(dataset=float(1.2), left_size=5)
     with self.assertRaisesRegex(
             TypeError,
             "The `dataset` argument must be either a `tf.data.Dataset` "
             "object or a list/tuple of arrays.",
     ):
         dataset_utils.split_dataset(dataset=dict({}), left_size=5)
     with self.assertRaisesRegex(
             TypeError,
             "The `dataset` argument must be either a `tf.data.Dataset` "
             "object or a list/tuple of arrays.",
     ):
         dataset_utils.split_dataset(dataset=float("INF"), left_size=5)
예제 #3
0
    def test_invalid_left_and_right_size_types(self):
        expected_regex = (r"^.*?(\bInvalid `left_size` and `right_size` Types"
                          r"\b).*?(\bExpected: integer or float or None\b)")
        with self.assertRaisesRegex(TypeError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]),
                                        left_size="1",
                                        right_size="1")

        expected_regex = r"^.*?(\bInvalid `right_size` Type\b)"
        with self.assertRaisesRegex(TypeError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]),
                                        left_size=0,
                                        right_size="1")

        expected_regex = r"^.*?(\bInvalid `left_size` Type\b)"
        with self.assertRaisesRegex(TypeError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]),
                                        left_size="100",
                                        right_size=None)

        expected_regex = r"^.*?(\bInvalid `right_size` Type\b)"
        with self.assertRaisesRegex(TypeError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]), right_size="1")

        expected_regex = r"^.*?(\bInvalid `right_size` Type\b)"
        with self.assertRaisesRegex(TypeError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]),
                                        left_size=0.5,
                                        right_size="1")
예제 #4
0
    def test_unbatched_tf_dataset_of_tuple_of_vectors(self):
        # test with tuple of np arrays with same shape
        X, Y = (np.random.rand(10, 32, 32, 1), np.random.rand(10, 32, 32, 1))
        dataset = tf.data.Dataset.from_tensor_slices((X, Y))

        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=5)

        self.assertEqual(len(list(left_split)), 5)
        self.assertEqual(len(list(right_split)), 5)
        self.assertAllEqual(list(dataset),
                            list(left_split) + list(right_split))

        # test with tuple of np arrays with different shapes
        X, Y = (
            np.random.rand(5, 3, 3),
            np.random.rand(5, ),
        )
        dataset = tf.data.Dataset.from_tensor_slices((X, Y))
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.5)

        self.assertEqual(len(list(left_split)), 2)
        self.assertEqual(len(list(right_split)), 3)
        self.assertEqual(np.array(list(left_split)[0][0]).shape, (3, 3))
        self.assertEqual(np.array(list(left_split)[0][1]).shape, ())
예제 #5
0
    def test_dataset_with_invalid_shape(self):
        with self.assertRaisesRegex(
                ValueError, 'Received a list of NumPy arrays '
                'with different lengths'):
            dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))]
            dataset_utils.split_dataset(dataset, left_size=4)

        with self.assertRaisesRegex(
                ValueError, 'Received a tuple of NumPy arrays '
                'with different lengths'):
            dataset = (np.ones(shape=(200, 32)), np.zeros(shape=(201, 32)))
            dataset_utils.split_dataset(dataset, left_size=4)
예제 #6
0
    def test_tuple_of_numpy_arrays(self):
        dataset = (np.random.rand(4, 3), np.random.rand(4, 3))
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2)

        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(right_split, tf.data.Dataset)

        self.assertEqual(len(left_split), 2)
        self.assertEqual(len(right_split), 2)

        self.assertEqual(np.array(list(left_split)[0]).shape, (2, 3))
        self.assertEqual(np.array(list(left_split)[1]).shape, (2, 3))

        # test with fractional size
        dataset = (np.random.rand(5, 32, 32), np.random.rand(5, 32, 32))
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              right_size=0.4)
        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(right_split, tf.data.Dataset)

        self.assertEqual(np.array(list(left_split)).shape, (3, 2, 32, 32))
        self.assertEqual(np.array(list(right_split)).shape, (2, 2, 32, 32))

        self.assertEqual(np.array(list(left_split))[0].shape, (2, 32, 32))
        self.assertEqual(np.array(list(left_split))[1].shape, (2, 32, 32))

        self.assertEqual(np.array(list(right_split))[0].shape, (2, 32, 32))
        self.assertEqual(np.array(list(right_split))[1].shape, (2, 32, 32))

        # test with tuple of np arrays with different shapes
        dataset = (
            np.random.rand(5, 32, 32),
            np.random.rand(5, ),
        )
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2,
                                                              right_size=3)
        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(right_split, tf.data.Dataset)

        self.assertEqual(np.array(list(left_split)).shape, (2, 2))
        self.assertEqual(np.array(list(right_split)).shape, (3, 2))

        self.assertEqual(np.array(list(left_split)[0]).shape, (2, ))
        self.assertEqual(np.array(list(left_split)[0][0]).shape, (32, 32))
        self.assertEqual(np.array(list(left_split)[0][1]).shape, ())

        self.assertEqual(np.array(list(right_split)[0]).shape, (2, ))
        self.assertEqual(np.array(list(right_split)[0][0]).shape, (32, 32))
        self.assertEqual(np.array(list(right_split)[0][1]).shape, ())
예제 #7
0
    def test_list_dataset(self):
        dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)]
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=5,
                                                              right_size=5)
        self.assertEqual(len(left_split), len(right_split))
        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(left_split, tf.data.Dataset)

        dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)]
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.6,
                                                              right_size=0.4)
        self.assertEqual(len(left_split), 6)
        self.assertEqual(len(right_split), 4)
예제 #8
0
    def test_unbatched_tf_dataset_of_dict_of_vectors(self):
        # test with dict of np arrays of same shape
        dict_samples = {'X': np.random.rand(10, 2), 'Y': np.random.rand(10, 2)}
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2)
        self.assertEqual(len(list(left_split)), 2)
        self.assertEqual(len(list(right_split)), 8)
        for i in range(10):
            if i < 2:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 2], list(dataset)[i])

        # test with dict of np arrays with different shapes
        dict_samples = {
            'images': np.random.rand(10, 16, 16, 3),
            'labels': np.random.rand(10, )
        }
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.3)
        self.assertEqual(len(list(left_split)), 3)
        self.assertEqual(len(list(right_split)), 7)
        for i in range(10):
            if i < 3:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 3], list(dataset)[i])

        # test with dict of text arrays
        txt_feature = ['abb', 'bb', 'cc', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
        dict_samples = {
            'txt_feature': txt_feature,
            'label': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        }
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.45,
                                                              right_size=0.55)
        self.assertEqual(len(list(left_split)), 4)
        self.assertEqual(len(list(right_split)), 6)
        for i in range(10):
            if i < 4:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 4], list(dataset)[i])
예제 #9
0
    def test_unbatched_tf_dataset_of_dict_of_vectors(self):
        # test with dict of np arrays of same shape
        dict_samples = {"X": np.random.rand(10, 2), "Y": np.random.rand(10, 2)}
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2)
        self.assertEqual(len(list(left_split)), 2)
        self.assertEqual(len(list(right_split)), 8)
        for i in range(10):
            if i < 2:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 2], list(dataset)[i])

        # test with dict of np arrays with different shapes
        dict_samples = {
            "images": np.random.rand(10, 16, 16, 3),
            "labels": np.random.rand(10, ),
        }
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.3)
        self.assertEqual(len(list(left_split)), 3)
        self.assertEqual(len(list(right_split)), 7)
        for i in range(10):
            if i < 3:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 3], list(dataset)[i])

        # test with dict of text arrays
        txt_feature = ["abb", "bb", "cc", "d", "e", "f", "g", "h", "i", "j"]
        dict_samples = {
            "txt_feature": txt_feature,
            "label": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        }
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.45,
                                                              right_size=0.55)
        self.assertEqual(len(list(left_split)), 4)
        self.assertEqual(len(list(right_split)), 6)
        for i in range(10):
            if i < 4:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 4], list(dataset)[i])
예제 #10
0
    def test_invalid_float_left_and_right_sizes(self):
        expected_regex = (r"^(.*?(\bleft_size\b).*?(\bshould be\b)"
                          r".*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))")
        with self.assertRaisesRegexp(ValueError, expected_regex):
            dataset = [
                np.ones(shape=(200, 32, 32)),
                np.zeros(shape=(200, 32, 32)),
            ]
            dataset_utils.split_dataset(dataset, left_size=1.5, right_size=0.2)

        expected_regex = (r"^(.*?(\bright_size\b).*?(\bshould be\b)"
                          r".*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))")
        with self.assertRaisesRegex(ValueError, expected_regex):
            dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))]
            dataset_utils.split_dataset(dataset,
                                        left_size=0.8,
                                        right_size=-0.8)
예제 #11
0
 def test_float_left_and_right_sizes(self):
     X = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
     dataset = tf.data.Dataset.from_tensor_slices(X)
     left_split, right_split = dataset_utils.split_dataset(dataset,
                                                           left_size=0.8,
                                                           right_size=0.2)
     self.assertEqual(len(left_split), 2)
     self.assertEqual(len(right_split), 1)
예제 #12
0
    def test_batched_tf_dataset_of_dict_of_vectors(self):
        dict_samples = {"X": np.random.rand(10, 3), "Y": np.random.rand(10, 3)}
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        dataset = dataset.batch(2)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2)

        self.assertAllEqual(np.array(list(left_split)).shape, (1, ))
        self.assertAllEqual(np.array(list(right_split)).shape, (4, ))

        left_split, right_split = left_split.unbatch(), right_split.unbatch()
        self.assertEqual(len(list(left_split)), 2)
        self.assertEqual(len(list(right_split)), 8)
        for i in range(10):
            if i < 2:
                self.assertEqual(
                    list(left_split)[i],
                    list(dataset.unbatch())[i])
            else:
                self.assertEqual(
                    list(right_split)[i - 2],
                    list(dataset.unbatch())[i])

        # test with dict of np arrays with different shapes
        dict_samples = {
            "images": np.random.rand(10, 16, 16, 3),
            "labels": np.random.rand(10, ),
        }
        dataset = tf.data.Dataset.from_tensor_slices(dict_samples)
        dataset = dataset.batch(1)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              right_size=0.3)

        self.assertAllEqual(np.array(list(left_split)).shape, (7, ))
        self.assertAllEqual(np.array(list(right_split)).shape, (3, ))

        dataset = dataset.unbatch()
        left_split, right_split = left_split.unbatch(), right_split.unbatch()
        self.assertEqual(len(list(left_split)), 7)
        self.assertEqual(len(list(right_split)), 3)
        for i in range(10):
            if i < 7:
                self.assertEqual(list(left_split)[i], list(dataset)[i])
            else:
                self.assertEqual(list(right_split)[i - 7], list(dataset)[i])
예제 #13
0
    def test_unbatched_tf_dataset_of_vectors(self):
        vectors = np.ones(shape=(100, 16, 16, 3))
        dataset = tf.data.Dataset.from_tensor_slices(vectors)

        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=0.25)

        self.assertAllEqual(np.array(list(left_split)).shape, (25, 16, 16, 3))
        self.assertAllEqual(np.array(list(right_split)).shape, (75, 16, 16, 3))

        self.assertAllEqual(list(dataset),
                            list(left_split) + list(right_split))

        dataset = [np.random.rand(10, 3, 3) for _ in range(5)]
        dataset = tf.data.Dataset.from_tensor_slices(dataset)

        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2)
        self.assertAllEqual(list(dataset),
                            list(left_split) + list(right_split))
예제 #14
0
    def test_end_to_end(self):
        x_train = np.random.random((10000, 28, 28))
        y_train = np.random.randint(0, 10, size=(10000, ))

        left_split, right_split = dataset_utils.split_dataset(
            (x_train, y_train), left_size=0.8)

        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(right_split, tf.data.Dataset)

        self.assertEqual(len(left_split), 8000)
        self.assertEqual(len(right_split), 2000)
예제 #15
0
    def test_valid_left_and_right_sizes(self):
        dataset = np.array([1, 2, 3])
        splitted_dataset = dataset_utils.split_dataset(dataset, 1, 2)
        self.assertLen(splitted_dataset, 2)
        left_split, right_split = splitted_dataset
        self.assertEqual(len(left_split), 1)
        self.assertEqual(len(right_split), 2)
        self.assertEqual(list(left_split), [1])
        self.assertEqual(list(right_split), [2, 3])

        dataset = np.ones(shape=(200, 32))
        res = dataset_utils.split_dataset(dataset,
                                          left_size=150,
                                          right_size=50)
        self.assertLen(res, 2)
        self.assertIsInstance(res[0], tf.data.Dataset)
        self.assertIsInstance(res[1], tf.data.Dataset)

        self.assertLen(res[0], 150)
        self.assertLen(res[1], 50)

        dataset = np.ones(shape=(200, 32))
        res = dataset_utils.split_dataset(dataset, left_size=120)
        self.assertLen(res, 2)
        self.assertIsInstance(res[0], tf.data.Dataset)
        self.assertIsInstance(res[1], tf.data.Dataset)

        self.assertLen(res[0], 120)
        self.assertLen(res[1], 80)

        dataset = np.ones(shape=(10000, 16))
        res = dataset_utils.split_dataset(dataset, right_size=20)
        self.assertLen(res, 2)
        self.assertIsInstance(res[0], tf.data.Dataset)
        self.assertIsInstance(res[1], tf.data.Dataset)

        self.assertLen(res[0], 9980)
        self.assertLen(res[1], 20)

        dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        splitted_dataset = dataset_utils.split_dataset(dataset,
                                                       left_size=0.1,
                                                       right_size=0.9)
        self.assertLen(splitted_dataset, 2)
        left_split, right_split = splitted_dataset
        self.assertEqual(len(left_split), 1)
        self.assertEqual(len(right_split), 9)
        self.assertEqual(list(left_split), [1])
        self.assertEqual(list(right_split), [2, 3, 4, 5, 6, 7, 8, 9, 10])

        dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        splitted_dataset = dataset_utils.split_dataset(dataset,
                                                       left_size=2,
                                                       right_size=5)
        self.assertLen(splitted_dataset, 2)
        left_split, right_split = splitted_dataset
        self.assertEqual(len(left_split), 2)
        self.assertEqual(len(right_split), 5)
        self.assertEqual(list(left_split), [1, 2])
        self.assertEqual(list(right_split), [6, 7, 8, 9, 10])
예제 #16
0
    def test_batched_tf_dataset_of_vectors(self):
        vectors = np.ones(shape=(100, 32, 32, 1))
        dataset = tf.data.Dataset.from_tensor_slices(vectors)
        dataset = dataset.batch(10)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=2)

        # Ensure that the splits are batched
        self.assertEqual(len(list(right_split)), 10)

        left_split, right_split = left_split.unbatch(), right_split.unbatch()
        self.assertAllEqual(np.array(list(left_split)).shape, (2, 32, 32, 1))
        self.assertAllEqual(np.array(list(right_split)).shape, (98, 32, 32, 1))
        dataset = dataset.unbatch()
        self.assertAllEqual(list(dataset),
                            list(left_split) + list(right_split))
예제 #17
0
    def test_numpy_array(self):
        dataset = np.ones(shape=(200, 32))
        res = dataset_utils.split_dataset(dataset,
                                          left_size=0.8,
                                          right_size=0.2)

        self.assertLen(res, 2)
        left_split, right_split = res

        self.assertIsInstance(left_split, tf.data.Dataset)
        self.assertIsInstance(right_split, tf.data.Dataset)

        self.assertLen(left_split, 160)
        self.assertLen(right_split, 40)

        self.assertAllEqual(dataset[:160], list(left_split))
        self.assertAllEqual(dataset[-40:], list(right_split))
예제 #18
0
    def test_batched_tf_dataset_of_tuple_of_vectors(self):
        tuple_of_vectors = (np.random.rand(10, 32,
                                           32), np.random.rand(10, 32, 32))
        dataset = tf.data.Dataset.from_tensor_slices(tuple_of_vectors)
        dataset = dataset.batch(2)
        left_split, right_split = dataset_utils.split_dataset(dataset,
                                                              left_size=4)

        # Ensure that the splits are batched
        self.assertEqual(np.array(list(right_split)).shape, (3, 2, 2, 32, 32))
        self.assertEqual(np.array(list(left_split)).shape, (2, 2, 2, 32, 32))

        left_split, right_split = left_split.unbatch(), right_split.unbatch()
        self.assertAllEqual(np.array(list(left_split)).shape, (4, 2, 32, 32))
        self.assertAllEqual(np.array(list(right_split)).shape, (6, 2, 32, 32))

        dataset = dataset.unbatch()
        self.assertAllEqual(list(dataset),
                            list(left_split) + list(right_split))
예제 #19
0
    def test_None_and_zero_left_and_right_size(self):
        expected_regex = (r"^.*?(\bleft_size\b).*?(\bright_size\b).*?(\bmust "
                          r"be specified\b).*?(\bReceived: left_size=None and"
                          r" right_size=None\b)")

        with self.assertRaisesRegex(ValueError, expected_regex):
            dataset_utils.split_dataset(dataset=np.array([1, 2, 3]),
                                        left_size=None)
        with self.assertRaisesRegex(ValueError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]),
                                        left_size=None,
                                        right_size=None)

        expected_regex = (r"^.*?(\bleft_size\b).*?(\bshould be\b)"
                          r".*?(\bpositive\b).*?(\bsmaller than 3\b)")
        with self.assertRaisesRegex(ValueError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=3)

        expected_regex = ("Both `left_size` and `right_size` are zero. "
                          "At least one of the split sizes must be non-zero.")
        with self.assertRaisesRegex(ValueError, expected_regex):
            dataset_utils.split_dataset(np.array([1, 2, 3]),
                                        left_size=0,
                                        right_size=0)