def test_numpy_projection_logic(self):
        """
        Test that the projection works as expected for numpy objects
        """
        mock_dataset = MagicMock()
        mock_dataset.get_split.return_value = Split(
            X=np.ones((100, 10)) * np.array(range(100)).reshape(-1, 1),
            y=np.array(range(100)),
        )
        projected_split = self.mock_cls(
            dataset=mock_dataset, split="ddefg", indices=range(10, 30)
        )
        expected_split = Split(
            X=np.ones((20, 10)) * np.array(range(10, 30)).reshape(-1, 1),
            y=np.array(range(10, 30)),
        )
        split = projected_split.dataset_split
        explicit_output = projected_split.apply_projection(split)
        implicit_output = projected_split.projected_split

        def numpy_split_comparison(a, b):
            self.assertEqual(a.keys(), b.keys())
            for k, v in a.items():
                np.testing.assert_equal(v, b[k])

        with self.assertRaises(AssertionError):
            numpy_split_comparison(split, explicit_output)

        with self.assertRaises(AssertionError):
            numpy_split_comparison(split, implicit_output)

        numpy_split_comparison(expected_split, explicit_output)
        numpy_split_comparison(expected_split, implicit_output)
        numpy_split_comparison(implicit_output, explicit_output)
示例#2
0
    def __getitem__(self, index) -> Split:
        """Gets batch at position `index`.
        # Arguments
            index: position of the batch in the Sequence.
        # Returns
            A batch
        """
        current_index = index * self.batch_size  # list index of batch start
        batch = self.indices[current_index:min(current_index +
                                               self.batch_size, self.
                                               dataset_size)]

        X = self.validated_split(self.split.X)
        y = self.validated_split(self.split.y)

        if y is not None:  # Supervised
            if isinstance(X, (pd.DataFrame, pd.Series)):
                split = Split(X=X.loc[batch],
                              y=np.stack(y.loc[batch].squeeze().values))
            else:
                split = Split(X=X[batch], y=y[batch])
        else:  # Unsupervised
            if isinstance(X, (pd.DataFrame, pd.Series)):
                split = Split(X=X.loc[batch])
            else:
                split = Split(X=X[batch])

        if self.return_tuple:
            return split_to_ordered_tuple(split)
        else:
            return split
示例#3
0
 def test_squeeze_behavior(self):
     split = Split(a=None,
                   b=pd.DataFrame(),
                   c=pd.Series(),
                   d=[],
                   e=tuple(),
                   f={})
     split.squeeze()
     self.assertEqual(split, {})
示例#4
0
 def apply_projection(self, dataset_split: Split) -> Split:
     """
     Index subset return
     """
     return Split(
         **{
             k: self.indexing_method(v, self.indices)
             for k, v in dataset_split.items()
         }).squeeze()
示例#5
0
    def __getitem__(self, *args, **kwargs) -> Union[Split, Tuple]:
        """
        Pass-through to dataset sequence - applies transform on data and returns transformed batch
        """
        batch: Union[Tuple, Split] = self.dataset_sequence(*args, **kwargs)

        if isinstance(batch, tuple):
            X = batch[0]
            return_tuple = True
        else:
            X = batch.X
            return_tuple = False

        output = self.pipeline.transform(X)

        if return_tuple:
            # Return input with X replaced by output (transformed X)
            # Contains y or other named inputs to propagate downstream
            # Explicitly order for *args input -- X, y, other...
            return tuple((X, *batch[1:]))

        else:
            return Split(X=output,
                         **{k: v
                            for k, v in batch.items() if k != "X"})
示例#6
0
    def __next__(self) -> Union[Split, Tuple]:
        """
        NOTE: Some downstream objects expect to consume a generator with a tuple of
        X, y, other... not a Split object, so an ordered tuple will be returned
        if the dataset iterator returns a tuple
        """
        batch = next(self.data_iterator)
        if isinstance(batch, tuple):
            X = batch[0]
            return_tuple = True
        else:
            X = batch.X
            return_tuple = False

        output = self.pipeline.transform(X)

        if return_tuple:
            # Return input with X replaced by output (transformed X)
            # Contains y or other named inputs to propagate downstream
            # Explicitly order for *args input -- X, y, other...
            return tuple((X, *batch[1:]))

        else:
            return Split(X=output,
                         **{k: v
                            for k, v in batch.items() if k != "X"})
示例#7
0
    def __next__(self) -> Union[Split, Tuple]:
        """
        Turn a dataset split into a generator
        """
        X = self.split.X
        y = self.split.y

        if self.dataset_size == 0:  # Return None
            raise StopIteration

        # Loop so that infinite batches can be generated
        if self.current_index >= self.dataset_size:
            if self.infinite_loop:
                self.current_index = 0
                self.first_run = False
            else:
                raise StopIteration

        # shuffle on new loops
        if self.current_index == 0 and self.shuffle and not self.first_run:
            self.indices = np.random.shuffle(self.indices)

        # next batch indices
        batch = self.indices[
            self.current_index:min(self.current_index +
                                   self.batch_size, self.dataset_size)]
        self.current_index += self.batch_size

        if y is not None and (isinstance(y, (pd.DataFrame, pd.Series))
                              and not y.empty):  # Supervised
            if isinstance(X, (pd.DataFrame, pd.Series)):
                split = Split(X=X.loc[batch],
                              y=np.stack(y.loc[batch].squeeze().values))
            else:
                split = Split(X=X[batch], y=y[batch])
        else:  # Unsupervised
            if isinstance(X, (pd.DataFrame, pd.Series)):
                split = Split(X=X.loc[batch])
            else:
                split = Split(X=X[batch])

        if self.return_tuple:
            return split_to_ordered_tuple(split)
        else:
            return split
    def test_projection_logic(self):
        """
        Test that the projection works as expected
        """
        mock_dataset = MagicMock()
        mock_dataset.get_split.return_value = Split(X="a", y="b")
        projected_split = self.mock_cls(dataset=mock_dataset, split="ddefg")
        split = projected_split.dataset_split
        explicit_output = projected_split.apply_projection(split)
        implicit_output = projected_split.projected_split

        self.assertEqual(split, explicit_output)
        self.assertEqual(split, implicit_output)
        self.assertEqual(implicit_output, explicit_output)
    def test_projection_logic(self):
        """
        Test that the projection works as expected for pandas objects
        """
        mock_dataset = MagicMock()
        mock_dataset.get_split.return_value = Split(
            X=pd.DataFrame(range(100)), y=pd.Series(range(100))
        )
        projected_split = self.mock_cls(
            dataset=mock_dataset, split="ddefg", indices=range(10, 30)
        )
        expected_split = Split(
            X=pd.DataFrame(range(10, 30), index=range(10, 30)),
            y=pd.Series(range(10, 30), index=range(10, 30)),
        )
        split = projected_split.dataset_split
        explicit_output = projected_split.apply_projection(split)
        implicit_output = projected_split.projected_split

        def pandas_split_comparison(a, b):
            self.assertEqual(a.keys(), b.keys())
            for k, v in a.items():
                if isinstance(v, pd.DataFrame):
                    pd.testing.assert_frame_equal(v, b[k])
                else:
                    pd.testing.assert_series_equal(v, b[k])

        with self.assertRaises(AssertionError):
            pandas_split_comparison(split, explicit_output)

        with self.assertRaises(AssertionError):
            pandas_split_comparison(split, implicit_output)

        pandas_split_comparison(expected_split, explicit_output)
        pandas_split_comparison(expected_split, implicit_output)
        pandas_split_comparison(implicit_output, explicit_output)
示例#10
0
    def test_itemgetter_behavior(self):
        """
        Test the split is proxied for references
        """
        mock_split = Split(X="abc", y="def", other="xyz")
        prop_mock = PropertyMock(return_value=mock_split)
        with patch.object(self.test_cls, "projected_split", new_callable=prop_mock):
            projected_split = self.mock_cls()

            for section in mock_split:
                self.assertEqual(mock_split[section], projected_split[section])

            # **behavior
            self.assertEqual({**mock_split}, {**projected_split})
            self.assertEqual(
                {"X": "abc", "y": "def", "other": "xyz"}, {**projected_split}
            )
示例#11
0
def split_to_ordered_tuple(split: Split) -> Tuple:
    """
    Helper to convert a split object into an ordered tuple of
    X, y, other
    """
    return_objects = []
    X = split.X
    y = split.y

    if X is not None:
        return_objects.append(X)
    if y is not None:
        return_objects.append(y)

    for k, v in split.items():
        if k not in ("X", "y") and v is not None:
            return_objects.append(v)
    return return_objects
示例#12
0
 def test_default_value(self):
     container = SplitContainer()
     self.assertEqual(container["nonexistent_key"], Split())
示例#13
0
 def test_getattr_behavior(self):
     split = Split(a="ab")
     self.assertEqual(split.a, "ab")
     self.assertEqual(split.b, None)
示例#14
0
 def test_null_type_check(self):
     for null_type in (None, pd.DataFrame(), pd.Series(), [], tuple(), {}):
         self.assertTrue(Split.is_null_type(null_type))
示例#15
0
 def apply_projection(self, dataset_split: Split) -> Split:
     """
     Identity return
     """
     return dataset_split.squeeze()