예제 #1
0
    def test_predefined_split(self, cv_split_cls, data):
        from sklearn.model_selection import PredefinedSplit
        indices = (data.y > 0).astype(int)
        split = PredefinedSplit(indices)

        dataset_train, dataset_valid = cv_split_cls(split)(data)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert (y_train > 0).all()
        assert (y_valid == 0).all()
예제 #2
0
    def test_with_list_of_arrays(self, cv_split_cls, data):
        data.X = [data.X, data.X]
        m = self.num_samples // 5
        n = self.num_samples - m

        dataset_train, dataset_valid = cv_split_cls(5)(data)
        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert len(X_train[0]) == len(X_train[1]) == len(y_train) == n
        assert len(X_valid[0]) == len(X_valid[1]) == len(y_valid) == m
예제 #3
0
    def test_predefined_split(self, cv_split_cls, data):
        from sklearn.model_selection import PredefinedSplit
        indices = (data.y > 0).astype(int)
        split = PredefinedSplit(indices)

        dataset_train, dataset_valid = cv_split_cls(split)(data)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert (y_train > 0).all()
        assert (y_valid == 0).all()
예제 #4
0
    def test_with_list_of_arrays(self, cv_split_cls, data):
        data.X = [data.X, data.X]
        m = self.num_samples // 5
        n = self.num_samples - m

        dataset_train, dataset_valid = cv_split_cls(5)(data)
        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert len(X_train[0]) == len(X_train[1]) == len(y_train) == n
        assert len(X_valid[0]) == len(X_valid[1]) == len(y_valid) == m
예제 #5
0
    def test_with_dict(self, cv_split_cls, data):
        data.X = {'1': data.X, '2': data.X}
        dataset_train, dataset_valid = cv_split_cls(5)(data)

        m = self.num_samples // 5
        n = self.num_samples - m

        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert len(X_train['1']) == len(X_train['2']) == len(y_train) == n
        assert len(X_valid['1']) == len(X_valid['2']) == len(y_valid) == m
예제 #6
0
    def test_with_dict(self, cv_split_cls, data):
        data.X = {'1': data.X, '2': data.X}
        dataset_train, dataset_valid = cv_split_cls(5)(data)

        m = self.num_samples // 5
        n = self.num_samples - m

        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert len(X_train['1']) == len(X_train['2']) == len(y_train) == n
        assert len(X_valid['1']) == len(X_valid['2']) == len(y_valid) == m
예제 #7
0
    def test_not_stratified(self, cv_split_cls, data, cv):
        num_expected = self.num_samples // 4
        y = np.hstack([np.repeat([0, 0, 0], num_expected),
                       np.repeat([1], num_expected)])
        data.y = y

        dataset_train, dataset_valid = cv_split_cls(
            cv, stratified=False)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        # when not stratified, we cannot know the distribution of targets
        assert y_train.sum() + y_valid.sum() == num_expected
예제 #8
0
    def test_stratified(self, cv_split_cls, data, cv):
        num_expected = self.num_samples // 4
        y = np.hstack([np.repeat([0, 0, 0], num_expected),
                       np.repeat([1], num_expected)])
        data.y = y

        dataset_train, dataset_valid = cv_split_cls(
            cv, stratified=True)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert y_train.sum() == 0.8 * num_expected
        assert y_valid.sum() == 0.2 * num_expected
예제 #9
0
    def test_y_str_val_stratified(self, cv_split_cls, data):
        y = np.array(['a', 'a', 'a', 'b'] * (self.num_samples // 4))
        if len(data.X) != len(y):
            raise ValueError
        data.y = y

        dataset_train, dataset_valid = cv_split_cls(5, stratified=True)(data,
                                                                        y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert np.isclose(np.mean(y_train == 'b'), 0.25)
        assert np.isclose(np.mean(y_valid == 'b'), 0.25)
예제 #10
0
    def test_with_torch_tensors_and_stratified(self, cv_split_cls, data):
        num_expected = self.num_samples // 4
        data.X = to_tensor(data.X, device='cpu')
        y = np.hstack([np.repeat([0, 0, 0], num_expected),
                       np.repeat([1], num_expected)])
        data.y = to_tensor(y, device='cpu')

        dataset_train, dataset_valid = cv_split_cls(5, stratified=True)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert y_train.sum() == 0.8 * num_expected
        assert y_valid.sum() == 0.2 * num_expected
예제 #11
0
    def test_not_stratified(self, cv_split_cls, data, cv):
        num_expected = self.num_samples // 4
        y = np.hstack([np.repeat([0, 0, 0], num_expected),
                       np.repeat([1], num_expected)])
        data.y = y

        dataset_train, dataset_valid = cv_split_cls(
            cv, stratified=False)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        # when not stratified, we cannot know the distribution of targets
        assert y_train.sum() + y_valid.sum() == num_expected
예제 #12
0
    def test_y_str_val_stratified(self, cv_split_cls, data):
        y = np.array(['a', 'a', 'a', 'b'] * (self.num_samples // 4))
        if len(data.X) != len(y):
            raise ValueError
        data.y = y

        dataset_train, dataset_valid = cv_split_cls(
            5, stratified=True)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert np.isclose(np.mean(y_train == 'b'), 0.25)
        assert np.isclose(np.mean(y_valid == 'b'), 0.25)
예제 #13
0
    def test_with_torch_tensors_and_stratified(self, cv_split_cls, data):
        num_expected = self.num_samples // 4
        data.X = to_tensor(data.X, device='cpu')
        y = np.hstack([np.repeat([0, 0, 0], num_expected),
                       np.repeat([1], num_expected)])
        data.y = to_tensor(y, device='cpu')

        dataset_train, dataset_valid = cv_split_cls(5, stratified=True)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert y_train.sum() == 0.8 * num_expected
        assert y_valid.sum() == 0.2 * num_expected
예제 #14
0
    def test_stratified(self, cv_split_cls, data, cv):
        num_expected = self.num_samples // 4
        y = np.hstack([np.repeat([0, 0, 0], num_expected),
                       np.repeat([1], num_expected)])
        data.y = y

        dataset_train, dataset_valid = cv_split_cls(
            cv, stratified=True)(data, y)
        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert y_train.sum() == 0.8 * num_expected
        assert y_valid.sum() == 0.2 * num_expected
예제 #15
0
    def test_with_y_none(self, cv_split_cls, data):
        data.y = None
        m = self.num_samples // 5
        n = self.num_samples - m
        dataset_train, dataset_valid = cv_split_cls(5)(data)

        assert len(dataset_train) == n
        assert len(dataset_valid) == m

        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert y_train is None
        assert y_valid is None
예제 #16
0
    def test_with_y_none(self, cv_split_cls, data):
        data.y = None
        m = self.num_samples // 5
        n = self.num_samples - m
        dataset_train, dataset_valid = cv_split_cls(5)(data)

        assert len(dataset_train) == n
        assert len(dataset_valid) == m

        y_train = data_from_dataset(dataset_train)[1]
        y_valid = data_from_dataset(dataset_valid)[1]

        assert y_train is None
        assert y_valid is None
예제 #17
0
    def test_with_pandas(self, cv_split_cls, data):
        import pandas as pd

        data.X = pd.DataFrame(
            data.X,
            columns=[str(i) for i in range(data.X.shape[1])],
        )
        dataset_train, dataset_valid = cv_split_cls(5)(data)

        m = self.num_samples // 5
        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert len(X_train) + len(X_valid) == self.num_samples
        assert len(y_train) + len(y_valid) == self.num_samples
        assert len(X_valid) == len(y_valid) == m
예제 #18
0
    def test_with_pandas(self, cv_split_cls, data):
        import pandas as pd

        data.X = pd.DataFrame(
            data.X,
            columns=[str(i) for i in range(data.X.shape[1])],
        )
        dataset_train, dataset_valid = cv_split_cls(5)(data)

        m = self.num_samples // 5
        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert len(X_train) + len(X_valid) == self.num_samples
        assert len(y_train) + len(y_valid) == self.num_samples
        assert len(X_valid) == len(y_valid) == m
예제 #19
0
    def on_epoch_end(self, net, dataset_train, dataset_valid, **kwargs):
        EpochScoring.on_epoch_end(self, net, dataset_train, dataset_valid)

        X_test, y_test = data_from_dataset(dataset_valid)
        y_pred = net.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        history = net.history
        history.record("confusion_matrix", cm)
예제 #20
0
    def test_group_kfold(self, cv_split_cls, data):
        from sklearn.model_selection import GroupKFold

        X, y = data.X, data.y
        n = self.num_samples // 2
        groups = np.asarray([0 for _ in range(n)] +
                            [1 for _ in range(self.num_samples - n)])

        dataset_train, dataset_valid = cv_split_cls(GroupKFold(n_splits=2))(
            data, groups=groups)
        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert np.allclose(X[:n], X_train)
        assert np.allclose(y[:n], y_train)
        assert np.allclose(X[n:], X_valid)
        assert np.allclose(y[n:], y_valid)
예제 #21
0
    def test_group_kfold(self, cv_split_cls, data):
        from sklearn.model_selection import GroupKFold

        X, y = data.X, data.y
        n = self.num_samples // 2
        groups = np.asarray(
            [0 for _ in range(n)] + [1 for _ in range(self.num_samples - n)])

        dataset_train, dataset_valid = cv_split_cls(
            GroupKFold(n_splits=2))(data, groups=groups)
        X_train, y_train = data_from_dataset(dataset_train)
        X_valid, y_valid = data_from_dataset(dataset_valid)

        assert np.allclose(X[:n], X_train)
        assert np.allclose(y[:n], y_train)
        assert np.allclose(X[n:], X_valid)
        assert np.allclose(y[n:], y_valid)
예제 #22
0
    def test_shuffle_split_reproducible_with_random_state(
            self, cv_split_cls, dataset_cls):
        n = self.num_samples
        X, y = np.random.random((n, 10)), np.random.randint(0, 10, size=n)
        cv = cv_split_cls(0.2, stratified=False)

        dst0, dsv0 = cv(dataset_cls(X, y))
        dst1, dsv1 = cv(dataset_cls(X, y))

        Xt0, yt0 = data_from_dataset(dst0)
        Xv0, yv0 = data_from_dataset(dsv0)
        Xt1, yt1 = data_from_dataset(dst1)
        Xv1, yv1 = data_from_dataset(dsv1)

        assert not np.allclose(Xt0, Xt1)
        assert not np.allclose(Xv0, Xv1)
        assert not np.allclose(yt0, yt1)
        assert not np.allclose(yv0, yv1)
예제 #23
0
    def get_test_data(self, dataset_train, dataset_valid):
        """Return data needed to perform scoring.

        This is a convenience method that handles picking of
        train/valid, different types of input data, use of cache,
        etc. for you.

        Parameters
        ----------
        dataset_train
          Incoming training data or dataset.

        dataset_valid
          Incoming validation data or dataset.

        Returns
        -------
        X_test
          Input data used for making the prediction.

        y_test
          Target ground truth. If caching was enabled, return cached
          y_test.

        y_pred : list
          The predicted targets. If caching was disabled, the list is
          empty. If caching was enabled, the list contains the batches
          of the predictions. It may thus be necessary to concatenate
          the output before working with it:
          ``y_pred = np.concatenate(y_pred)``

        """
        dataset = dataset_train if self.on_train else dataset_valid

        if self.use_caching:
            X_test = dataset
            y_pred = self.y_preds_
            y_test = [self.target_extractor(y) for y in self.y_trues_]
            # In case of y=None we will not have gathered any samples.
            # We expect the scoring function to deal with y_test=None.
            y_test = np.concatenate(y_test) if y_test else None
            return X_test, y_test, y_pred

        if is_skorch_dataset(dataset):
            X_test, y_test = data_from_dataset(
                dataset,
                X_indexing=self.X_indexing_,
                y_indexing=self.y_indexing_,
            )
        else:
            X_test, y_test = dataset, None

        if y_test is not None:
            # We allow y_test to be None but the scoring function has
            # to be able to deal with it (i.e. called without y_test).
            y_test = self.target_extractor(y_test)
        return X_test, y_test, []
예제 #24
0
    def test_shuffle_split_reproducible_with_random_state(
            self, cv_split_cls, dataset_cls):
        n = self.num_samples
        X, y = np.random.random((n, 10)), np.random.randint(0, 10, size=n)
        cv = cv_split_cls(0.2, stratified=False)

        dst0, dsv0 = cv(dataset_cls(X, y))
        dst1, dsv1 = cv(dataset_cls(X, y))

        Xt0, yt0 = data_from_dataset(dst0)
        Xv0, yv0 = data_from_dataset(dsv0)
        Xt1, yt1 = data_from_dataset(dst1)
        Xv1, yv1 = data_from_dataset(dsv1)

        assert not np.allclose(Xt0, Xt1)
        assert not np.allclose(Xv0, Xv1)
        assert not np.allclose(yt0, yt1)
        assert not np.allclose(yv0, yv1)
예제 #25
0
    def on_epoch_end(self, net, dataset_train, dataset_valid, **kwargs):
        EpochScoring.on_epoch_end(self, net, dataset_train, dataset_valid)

        X_test, y_test = data_from_dataset(dataset_valid)
        y_pred = net.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        sample_num = np.sum(cm, axis=1)
        sample_num = np.tile(sample_num, (sample_num.size, 1))
        cm = cm * (1 / np.transpose(sample_num))
        cm = np.round(cm * 100)
        cm1 = cm.astype(np.int8)
        history = net.history
        history.record("confusion_matrix", cm1)
예제 #26
0
    def on_epoch_end(
            self,
            net,
            dataset_train,
            dataset_valid,
            **kwargs):

        dataset = dataset_train if self.on_train else dataset_valid

        if self.use_caching:
            X_test = dataset
            y_pred = self.y_preds_
            y_test = [self.target_extractor(y) for y in self.y_trues_]
            # In case of y=None we will not have gathered any samples.
            # We expect the scoring function to deal with y_test=None.
            y_test = np.concatenate(y_test) if y_test else None
        else:
            if is_skorch_dataset(dataset):
                X_test, y_test = data_from_dataset(dataset)
            else:
                X_test, y_test = dataset, None
            y_pred = []
            if y_test is not None:
                # We allow y_test to be None but the scoring function has
                # to be able to deal with it (i.e. called without y_test).
                y_test = self.target_extractor(y_test)

        if X_test is None:
            return

        with cache_net_infer(net, self.use_caching, y_pred) as cached_net:
            current_score = self._scoring(cached_net, X_test, y_test)

            cached_net.history.record(self.name_, current_score)

            is_best = self._is_best_score(current_score)
            if is_best is None:
                return

            cached_net.history.record(self.name_ + '_best', is_best)
            if is_best:
                self.best_score_ = current_score
예제 #27
0
    def on_epoch_end(self, net, dataset_train, dataset_valid, **kwargs):

        dataset = dataset_train if self.on_train else dataset_valid

        if self.use_caching:
            X_test = dataset
            y_pred = self.y_preds_
            y_test = [self.target_extractor(y) for y in self.y_trues_]
            # In case of y=None we will not have gathered any samples.
            # We expect the scoring function to deal with y_test=None.
            y_test = np.concatenate(y_test) if y_test else None
        else:
            if is_skorch_dataset(dataset):
                X_test, y_test = data_from_dataset(
                    dataset,
                    X_indexing=self.X_indexing_,
                    y_indexing=self.y_indexing_,
                )
            else:
                X_test, y_test = dataset, None
            y_pred = []
            if y_test is not None:
                # We allow y_test to be None but the scoring function has
                # to be able to deal with it (i.e. called without y_test).
                y_test = self.target_extractor(y_test)

        if X_test is None:
            return

        with cache_net_infer(net, self.use_caching, y_pred) as cached_net:
            current_score = self._scoring(cached_net, X_test, y_test)

            cached_net.history.record(self.name_, current_score)

            is_best = self._is_best_score(current_score)
            if is_best is None:
                return

            cached_net.history.record(self.name_ + '_best', bool(is_best))
            if is_best:
                self.best_score_ = current_score
예제 #28
0
 def test_with_skorch_ds(self, data_from_dataset, data, skorch_ds):
     X, y = data_from_dataset(skorch_ds)
     assert (X == data[0]).all()
     assert (y == data[1]).all()
예제 #29
0
 def test_with_other_ds(self, data_from_dataset, other_ds):
     with pytest.raises(AttributeError):
         data_from_dataset(other_ds)
예제 #30
0
 def test_with_dict_data(self, data_from_dataset, data, subset):
     subset.dataset.X = {'X': subset.dataset.X}
     X, y = data_from_dataset(subset)
     assert (X['X'] == data[0][[1, 3]]).all()
     assert (y == data[1][[1, 3]]).all()
예제 #31
0
 def test_subset_with_y_none(self, data_from_dataset, data, subset):
     subset.dataset.y = None
     X, y = data_from_dataset(subset)
     assert (X == data[0][[1, 3]]).all()
     assert y is None
예제 #32
0
 def test_with_subset_subset(self, data_from_dataset, data, subset_subset):
     X, y = data_from_dataset(subset_subset)
     assert (X == data[0][1]).all()
     assert (y == data[1][1]).all()
예제 #33
0
 def test_with_skorch_ds(self, data_from_dataset, data, skorch_ds):
     X, y = data_from_dataset(skorch_ds)
     assert (X == data[0]).all()
     assert (y == data[1]).all()
예제 #34
0
 def test_subset_with_y_none(self, data_from_dataset, data, subset):
     subset.dataset.y = None
     X, y = data_from_dataset(subset)
     assert (X == data[0][[1, 3]]).all()
     assert y is None
예제 #35
0
 def test_with_dict_data(self, data_from_dataset, data, subset):
     subset.dataset.X = {'X': subset.dataset.X}
     X, y = data_from_dataset(subset)
     assert (X['X'] == data[0][[1, 3]]).all()
     assert (y == data[1][[1, 3]]).all()
예제 #36
0
 def test_with_other_ds(self, data_from_dataset, other_ds):
     with pytest.raises(AttributeError):
         data_from_dataset(other_ds)
예제 #37
0
 def test_with_subset_subset(self, data_from_dataset, data, subset_subset):
     X, y = data_from_dataset(subset_subset)
     assert (X == data[0][1]).all()
     assert (y == data[1][1]).all()