Exemplo n.º 1
0
    def _iter_test_indices(self, X, y=None, groups=None):
        n_samples = _num_samples(X)
        indices = np.arange(n_samples)

        n_splits = self.n_splits
        for n in range(n_splits):
            yield indices
Exemplo n.º 2
0
    def _iter_test_indices(self, X, y=None, groups=None):
        n = _num_samples(X)
        index = np.arange(n)

        train_index, test_index = train_test_split(
            index, test_size=self.test_size, random_state=self.random_state)
        yield test_index
Exemplo n.º 3
0
    def split(self, X, y=None, groups=None):
        """Generates indices to split data into training and test set.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,), optional
            Always ignored, exists for compatibility.

        Returns
        -------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """

        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        indices = np.arange(n_samples)
        test_size = int(n_samples * self.test_percentage)
        test_starts = [
            n_samples - test_size * ind for ind in range(1, self.n_splits + 1)
        ]
        test_ends = [
            n_samples - test_size * ind for ind in range(0, self.n_splits)
        ]
        for test_start, test_end in zip(test_starts, test_ends):
            train_indices = indices[0:test_start]
            test_indices = indices[test_start:test_end]
            yield (train_indices, test_indices)
Exemplo n.º 4
0
    def split(self, X, y=None, groups=None, window_length=4):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        group_lst = np.unique(groups)
        n_groups = len(group_lst)

        indices = np.arange(n_samples)

        eras = range(n_groups - window_length)
        eras = list(eras)
        for i in eras[:]:
            yield (indices[groups == group_lst[i]],
                   indices[groups == group_lst[i + window_length]])
Exemplo n.º 5
0
    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        group_lst = np.unique(groups)
        n_groups = len(group_lst)

        indices = np.arange(n_samples)

        cutoff_eras = n_groups // self.n_splits
        np.random.shuffle(group_lst)

        for i in range(self.n_splits):
            yield (indices[groups.isin(
                group_lst[i * cutoff_eras:i * cutoff_eras + cutoff_eras])],
                   indices[groups.isin(
                       group_lst[i * cutoff_eras:i * cutoff_eras +
                                 cutoff_eras])])
Exemplo n.º 6
0
 def split(self, X, y=None, groups=None):
     X, y, groups = indexable(X, y, groups)
     n_samples = _num_samples(X)
     n_splits = self.n_splits
     n_folds = n_splits + 1
     group_list = np.unique(groups)
     n_groups = len(group_list)
     if n_folds > n_groups:
         raise ValueError(("Cannot have number of folds ={0} greater"
                           " than the number of samples: {1}.").format(
                               n_folds, n_groups))
     indices = np.arange(n_samples)
     test_size = (n_groups // n_folds)
     test_starts = range(test_size + n_groups % n_folds, n_groups,
                         test_size)
     test_starts = list(test_starts)[::-1]
     for test_start in test_starts:
         yield (indices[groups.isin(group_list[:test_start])],
                indices[groups.isin(group_list[test_start:test_start +
                                               test_size])])
Exemplo n.º 7
0
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError("The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds, n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0,
                           group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start -
                                                           group_gap)]:
                train_array_tmp = group_dict[train_group_idx]

                train_array = np.sort(
                    np.unique(
                        np.concatenate((train_array, train_array_tmp)),
                        axis=None,
                    ),
                    axis=None,
                )

            train_end = train_array.size

            for test_group_idx in unique_groups[
                    group_test_start:group_test_start + group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(
                    np.unique(np.concatenate((test_array, test_array_tmp)),
                              axis=None),
                    axis=None,
                )

            test_array = test_array[group_gap:]

            if self.verbose > 0:
                pass

            yield [int(i) for i in train_array], [int(i) for i in test_array]
Exemplo n.º 8
0
 def split(self, X, y=None, groups=None):
     """Generate indices to split data into training and test set.
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
         Training data, where n_samples is the number of samples
         and n_features is the number of features.
     y : array-like of shape (n_samples,)
         Always ignored, exists for compatibility.
     groups : array-like of shape (n_samples,)
         Group labels for the samples used while splitting the dataset into
         train/test set.
     Yields
     ------
     train : ndarray
         The training set indices for that split.
     test : ndarray
         The testing set indices for that split.
     """
     if groups is None:
         raise ValueError("The 'groups' parameter should not be None")
     X, y, groups = indexable(X, y, groups)
     n_folds = self.n_splits + 1
     # np.unique returns sorted groups
     u, ind = np.unique(groups, return_index=True)
     # re-sort unique groups in order of first occurrence
     unique_groups = u[np.argsort(ind)]
     log.debug(f"u={u}, unique_groups={unique_groups}")
     n_samples = _num_samples(X)
     n_groups = _num_samples(unique_groups)
     if n_folds > n_groups:
         raise ValueError(
             ("Cannot have number of folds={0} greater than"
              " the number of groups={1}").format(n_folds, n_groups))
     group_dict: Dict[int, List[int]] = {}
     for idx in np.arange(n_samples):
         if groups[idx] in group_dict:
             group_dict[groups[idx]].append(idx)
         else:
             group_dict[groups[idx]] = [idx]
     group_test_size = min(n_groups // n_folds, self.max_test_group_size)
     group_test_starts = range(n_groups - self.n_splits * group_test_size,
                               n_groups, group_test_size)
     for group_test_start in group_test_starts:
         train_array = []
         test_array = []
         group_st = max(
             0,
             group_test_start - self.group_gap - self.max_train_group_size)
         log.debug(
             f"group_st={group_st}, group_test_size={group_test_size}, group_test_starts={group_test_starts}"
         )
         for train_group_idx in unique_groups[group_st:(group_test_start -
                                                        self.group_gap)]:
             tmp = group_dict[train_group_idx]
             train_array = np.sort(
                 np.unique(np.concatenate((train_array, tmp)), axis=None),
                 axis=None,
             )
         for test_group_idx in unique_groups[
                 group_test_start:group_test_start + group_test_size]:
             tmp = group_dict[test_group_idx]
             test_array = np.sort(
                 np.unique(np.concatenate((test_array, tmp)), axis=None),
                 axis=None,
             )
         yield [int(i) for i in train_array], [int(i) for i in test_array]