def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) indices = np.arange(n_samples) n_splits = self.n_splits for n in range(n_splits): yield indices
def _iter_test_indices(self, X, y=None, groups=None): n = _num_samples(X) index = np.arange(n) train_index, test_index = train_test_split( index, test_size=self.test_size, random_state=self.random_state) yield test_index
def split(self, X, y=None, groups=None): """Generates indices to split data into training and test set. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Always ignored, exists for compatibility. groups : array-like, with shape (n_samples,), optional Always ignored, exists for compatibility. Returns ------- train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) indices = np.arange(n_samples) test_size = int(n_samples * self.test_percentage) test_starts = [ n_samples - test_size * ind for ind in range(1, self.n_splits + 1) ] test_ends = [ n_samples - test_size * ind for ind in range(0, self.n_splits) ] for test_start, test_end in zip(test_starts, test_ends): train_indices = indices[0:test_start] test_indices = indices[test_start:test_end] yield (train_indices, test_indices)
def split(self, X, y=None, groups=None, window_length=4): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) group_lst = np.unique(groups) n_groups = len(group_lst) indices = np.arange(n_samples) eras = range(n_groups - window_length) eras = list(eras) for i in eras[:]: yield (indices[groups == group_lst[i]], indices[groups == group_lst[i + window_length]])
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) group_lst = np.unique(groups) n_groups = len(group_lst) indices = np.arange(n_samples) cutoff_eras = n_groups // self.n_splits np.random.shuffle(group_lst) for i in range(self.n_splits): yield (indices[groups.isin( group_lst[i * cutoff_eras:i * cutoff_eras + cutoff_eras])], indices[groups.isin( group_lst[i * cutoff_eras:i * cutoff_eras + cutoff_eras])])
def split(self, X, y=None, groups=None): X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits n_folds = n_splits + 1 group_list = np.unique(groups) n_groups = len(group_list) if n_folds > n_groups: raise ValueError(("Cannot have number of folds ={0} greater" " than the number of samples: {1}.").format( n_folds, n_groups)) indices = np.arange(n_samples) test_size = (n_groups // n_folds) test_starts = range(test_size + n_groups % n_folds, n_groups, test_size) test_starts = list(test_starts)[::-1] for test_start in test_starts: yield (indices[groups.isin(group_list[:test_start])], indices[groups.isin(group_list[test_start:test_start + test_size])])
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is None: raise ValueError("The 'groups' parameter should not be None") X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) n_splits = self.n_splits group_gap = self.group_gap max_test_group_size = self.max_test_group_size max_train_group_size = self.max_train_group_size n_folds = n_splits + 1 group_dict = {} u, ind = np.unique(groups, return_index=True) unique_groups = u[np.argsort(ind)] n_samples = _num_samples(X) n_groups = _num_samples(unique_groups) for idx in np.arange(n_samples): if groups[idx] in group_dict: group_dict[groups[idx]].append(idx) else: group_dict[groups[idx]] = [idx] if n_folds > n_groups: raise ValueError( ("Cannot have number of folds={0} greater than" " the number of groups={1}").format(n_folds, n_groups)) group_test_size = min(n_groups // n_folds, max_test_group_size) group_test_starts = range(n_groups - n_splits * group_test_size, n_groups, group_test_size) for group_test_start in group_test_starts: train_array = [] test_array = [] group_st = max(0, group_test_start - group_gap - max_train_group_size) for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]: train_array_tmp = group_dict[train_group_idx] train_array = np.sort( np.unique( np.concatenate((train_array, train_array_tmp)), axis=None, ), axis=None, ) train_end = train_array.size for test_group_idx in unique_groups[ group_test_start:group_test_start + group_test_size]: test_array_tmp = group_dict[test_group_idx] test_array = np.sort( np.unique(np.concatenate((test_array, test_array_tmp)), axis=None), axis=None, ) test_array = test_array[group_gap:] if self.verbose > 0: pass yield [int(i) for i in train_array], [int(i) for i in test_array]
def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ if groups is None: raise ValueError("The 'groups' parameter should not be None") X, y, groups = indexable(X, y, groups) n_folds = self.n_splits + 1 # np.unique returns sorted groups u, ind = np.unique(groups, return_index=True) # re-sort unique groups in order of first occurrence unique_groups = u[np.argsort(ind)] log.debug(f"u={u}, unique_groups={unique_groups}") n_samples = _num_samples(X) n_groups = _num_samples(unique_groups) if n_folds > n_groups: raise ValueError( ("Cannot have number of folds={0} greater than" " the number of groups={1}").format(n_folds, n_groups)) group_dict: Dict[int, List[int]] = {} for idx in np.arange(n_samples): if groups[idx] in group_dict: group_dict[groups[idx]].append(idx) else: group_dict[groups[idx]] = [idx] group_test_size = min(n_groups // n_folds, self.max_test_group_size) group_test_starts = range(n_groups - self.n_splits * group_test_size, n_groups, group_test_size) for group_test_start in group_test_starts: train_array = [] test_array = [] group_st = max( 0, group_test_start - self.group_gap - self.max_train_group_size) log.debug( f"group_st={group_st}, group_test_size={group_test_size}, group_test_starts={group_test_starts}" ) for train_group_idx in unique_groups[group_st:(group_test_start - self.group_gap)]: tmp = group_dict[train_group_idx] train_array = np.sort( np.unique(np.concatenate((train_array, tmp)), axis=None), axis=None, ) for test_group_idx in unique_groups[ group_test_start:group_test_start + group_test_size]: tmp = group_dict[test_group_idx] test_array = np.sort( np.unique(np.concatenate((test_array, tmp)), axis=None), axis=None, ) yield [int(i) for i in train_array], [int(i) for i in test_array]