-
Notifications
You must be signed in to change notification settings - Fork 0
/
resampling.py
executable file
·64 lines (54 loc) · 2.22 KB
/
resampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from sklearn.cross_validation import StratifiedKFold,bincount,check_random_state
import numpy as np
class ResampledKFold(StratifiedKFold):
"""Resampled Stratified K-Folds cross validation iterator
Provides train/test indices to split data in train test sets.
This cross-validation object is a variation of KFold that
returns resampled stratified folds. The folds are made by preserving
the percentage of samples for each class, then resampling from each
class to result in equal shares.
Parameters
----------
y : array-like, [n_samples]
Samples to split in K folds.
n_folds : int, default=3
Number of folds. Must be at least 2.
shuffle : boolean, optional
Whether to shuffle each stratification of the data before splitting
into batches.
random_state : None, int or RandomState
When shuffle=True, pseudo-random number generator state used for
shuffling. If None, use default numpy RNG for shuffling.
"""
def __init__(self, y, n_folds=3, shuffle=False, random_state=None):
super(ResampledKFold, self).__init__(y, n_folds, shuffle, random_state)
self.y = y
#We need to override the _PartitionIterator version of this, which
# only lets each index appear once
def __iter__(self):
ind = np.arange(self.n)
for test_index in self._iter_test_masks():
train_index = np.logical_not(test_index)
train_index = self._resample_partition(ind[train_index])
test_index = self._resample_partition(ind[test_index])
yield train_index, test_index
def _resample_partition(self, partition):
rng = check_random_state(self.random_state)
y = self.y[partition]
unique_labels, y_inversed = np.unique(y, return_inverse=True)
label_counts = bincount(y_inversed)
class_share = max(label_counts)
resampled_partition = np.empty(class_share*len(unique_labels),
dtype=np.int_)
for i,label in enumerate(unique_labels):
indices = partition[y == label]
class_size = len(indices)
offset = class_share*i
added = 0
while added < class_share:
rng.shuffle(indices)
to_add = min(class_share - added, class_size)
resampled_partition[offset+added:offset+added+to_add] = \
indices[:to_add]
added += to_add
return resampled_partition