Python StratifiedKFold示例，sklearn.model_selection._split.StratifiedKFold Python示例

示例#1

0

显示文件

文件： kfold.py 项目： modusdatascience/sklearntools

class ThresholdStratifiedKFold(object):
    def __init__(self, thresholds, *args, **kwargs):
        if isinstance(thresholds, Iterable):
            self.thresholds = list(thresholds)
        else:
            self.thresholds = [thresholds]
        self.stratified = StratifiedKFold(*args, **kwargs)
    
    def get_n_splits(self, *args,  **kwargs):
        return self.stratified.get_n_splits(*args, **kwargs)
    
    def split(self, X, y):
        y_thresh = np.zeros(y.shape)
        for thresh in self.thresholds:
            y_thresh += y >= thresh
        for train, test in self.stratified.split(X, y_thresh):
            yield train, test

示例#2

0

显示文件

文件： nestedcross.py 项目： pradeepbabburi/EPI_Machine_Learning

def check_cv2(cv=3, y=None, classifier=False, random_state=None):
    """Input checker utility for building a cross-validator

    NOTE: this is the same as sklearn.model_selection._split.check_cv but with an added parameter for random_state
    So that nested CV splits are reproduceable

    Parameters
    ----------
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if classifier is True and ``y`` is either
        binary or multiclass, :class:`StratifiedKFold` is used. In all other
        cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    y : array-like, optional
        The target variable for supervised learning problems.

    classifier : boolean, optional, default False
        Whether the task is a classification task, in which case
        stratified KFold will be used.

    random_state : None, int or RandomState
        When shuffle=True, pseudo-random number generator state used for
        shuffling. If None, use default numpy RNG for shuffling.

    Returns
    -------
    checked_cv : a cross-validator instance.
        The return value is a cross-validator which generates the train/test
        splits via the ``split`` method.
    """
    if cv is None:
        cv = 3

    if isinstance(cv, numbers.Integral):
        if (classifier and (y is not None)
                and (type_of_target(y) in ('binary', 'multiclass'))):
            return StratifiedKFold(cv, random_state=random_state)
        else:
            return KFold(cv, random_state=random_state)

    if not hasattr(cv, 'split') or isinstance(cv, str):
        if not isinstance(cv, Iterable) or isinstance(cv, str):
            raise ValueError("Expected cv as an integer, cross-validation "
                             "object (from sklearn.model_selection) "
                             "or an iterable. Got %s." % cv)
        return _CVIterableWrapper(cv)

    return cv  # New style cv objects are passed without any modification

示例#3

0

显示文件

def MLKFoldCrossValid(epoch):
    seed = 7
    np.random.seed(seed)
    dataset = np.loadtxt("pima-indians-diabetes.csv", delimiter=",")
    X = dataset[:, 0:8]
    Y = dataset[:, 8]
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    cvscores = []

    for train, test in kfold.split(X, Y):
        layers = [Dense(12, input_dim=8, activation='relu', kernel_initializer='uniform'),\
                  Dense(8,activation='relu', kernel_initializer='uniform'),\
                  Dense(1,activation='sigmoid', kernel_initializer='uniform')]
        model = Sequential(layers)
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        model.fit(X[train], Y[train], epochs=epoch, batch_size=10, verbose=0)
        scores = model.evaluate(X[test], Y[test], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
        cvscores.append(scores[1] * 100)
    print("%.2f%%(+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

示例#4

0

显示文件

文件： model_selection.py 项目： thomasgreg/dask-searchcv

def check_cv(cv=3, y=None, classifier=False):
    """Dask aware version of ``sklearn.model_selection.check_cv``

    Same as the scikit-learn version, but works if ``y`` is a dask object.
    """
    if cv is None:
        cv = 3

    # If ``cv`` is not an integer, the scikit-learn implementation doesn't
    # touch the ``y`` object, so passing on a dask object is fine
    if not isinstance(y, Base) or not isinstance(cv, numbers.Integral):
        return model_selection.check_cv(cv, y, classifier)

    if classifier:
        # ``y`` is a dask object. We need to compute the target type
        target_type = delayed(type_of_target, pure=True)(y).compute()
        if target_type in ('binary', 'multiclass'):
            return StratifiedKFold(cv)
    return KFold(cv)

示例#5

0

显示文件

文件： classify_timeseries.py 项目： mattyws/organism_resistance

data_csv = pd.read_csv('dataset.csv')
data_csv = data_csv.sort_values(['icustay_id'])
data = np.array([
    itemid for itemid in list(data_csv['icustay_id'])
    if os.path.exists(parameters['dataPath'] + '{}.csv'.format(itemid))
])
data_csv = data_csv[data_csv['icustay_id'].isin(data)]
data = np.array(
    [parameters['dataPath'] + '{}.csv'.format(itemid) for itemid in data])
classes = np.array([0 if c == 'S' else 1 for c in list(data_csv['class'])])
classes_for_stratified = np.array(
    [0 if c == 'S' else 1 for c in list(data_csv['class'])])
print('S', len([c for c in classes if c == [0]]))
print('R', len([c for c in classes if c == [1]]))
# Using a seed always will get the same data split even if the training stops
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=15)

# Get input shape
aux = pd.read_csv(data[0])
inputShape = (parameters['dataLength'], len(aux.columns))

config = None
if os.path.exists(parameters['modelConfigPath']):
    with open(parameters['modelConfigPath'], 'r') as configHandler:
        config = json.load(configHandler)

i = 0
# ====================== Script that start training new models
with open(parameters['resultFilePath'], 'a+'
          ) as cvsFileHandler:  # where the results for each fold are appended
    dictWriter = None

示例#6

0

显示文件

    def kfold_stratified(self,
                         n_splits=5,
                         n_repeats=0,
                         shuffle=False,
                         random_state=None):
        '''
        Uses sklearn's StratifiedKFold and RepeatedStratifiedKFold facility.
        https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
        See also method kfold() for folding without balancing.
        
        The 'X' in this context are sample ids that are eventually 
        used by the dataloader to retrieve spectrograms. Recall
        that each sample id stands for one spectrogram snippet. The
        'y' vector in the KFold page is the 'label' column in the
        Sqlite db, which is 1 or 0 for each column (i.e. time bin)
        of the spectrogram.
        
        All methods on the sklearn [Repeated]StratifiedKFold facility
        are available in this class by the same name.
        
        After calling this method, calls to next() will
        return train samples. I.e. the current queue is set
        to self.train_queue
        
        
        @param n_splits: number of folds to create 
        @type n_splits: int
        @param n_repeats: number times fold splitting should
            be repeated (n-times k-fold cross validation.
            Set to zero, the method uses sklearn KFold class,
            else it uses the sklearn.RepeatedKFold
        @type n_repeats: int
        @param shuffle: whether or not to shuffle the 
            data before splitting. Once split, the 
            data in the folds are not shuffled 
        @type shuffle: bool
        @param random_state: if shuffle is set to True,
            this argument allows for repeatability over
            multiple runs
        @type random_state: int
        '''
        if n_repeats == 0:
            self.cross_validator = StratifiedKFold(n_splits=n_splits,
                                                   shuffle=shuffle,
                                                   random_state=random_state)
        else:
            self.cross_validator = RepeatedStratifiedKFold(
                n_splits=n_splits,
                n_repeats=n_repeats,
                random_state=random_state)

        # The following retrieves *indices* into
        # our list of sample_ids. However, since
        # our sample_ids are just numbers from 0 to n,
        # the indices are equivalent to the sample ids
        # themselves

        # The split method will return a generator
        # object. Each item in this generator is
        # a 2-tuple: a test set array and a validation
        # set array. There will be n_splits such tuples.

        # We grab the first pair:

        all_labels = self.labels_from_db(self.sample_ids)
        self.folds_iter = self.cross_validator.split(self.sample_ids,
                                                     all_labels)
        (self.train_sample_ids, self.validate_sample_ids) = \
            next(self.folds_iter)

        self.train_queue = deque(self.train_sample_ids)
        self.val_queue = deque(self.validate_sample_ids)
        self.train_labels = self.labels_from_db(self.train_sample_ids)
        self.validate_labels = self.labels_from_db(self.validate_sample_ids)
        self.switch_to_split('train')

示例#7

0

显示文件

文件： kfold.py 项目： modusdatascience/sklearntools

 def __init__(self, thresholds, *args, **kwargs):
     if isinstance(thresholds, Iterable):
         self.thresholds = list(thresholds)
     else:
         self.thresholds = [thresholds]
     self.stratified = StratifiedKFold(*args, **kwargs)