示例#1
0
    def IrisData(self, args):
        # init splitting size
        if (len(args) > 1):
            self.test_size = float(args[1])
        if (len(args) > 2):
            self.train_unlabeled_size = float(args[2])

        # read data
        data_load = np.loadtxt(self.iris_data_file, dtype='str', delimiter=',')
        map_load = np.genfromtxt(self.iris_map_file,
                                 dtype='str',
                                 delimiter=',')

        # re-index class to number 0, 1, ..., c
        index_map = {}
        for i in range(len(map_load)):
            index_map[map_load[i]] = i

        for i, d in enumerate(data_load):
            d[-1] = index_map.get(d[-1])

        # split data into 3 parts, nearly same proportion
        # first slpit train and test, then from train split to label and unlabel
        sss1 = model_selection.StratifiedShuffleSplit(n_splits=1,
                                                      test_size=self.test_size,
                                                      random_state=0)
        train_indices = []  #just in case
        for train_indices, test_indices in sss1.split(data_load,
                                                      data_load.T[-1]):
            np.savetxt(self.iris_output_test,
                       data_load[test_indices],
                       fmt="%s",
                       delimiter=',')  # test first
        #
        if self.train_unlabeled_size == 0:
            np.savetxt(self.iris_output_train[0],
                       data_load[train_indices],
                       fmt='%s',
                       delimiter=',')  # only train
        else:
            sss2 = model_selection.StratifiedShuffleSplit(
                n_splits=1,
                test_size=self.train_unlabeled_size,
                random_state=0)
            for train_label_indices, train_unlabel_indices in sss2.split(
                    data_load[train_indices], data_load[train_indices].T[-1]):
                np.savetxt(self.iris_output_train[0],
                           data_load[train_indices][train_label_indices],
                           fmt='%s',
                           delimiter=',')
                np.savetxt(self.iris_output_train[1],
                           data_load[train_indices][train_unlabel_indices],
                           fmt='%s',
                           delimiter=',')

        # map file generate
        np.savetxt(self.iris_output_map,
                   np.mat(map_load)[0],
                   fmt="%s",
                   delimiter=',')
示例#2
0
def train_test_split(X,
                     Y,
                     test_size=.2,
                     use_examples_num=None,
                     random_state=None):
    # First - use only required number of examples
    if use_examples_num:
        if use_examples_num > X.shape[0]:
            raise ValueError('Too big total_size')

        cv = model_selection.StratifiedShuffleSplit(n_splits=1,
                                                    test_size=use_examples_num,
                                                    random_state=random_state)
        _, index = next(cv.split(X, np.argmax(Y, axis=1)))
        X, Y = X[index], Y[index]

    if not test_size:
        return X, np.array([]), Y, np.array([])

    # Second - split train/test
    cv = model_selection.StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=random_state)
    train_idx, test_idx = next(cv.split(X, np.argmax(Y, axis=1)))

    return train_idx, test_idx
示例#3
0
 def __call__(self, table):
     if self.replace:
         # pylint: disable=no-member
         rgen = np.random.RandomState(self.random_state)
         sample = rgen.randint(0, len(table), self.n)
         o = np.ones(len(table))
         o[sample] = 0
         others = np.nonzero(o)[0]
         return others, sample
     if self.n == len(table):
         rgen = np.random.RandomState(self.random_state)
         sample = np.arange(self.n)
         rgen.shuffle(sample)
         return np.array([], dtype=int), sample
     elif self.stratified and table.domain.has_discrete_class:
         test_size = max(len(table.domain.class_var.values), self.n)
         splitter = skl.StratifiedShuffleSplit(
             n_splits=1,
             test_size=test_size,
             train_size=len(table) - test_size,
             random_state=self.random_state)
         splitter.get_n_splits(table.X, table.Y)
         ind = splitter.split(table.X, table.Y)
     else:
         splitter = skl.ShuffleSplit(n_splits=1,
                                     test_size=self.n,
                                     random_state=self.random_state)
         splitter.get_n_splits(table)
         ind = splitter.split(table)
     return next(iter(ind))
示例#4
0
    def fit(self, X, y):
        print "Fitting an SGD Elasticnet Classification model..."
        t_start = time.time()
        n_iter = np.ceil(10**6 / float(len(y)))

        self.standardizer = preprocessing.StandardScaler()
        X = self.standardizer.fit_transform(X)

        alpha_range = 10.0**-np.arange(1, 7)
        param_grid = []
        param_grid.append(
            dict(loss=['log', 'modified_huber'],
                 alpha=alpha_range,
                 n_iter=[n_iter],
                 penalty=['elasticnet'],
                 l1_ratio=[.1, .5, .7, .9, .95, .99, 1.]))
        print "Using param grid " + str(param_grid)
        self.clf = linear_model.SGDClassifier(random_state=1337)
        cv = model_selection.StratifiedShuffleSplit(n_splits=5,
                                                    test_size=0.2,
                                                    random_state=0)
        self.clf = model_selection.GridSearchCV(self.clf,
                                                param_grid=param_grid,
                                                cv=cv,
                                                n_jobs=7)
        self.clf.fit(X, y)
        print "Best params: " + str(
            self.clf.best_params_) + " and corresponding score is " + str(
                self.clf.best_score_)

        utime = time.time() - t_start
        print " Done fitting. Took time " + str(utime)
示例#5
0
 def fit(self, X, y):
     # Split into categorical,numerical categories:
     self.cat_clf = pipeline.Pipeline((('cat-tf', CategoricalTransformer()),
                                       ('bnb', naive_bayes.BernoulliNB())))
     self.num_clf = pipeline.Pipeline(
         (('num-tf', NumericalTransformer()), ('gnb',
                                               naive_bayes.GaussianNB())))
     weights_range = [[
         a, 1.0 - a
     ] for a in [0., .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0]]
     voting_range = ['soft']
     param_grid = dict(voting=voting_range, weights=weights_range)
     print "Using param grid " + str(param_grid)
     cv = model_selection.StratifiedShuffleSplit(n_splits=5,
                                                 test_size=0.2,
                                                 random_state=0)
     self.clf = ensemble.VotingClassifier(
         estimators=[('num-clf', self.num_clf), ('cat-clf', self.cat_clf)])
     self.clf = model_selection.GridSearchCV(self.clf,
                                             param_grid=param_grid,
                                             cv=cv,
                                             n_jobs=7)
     self.clf.fit(X, y)
     print "Best params: " + str(
         self.clf.best_params_) + " and corresponding score is " + str(
             self.clf.best_score_)
def _split(data, test_size, random_state):
    """Splits the data into train and valid set, in a stratified
    manner. Also puts similar examples in the same set (based
    on _/input/similar_examples_hashXXX.npy)"""

    data['fold'] = -1

    classes = (
        np.where(data.data_provider == 'karolinska', 6, 0)
        + data.isup_grade.values)

    skf = model_selection.StratifiedShuffleSplit(
        n_splits=1,
        test_size=test_size,
        random_state=random_state)

    skf_iterator = skf.split(
        X=data.image_id,
        y=classes)

    train_idx, valid_idx = next(skf_iterator)
    data.loc[valid_idx, 'fold'] = 0
    data.loc[train_idx, 'fold'] = 1

    return data[data.fold != 0], data[data.fold == 0]
    def test_StratifiedShuffleSplit(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)
        sf1 = df.model_selection.StratifiedShuffleSplit(
            random_state=self.random_state)
        sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state)

        # consume generator
        ind1 = [x for x in sf1.split(df.data.values, df.target.values)]
        ind2 = [x for x in sf2.split(iris.data, iris.target)]

        for i1, i2 in zip(ind1, ind2):
            self.assertIsInstance(i1, tuple)
            self.assertEqual(len(i1), 2)
            self.assertIsInstance(i2, tuple)
            self.assertEqual(len(i2), 2)
            self.assert_numpy_array_equal(i1[0], i1[0])
            self.assert_numpy_array_equal(i1[1], i2[1])

        sf1 = df.model_selection.StratifiedShuffleSplit(
            random_state=self.random_state)
        with tm.assert_produces_warning(FutureWarning):
            gen = df.model_selection.iterate(sf1)

        # StratifiedShuffleSplit is not a subclass of BaseCrossValidator
        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            self.assert_index_equal(df.columns, train_df.columns)
            self.assert_index_equal(df.columns, test_df.columns)

            self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
示例#8
0
def classifier_tester(classifier,x,y):
    sss=model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
    scores=model_selection.cross_validate(classifier,x, y, scoring='accuracy',cv=sss)
    acc=scores['test_score']
    print('accuracies=',acc*100)
    print('total acc=',round(acc.mean()*100,2),round(acc.std()*100,2))
    print('test time=',scores['score_time'])
示例#9
0
 def fit(self,
         x,
         y,
         sample_weight=None,
         check_input=True,
         x_idx_sorted=None):
     if self._alpha is None:
         return self._learner.fit(x,
                                  y,
                                  sample_weight=sample_weight,
                                  check_input=check_input,
                                  X_idx_sorted=x_idx_sorted)
     if sample_weight is None:
         sample_weight = np.ones(x.shape[0])
     self.training_x = x.copy()
     self.training_y = y.copy()
     self.training_weights = sample_weight.copy()
     # TODO: Make this tunable? at least random_state?
     sss = ms.StratifiedShuffleSplit(n_splits=1,
                                     test_size=0.2,
                                     random_state=123)
     for train_index, test_index in sss.split(self.training_x,
                                              self.training_y):
         self.value_x = self.training_x[test_index]
         self.value_y = self.training_y[test_index]
         self.training_x = self.training_x[train_index]
         self.training_y = self.training_y[train_index]
         self.value_weights = sample_weight[test_index]
         self.training_weights = sample_weight[train_index]
     self._learner.fit(self.training_x, self.training_y,
                       self.training_weights, check_input, x_idx_sorted)
     self.prune()
     return self
    def eval_classification(self, session, labels, train_size):
        sk_graph = self._skipgram_graph
        node_embeddings = session.run(sk_graph["normalized_embeddings"])

        # Classifier choice
        classifier = linear_model.LogisticRegression(C=10)
        #classifier = svm.SVC(C=1)

        scoring = ['accuracy', 'f1_macro', 'f1_micro']

        shuffle = model_selection.StratifiedShuffleSplit(n_splits=5,
                                                         test_size=0.8)

        cv_scores = model_selection.cross_validate(classifier,
                                                   node_embeddings,
                                                   labels,
                                                   scoring=scoring,
                                                   cv=shuffle,
                                                   return_train_score=True)
        train_acc = cv_scores['train_accuracy'].mean()
        train_f1 = cv_scores['train_f1_macro'].mean()
        test_acc = cv_scores['test_accuracy'].mean()
        test_f1 = cv_scores['test_f1_macro'].mean()

        print("Train acc: {:0.3f}, f1: {:0.3f}".format(train_acc, train_f1))
        print("Test acc: {:0.3f}, f1: {:0.3f}".format(test_acc, test_f1))

        return {
            'train_acc': train_acc,
            'test_acc': test_acc,
            'train_f1': train_f1,
            'test_f1': test_f1
        }
def abide1_subtype_stability_core(n_cpu):
    # Hardcoded variables
    scale = 20
    state = 1
    n_boot = 1000
    dist_thr = 0.99
    part_thr = 20
    regressors = 'AGE_AT_SCAN+fd_scrubbed+SITE_ID'
    # Paths
    root_p = pal.Path(__file__).resolve().parents[2] / 'data'
    pheno_p = root_p / 'pheno/ABIDE1_Pheno_PSM_matched_minimum_10.tsv'
    # Data
    sca_p = root_p / f'preprocessed/seed_maps/abide_1/MIST_{scale}'
    sca_t = f'sub_{{}}_ses_{{}}_run{{}}_mist_{scale}_nocereb.npy'
    # Output
    out_d = root_p / f'processed/stability/abide_1/'
    out_p = out_d / f'abide_1_subtype_stability_mist_{scale}_core_{part_thr:d}_within_{dist_thr*100:.0f}.npz'
    if not out_d.is_dir():
        out_d.mkdir()

    pheno = pd.read_csv(pheno_p, sep='\t')
    seed_paths = [
        sca_p / sca_t.format(row['SUB_ID'], row['session'], row['run'])
        for rid, row in pheno.iterrows()
    ]
    subject_stack = np.array([np.load(p) for p in seed_paths])
    n_sub, n_vox, n_roi = subject_stack.shape
    splitter = skm.StratifiedShuffleSplit(n_splits=n_boot,
                                          test_size=0.5,
                                          random_state=state)
    asd_label = (pheno.DX_GROUP == 'Autism').values.astype(int)
    n_samples = len(asd_label)
    if not n_samples == n_sub:
        raise Exception(
            f'got {n_sub} subjects in residual but {n_samples} in the pheno file. Doesnt work.'
        )
    # data_stack, mode='classic', n_subtypes=3, dist_thr=0.7, part_thr=20
    job_arg_list = [{
        'data_stack': subject_stack,
        'sbt_idx': train,
        'dist_thr': dist_thr,
        'part_thr': part_thr,
        'regressors': regressors,
        'pheno': pheno
    } for train, test in splitter.split(X=np.zeros(n_samples), y=asd_label)]
    train_indices_list, _ = zip(
        *list(splitter.split(X=np.zeros(n_samples), y=asd_label)))
    # decorate the subtype function
    ex = futures.ThreadPoolExecutor(max_workers=n_cpu)
    results = {
        run_id: res
        for run_id, res in zip(
            range(len(job_arg_list)),
            list(
                tqdm(ex.map(wrap_subtype_stability, job_arg_list),
                     total=len(job_arg_list))))
    }

    # Store the results
    np.savez(out_p, train_idx=train_indices_list, partitions=results)
示例#12
0
def SVM_hyper(X_train_pca, y_train, X_test_pca, y_test):
    k_list = list(range(1, 50, 1))
    all_train = []
    all_test = []

    sss = model_selection.StratifiedShuffleSplit(n_splits=20,
                                                 test_size=0.5,
                                                 random_state=0)
    for train_index, test_index in sss.split(X_train_pca, y_train):
        train_scores = []
        val_scores = []

        split_X_train = X_train_pca[train_index]
        split_y_train = y_train[train_index]
        split_X_val = X_train_pca[test_index]
        split_y_val = y_train[test_index]

        for k in k_list:
            clf_SVM = SVC(kernel='poly', degree=3, gamma='scale', coef0=k, C=1)

            #clf_knn = neighbors.KNeighborsClassifier(n_neighbors=k)
            clf_SVM.fit(split_X_train, split_y_train)

            # Test the classifier on the training data and plot
            score_train = clf_SVM.score(split_X_train, split_y_train)
            score_val = clf_SVM.score(split_X_val, split_y_val)

            train_scores.append(score_train)
            val_scores.append(score_val)

        all_train.append(train_scores)
        all_test.append(val_scores)

    # Create numpy array of scores and calculate the mean and std
    all_train = np.array(all_train)
    all_test = np.array(all_test)

    train_scores_mean = all_train.mean(axis=0)
    train_scores_std = all_train.std(axis=0)

    test_scores_mean = all_test.mean(axis=0)
    test_scores_std = all_test.std(axis=0)

    # Plot the mean scores and the std as shading
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.grid()
    ax.fill_between(k_list,
                    train_scores_mean - train_scores_std,
                    train_scores_mean + train_scores_std,
                    alpha=0.1,
                    color="r")
    ax.fill_between(k_list,
                    test_scores_mean - test_scores_std,
                    test_scores_mean + test_scores_std,
                    alpha=0.1,
                    color="g")
    ax.plot(k_list, train_scores_mean, 'o-', color="r", label="Training score")
    ax.plot(k_list, test_scores_mean, 'o-', color="g", label="Test score")
    plt.show()
示例#13
0
def sample(table, n=0.7, stratified=False, replace=False, random_state=None):
    """
    Samples data instances from a data table. Returns the sample and
    a dataset from input data table that are not in the sample. Also
    uses several sampling functions from
    `scikit-learn <http://scikit-learn.org>`_.

    table : data table
        A data table from which to sample.

    n : float, int (default = 0.7)
        If float, should be between 0.0 and 1.0 and represents
        the proportion of data instances in the resulting sample. If
        int, n is the number of data instances in the resulting sample.

    stratified : bool, optional (default = False)
        If true, sampling will try to consider class values and
        match distribution of class values
        in train and test subsets.

    replace : bool, optional (default = False)
        sample with replacement

    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.
    """

    if type(n) == float:
        n = int(n * len(table))

    if replace:
        if random_state is None:
            rgen = np.random
        else:
            rgen = np.random.mtrand.RandomState(random_state)
        sample = rgen.randint(0, len(table), n)
        o = np.ones(len(table))
        o[sample] = 0
        others = np.nonzero(o)[0]
        return table[sample], table[others]

    n = len(table) - n
    if stratified and table.domain.has_discrete_class:
        test_size = max(len(table.domain.class_var.values), n)
        splitter = skl.StratifiedShuffleSplit(
            n_splits=1,
            test_size=test_size,
            train_size=len(table) - test_size,
            random_state=random_state,
        )
        splitter.get_n_splits(table.X, table.Y)
        ind = splitter.split(table.X, table.Y)
    else:
        splitter = skl.ShuffleSplit(n_splits=1,
                                    test_size=n,
                                    random_state=random_state)
        splitter.get_n_splits(table)
        ind = splitter.split(table)
    ind = next(ind)
    return table[ind[0]], table[ind[1]]
示例#14
0
 def fit(self,
         X,
         Y,
         sample_weight=None,
         check_input=True,
         X_idx_sorted=None):
     if sample_weight is None:
         sample_weight = np.ones(X.shape[0])
     self.trgX = X.copy()
     self.trgY = Y.copy()
     self.trgWts = sample_weight.copy()
     sss = ms.StratifiedShuffleSplit(n_splits=1,
                                     test_size=0.2,
                                     random_state=123)
     for train_index, test_index in sss.split(self.trgX, self.trgY):
         self.valX = self.trgX[test_index]
         self.valY = self.trgY[test_index]
         self.trgX = self.trgX[train_index]
         self.trgY = self.trgY[train_index]
         self.valWts = sample_weight[test_index]
         self.trgWts = sample_weight[train_index]
     super().fit(self.trgX, self.trgY, self.trgWts, check_input,
                 X_idx_sorted)
     self.prune()
     return self
示例#15
0
def print_data(train_file_addr, val_file_addr, x, y):
    def p_data(addr, arr):
        with open(addr, 'w') as f:
            for i in arr:
                f.write(str(i))

    cv = model_selection.StratifiedShuffleSplit(2)
    for train, val in cv.split(x, y):
        p_data(train_file_addr, x[train])
        p_data(val_file_addr, x[val])
def split_data(X, y):
    """
    Splits training data into train and test sets in a stratified fashion preserving class distribution in the data
    :param X: the features
    :param y: the labels
    :return: a generator that returns exactly one train/test split in a stratified fashion based on labels
    """
    splitter = model_selection.StratifiedShuffleSplit(n_splits=1,
                                                      test_size=.05,
                                                      random_state=10)
    return splitter.split(X=X, y=y)
示例#17
0
 def paramSearch(self, xMat, yMat):
     C_range = np.logspace(-3, 3, 6)
     gamma_range = np.logspace(-3, 2, 6)
     param_grid = dict(gamma=gamma_range, C=C_range)
     cv = ms.StratifiedShuffleSplit(n_splits=3,
                                    test_size=0.33,
                                    random_state=42)
     grid = ms.GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
     grid.fit(xMat, yMat)
     print("The best parameters are %s with a score of %0.2f" %
           (grid.best_params_, grid.best_score_))
示例#18
0
    def split_train_test(self, folds, test_size, seed, remove_duplicates,
                         kfold_statified_shuffle_splits):
        data = self.data_tp if not self.use_fp else self.data_all

        if folds < 0:
            return [data], [[]]

        grouped = data[[self.k_recording_id, self.classes_column_name
                        ]].groupby(self.k_recording_id).min()
        classes = grouped[self.classes_column_name].tolist()

        train = []
        test = []

        if folds < 2:
            res = model_selection.train_test_split(grouped,
                                                   test_size=test_size,
                                                   random_state=seed,
                                                   stratify=classes)
            train.append(res[0].index)
            test.append(res[1].index)
        else:
            if not kfold_statified_shuffle_splits:
                folds_generator = model_selection.StratifiedKFold(
                    folds, shuffle=True, random_state=seed)
            else:
                folds_generator = model_selection.StratifiedShuffleSplit(
                    folds, random_state=seed, test_size=test_size)

            for train_part, test_part in folds_generator.split(
                    grouped, classes):
                train.append(grouped.iloc[train_part, :].index)
                test.append(grouped.iloc[test_part, :].index)
                # train.append(self.data.iloc[train_part, :])
                # test.append(self.data.iloc[test_part, :])

        for split_id in range(len(train)):
            train_part, test_part = train[split_id], test[split_id]
            train_part = data.loc[data[self.k_recording_id].isin(train_part)]
            test_part = data.loc[data[self.k_recording_id].isin(test_part)]

            if self.sample_val_fp:
                train_part, test_part = self.perform_sampling_for_val_fp(
                    train_part, test_part)

            # if remove_duplicates:
            #     duplicated_records = train_part[self.k_recording_id].duplicated(keep=False)
            #     test_part = pd.concat([test_part, train_part.loc[duplicated_records, :]])
            #     train_part = train_part.drop_duplicates(self.k_recording_id, keep=False)

            train[split_id], test[split_id] = train_part, test_part

        return train, test
示例#19
0
    def data_from_scaling(self, size):
        # Split labeled and unlabeled data by scaling size[labeled, unlabeled]

        splited_data = Dataset()

        # splitting data, guarantee that number of samples per class are nearly equal

        # labeled data slpiting
        if size[0] == 1:
            splited_data.train_xl = self.train_xl[:]
            splited_data.train_yl = self.train_yl[:]
        else:
            sss1 = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=size[0], random_state=0)
            # notice: set random_state is a constant to make sure that the next scaling is the expand of last data set
            for data_indecices, labeled_indices in sss1.split(self.train_xl, self.train_yl.T):
                splited_data.train_xl = self.train_xl[labeled_indices]
                splited_data.train_yl = self.train_yl[0,labeled_indices]

        # unlabeled data splitting
        if size[1] == 1:
            splited_data.train_xu = self.train_xu
        else:
            sss2 = model_selection.StratifiedShuffleSplit(n_splits=1, test_size=size[1], random_state=0)
            for data_indecices, unlabeled_indices in sss2.split(self.train_xu, self.train_yu.T):
                splited_data.train_xu = self.train_xu[unlabeled_indices]

        # update parameters
        splited_data.problem_type = self.problem_type
        splited_data.test_x = self.test_x
        splited_data.test_y = self.test_y
        splited_data.class_name = self.class_name
        splited_data.class_number = self.class_number
        splited_data.feature_number = self.feature_number
        splited_data.instance_label_number = len(splited_data.train_xl)
        splited_data.instance_unlabel_number = len(splited_data.train_xu)
        splited_data.instance_test_number = self.instance_test_number

        return splited_data
示例#20
0
def split_train_test_strat(data, split_category, n_splits, test_ratio, seed):
    strat_train_set = pd.DataFrame(data=None,
                                   columns=data.columns,
                                   index=data.index)
    strat_test_set = pd.DataFrame(data=None,
                                  columns=data.columns,
                                  index=data.index)
    sd = ms.StratifiedShuffleSplit(n_splits=n_splits,
                                   test_size=test_ratio,
                                   random_state=seed)
    for train_idx, test_idx in sd.split(data, data[split_category]):
        strat_train_set = data.loc[train_idx]
        strat_test_set = data.loc[test_idx]
    return strat_train_set, strat_test_set
示例#21
0
    def get_indices(self, data):
        if self.stratified and data.domain.has_discrete_class:
            splitter = skl.StratifiedShuffleSplit(
                n_splits=self.n_resamples, train_size=self.train_size,
                test_size=self.test_size, random_state=self.random_state
            )
            splitter.get_n_splits(data.X, data.Y)
            return list(splitter.split(data.X, data.Y))

        splitter = skl.ShuffleSplit(
            n_splits=self.n_resamples, train_size=self.train_size,
            test_size=self.test_size, random_state=self.random_state
        )
        splitter.get_n_splits(data)
        return list(splitter.split(data))
示例#22
0
def split(ds, testSplit, testSplitSeed, stratified=False, groupFunc=None):

    rn = list(range(0, len(ds)))
    if stratified:
        data_classes = dataset_classes(ds, groupFunc)
        vals = ms.StratifiedShuffleSplit(4,
                                         testSplit,
                                         random_state=testSplitSeed).split(
                                             rn, data_classes)
        for v in vals:
            return SubDataSet(ds, v[0]), SubDataSet(ds, v[1])

    random.seed(testSplitSeed)
    random.shuffle(rn)
    dm = round(len(ds) - len(ds) * testSplit)
    return SubDataSet(ds, rn[:dm]), SubDataSet(ds, rn[dm:])
示例#23
0
    def setup_indices(self, train_data, test_data):
        if self.stratified and test_data.domain.has_discrete_class:
            splitter = skl.StratifiedShuffleSplit(
                n_splits=self.n_resamples, train_size=self.train_size,
                test_size=self.test_size, random_state=self.random_state
            )
            splitter.get_n_splits(test_data.X, test_data.Y)
            self.indices = list(splitter.split(test_data.X, test_data.Y))

        else:
            splitter = skl.ShuffleSplit(
                n_splits=self.n_resamples, train_size=self.train_size,
                test_size=self.test_size, random_state=self.random_state
            )
            splitter.get_n_splits(test_data)
            self.indices = list(splitter.split(test_data))
示例#24
0
def stratified_split(data, cat, bins, test_size):
    lab = list(range(len(bins) - 1))
    temp_cat = "temp"
    data[temp_cat] = pd.cut(data[cat], bins=bins, labels=lab)

    split = model_selection.StratifiedShuffleSplit(n_splits=1,
                                                   test_size=test_size,
                                                   random_state=42)

    for train_index, test_index in split.split(data, data[temp_cat]):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]

    for set_ in (strat_train_set, strat_test_set):
        set_.drop(temp_cat, axis=1, inplace=True)

    return strat_train_set, strat_test_set
示例#25
0
def evaluate(model, which="dev"):
    X, y = [], []

    for (s1, s2), label in three_class_data_iter(which):
        d = model.distance(s1, s2)
        X.append(d)
        y.append(label)

    scores = sel.cross_validate(
        lm.LogisticRegression(),
        X,
        y=y,
        scoring=metrics.make_scorer(metrics.accuracy_score),
        cv=sel.StratifiedShuffleSplit(n_splits=5),
    )

    return scores
示例#26
0
def split_example():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([0, 0, 2, 2])

    if False:
        # The entry test_fold[i] represents the index of the test set that sample i belongs to.
        # It is possible to exclude sample i from any test set (i.e. include sample i in every training set) by setting test_fold[i] equal to -1.
        test_fold = [0, 1, -1, 1]
        split = PredefinedSplit(test_fold)
        print('#splits =', split.get_n_splits(X, y))
    elif False:
        # The stratified folds are made by preserving the percentage of samples for each class.
        split = model_selection.StratifiedShuffleSplit(n_splits=3,
                                                       test_size=0.25,
                                                       random_state=None)
        print('#splits =', split.get_n_splits(X, y))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        split = model_selection.GroupShuffleSplit(n_splits=3,
                                                  test_size=0.25,
                                                  random_state=None)
        #print('#splits =', split.get_n_splits(X, y, groups))
        print('#splits =', split.get_n_splits(groups=groups))
    elif False:
        split = model_selection.TimeSeriesSplit(n_splits=3,
                                                max_train_size=None)
        print('#splits =', split.get_n_splits())
    else:
        split = model_selection.ShuffleSplit(n_splits=3,
                                             test_size=0.25,
                                             random_state=None)
        print('#splits =', split.get_n_splits(X))
    print('Split:', split)

    #for train_indices, test_indices in split.split():
    #for train_indices, test_indices in split.split(X, y):
    #for train_indices, test_indices in split.split(X, y, groups):
    for train_indices, test_indices in split.split(X):
        #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape)
        print('TRAIN:', train_indices, 'TEST:', test_indices)

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
示例#27
0
    def evaluateModel(self, model, features, classes, train_size=0.7):
        XT, XF, YT, YF = model_selection.train_test_split(
            features, classes, train_size)

        kf2 = model_selection.KFold(n_splits=5,
                                    shuffle=True,
                                    random_state=12345)

        # https: // scikit - learn.org / stable / modules / cross_validation.html  # cross-validation
        # https://chrisalbon.com/machine_learning/model_evaluation/cross_validation_parameter_tuning_grid_search/

        # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, по очереди
        kf1 = model_selection.KFold(n_splits=5,
                                    shuffle=False,
                                    random_state=12345)

        # Разбивает так, что каждый элемент единожды попадает в тестовую выборку, случайный порядок
        kf2 = model_selection.KFold(n_splits=5,
                                    shuffle=True,
                                    random_state=12345)

        # Разбивает так, что все тестовые выборки содержат примерно одинаковое количество эл-тов разных классов
        kf3 = model_selection.StratifiedKFold(n_splits=5,
                                              shuffle=False,
                                              random_state=12345)

        # Разбивает в случайном порядке, элементы могут повторяться
        kf4 = model_selection.ShuffleSplit(n_splits=10, random_state=12345)

        # Разбивает в случайном порядке, элементы могут повторяться, тестовые выборки содержат примерно одинаковое количество эл-тов разных классов
        kf5 = model_selection.StratifiedShuffleSplit(n_splits=10,
                                                     random_state=12345)

        # делает N тестовых выборок, содержащих поочередно каждый элемент
        kf6 = model_selection.LeaveOneOut()

        self.trainModel(model, XT, YT)
        YP = self.predictModel(model, XF)

        acc = metrics.accuracy_score(YF, YP)
        prec = metrics.precision_score(YF, YP)
        rec = metrics.recall_score(YF, YP)
        f1 = metrics.f1_score(YF, YP)

        return f1, prec, rec, acc
示例#28
0
def load_data_set(file_name):
    data = pandas.read_csv(file_name)
    goal = data['drowsy']
    data = data.drop('drowsy', axis=1)

    model = model_selection.StratifiedShuffleSplit(n_splits=2,
                                                   test_size=0.2,
                                                   random_state=1)
    gen = model.split(data, goal)

    xTrain, yTrain, xTest, yTest = [], [], [], []
    for train_idx, test_idx in gen:
        xTrain = data.loc[train_idx]
        yTrain = goal.loc[train_idx]
        xTest = data.loc[test_idx]
        yTest = goal.loc[test_idx]

    return xTrain, yTrain, xTest, yTest
示例#29
0
 def build_sklearn(self, splitter_id, splitter_params):
     """Build splitters wrapping sklearn"""
     if splitter_id == 'mangoml_sklearn_KFold':
         return SplitterWrapper(model_selection.KFold(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_StratifiedKFold':
         return SplitterWrapper(
             model_selection.StratifiedKFold(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_ShuffleSplit':
         return SplitterWrapper(
             model_selection.ShuffleSplit(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_StratifiedShuffleSplit':
         return SplitterWrapper(
             model_selection.StratifiedShuffleSplit(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_GroupKFold':
         group_column = splitter_params.pop('group_column')
         return SplitterWrapper(
             model_selection.GroupKFold(**splitter_params), group_column)
     return None
示例#30
0
def load_data(file_name, training=False):
    data = pandas.read_csv(file_name)
    target = data['answer']
    data = data.drop('answer', axis=1)

    if training:
        model = model_selection.StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=1)
        gen = model.split(data, target)

        train_x, train_y, test_x, test_y = [], [], [], []
        for train_index, test_index in gen:
            train_x = data.loc[train_index]
            train_y = target.loc[train_index]
            test_x = data.loc[test_index]
            test_y = target.loc[test_index]

        return train_x, train_y, test_x, test_y

    return data, target