示例#1
0
class DKULeavePGroupsOut(object):
    def __init__(self, column_name, p):
        self.column_name = column_name
        self.splitter = LeavePGroupsOut(p)
        pass

    def set_column_labels(self, column_labels):
        self.column_labels = column_labels

    def get_n_splits(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        ret = self.splitter.get_n_splits(X, y, groups_array)
        print("Will use %s splits" % ret)
        return ret

    def split(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        return self.splitter.split(X, y, groups_array)
示例#2
0
def cross_validation(X, y, pre_x, groups, model='LGB', test_days=1):
    groups = np.floor((groups + 1) / 2)

    logo = LeavePGroupsOut(n_groups=test_days)
    i = 0
    pre_sum = np.zeros(pre_x.shape[0])
    pre_ = []
    print np.isnan(groups).astype(int).sum()
    print np.unique(groups)
    ll_ = []
    for train, test in logo.split(X, y, groups=groups):
        i = i + 1
        print 'times:', i
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        print X_train.shape, X_test.shape, y_train.shape, y_test.shape
        if model == 'LGB':
            pre, ll = LGB(X_train, X_test, y_train, y_test, pre_x)
        else:
            pre, ll = LR(X_train, X_test, y_train, y_test, pre_x)
        ll_ += [ll]
        pre_ += [pre]
    weight = []
    weight_sum = 0
    for l in ll_:
        weight_sum += 1.0 / l
        weight += [1.0 / l]
    for i in range(len(pre_)):
        pre_sum += pre_[i] * weight[i] / weight_sum

    print 'weight', weight
    print 'loss', ll_

    return pre_sum
def tmpFUN(dataset, group_label = "groups", n_groups = 2, y_label = "groups", rf_n_estimators = 2000, n_jobs = -1):
        lpgo = LeavePGroupsOut(n_groups = n_groups)

        for train_index, validate_index in lpgo.split(X = dataset, y = dataset.loc[:,y_label], groups = dataset.loc[:,group_label]):
                trainset = dataset.iloc[train_index,:]
                validateset  = dataset.iloc[validate_index,:]
                X_train = trainset.drop(y_label, axis = 1)
                y_train = trainset.loc[:,y_label]
                
                RF_mod = RandomForestClassifier(n_estimators = rf_n_estimators, n_jobs = n_jobs, class_weight = "balanced")
                RF_mod.fit(X_train, y_train)
                RF_pred = RF_mod.predict(X_test)
示例#4
0
文件: utils.py 项目: sidiatig/ot_ISPA
def create_cv(x, y, subjects, P):
    """

    :param x:
    :param y:
    :param N:
    :return:
    """
    cv = []
    lpgo = LeavePGroupsOut(n_groups=P)
    for train_index, test_index in lpgo.split(x, y, subjects):
        cv.append((train_index, test_index))
    return cv
示例#5
0
class LeavePSubjectsOut():
    def __init__(self, subjects_indexes):
        self.subjects_indexes = subjects_indexes
        self.splitter = LeavePGroupsOut(np.unique(subjects_indexes))

    def split(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.split(X, y, groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.get_n_splits(X, y, groups)
示例#6
0
class Splits():
    def __init__(self, sub_indexes, train_size=0.33, n_splits=10, mode='loso'):
        # bootstrap ou loso
        self.si = sub_indexes
        self.train_size = train_size
        self.n_splits = n_splits
        self.mode = mode
        self.create_splits()

    def create_splits(self, splits=None):

        if self.mode == 'bootstrap':
            unique = np.unique(self.si)

            rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size)
            splits = []
            for train, test in rs.split(unique):
                train = unique[train]
                test = unique[test]
                train_ = np.nonzero([x in train for x in self.si])
                test_ = np.nonzero([x in test for x in self.si])
                splits.append((train_, test_))
            self.splits = splits
            self.splitter = None
        elif self.mode == 'groupkfold':
            self.splitter = GroupKFold(n_splits=self.n_splits)
        elif self.mode == 'loso':
            self.splitter = LeaveOneGroupOut()
        elif self.mode == 'lpso':
            self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits)

    def get_n_splits(self, X=None, y=None, groups=None):
        if self.splitter:
            return self.splitter.get_n_splits(X, y, groups)
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        if self.splitter:
            for i, j in self.splitter.split(X, y, groups):
                yield i, j
        else:
            for tt in self.splits:
                yield tt
示例#7
0
    def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None):
        """
        returns a generator with train and test set indices based on hold on
        subject out cross-validation. This is based on the LeavePGroupsOut


        Parameters
        ----------
        tcrrep : TCRrep class instance
            TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields

        Returns
        -------
        partitions : generator object BaseCrossValidator.split from sklearn

        """
        if tcrrep is None:
            tcrrep = self.tcrrep
        # unique epitope mapped to unique numbers
        encoder_epitope = preprocessing.LabelEncoder()
        encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique()))

        # `y` target vector
        y = encoder_epitope.transform(tcrrep.clone_df.epitope)

        # `X` distance matrix (metric = 'precomputed')
        X = tcrrep.paired_tcrregex

        # Cross Validation Split
        # unique subjects mapped to unique numbers
        encoder_subjects = preprocessing.LabelEncoder()
        encoder_subjects = encoder_subjects.fit(
            list(tcrrep.clone_df.subject.unique()))

        # define groups based on subject
        groups = list(encoder_subjects.transform(tcrrep.clone_df.subject))

        # Leave P Groups Out
        lpgo = LeavePGroupsOut(n_groups=1)
        lpgo.get_n_splits(X, y, groups)
        partitions = lpgo.split(X, y, groups)
        return partitions
示例#8
0
    def split_groups(filenames, labels, groups, size):
        lpgo = LeavePGroupsOut(n_groups=size)
        flag = False
        for i, (train,
                test) in enumerate(lpgo.split(filenames, labels,
                                              groups=groups)):
            if random() > 0.95:
                flag = True
                train_filenames, train_labels, train_groups = np.array(
                    filenames)[train], np.array(labels)[train], np.array(
                        groups)[train]
                test_filenames, test_labels, test_groups = np.array(filenames)[
                    test], np.array(labels)[test], np.array(groups)[test]
                break
        if not flag:
            train_filenames, train_labels, train_groups = np.array(filenames)[
                train], np.array(labels)[train], np.array(groups)[train]
            test_filenames, test_labels, test_groups = np.array(filenames)[
                test], np.array(labels)[test], np.array(groups)[test]

        return train_filenames, test_filenames, train_groups, train_labels
示例#9
0
def construct_exp_splits(feature_frame, leave_n_out=1):
    """ Constructs a list of (train,test) splits for a feature_frame
        representing a set of experiments. These splits used integer
        based (as opposed to label based) indexing of feature_frame.
        Input:
            feature_frame : DataFrame
                A pandas dataframe returned by extract_features_targets
                representing multiple experiments
            leave_n_out : int
                The number of experiments to leave out in each cross validation
                fold
        Returns: [(Array, Array)]
            A list of (train index, test index) splits
    """
    groups = feature_frame.index.get_level_values(0)
    logo = LeavePGroupsOut(n_groups=leave_n_out)
    df_mat = feature_frame.values
    cv_splits = [
        (train_index, test_index)
        for train_index, test_index in logo.split(df_mat, groups=groups)
    ]
    return cv_splits
def def_get_n_psplits(X, y, groups, p, n):
    splitter = LeavePGroupsOut(n_groups=p)
    splits = list(splitter.split(X, y, groups))
    ids = np.random.choice(len(splits), n).tolist()
    list_random_selected = [splits[i] for i in ids]
    return (list_random_selected)
N_GROUPS = 1
N_EPOCH = 1
N_TRAIN = 2
START_SPEAKER = 0
loop = []
for i in range(N_TRAIN):
    print('======== START TRAINING {} OUT OF {} TIMES ========'.format(
        i + 1, N_TRAIN))

    lpgo = LeavePGroupsOut(
        n_groups=N_GROUPS)  #Number of groups to leave out in the test split.
    groups = np.array(labels_df_filtered['speakers'])

    cvscores = []
    gen = lpgo.split(X_reshape, y, groups)
    for num, indices in islice(
            enumerate(gen, 1),  # index from 1 instead of 0
            START_SPEAKER,
            None):  # loop from 5th onwards
        train_idx = indices[0]
        val_idx = indices[1]

        print('   ===== Fitting CV {} out of {} ====='.format(
            num, lpgo.get_n_splits(groups=groups)))
        print("     TRAIN:", np.unique(groups[train_idx]))
        print("       VAL:", np.unique(groups[val_idx]))

        train_X = X_reshape[train_idx]
        train_y = y.iloc[train_idx]
示例#12
0
RF = False
if RF == True:
    seed(0)
    predict = False  # True, False
    save_model_path = './models/RF/'
    check_dirs.check_dir(save_model_path)
    #kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
    lppo = LeavePGroupsOut(n_groups=1)
    fold_no = 1
    f1_per_fold, acc_per_fold, pre_per_fold, rec_per_fold = [], [], [], []
    features = np.reshape(features, (features.shape[0], features.shape[1]))
    # training
    if not predict:
        #for train, test in kfold.split(features, labels):
        for train, test in lppo.split(features_new,
                                      labels_new,
                                      groups=groups_new):
            feat_train, labels_train = features_new[train], labels_new[
                train].reshape((len(labels_new[train])))
            feat_test, labels_test = features_new[test], labels_new[
                test].reshape((len(labels_new[test])))
            clf = RandomForestClassifier(n_estimators=100,
                                         random_state=0,
                                         n_jobs=-1)
            # Fit data to model, then save models
            clf.fit(feat_train, labels_train)
            para = clf.get_params()
            # name: RandomForestClassifier_fold%d_estimators_%d
            filename = 'fold%d_' % fold_no + str(clf).split(
                '(')[0] + '_estimators_%d' % para['n_estimators']
            joblib.dump(clf, save_model_path + filename)
示例#13
0
                   param_grid=p_grid,
                   cv=inner_cv,
                   scoring="neg_mean_squared_error",
                   verbose=True,
                   return_train_score=False,
                   n_jobs=-1)

all_models = []
best_params = []
predicted = np.zeros(len(y))
nested_scores_train = np.zeros(outer_cv.get_n_splits(X, groups=df.study))
nested_scores_test = np.zeros(outer_cv.get_n_splits(X, groups=df.study))

print("model\tinner_cv mean score\touter vc score")
i = 0
for train, test in outer_cv.split(X, y, groups=df.study):
    print(test)
    group_train = groups[train]
    clf.fit(X[train], y[train], groups=group_train)

    print(
        str(clf.best_params_) + " " + str(clf.best_score_) + " " +
        str(clf.score(X[test], y[test])))

    all_models.append(clf.best_estimator_)
    best_params.append(clf.best_params_)

    predicted[test] += clf.predict(
        X[test])  # added, to later construct average

    nested_scores_train[i] = clf.best_score_
示例#14
0
# .. note::
#    Split is made to generate each fold

for tr, vl in LPSGO.split(X, y, g):
    print(tr.shape, vl.shape)

print('y label with number of samples')
print(np.unique(y[tr], return_counts=True))
##############################################################################
# Differences with scikit-learn
# -------------------------------------------
from sklearn.model_selection import LeavePGroupsOut
# You need to specify the number of groups

LPGO = LeavePGroupsOut(n_groups=2)
for tr, vl in LPGO.split(X, y, g):
    print(tr.shape, vl.shape)

##############################################################################
# With GroupShuffleSplit, won't keep the percentage per subgroup
# This generate unbalanced classes

from sklearn.model_selection import GroupShuffleSplit
GSS = GroupShuffleSplit(test_size=0.5, n_splits=2)
for tr, vl in GSS.split(X, y, g):
    print(tr.shape, vl.shape)

print('y label with number of samples')
print(np.unique(y[tr], return_counts=True))

###############################################################################
示例#15
0
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, balanced_accuracy_score


N_GROUPS = 1
N_EPOCH = 10
N_TRAIN = 1
loop = []
for i in range(N_TRAIN):
    print('======== START TRAINING {} OUT OF {} TIMES ========'.format(i, N_TRAIN))

    lpgo = LeavePGroupsOut(n_groups=N_GROUPS) #Number of groups to leave out in the test split.
    groups = np.array(labels_df_filtered['speakers'])
    
    cvscores = []
    for num, indices in enumerate(lpgo.split(X, y, groups)):
        train_idx = indices[0]
        val_idx = indices[1]
        
        print('   ===== Fitting CV {} out of {} ====='.format(num, lpgo.get_n_splits(groups=groups)))
        print("     TRAIN:", np.unique(groups[train_idx]))
        print("       VAL:", np.unique(groups[val_idx]))
    
        
        train_X = X[train_idx]
        train_y = y.iloc[train_idx]
        
        val_X = X[val_idx]
        val_y = y.iloc[val_idx]
        
        train_X = train_X.reshape((train_X.shape[0],) + (1,) + train_X.shape[1:])  
示例#16
0
# ==================================组 k-fold交叉验证、留一组交叉验证、留 P 组交叉验证、Group Shuffle Split==========================================
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]

# k折分组
gkf = GroupKFold(n_splits=3)  # 训练集和测试集属于不同的组
for train, test in gkf.split(X, y, groups=groups):
    print("组 k-fold分割:%s %s" % (train, test))

# 留一分组
logo = LeaveOneGroupOut()
for train, test in logo.split(X, y, groups=groups):
    print("留一组分割:%s %s" % (train, test))

# 留p分组
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X, y, groups=groups):
    print("留 P 组分割:%s %s" % (train, test))

# 随机分组
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("随机分割:%s %s" % (train, test))


# ==================================时间序列分割==========================================
tscv = TimeSeriesSplit(n_splits=3)
TimeSeriesSplit(max_train_size=None, n_splits=3)
for train, test in tscv.split(iris.data):
    print("时间序列分割:%s %s" % (train, test))
#  [ 0.74285061  1.46351659]
#  [ 2.49913075  1.23133799]]
# y
# [1 0 2 0 0 1 1 2 0 2 2 1]

# --------------------------------------------------------------------------------
# Suppose example data is collected from following group-distribution

groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

# --------------------------------------------------------------------------------
# Configure model

leave_p_group_out = LeavePGroupsOut(n_groups=2)

leave_p_group_out_get_n_splits = leave_p_group_out.get_n_splits(X, y, groups)
# print('leave_p_group_out_get_n_splits',leave_p_group_out_get_n_splits)
# leave_p_group_out_get_n_splits 6

# --------------------------------------------------------------------------------
for train, test in leave_p_group_out.split(X, y, groups):

    print('train', train.shape)
    print('test', test.shape)

    print('X[test]\n', X[train])
    print('y[test]\n', y[test])
    print('train groups', np.array(groups)[train])
    print('test groups', np.array(groups)[test])
    print('')