示例#1
0
def test_sklearn_cv_with_groups(tmp_dir):
    tuner = kt.SklearnTuner(
        oracle=kt.oracles.BayesianOptimization(objective=kt.Objective(
            "score", "max"),
                                               max_trials=10),
        hypermodel=build_model,
        cv=model_selection.GroupKFold(5),
        directory=tmp_dir,
    )

    x = np.random.uniform(size=(50, 10))
    y = np.random.randint(0, 2, size=(50, ))
    groups = np.random.randint(0, 5, size=(50, ))
    tuner.search(x, y, groups=groups)

    assert len(tuner.oracle.trials) == 10

    best_trial = tuner.oracle.get_best_trials()[0]
    assert best_trial.status == "COMPLETED"
    assert best_trial.score is not None
    assert best_trial.best_step == 0
    assert best_trial.metrics.exists("score")

    # Make sure best model can be reloaded.
    best_model = tuner.get_best_models()[0]
    best_model.score(x, y)
示例#2
0
    def test_split(self):
        X = np.array([0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10])
        y = np.array(["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"])
        groups = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 3])

        fold1 = model_selection.GroupKFold(n_splits=3).split(X, y, groups)
        fold2 = sklearn_model_selection.GroupKFold(n_splits=3).split(X, y, groups)
示例#3
0
def k_fold_example():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 2, 1, 2])
    groups = np.array([0, 0, 2, 2])

    if False:
        # The stratified folds are made by preserving the percentage of samples for each class.
        kf = model_selection.StratifiedKFold(n_splits=4,
                                             shuffle=True,
                                             random_state=None)
        #kf = model_selection.RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=None)
        print('#splits =', kf.get_n_splits(X, y))
    elif False:
        # The same group will not appear in two different folds.
        # The number of distinct groups has to be at least equal to the number of folds.
        kf = model_selection.GroupKFold(n_splits=4)
        print('#splits =', kf.get_n_splits(X, y, groups))
    else:
        kf = model_selection.KFold(n_splits=4, shuffle=True, random_state=None)
        #kf = model_selection.RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
        print('#splits =', kf.get_n_splits(X))
    print('K-fold:', kf)

    #for train_indices, test_indices in kf.split(X, y):
    #for train_indices, test_indices in kf.split(X, y, groups):
    for train_indices, test_indices in kf.split(X):
        #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape)
        print('TRAIN:', train_indices, 'TEST:', test_indices)

        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]
示例#4
0
def create_folds(datapath, output_path, nb_folds, method="KF", target=None):
    '''
    Creating folds for cross validation
    method must be one of KF, GKF, SKF
    target if SKF is the stratify variable which distribution must remain constant accross folds
    target if GKF is the group which must be non-overlapping
    '''
    df = pd.read_csv(datapath)
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = None
    if method == "KF":
        kf = model_selection.KFold(n_splits=nb_folds)
    if method == "GKF":
        kf = model_selection.GroupKFold(n_splits=nb_folds)
        y = df[target]
    if method == "SKF":
        kf = model_selection.StratifiedKFold(n_splits=nb_folds)
        y = df[target]

    if method == "KF":
        for fold, (t_, v_) in enumerate(kf.split(X=df)):
            df.loc[v_, 'kfold'] = fold
    else:
        for fold, (t_, v_) in enumerate(kf.split(X=df, y=y)):
            df.loc[v_, 'kfold'] = fold

    df.to_csv(output_path, index=False)
def run_full_cv(train_file, user_file):
	lgb_train = lgb.Dataset(train_file)
	lgb_train.construct()
	users = pd.read_pickle(user_file)

	assert lgb_train.num_data() == users.shape[0]

	kf = model_selection.GroupKFold(n_splits=5).split(users, users, users)

	params = dict(
		task='train',
		boosting_type='gbdt',
		objective='binary',
		num_leaves=170,
		max_depth=8,
		learning_rate=0.03,
		feature_fraction=0.6,
		bagging_fraction=0.85,
		bagging_freq=3,
		verbose=3,
	)

	lgb.cv(params, lgb_train,
		metrics=['binary_logloss', 'auc'],
		num_boost_round=2000,
		early_stopping_rounds=30,
		folds=kf,
		verbose_eval=1
	)
示例#6
0
def build_splits(data_dir, args, config, parser, snapshots_dir, snapshot_name):
    """
    Splits the images from the given directory into training and validation folds.

    IMPORTANT! Check that the subject ID is set up correctly (args.ID_split),
    i.e. at which point the file name separates the ID and image name.

    :param data_dir: Path to input and target data
    :param args: Experiment arguments
    :param config: Configuration file (more arguments)
    :param parser: Function that loads the images
    :param snapshots_dir: Path to experiment logs and models
    :param snapshot_name: Name of the experiment
    :return: Metadata including the training and validation splits, as well as mean and std
    """
    # Metadata
    metadata = build_meta_from_files(data_dir)
    # Group_ID
    metadata['subj_id'] = metadata.fname.apply(lambda x: '_'.join(x.stem.split(args.ID_char, args.ID_split)[:-1]), 0)

    # Mean and std
    crop = config['training']['crop_size']
    mean_std_path = snapshots_dir / f"mean_std_{crop[0]}x{crop[1]}.pth"
    if mean_std_path.is_file() and not config['training']['calc_meanstd']:  # Load
        print('==> Loading mean and std from cache')
        tmp = torch.load(mean_std_path)
        mean, std = tmp['mean'], tmp['std']
    else:  # Calculate
        print('==> Estimating mean and std')
        mean, std = estimate_mean_std(config, metadata, parser, args.num_threads, config['training']['bs'])
        torch.save({'mean': mean, 'std': std}, mean_std_path)

    print('==> Mean:', mean)
    print('==> STD:', std)

    # Group K-Fold by rabbit ID
    gkf = model_selection.GroupKFold(n_splits=config['training']['n_folds'])
    # K-fold by random shuffle (not recommended if ID is known)
    # gkf = model_selection.KFold(n_splits=config['training']['n_folds'], shuffle=True, random_state=args.seed)

    # Create splits for all folds
    splits_metadata = dict()
    iterator = gkf.split(metadata.fname.values, groups=metadata.subj_id.values)  # Split by subject ID
    for fold in range(config['training']['n_folds']):
        train_idx, val_idx = next(iterator)
        splits_metadata[f'fold_{fold}'] = {'train': metadata.iloc[train_idx],
                                           'val': metadata.iloc[val_idx]}

    # Add mean and std to metadata
    splits_metadata['mean'] = mean
    splits_metadata['std'] = std

    # Save splits, mean and std
    with open(snapshots_dir / snapshot_name / 'split_config.dill', 'wb') as f:
        dill.dump(splits_metadata, f)

    return splits_metadata
示例#7
0
文件: utils.py 项目: dumjax/kag_siim
def folds_generator_group(nr_folds):
    df = pd.read_csv("../../data/raw/train.csv")
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.target.values
    kf = model_selection.GroupKFold(n_splits=nr_folds)

    for f, (t_, v_) in enumerate(kf.split(X=df, y=y, groups=df['patient_id'])):
        df.loc[v_, 'kfold'] = f
    df.to_csv('train_folds_group_{}.csv'.format(nr_folds), index=False)
示例#8
0
def build_splits(data_dir, args, config, parser, snapshots_dir, snapshot_name):
    # TODO correct splits
    # Metadata
    # metadata = build_meta_from_files(data_dir)
    metadata = build_meta_from_files(base_path=data_dir)
    # Group_ID
    if config['training']['uCT']:
        metadata['subj_id'] = metadata.fname.apply(
            lambda x: x.stem.rsplit('_', 2)[0], 0)
    else:
        metadata['subj_id'] = metadata.fname.apply(
            lambda x: x.stem.rsplit('_', 3)[0], 0)

    # Mean and std
    crop = config['training']['crop_size']
    mean_std_path = snapshots_dir / f"mean_std_{crop[0]}x{crop[1]}.pth"
    if mean_std_path.is_file(
    ) and not config['training']['calc_meanstd']:  # Load
        print('==> Loading mean and std from cache')
        tmp = torch.load(mean_std_path)
        mean, std = tmp['mean'], tmp['std']
    else:  # Calculate
        print('==> Estimating mean and std')
        mean, std = estimate_mean_std(config, metadata, parser,
                                      args.num_threads, args.bs)
        torch.save({'mean': mean, 'std': std}, mean_std_path)

    print('==> Mean:', mean)
    print('==> STD:', std)

    # Group K-Fold by rabbit ID
    gkf = model_selection.GroupKFold(n_splits=config['training']['n_folds'])
    # K-fold by random shuffle
    # gkf = model_selection.KFold(n_splits=config['training']['n_folds'], shuffle=True, random_state=args.seed)

    # Create splits for all folds
    splits_metadata = dict()
    iterator = gkf.split(metadata, groups=metadata.subj_id)
    for fold in range(config['training']['n_folds']):
        train_idx, val_idx = next(iterator)
        splits_metadata[f'fold_{fold}'] = {
            'train': metadata.iloc[train_idx],
            'val': metadata.iloc[val_idx]
        }

    # Add mean and std to metadata
    splits_metadata['mean'] = mean
    splits_metadata['std'] = std

    with open(snapshots_dir / snapshot_name / 'split_config.dill', 'wb') as f:
        dill.dump(splits_metadata, f)

    return splits_metadata
示例#9
0
def group_kfold():
    lr = linear_model.LogisticRegression()
    x, y = datasets.make_blobs(n_samples=12, random_state=0)

    print(x)
    print(y)

    # groups!? 모호함
    groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
    scores = model_selection.cross_val_score(
        lr, x, y, groups, cv=model_selection.GroupKFold(n_splits=3))
    print(scores)
    print('-' * 50)

    sp1 = model_selection.GroupKFold()
    for train_index, test_index in sp1.split(x, y, groups):
        print(train_index, test_index)
    print()

    sp2 = model_selection.GroupKFold(n_splits=4)
    for train_index, test_index in sp2.split(x, y, groups):
        print(train_index, test_index)
    print()
示例#10
0
def randomsearch_fitparams_epsSVR(X,
                                  y,
                                  groups=None,
                                  n_iter=20,
                                  n_jobs=1,
                                  **kws):
    '''
        Function for randomized search per sklearn's RandomizedSearchCV
        
        X,y - Training input/output pairs following sklearn .fit method conventions
        groups - (optional) list of group assignments following sklearn convention; if non-None, GroupKFold is used.
        n_iter, n_jobs - parameters for # of iterations and threads for the RandomizedSearchCV to use
        **kws - Additional kws used for explicitly fixing one or more parameters during parameter searching
    '''

    param_distributions = {
        'C': stats.expon(scale=50),
        'gamma': stats.expon(scale=1),
        'epsilon': stats.expon(scale=10)
    }
    regressor = svm.SVR(kernel='rbf')

    for param in param_distributions.keys():
        if kws.get(
                param
        ) is not None:  # Assume we're using this to fix a parameter.
            setattr(regressor, param, kws[param])

    param_distributions = {
        k: v
        for k, v in param_distributions.items() if kws.get(k) is None
    }

    if groups is None:
        cv = model_selection.KFold(n_splits=4, shuffle=True)
    else:
        print(
            'Using GroupKFold to separate measurements from different animals')
        cv = model_selection.GroupKFold(n_splits=4)
    cv_splits = list(cv.split(X, y, groups=groups))

    random_search = model_selection.RandomizedSearchCV(
        regressor,
        param_distributions=param_distributions,
        n_iter=n_iter,
        verbose=True,
        cv=cv_splits,
        n_jobs=n_jobs)
    random_search.fit(X, y)
    return random_search
示例#11
0
def optimize(trial, df):

    n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    num_leaves = trial.suggest_int("num_leaves", 10, 500)
    learning_rate = trial.suggest_uniform("learning_rate", 0.01, 1.0)

    modely = lgb.LGBMRegressor(n_estimators=n_estimators,
                               num_leaves=num_leaves,
                               learning_rate=learning_rate)

    modelx = lgb.LGBMRegressor(n_estimators=n_estimators,
                               num_leaves=num_leaves,
                               learning_rate=learning_rate)

    modelf = lgb.LGBMClassifier(n_estimators=n_estimators,
                                num_leaves=num_leaves,
                                learning_rate=learning_rate)

    kf = model_selection.GroupKFold(n_splits=5, groups=df.iloc[:, -1])
    comp_metric = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):

        df_train = df.loc[train_idx]
        df_val = df.loc[val_idx]

        x_train = df_train.iloc[:, :-4]
        y_trainx = df_train.iloc[:, -4]
        y_trainy = df_train.iloc[:, -3]
        y_trainf = df_train.iloc[:, -2]

        x_val = df_val.iloc[:, :-4]
        y_valx = df_val.iloc[:, -4]
        y_valy = df_val.iloc[:, -3]
        y_valf = df_val.iloc[:, -2]

        modelx.fit(x_train, y_trainx)
        modely.fit(x_train, y_trainy)
        modelf.fit(x_train, y_trainf)

        test_predsx = modelx.predict(x_val)
        test_predsy = modely.predict(x_val)
        test_predsf = modelf.predict(x_val)

        fold_metric = regression_metrics.iln_comp_metric(
            test_predsx, test_predsy, test_predsf, y_valx, y_valy, y_valf)
        comp_metric.append(fold_metric)
    return np.mean(comp_metric)
示例#12
0
 def build_sklearn(self, splitter_id, splitter_params):
     """Build splitters wrapping sklearn"""
     if splitter_id == 'mangoml_sklearn_KFold':
         return SplitterWrapper(model_selection.KFold(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_StratifiedKFold':
         return SplitterWrapper(
             model_selection.StratifiedKFold(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_ShuffleSplit':
         return SplitterWrapper(
             model_selection.ShuffleSplit(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_StratifiedShuffleSplit':
         return SplitterWrapper(
             model_selection.StratifiedShuffleSplit(**splitter_params))
     elif splitter_id == 'mangoml_sklearn_GroupKFold':
         group_column = splitter_params.pop('group_column')
         return SplitterWrapper(
             model_selection.GroupKFold(**splitter_params), group_column)
     return None
示例#13
0
def temp(samples):
    from sklearn import model_selection
    from wbia.algo.verif import sklearn_utils

    def check_balance(idxs):
        # from sklearn.utils.fixes import bincount
        logger.info('-------')
        for count, (test, train) in enumerate(idxs):
            logger.info('split %r' % (count))
            groups_train = set(groups.take(train))
            groups_test = set(groups.take(test))
            n_group_isect = len(groups_train.intersection(groups_test))
            y_train_freq = bincount(y.take(train))
            y_test_freq = bincount(y.take(test))
            y_test_ratio = y_test_freq / y_test_freq.sum()
            y_train_ratio = y_train_freq / y_train_freq.sum()
            balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2)
            logger.info('n_group_isect = %r' % (n_group_isect,))
            logger.info('y_test_ratio = %r' % (y_test_ratio,))
            logger.info('y_train_ratio = %r' % (y_train_ratio,))
            logger.info('balance_error = %r' % (balance_error,))

    X = np.empty((len(samples), 0))
    y = samples.encoded_1d().values
    groups = samples.group_ids

    n_splits = 3

    splitter = model_selection.GroupShuffleSplit(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.GroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = model_selection.StratifiedKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)

    splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits)
    idxs = list(splitter.split(X=X, y=y, groups=groups))
    check_balance(idxs)
示例#14
0
 def split_data(self):
     try:
         from sklearn import model_selection
         if self.split_by==None and self.split_parent==None:
             self.kfold=model_selection.KFold(n_splits=self.nbins,random_state=self.seed)
             self.folds=self.kfold.split(self.x)
             self.folds = self.kfold.split(self.x)
         elif self.split_by == None and self.split_parent!=None:
             self.kfold=model_selection.GroupKFold(n_splits=self.nbins,random_state=self.seed)
             self.folds=self.kfold.split(self.x,groups=self.split_parent)
         elif self.split.by !=None and self.split_parent==None:
             self.kfold=model_selection.StratifiedKFold(n_splits=self.nbins,random_state=self.seed)
             self.folds=self.kfold.split(self.x,self.split_by)
         i=0
         for train,test in self.folds:
             self.x.loc[test,"TestInd"]=i
             i+=1
     except:
         ValueError("X data is missing for splitting")
示例#15
0
    def __init__(self,
                 ds: pd.DataFrame,
                 n_folds: int = 5,
                 target_col: str = 'target',
                 group_col: str or None = None,
                 random_state: int or None = None):
        super().__init__()
        if group_col is None:
            splitter = model_selection.StratifiedKFold(
                n_splits=n_folds, random_state=random_state)
            split_iter = splitter.split(ds, ds[target_col])
        else:
            splitter = model_selection.GroupKFold(n_splits=n_folds)
            split_iter = splitter.split(ds,
                                        ds[target_col],
                                        groups=ds[group_col])

        self.__cv_folds_idx = [(train_idx, val_idx)
                               for (train_idx, val_idx) in split_iter]
        self.__ds_chunks = [(ds.iloc[split[0]], ds.iloc[split[1]])
                            for split in self.__cv_folds_idx]
        self.__folds_iter = iter(self.__ds_chunks)
示例#16
0
    def split(
        self,
        cross_validation: Union[int, model_selection.BaseCrossValidator,
                                model_selection.ShuffleSplit,
                                model_selection.StratifiedShuffleSplit],
        groups: Optional[np.ndarray] = None,
    ) -> Iterator[Tuple['InferenceData', 'InferenceData']]:
        """Splits the data using the indicated cross validator.

    Args:
      cross_validation: Cross validation to be applied. If an int is passed
        and groups is None a sklearn Kfold is used with cross_validation as
        the number of splits. If an int is passed and groups is not None,
        sklearn GroupKFold will be used. Whena a cross validator is passed it
        is used directly.
      groups: If cross validating for non overlaping groups, this array
        indicates to which group each row belongs.

    Yields:
      A tuple with train and test InferenceDatas.
    """
        if isinstance(cross_validation, int):
            if groups is not None:
                cross_validation = model_selection.GroupKFold(
                    n_splits=cross_validation)
            else:
                cross_validation = model_selection.KFold(
                    n_splits=cross_validation)

        for train_index, test_index in cross_validation.split(self.data,
                                                              groups=groups):
            train_inference_data = self._copy_and_index_inference_data(
                train_index)
            test_inference_data = self._copy_and_index_inference_data(
                test_index)
            yield train_inference_data, test_inference_data
示例#17
0
# loop through each tournament and print the input for train and validation
for index in range(0, len(tournaments)):
    # get the tournament name
    tournament = tournaments[index]

    print("*********** TOURNAMENT " + tournament + " ***********")

    # set the target name for the tournament
    target = "target_" + tournament

    # set the y train with the target variable
    y = train_data.iloc[:, train_data.columns == target].values.reshape(-1, )

    # use GroupKFold for splitting the era
    group_kfold = model_selection.GroupKFold(n_splits=kfold_split)

    counter = 1

    print(">> group eras using kfold split\n")

    for train_index, test_index in group_kfold.split(X,
                                                     y,
                                                     groups=train_data['era']):
        # X_train takes the 50 features only for training and leave the other columns
        X_train = X[train_index][:, 3:]
        # y_train remains the same
        y_train = y[train_index]

        print(">> running split #", counter)
示例#18
0
from sklearn import ensemble
print(
    cross_val_score(ensemble.GradientBoostingRegressor(), X, y, cv=10).mean())

###############################################################
# We can predict!
#
# But there is one caveat: are we simply learning to recognive students
# across the years? There is many implicit informations about students:
# notably in the school ID and the class ID.
#
# **Stratification** To test for this, we can make sure that we have
# different students in the train and the test set.
from sklearn import model_selection
cv = model_selection.GroupKFold(10)

print(
    cross_val_score(ensemble.GradientBoostingRegressor(),
                    X,
                    y,
                    cv=cv,
                    groups=exams['StudentID']).mean())

###############################################################
# It works better!
#
# The classifier learns better to generalize, probably by learning
# stronger invariances from the repeated measures on the students
#
# Summary
示例#19
0
def process(args, n_splits=5):
    np.random.seed(2299)
    ## Read data
    # Paths
    out_path = Path(args.out_path)
    out_path.mkdir(parents=True, exist_ok=True)
    ds_path = Path("../data/drug_screens/")
    cm_path = Path("../data/cellular_models/")
    # L1000 genes
    genes = pd.read_csv(
        cm_path.joinpath("GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz"),
        sep="\t")
    l1000_cols = genes[genes['pr_is_lm'] == True]['pr_gene_id'].values.astype(
        str)
    # CTRP
    cp_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_compound.txt"),
                          sep="\t",
                          index_col=0)
    cl_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_cell_line.txt"),
                          sep="\t",
                          index_col=0)
    exp_ctrp = pd.read_csv(
        ds_path.joinpath("CTRP/v20.meta.per_experiment.txt"),
        sep="\t",
        index_col=0)
    ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.data.per_cpd_post_qc.txt"),
                       sep='\t',
                       index_col=0)

    ## Create meta data
    meta_ccle = pd.read_csv(cm_path.joinpath("sample_info.csv"))
    meta = ctrp.join(exp_ctrp['master_ccl_id']).drop_duplicates()
    meta = meta.merge(cl_ctrp['ccl_name'],
                      left_on='master_ccl_id',
                      right_index=True)
    meta = meta.merge(cp_ctrp[['broad_cpd_id', 'cpd_smiles']],
                      left_on='master_cpd_id',
                      right_index=True)
    meta = meta.merge(meta_ccle[['stripped_cell_line_name', 'DepMap_ID']],
                      left_on='ccl_name',
                      right_on='stripped_cell_line_name')
    meta['cpd_pred_pv'] = np.clip(meta['cpd_pred_pv'], a_min=0., a_max=None)
    meta = meta.set_index('DepMap_ID')

    # CCLE
    ccle = pd.read_csv(cm_path.joinpath("CCLE_expression.csv"),
                       index_col=0).rename_axis('DepMap_ID').astype(np.float32)
    ccle = ccle[ccle.index.isin(meta.index)]

    ## Generate Fingerprints
    nBits = 512
    cpd_data = meta[['broad_cpd_id', 'cpd_smiles']].drop_duplicates()
    fp = smiles_to_bits(cpd_data['cpd_smiles'], nBits=nBits)
    fp.index = cpd_data['broad_cpd_id']
    fp.columns = ["FP_{}".format(i) for i in range(len(fp.columns))]
    fp = fp.astype(np.int)

    ## Subset L1000 gene features by HGNC
    ccle_cols = np.array([[gene[0], gene[1].strip("()")]
                          for gene in ccle.columns.str.split(" ")])
    ccle.columns = ccle_cols[:, 1]

    ## Get feature columns
    gene_cols = ccle.columns.to_numpy()
    fp_cols = fp.columns.to_numpy()

    ## Generate folds
    ccle["fold"] = -1
    ccle = ccle.sample(frac=1).reset_index()
    gkf = model_selection.GroupKFold(n_splits=n_splits)
    for fold, (train_idx, val_idx) in enumerate(
            gkf.split(X=ccle, y=None, groups=ccle.index)):
        ccle.loc[val_idx, 'fold'] = fold
    ccle = ccle.set_index('DepMap_ID')

    ## Standardize folds & write
    for fold in range(0, n_splits):
        train = ccle[ccle['fold'] != fold].drop(columns='fold').copy()
        val = ccle[ccle['fold'] == fold].drop(columns='fold').copy()
        # Standardize GEX values
        train, val = stdscale_update(train, val, gene_cols)
        # Write
        train.to_csv(out_path.joinpath(f"train_fold_{fold}.csv.gz"))
        val.to_csv(out_path.joinpath(f"val_fold_{fold}.csv.gz"))
        # joblib.dump(scaler, out_path.joinpath(f"stdscaler_full_fold_{fold}.pkl"))

    ## Write out metadata
    meta.to_csv(out_path.joinpath(f"meta.csv.gz"))
    fp.to_csv(out_path.joinpath(f"fingerprints.csv.gz"))
    joblib.dump(gene_cols, out_path.joinpath("protcode_gene_cols.pkl"))
    joblib.dump(l1000_cols, out_path.joinpath("l1000_gene_cols.pkl"))
    joblib.dump(fp_cols, out_path.joinpath("fp_cols.pkl"))

    return "Complete"
示例#20
0
RUN_ID = 'bb_lstm_20steps_2017-06-14_19.57.18'
MODEL_FILE = 'best.hdf5'

print('Loading data')
df = pd.read_csv('bb/models/' + RUN_ID + '/' +
                 MODEL_FILE.replace('.hdf5', '-predict-aligned.csv'))
features = [f for f in df if re.fullmatch('^z[0-9]+$', f)]
print('Found ' + str(len(features)) + ' embedding features: ' + str(features))
print(df.behavior.value_counts())

X = df[features].values
y = np.array([1 if l == 'UDEF1' else 0 for l in df.behavior])
cm = []
aucs = []
kappas = []
for train_i, test_i in model_selection.GroupKFold(4).split(
        X, y, df.participant_id.values):
    model = tree.DecisionTreeClassifier()
    imbalance = sum(y[train_i]) / len(train_i)
    weights = np.array(
        [imbalance if l == 0 else 1 - imbalance for l in y[train_i]])
    model.fit(X[train_i], y[train_i], sample_weight=weights)
    preds = model.predict(X[test_i])
    probs = model.predict_proba(X[test_i])
    aucs.append(metrics.roc_auc_score(y[test_i], probs[:, 1]))
    kappas.append(metrics.cohen_kappa_score(preds, y[test_i]))
    cm.append(metrics.confusion_matrix(y[test_i], preds))
cm = np.sum(cm, axis=0)
print(cm)
print(kappas)
print('mean kappa: %.3f' % (sum(kappas) / len(kappas)))
print('mean AUC:   %.3f' % (sum(aucs) / len(aucs)))
示例#21
0
        elif np.sum(target_data_idx) == 1:
            Y = y_sr[0]
        else:
            raise ValueError('Wrong target number')

        Y.columns.rename('target', level=0, inplace=True)
        assert all(X.index == Y.index)  # check index consistency


        """  Construct CV iterator  """
        from sklearn import model_selection
        odor_set = set(X.index.unique(level='stim1').str[:-2])
        odor_group_dict = dict(zip(odor_set,range(len(odor_set))))
        stim_group = X.index.get_level_values(level='stim1').str[:-2]
        stim_group = stim_group.map(lambda x: odor_group_dict[x]).tolist()
        group_kfold = model_selection.GroupKFold(n_splits=len(odor_set))
        # stim_kfold = group_kfold.split(X, Y, groups=stim_group)

        """  Build regression model with CV  """
        reg_param = dict(
            max_iter=10000,
            selection='random'
        )

        from sklearn import linear_model
        mtlcv = linear_model.MultiTaskLassoCV(copy_X=True,
                                              alphas=10**np.linspace(-4,0,num=20),
                                              cv=group_kfold.split(X, Y, groups=stim_group), n_jobs=12,#group_kfold.n_splits,
                                              verbose=1,
                                              **reg_param)

if __name__ == "__main__":
    args = get_args()
    # create folds
    df = pd.read_csv(args.source)
    df["kfold"] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.target.values

    assert args.type in [
        "group_k_fold", "stratified_k_fold", "stratified_group_k_fold"
    ]
    if args.type == "group_k_fold":
        print("Using groupKFold")
        kf = model_selection.GroupKFold(n_splits=5)
        for f, (t_, v_) in enumerate(
                kf.split(X=df, y=y, groups=df["patient_id"].tolist())):
            df.loc[v_, "kfold"] = f
        df.to_csv(args.target, index=False)
    elif args.type == "stratified_k_fold":
        print("Using StratifiedKFold")
        kf = model_selection.StratifiedKFold(n_splits=5)
        for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
            df.loc[v_, "kfold"] = f
        df.to_csv(args.target, index=False)

    elif args.type == "stratified_group_k_fold":
        print("Using stratified_group_k_fold")
        groups = np.array(df["patient_id"].values)
        for f, (t_, v_) in enumerate(
示例#23
0
        print(assignment_ami)
        print(
            "Cluster assignment accuracy over {} runs: mean={}, std={}".format(
                args.cluster, np.mean(assignment_ami), np.std(assignment_ami)))

    classifier_models = list()
    if args.knn > 0:
        classifier_models.append(neighbors.KNeighborsClassifier(args.knn))
    if args.svm:
        classifier_models.append(sklearn.svm.SVC())

    for model in classifier_models:
        print("Evaluating ", str(model))
        if args.predict_performance is not None:
            split = model_selection.GroupKFold(n_splits=5).split(
                t_vectors, targets, groups=person_ids)
        else:
            split = model_selection.StratifiedKFold(n_splits=5).split(
                t_vectors, targets)

        # Stratify by person_ids
        score = list()
        target_wise = list()
        saved = [list(), list()]
        for train, test in split:
            model.fit(t_vectors[train, :], targets[train].squeeze())
            score.append(
                model.score(t_vectors[test, :], targets[test].squeeze()))
            print("Overall Accuracy: ", score[-1])
            predictions = model.predict(t_vectors[test, :])
            cmat = metrics.confusion_matrix(targets[test].squeeze(),
示例#24
0
def process(out_path):
    np.random.seed(2299)
    ## Read data
    # Paths
    out_path = Path(out_path)
    out_path.mkdir(parents=True, exist_ok=True)
    ds_path = Path("../../film-gex-data/drug_screens/")
    cm_path = Path("../../film-gex-data/cellular_models/")
    # CCLE
    meta_ccle = pd.read_csv(cm_path.joinpath("sample_info.csv"))
    ccle = pd.read_csv(cm_path.joinpath("CCLE_expression.csv"), index_col=0)
    # L1000 genes
    genes = pd.read_csv(
        cm_path.joinpath("GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz"),
        sep="\t",
        index_col=0)
    # CTRP
    cp_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_compound.txt"),
                          sep="\t",
                          index_col=0)
    cl_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_cell_line.txt"),
                          sep="\t",
                          index_col=0)
    exp_ctrp = pd.read_csv(
        ds_path.joinpath("CTRP/v20.meta.per_experiment.txt"),
        sep="\t",
        index_col=0)
    ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.data.per_cpd_post_qc.txt"),
                       sep='\t',
                       index_col=0)

    ## Merge data
    data = ctrp.join(exp_ctrp['master_ccl_id']).drop_duplicates()
    data = data.merge(cl_ctrp['ccl_name'],
                      left_on='master_ccl_id',
                      right_index=True)
    data = data.merge(cp_ctrp[['broad_cpd_id', 'cpd_smiles']],
                      left_on='master_cpd_id',
                      right_index=True)
    data = data.merge(meta_ccle[['stripped_cell_line_name', 'DepMap_ID']],
                      left_on='ccl_name',
                      right_on='stripped_cell_line_name')

    ## Generate Fingerprints
    nBits = 512
    cpd_data = data[['broad_cpd_id', 'cpd_smiles']].drop_duplicates()
    bits = smiles_to_bits(cpd_data['cpd_smiles'], nBits=nBits)
    bits.index = cpd_data['broad_cpd_id']
    bits.columns = ["FP_{}".format(i) for i in range(len(bits.columns))]

    ## Subset L1000 gene features by Entrez ID
    genes_lm = genes[genes['pr_is_lm'] == 1]
    ccle_cols = np.array([[gene[0], gene[1].strip("()")]
                          for gene in ccle.columns.str.split(" ")])
    ccle.columns = ccle_cols[:, 1]
    ccle = ccle[genes_lm.index.astype(str)]

    ## Merge bits and GEX
    data = data.merge(bits, left_on="broad_cpd_id", right_index=True)
    data = data.merge(ccle, left_on="DepMap_ID", right_index=True)
    print(
        "{} unique compounds and {} unique cell lines comprising {} data points"
        .format(data['broad_cpd_id'].nunique(),
                data['stripped_cell_line_name'].nunique(), data.shape[0]))

    ## Generate folds
    target = "cpd_avg_pv"
    group = "stripped_cell_line_name"
    n_splits = 5
    gene_cols = ccle.columns.to_numpy()
    fp_cols = bits.columns.to_numpy()

    data["fold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    gkf = model_selection.GroupKFold(n_splits=n_splits)
    for fold, (train_idx, val_idx) in enumerate(
            gkf.split(X=data,
                      y=data[target].to_numpy(),
                      groups=data[group].to_numpy())):
        print(len(train_idx), len(val_idx))
        data.loc[val_idx, 'fold'] = fold

    ## Generate transforms & write
    for fold in range(0, n_splits):
        train = data[data['fold'] != fold]
        val = data[data['fold'] == fold]
        # Transform
        scaler = StandardScaler()
        train.loc[:, gene_cols] = scaler.fit_transform(train.loc[:, gene_cols])
        val.loc[:, gene_cols] = scaler.transform(val.loc[:, gene_cols])
        # Write
        train.reset_index(drop=True).to_feather(
            out_path.joinpath("train_fold_{}.feather".format(fold)))
        val.reset_index(drop=True).to_feather(
            out_path.joinpath("val_fold_{}.feather".format(fold)))
        # Testing set
        train.sample(frac=0.05).reset_index(drop=True).to_feather(
            out_path.joinpath("sub_train_fold_{}.feather".format(fold)))
        val.sample(frac=0.05).reset_index(drop=True).to_feather(
            out_path.joinpath("sub_val_fold_{}.feather".format(fold)))

    ## Write out
    joblib.dump(gene_cols, out_path.joinpath("gene_cols.pkl"))
    joblib.dump(fp_cols, out_path.joinpath("fp_cols.pkl"))
    data.sample(frac=0.05).reset_index(drop=True).to_feather(
        out_path.joinpath("data_sub.feather"))
    data.reset_index(drop=True).to_feather(out_path.joinpath("data.feather"))

    return "Complete"
示例#25
0
 def __init__(self, cfg):
     super(GroupKFold, self).__init__(cfg)
     self.fold = model_selection.GroupKFold(**cfg.params)
示例#26
0
    def __init__(self,
                 ds: pd.DataFrame,
                 n_ss_folds: int = 3,
                 n_folds: int = 5,
                 target_col: str = 'target',
                 random_state: int or None = None,
                 unlabeled_target_col: str = '5means_classes',
                 test_ratio: int = 0.25,
                 labeled_train_size_per_class: int = None,
                 unlabeled_train_size_per_class: int = None,
                 labeled_train_size: int = None,
                 unlabeled_train_size: int = None,
                 group_col: str or None = None,
                 equal_target: bool = True,
                 equal_unlabeled_target: bool = True,
                 shuffle: bool = True):
        super().__init__()

        self._test_ratio = test_ratio

        if equal_target and labeled_train_size_per_class is None:
            raise ValueError(
                "labeled_train_size_per_class must be determined when \
            equal_target is True, but found None")

        if not equal_target and labeled_train_size is None:
            raise ValueError("labeled_train_size must be determined when \
            equal_target is False, but found None")

        # Master split into Label/Unlabel
        if group_col is None:
            master_splitter = model_selection.StratifiedKFold(
                n_splits=n_ss_folds, random_state=random_state)
            unlabeled_idx, labeled_idx = next(
                master_splitter.split(ds, ds[target_col]))
        else:
            master_splitter = model_selection.GroupKFold(n_splits=n_ss_folds)
            unlabeled_idx, labeled_idx = next(
                master_splitter.split(ds, ds[target_col],
                                      groups=ds[group_col]))
        unlabeled_ds = ds.iloc[unlabeled_idx]
        # u_groups = ds[unlabeled_target_col].iloc[unlabeled_idx]
        labeled_ds = ds.iloc[labeled_idx]
        l_groups = ds[target_col].iloc[labeled_idx]

        if not equal_target and labeled_train_size is not None and labeled_train_size > len(
                labeled_idx):
            raise ValueError(
                'Input labeled train size {} is larger than actual labeled train size {}'
                .format(labeled_train_size, len(labeled_idx)))

        if unlabeled_train_size is not None and unlabeled_train_size > len(
                unlabeled_idx):
            unlabeled_train_size = len(unlabeled_idx)
            # raise ValueError('Input unlabeled train size {} is larger than actual unlabeled train size {}'.format(unlabeled_train_size, len(unlabeled_idx)))

        # Split labeled data using GroupKFold
        # Split unlabeled data using GroupKFold
        self.__cv_folds_idx = []
        self.__ds_chunks = []

        # split of train/val data
        if group_col is None:
            unlabeled_splitter = model_selection.StratifiedKFold(
                n_splits=n_folds, random_state=random_state + 1)
            unlabeled_spl_iter = unlabeled_splitter.split(
                unlabeled_ds, unlabeled_ds[target_col])
        else:
            unlabeled_splitter = model_selection.GroupKFold(n_splits=n_folds)
            unlabeled_spl_iter = unlabeled_splitter.split(
                unlabeled_ds,
                unlabeled_ds[target_col],
                groups=unlabeled_ds[group_col])

        if group_col is None:
            labeled_splitter = model_selection.StratifiedKFold(
                n_splits=n_folds, random_state=random_state + 2)
            labeled_spl_iter = labeled_splitter.split(labeled_ds,
                                                      labeled_ds[target_col])
        else:
            labeled_splitter = model_selection.GroupKFold(n_splits=n_folds)
            labeled_spl_iter = labeled_splitter.split(
                labeled_ds,
                labeled_ds[target_col],
                groups=labeled_ds[group_col])

        for i in range(n_folds):
            u_train, u_test = next(unlabeled_spl_iter)
            l_train, l_test = next(labeled_spl_iter)

            l_train_target = labeled_ds.iloc[l_train][target_col]
            l_train_data = labeled_ds.iloc[l_train]

            l_test_target = labeled_ds.iloc[l_test][target_col]
            l_test_data = labeled_ds.iloc[l_test]

            # Sample labeled_train_size of labeled data
            if equal_target:
                filtered_l_train_idx, chosen_l_train = self._sample_labeled_data(
                    l_train_data, l_train_target, target_col,
                    labeled_train_size_per_class, random_state)

                filtered_l_test_idx, chosen_l_test = self._sample_labeled_data(
                    l_test_data, l_test_target, target_col,
                    int(labeled_train_size_per_class * self._test_ratio),
                    random_state)
            else:
                if labeled_train_size is not None:
                    chosen_l_train, _ = model_selection.train_test_split(
                        l_train,
                        train_size=labeled_train_size,
                        random_state=random_state,
                        shuffle=shuffle,
                        stratify=l_train_target)
                    chosen_l_test, _ = model_selection.train_test_split(
                        l_test,
                        train_size=int(labeled_train_size * self._test_ratio),
                        random_state=random_state,
                        shuffle=shuffle,
                        stratify=l_train_target)
                else:
                    chosen_l_train = l_train
                    chosen_l_test = l_test
                filtered_l_train_idx = labeled_ds.iloc[chosen_l_train]
                filtered_l_test_idx = labeled_ds.iloc[chosen_l_test]

            # Sample unlabeled_train_size of labeled data
            if equal_unlabeled_target:
                u_train_target = unlabeled_ds.iloc[u_train][
                    unlabeled_target_col]
                u_test_target = unlabeled_ds.iloc[u_test][unlabeled_target_col]

                filtered_u_train_idx, chosen_u_train = self._sample_unlabeled_data(
                    unlabeled_ds, u_train, unlabeled_target_col,
                    u_train_target, unlabeled_train_size_per_class,
                    random_state)

                filtered_u_test_idx, chosen_u_test = self._sample_unlabeled_data(
                    unlabeled_ds, u_test, unlabeled_target_col, u_test_target,
                    int(unlabeled_train_size_per_class * self._test_ratio),
                    random_state)
            else:
                if unlabeled_train_size is not None:
                    # chosen_u_train, _ = model_selection.train_test_split(u_train, train_size=unlabeled_train_size,
                    #                                                      random_state=random_state, shuffle=shuffle)
                    is_replace = unlabeled_train_size > len(u_train)
                    chosen_u_train = resample(u_train,
                                              n_samples=unlabeled_train_size,
                                              replace=is_replace,
                                              random_state=random_state)
                    unlabeled_test_size = int(unlabeled_train_size *
                                              self._test_ratio)
                    is_replace = unlabeled_test_size > len(u_test)
                    chosen_u_test = resample(u_test,
                                             n_samples=unlabeled_test_size,
                                             replace=is_replace,
                                             random_state=random_state)
                else:
                    chosen_u_train = u_train
                    chosen_u_test = u_test

                filtered_u_train_idx = unlabeled_ds.iloc[chosen_u_train]
                filtered_u_test_idx = unlabeled_ds.iloc[chosen_u_test]

            self.__cv_folds_idx.append(
                (chosen_l_train, chosen_l_test, chosen_u_train, chosen_u_test))

            self.__ds_chunks.append(
                (filtered_l_train_idx, filtered_l_test_idx,
                 filtered_u_train_idx, filtered_u_test_idx))

        self.__folds_iter = iter(self.__ds_chunks)
示例#27
0
    def split(self):
        if self.problem_type in [
                'binary_classification', 'multi-class classification'
        ]:
            if self.num_target != 1:
                raise Exception(
                    'Invalid number of targets for this problem type')
            target = self.target_cols[0]
            unique_values = self.dataframe[target].nunique()
            if unique_values == 1:
                raise Exception('Only one unique value found!')
            elif unique_values > 1:
                kf = model_selection.StratifiedKFold(n_splits=self.num_folds,
                                                     shuffle=False)

                for fold, (train_idx, val_idx) in enumerate(
                        kf.split(X=self.dataframe,
                                 y=self.dataframe[target].values)):
                    self.dataframe.loc[val_idx, 'kfold'] = fold

        elif self.problem_type in [
                'single_col_regression', 'multi_col_regression'
        ]:
            if self.num_target != 1 and self.problem_type == 'single_col_regression':
                raise Exception(
                    'Invalid number of targets for this problem type')
            if self.num_target < 2 and self.problem_type == 'multi_col_regression':
                raise Exception(
                    'Invalid number of targets for this problem type')
            target = self.target_cols[0]
            kf = model_selection.KFold(n_splits=self.num_folds)
            for fold, (train_idx,
                       val_idx) in enumerate(kf.split(X=self.dataframe)):
                self.dataframe.loc[val_idx, 'kfold'] = fold

        elif self.problem_type.startswith('holdout_'):
            holdout_percentage = int(self.problem_type.split('_')[1])
            num_holdout_samples = int(
                len(self.dataframe) * holdout_percentage / 100)
            self.dataframe.loc[:len(self.dataframe) - num_holdout_samples,
                               'kfold'] = 0
            self.dataframe.loc[len(self.dataframe) - num_holdout_samples:,
                               'kfold'] = 1

        elif self.problem_type == 'multilabel_classification':
            if self.num_target != 1:
                raise Exception(
                    'Invalid number of targets for this problem type ')
            targets = self.dataframe[self.target_cols[0]].apply(
                lambda x: len(str(x).split(self.multilabel_delimeter)))
            kf = model_selection.StratifiedKFold(n_splits=self.num_folds)
            for fold, (train_idx, val_idx) in enumerate(
                    kf.split(X=self.dataframe, y=targets)):
                self.dataframe.loc[val_idx, 'kfold'] = fold

        elif self.problem_type in [
                'groupfold_regression', 'groupfold_classification'
        ]:
            if self.num_target != 1:
                raise Exception(
                    'INvalid number of targets for this problem type')
            target = self.target_cols[0]
            kf = model_selection.GroupKFold(n_splits=self.num_folds)
            for fold, (train_idx, valid_idx) in enumerate(
                    kf.split(X=self.dataframe, group=self.groupfold)):
                self.dataframe.loc[val_idx, 'kfold'] = fold
        else:
            raise Exception('Problem type not understood')

        return self.dataframe
示例#28
0
    def classify_rf(self,
                    max_depth=64,
                    n_estimators=1000,
                    max_features="sqrt",
                    roc_flag=False,
                    rand_flag=False,
                    save="",
                    compare_flag=True,
                    group_classes=True):
        """
        This uses LogisticRegressionCV to find the maximum mean f1 score using by adjusting the C parameter

        :param C_flag: A boolian indicating what to output from the function. (if False output the max mean f1, if True output the C value used to find the maximum mean f1 score)
        """

        # seeds random state from time
        random_state = np.random.RandomState(int(time.time()))
        np.random.seed(int(time.time() / 100))
        if group_classes:
            rng_idx = np.arange(len(self.class_list))
            np.random.shuffle(rng_idx)
        # Uncomment if you want to seed random state from iteger instead (to be able to repeat exact results)
        #random_state = np.random.RandomState(11235813)
        #np.random.seed(112358)

        # Sets and fits Random ForestModel
        model2 = ensemble.RandomForestClassifier(class_weight='balanced',
                                                 max_depth=max_depth,
                                                 max_leaf_nodes=None,
                                                 n_estimators=n_estimators,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 max_features=max_features,
                                                 n_jobs=-1)
        fitModel = model2.fit(self.X, self.y)

        # saves the model
        if len(save) > 0:
            joblib.dump(fitModel, save)

        if rand_flag:
            # Generate random drug-disease pairs
            rand_n = 10000
            self.rand_rate(rand_n, self.drugs_path, self.diseases_path)

            # Get random pairs cutoff rates
            probas_rand = fitModel.predict_proba(self.X2)

            self.data["treat_prob"] = [pr[1] for pr in probas_rand]
            rand_df_sort = self.data.sort_values(
                "treat_prob", ascending=False).reset_index(drop=True)
            rand_df_sort.to_csv(self.output + "random_pairs_names.csv",
                                index=False)
            #print(self.data.sort_values("treat_prob", ascending = False).reset_index(drop=True))

            # Get true positive cutoff rates
            probas_tp = fitModel.predict_proba(self.Xtp)

            # Get true negative cutoff rates
            probas_tn = fitModel.predict_proba(self.Xtn)

            # Plot the cutoff rates together
            self.plot_cutoff([
                pd.DataFrame({"treat_prob": [pr[1] for pr in probas_rand]}),
                pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tp]}),
                pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tn]})
            ], ["Random Pairs", "True Positives", "True Negatives"])

        if roc_flag:
            model = ensemble.RandomForestClassifier(class_weight='balanced',
                                                    max_depth=max_depth,
                                                    max_leaf_nodes=None,
                                                    n_estimators=n_estimators,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    max_features=max_features,
                                                    n_jobs=-1)

            # Sets up 10-fold cross validation set
            cv = ms.StratifiedKFold(n_splits=10,
                                    random_state=random_state,
                                    shuffle=True)
            if group_classes:
                cv = ms.GroupKFold(n_splits=10)
            tprs = []
            aucs = []
            f1s = []
            mean_fpr = np.linspace(0, 1, 100)

            i = 0

            # Creates a shuffled index for X and y
            shuffled_idx = np.arange(len(self.y))
            np.random.shuffle(shuffled_idx)

            # Uncomment if you want it to find and print the mean f1 score
            #test_f1_mean = np.mean(ms.cross_val_score(model, self.X[shuffled_idx], self.y[shuffled_idx], cv=10, n_jobs=-1, scoring='f1'))
            #print('using cross val score F1 = %0.4f' % (test_f1_mean))

            prob_list = []

            if group_classes:
                cv_params = {
                    "X": self.X[rng_idx],
                    "y": self.y[rng_idx],
                    "groups": list(self.class_list[rng_idx])
                }
            else:
                cv_params = {"X": self.X, "y": self.y}
            # Calculates and plots the roc cureve for each set in 10-fold cross validation
            for train, test in cv.split(**cv_params):
                model_i = model.fit(self.X[train], self.y[train])
                probas_ = model_i.predict_proba(self.X[test])
                pred = model_i.predict(self.X[test])
                f1 = met.f1_score(self.y[test], pred, average='binary')
                f1s.append(f1)
                # Compute ROC curve and area the curve
                #prob_list += [pd.DataFrame({"treat_prob":[pr[1] for pr in probas_]})]
                fpr, tpr, thresholds = met.roc_curve(self.y[test], probas_[:,
                                                                           1])
                tprs.append(sci.interp(mean_fpr, fpr, tpr))
                tprs[-1][0] = 0.0
                roc_auc = met.auc(fpr, tpr)
                aucs.append(roc_auc)
                plt.plot(fpr,
                         tpr,
                         lw=1,
                         alpha=0.3,
                         label='ROC fold %d (AUC = %0.4f, F1 = %0.4f)' %
                         (i, roc_auc, f1))

                i += 1

            # Plots the 50/50 line
            plt.plot([0, 1], [0, 1],
                     linestyle='--',
                     lw=2,
                     color='r',
                     label='Coin Flip',
                     alpha=.8)

            # Finds and plots the mean roc curve and mean f1 score
            mean_tpr = np.mean(tprs, axis=0)
            mean_f1 = np.mean(f1s)
            mean_tpr[-1] = 1.0
            mean_auc = met.auc(mean_fpr, mean_tpr)
            std_auc = np.std(aucs)
            plt.plot(mean_fpr,
                     mean_tpr,
                     color='b',
                     label=u'Mean ROC (AUC = %0.4f \u00B1 %0.4f, \n        \
                        Mean F1 = %0.4f)' % (mean_auc, std_auc, mean_f1),
                     lw=2,
                     alpha=.8)

            # Finds and plots the +- standard deviation for roc curve
            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            plt.fill_between(mean_fpr,
                             tprs_lower,
                             tprs_upper,
                             color='grey',
                             alpha=.2,
                             label=r'$\pm$ 1 std. dev.')

            # Sets legend, limits, labels, and displays plot
            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic')
            plt.legend(loc="lower right")
            outloc = self.output + '/Figure3.png'
            plt.savefig(outloc)
            #plt.show()
            plt.close()
示例#29
0
# 以下是数据分布不均匀的时候,基于分层或分组的抽样方法
print('{0:-^70}'.format('Stratified K-Fold'))
y = np.array([0, 0, 0, 0, 1, 1, 1, 1])
skf = sm.StratifiedKFold(n_splits=4)
print('Stratified K-Fold class: ', skf)
print('splits of skf: ', skf.get_n_splits(X, y))  # 增加一个参数,根据y来分层
for train_indices, test_indices in skf.split(X, y):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Group K-Fold, 除了X和y之外,还有个额外的参数是每个样本所属的组
# 抽样要保证测试集里面的数据所属的组与训练集里面的样本所属的组是完全不一样的
print('{0:-^70}'.format('Group K-Fold'))
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
gkf = sm.GroupKFold(n_splits=3)  # groups的数量必须要大于n_splits
print('X: \n', X)
print('y: ', y)
print('groups: ', groups)
print('Group K-Fold class: ', gkf)
print('splits of gkf: ', gkf.get_n_splits(X, y, groups))  # 再增加一个分组的参数
for train_indices, test_indices in gkf.split(X, y, groups):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Leave One Group out
print('{0:-^70}'.format('Leave One Group out'))
logo = sm.LeaveOneGroupOut()
print('Leave One Group out class: ', logo)
print('splits of logo: ', logo.get_n_splits(X, y,
                                            groups=groups))  # 等于groups的数量
for train_indices, test_indices in logo.split(X, y, groups=groups):
示例#30
0
 def create(self, X, y=None):
     self.cv_iterator = model_selection.GroupKFold(n_splits=self.n_splits)
     return self