def test_sklearn_cv_with_groups(tmp_dir): tuner = kt.SklearnTuner( oracle=kt.oracles.BayesianOptimization(objective=kt.Objective( "score", "max"), max_trials=10), hypermodel=build_model, cv=model_selection.GroupKFold(5), directory=tmp_dir, ) x = np.random.uniform(size=(50, 10)) y = np.random.randint(0, 2, size=(50, )) groups = np.random.randint(0, 5, size=(50, )) tuner.search(x, y, groups=groups) assert len(tuner.oracle.trials) == 10 best_trial = tuner.oracle.get_best_trials()[0] assert best_trial.status == "COMPLETED" assert best_trial.score is not None assert best_trial.best_step == 0 assert best_trial.metrics.exists("score") # Make sure best model can be reloaded. best_model = tuner.get_best_models()[0] best_model.score(x, y)
def test_split(self): X = np.array([0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]) y = np.array(["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]) groups = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3, 3]) fold1 = model_selection.GroupKFold(n_splits=3).split(X, y, groups) fold2 = sklearn_model_selection.GroupKFold(n_splits=3).split(X, y, groups)
def k_fold_example(): X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) y = np.array([1, 2, 1, 2]) groups = np.array([0, 0, 2, 2]) if False: # The stratified folds are made by preserving the percentage of samples for each class. kf = model_selection.StratifiedKFold(n_splits=4, shuffle=True, random_state=None) #kf = model_selection.RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=None) print('#splits =', kf.get_n_splits(X, y)) elif False: # The same group will not appear in two different folds. # The number of distinct groups has to be at least equal to the number of folds. kf = model_selection.GroupKFold(n_splits=4) print('#splits =', kf.get_n_splits(X, y, groups)) else: kf = model_selection.KFold(n_splits=4, shuffle=True, random_state=None) #kf = model_selection.RepeatedKFold(n_splits=5, n_repeats=10, random_state=None) print('#splits =', kf.get_n_splits(X)) print('K-fold:', kf) #for train_indices, test_indices in kf.split(X, y): #for train_indices, test_indices in kf.split(X, y, groups): for train_indices, test_indices in kf.split(X): #print('TRAIN:', train_indices.shape, 'TEST:', test_indices.shape) print('TRAIN:', train_indices, 'TEST:', test_indices) X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices]
def create_folds(datapath, output_path, nb_folds, method="KF", target=None): ''' Creating folds for cross validation method must be one of KF, GKF, SKF target if SKF is the stratify variable which distribution must remain constant accross folds target if GKF is the group which must be non-overlapping ''' df = pd.read_csv(datapath) df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) y = None if method == "KF": kf = model_selection.KFold(n_splits=nb_folds) if method == "GKF": kf = model_selection.GroupKFold(n_splits=nb_folds) y = df[target] if method == "SKF": kf = model_selection.StratifiedKFold(n_splits=nb_folds) y = df[target] if method == "KF": for fold, (t_, v_) in enumerate(kf.split(X=df)): df.loc[v_, 'kfold'] = fold else: for fold, (t_, v_) in enumerate(kf.split(X=df, y=y)): df.loc[v_, 'kfold'] = fold df.to_csv(output_path, index=False)
def run_full_cv(train_file, user_file): lgb_train = lgb.Dataset(train_file) lgb_train.construct() users = pd.read_pickle(user_file) assert lgb_train.num_data() == users.shape[0] kf = model_selection.GroupKFold(n_splits=5).split(users, users, users) params = dict( task='train', boosting_type='gbdt', objective='binary', num_leaves=170, max_depth=8, learning_rate=0.03, feature_fraction=0.6, bagging_fraction=0.85, bagging_freq=3, verbose=3, ) lgb.cv(params, lgb_train, metrics=['binary_logloss', 'auc'], num_boost_round=2000, early_stopping_rounds=30, folds=kf, verbose_eval=1 )
def build_splits(data_dir, args, config, parser, snapshots_dir, snapshot_name): """ Splits the images from the given directory into training and validation folds. IMPORTANT! Check that the subject ID is set up correctly (args.ID_split), i.e. at which point the file name separates the ID and image name. :param data_dir: Path to input and target data :param args: Experiment arguments :param config: Configuration file (more arguments) :param parser: Function that loads the images :param snapshots_dir: Path to experiment logs and models :param snapshot_name: Name of the experiment :return: Metadata including the training and validation splits, as well as mean and std """ # Metadata metadata = build_meta_from_files(data_dir) # Group_ID metadata['subj_id'] = metadata.fname.apply(lambda x: '_'.join(x.stem.split(args.ID_char, args.ID_split)[:-1]), 0) # Mean and std crop = config['training']['crop_size'] mean_std_path = snapshots_dir / f"mean_std_{crop[0]}x{crop[1]}.pth" if mean_std_path.is_file() and not config['training']['calc_meanstd']: # Load print('==> Loading mean and std from cache') tmp = torch.load(mean_std_path) mean, std = tmp['mean'], tmp['std'] else: # Calculate print('==> Estimating mean and std') mean, std = estimate_mean_std(config, metadata, parser, args.num_threads, config['training']['bs']) torch.save({'mean': mean, 'std': std}, mean_std_path) print('==> Mean:', mean) print('==> STD:', std) # Group K-Fold by rabbit ID gkf = model_selection.GroupKFold(n_splits=config['training']['n_folds']) # K-fold by random shuffle (not recommended if ID is known) # gkf = model_selection.KFold(n_splits=config['training']['n_folds'], shuffle=True, random_state=args.seed) # Create splits for all folds splits_metadata = dict() iterator = gkf.split(metadata.fname.values, groups=metadata.subj_id.values) # Split by subject ID for fold in range(config['training']['n_folds']): train_idx, val_idx = next(iterator) splits_metadata[f'fold_{fold}'] = {'train': metadata.iloc[train_idx], 'val': metadata.iloc[val_idx]} # Add mean and std to metadata splits_metadata['mean'] = mean splits_metadata['std'] = std # Save splits, mean and std with open(snapshots_dir / snapshot_name / 'split_config.dill', 'wb') as f: dill.dump(splits_metadata, f) return splits_metadata
def folds_generator_group(nr_folds): df = pd.read_csv("../../data/raw/train.csv") df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) y = df.target.values kf = model_selection.GroupKFold(n_splits=nr_folds) for f, (t_, v_) in enumerate(kf.split(X=df, y=y, groups=df['patient_id'])): df.loc[v_, 'kfold'] = f df.to_csv('train_folds_group_{}.csv'.format(nr_folds), index=False)
def build_splits(data_dir, args, config, parser, snapshots_dir, snapshot_name): # TODO correct splits # Metadata # metadata = build_meta_from_files(data_dir) metadata = build_meta_from_files(base_path=data_dir) # Group_ID if config['training']['uCT']: metadata['subj_id'] = metadata.fname.apply( lambda x: x.stem.rsplit('_', 2)[0], 0) else: metadata['subj_id'] = metadata.fname.apply( lambda x: x.stem.rsplit('_', 3)[0], 0) # Mean and std crop = config['training']['crop_size'] mean_std_path = snapshots_dir / f"mean_std_{crop[0]}x{crop[1]}.pth" if mean_std_path.is_file( ) and not config['training']['calc_meanstd']: # Load print('==> Loading mean and std from cache') tmp = torch.load(mean_std_path) mean, std = tmp['mean'], tmp['std'] else: # Calculate print('==> Estimating mean and std') mean, std = estimate_mean_std(config, metadata, parser, args.num_threads, args.bs) torch.save({'mean': mean, 'std': std}, mean_std_path) print('==> Mean:', mean) print('==> STD:', std) # Group K-Fold by rabbit ID gkf = model_selection.GroupKFold(n_splits=config['training']['n_folds']) # K-fold by random shuffle # gkf = model_selection.KFold(n_splits=config['training']['n_folds'], shuffle=True, random_state=args.seed) # Create splits for all folds splits_metadata = dict() iterator = gkf.split(metadata, groups=metadata.subj_id) for fold in range(config['training']['n_folds']): train_idx, val_idx = next(iterator) splits_metadata[f'fold_{fold}'] = { 'train': metadata.iloc[train_idx], 'val': metadata.iloc[val_idx] } # Add mean and std to metadata splits_metadata['mean'] = mean splits_metadata['std'] = std with open(snapshots_dir / snapshot_name / 'split_config.dill', 'wb') as f: dill.dump(splits_metadata, f) return splits_metadata
def group_kfold(): lr = linear_model.LogisticRegression() x, y = datasets.make_blobs(n_samples=12, random_state=0) print(x) print(y) # groups!? 모호함 groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] scores = model_selection.cross_val_score( lr, x, y, groups, cv=model_selection.GroupKFold(n_splits=3)) print(scores) print('-' * 50) sp1 = model_selection.GroupKFold() for train_index, test_index in sp1.split(x, y, groups): print(train_index, test_index) print() sp2 = model_selection.GroupKFold(n_splits=4) for train_index, test_index in sp2.split(x, y, groups): print(train_index, test_index) print()
def randomsearch_fitparams_epsSVR(X, y, groups=None, n_iter=20, n_jobs=1, **kws): ''' Function for randomized search per sklearn's RandomizedSearchCV X,y - Training input/output pairs following sklearn .fit method conventions groups - (optional) list of group assignments following sklearn convention; if non-None, GroupKFold is used. n_iter, n_jobs - parameters for # of iterations and threads for the RandomizedSearchCV to use **kws - Additional kws used for explicitly fixing one or more parameters during parameter searching ''' param_distributions = { 'C': stats.expon(scale=50), 'gamma': stats.expon(scale=1), 'epsilon': stats.expon(scale=10) } regressor = svm.SVR(kernel='rbf') for param in param_distributions.keys(): if kws.get( param ) is not None: # Assume we're using this to fix a parameter. setattr(regressor, param, kws[param]) param_distributions = { k: v for k, v in param_distributions.items() if kws.get(k) is None } if groups is None: cv = model_selection.KFold(n_splits=4, shuffle=True) else: print( 'Using GroupKFold to separate measurements from different animals') cv = model_selection.GroupKFold(n_splits=4) cv_splits = list(cv.split(X, y, groups=groups)) random_search = model_selection.RandomizedSearchCV( regressor, param_distributions=param_distributions, n_iter=n_iter, verbose=True, cv=cv_splits, n_jobs=n_jobs) random_search.fit(X, y) return random_search
def optimize(trial, df): n_estimators = trial.suggest_int("n_estimators", 50, 1000) num_leaves = trial.suggest_int("num_leaves", 10, 500) learning_rate = trial.suggest_uniform("learning_rate", 0.01, 1.0) modely = lgb.LGBMRegressor(n_estimators=n_estimators, num_leaves=num_leaves, learning_rate=learning_rate) modelx = lgb.LGBMRegressor(n_estimators=n_estimators, num_leaves=num_leaves, learning_rate=learning_rate) modelf = lgb.LGBMClassifier(n_estimators=n_estimators, num_leaves=num_leaves, learning_rate=learning_rate) kf = model_selection.GroupKFold(n_splits=5, groups=df.iloc[:, -1]) comp_metric = [] for fold, (train_idx, val_idx) in enumerate(kf.split(df)): df_train = df.loc[train_idx] df_val = df.loc[val_idx] x_train = df_train.iloc[:, :-4] y_trainx = df_train.iloc[:, -4] y_trainy = df_train.iloc[:, -3] y_trainf = df_train.iloc[:, -2] x_val = df_val.iloc[:, :-4] y_valx = df_val.iloc[:, -4] y_valy = df_val.iloc[:, -3] y_valf = df_val.iloc[:, -2] modelx.fit(x_train, y_trainx) modely.fit(x_train, y_trainy) modelf.fit(x_train, y_trainf) test_predsx = modelx.predict(x_val) test_predsy = modely.predict(x_val) test_predsf = modelf.predict(x_val) fold_metric = regression_metrics.iln_comp_metric( test_predsx, test_predsy, test_predsf, y_valx, y_valy, y_valf) comp_metric.append(fold_metric) return np.mean(comp_metric)
def build_sklearn(self, splitter_id, splitter_params): """Build splitters wrapping sklearn""" if splitter_id == 'mangoml_sklearn_KFold': return SplitterWrapper(model_selection.KFold(**splitter_params)) elif splitter_id == 'mangoml_sklearn_StratifiedKFold': return SplitterWrapper( model_selection.StratifiedKFold(**splitter_params)) elif splitter_id == 'mangoml_sklearn_ShuffleSplit': return SplitterWrapper( model_selection.ShuffleSplit(**splitter_params)) elif splitter_id == 'mangoml_sklearn_StratifiedShuffleSplit': return SplitterWrapper( model_selection.StratifiedShuffleSplit(**splitter_params)) elif splitter_id == 'mangoml_sklearn_GroupKFold': group_column = splitter_params.pop('group_column') return SplitterWrapper( model_selection.GroupKFold(**splitter_params), group_column) return None
def temp(samples): from sklearn import model_selection from wbia.algo.verif import sklearn_utils def check_balance(idxs): # from sklearn.utils.fixes import bincount logger.info('-------') for count, (test, train) in enumerate(idxs): logger.info('split %r' % (count)) groups_train = set(groups.take(train)) groups_test = set(groups.take(test)) n_group_isect = len(groups_train.intersection(groups_test)) y_train_freq = bincount(y.take(train)) y_test_freq = bincount(y.take(test)) y_test_ratio = y_test_freq / y_test_freq.sum() y_train_ratio = y_train_freq / y_train_freq.sum() balance_error = np.sum((y_test_ratio - y_train_ratio) ** 2) logger.info('n_group_isect = %r' % (n_group_isect,)) logger.info('y_test_ratio = %r' % (y_test_ratio,)) logger.info('y_train_ratio = %r' % (y_train_ratio,)) logger.info('balance_error = %r' % (balance_error,)) X = np.empty((len(samples), 0)) y = samples.encoded_1d().values groups = samples.group_ids n_splits = 3 splitter = model_selection.GroupShuffleSplit(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.GroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = model_selection.StratifiedKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs) splitter = sklearn_utils.StratifiedGroupKFold(n_splits=n_splits) idxs = list(splitter.split(X=X, y=y, groups=groups)) check_balance(idxs)
def split_data(self): try: from sklearn import model_selection if self.split_by==None and self.split_parent==None: self.kfold=model_selection.KFold(n_splits=self.nbins,random_state=self.seed) self.folds=self.kfold.split(self.x) self.folds = self.kfold.split(self.x) elif self.split_by == None and self.split_parent!=None: self.kfold=model_selection.GroupKFold(n_splits=self.nbins,random_state=self.seed) self.folds=self.kfold.split(self.x,groups=self.split_parent) elif self.split.by !=None and self.split_parent==None: self.kfold=model_selection.StratifiedKFold(n_splits=self.nbins,random_state=self.seed) self.folds=self.kfold.split(self.x,self.split_by) i=0 for train,test in self.folds: self.x.loc[test,"TestInd"]=i i+=1 except: ValueError("X data is missing for splitting")
def __init__(self, ds: pd.DataFrame, n_folds: int = 5, target_col: str = 'target', group_col: str or None = None, random_state: int or None = None): super().__init__() if group_col is None: splitter = model_selection.StratifiedKFold( n_splits=n_folds, random_state=random_state) split_iter = splitter.split(ds, ds[target_col]) else: splitter = model_selection.GroupKFold(n_splits=n_folds) split_iter = splitter.split(ds, ds[target_col], groups=ds[group_col]) self.__cv_folds_idx = [(train_idx, val_idx) for (train_idx, val_idx) in split_iter] self.__ds_chunks = [(ds.iloc[split[0]], ds.iloc[split[1]]) for split in self.__cv_folds_idx] self.__folds_iter = iter(self.__ds_chunks)
def split( self, cross_validation: Union[int, model_selection.BaseCrossValidator, model_selection.ShuffleSplit, model_selection.StratifiedShuffleSplit], groups: Optional[np.ndarray] = None, ) -> Iterator[Tuple['InferenceData', 'InferenceData']]: """Splits the data using the indicated cross validator. Args: cross_validation: Cross validation to be applied. If an int is passed and groups is None a sklearn Kfold is used with cross_validation as the number of splits. If an int is passed and groups is not None, sklearn GroupKFold will be used. Whena a cross validator is passed it is used directly. groups: If cross validating for non overlaping groups, this array indicates to which group each row belongs. Yields: A tuple with train and test InferenceDatas. """ if isinstance(cross_validation, int): if groups is not None: cross_validation = model_selection.GroupKFold( n_splits=cross_validation) else: cross_validation = model_selection.KFold( n_splits=cross_validation) for train_index, test_index in cross_validation.split(self.data, groups=groups): train_inference_data = self._copy_and_index_inference_data( train_index) test_inference_data = self._copy_and_index_inference_data( test_index) yield train_inference_data, test_inference_data
# loop through each tournament and print the input for train and validation for index in range(0, len(tournaments)): # get the tournament name tournament = tournaments[index] print("*********** TOURNAMENT " + tournament + " ***********") # set the target name for the tournament target = "target_" + tournament # set the y train with the target variable y = train_data.iloc[:, train_data.columns == target].values.reshape(-1, ) # use GroupKFold for splitting the era group_kfold = model_selection.GroupKFold(n_splits=kfold_split) counter = 1 print(">> group eras using kfold split\n") for train_index, test_index in group_kfold.split(X, y, groups=train_data['era']): # X_train takes the 50 features only for training and leave the other columns X_train = X[train_index][:, 3:] # y_train remains the same y_train = y[train_index] print(">> running split #", counter)
from sklearn import ensemble print( cross_val_score(ensemble.GradientBoostingRegressor(), X, y, cv=10).mean()) ############################################################### # We can predict! # # But there is one caveat: are we simply learning to recognive students # across the years? There is many implicit informations about students: # notably in the school ID and the class ID. # # **Stratification** To test for this, we can make sure that we have # different students in the train and the test set. from sklearn import model_selection cv = model_selection.GroupKFold(10) print( cross_val_score(ensemble.GradientBoostingRegressor(), X, y, cv=cv, groups=exams['StudentID']).mean()) ############################################################### # It works better! # # The classifier learns better to generalize, probably by learning # stronger invariances from the repeated measures on the students # # Summary
def process(args, n_splits=5): np.random.seed(2299) ## Read data # Paths out_path = Path(args.out_path) out_path.mkdir(parents=True, exist_ok=True) ds_path = Path("../data/drug_screens/") cm_path = Path("../data/cellular_models/") # L1000 genes genes = pd.read_csv( cm_path.joinpath("GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz"), sep="\t") l1000_cols = genes[genes['pr_is_lm'] == True]['pr_gene_id'].values.astype( str) # CTRP cp_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_compound.txt"), sep="\t", index_col=0) cl_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_cell_line.txt"), sep="\t", index_col=0) exp_ctrp = pd.read_csv( ds_path.joinpath("CTRP/v20.meta.per_experiment.txt"), sep="\t", index_col=0) ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.data.per_cpd_post_qc.txt"), sep='\t', index_col=0) ## Create meta data meta_ccle = pd.read_csv(cm_path.joinpath("sample_info.csv")) meta = ctrp.join(exp_ctrp['master_ccl_id']).drop_duplicates() meta = meta.merge(cl_ctrp['ccl_name'], left_on='master_ccl_id', right_index=True) meta = meta.merge(cp_ctrp[['broad_cpd_id', 'cpd_smiles']], left_on='master_cpd_id', right_index=True) meta = meta.merge(meta_ccle[['stripped_cell_line_name', 'DepMap_ID']], left_on='ccl_name', right_on='stripped_cell_line_name') meta['cpd_pred_pv'] = np.clip(meta['cpd_pred_pv'], a_min=0., a_max=None) meta = meta.set_index('DepMap_ID') # CCLE ccle = pd.read_csv(cm_path.joinpath("CCLE_expression.csv"), index_col=0).rename_axis('DepMap_ID').astype(np.float32) ccle = ccle[ccle.index.isin(meta.index)] ## Generate Fingerprints nBits = 512 cpd_data = meta[['broad_cpd_id', 'cpd_smiles']].drop_duplicates() fp = smiles_to_bits(cpd_data['cpd_smiles'], nBits=nBits) fp.index = cpd_data['broad_cpd_id'] fp.columns = ["FP_{}".format(i) for i in range(len(fp.columns))] fp = fp.astype(np.int) ## Subset L1000 gene features by HGNC ccle_cols = np.array([[gene[0], gene[1].strip("()")] for gene in ccle.columns.str.split(" ")]) ccle.columns = ccle_cols[:, 1] ## Get feature columns gene_cols = ccle.columns.to_numpy() fp_cols = fp.columns.to_numpy() ## Generate folds ccle["fold"] = -1 ccle = ccle.sample(frac=1).reset_index() gkf = model_selection.GroupKFold(n_splits=n_splits) for fold, (train_idx, val_idx) in enumerate( gkf.split(X=ccle, y=None, groups=ccle.index)): ccle.loc[val_idx, 'fold'] = fold ccle = ccle.set_index('DepMap_ID') ## Standardize folds & write for fold in range(0, n_splits): train = ccle[ccle['fold'] != fold].drop(columns='fold').copy() val = ccle[ccle['fold'] == fold].drop(columns='fold').copy() # Standardize GEX values train, val = stdscale_update(train, val, gene_cols) # Write train.to_csv(out_path.joinpath(f"train_fold_{fold}.csv.gz")) val.to_csv(out_path.joinpath(f"val_fold_{fold}.csv.gz")) # joblib.dump(scaler, out_path.joinpath(f"stdscaler_full_fold_{fold}.pkl")) ## Write out metadata meta.to_csv(out_path.joinpath(f"meta.csv.gz")) fp.to_csv(out_path.joinpath(f"fingerprints.csv.gz")) joblib.dump(gene_cols, out_path.joinpath("protcode_gene_cols.pkl")) joblib.dump(l1000_cols, out_path.joinpath("l1000_gene_cols.pkl")) joblib.dump(fp_cols, out_path.joinpath("fp_cols.pkl")) return "Complete"
RUN_ID = 'bb_lstm_20steps_2017-06-14_19.57.18' MODEL_FILE = 'best.hdf5' print('Loading data') df = pd.read_csv('bb/models/' + RUN_ID + '/' + MODEL_FILE.replace('.hdf5', '-predict-aligned.csv')) features = [f for f in df if re.fullmatch('^z[0-9]+$', f)] print('Found ' + str(len(features)) + ' embedding features: ' + str(features)) print(df.behavior.value_counts()) X = df[features].values y = np.array([1 if l == 'UDEF1' else 0 for l in df.behavior]) cm = [] aucs = [] kappas = [] for train_i, test_i in model_selection.GroupKFold(4).split( X, y, df.participant_id.values): model = tree.DecisionTreeClassifier() imbalance = sum(y[train_i]) / len(train_i) weights = np.array( [imbalance if l == 0 else 1 - imbalance for l in y[train_i]]) model.fit(X[train_i], y[train_i], sample_weight=weights) preds = model.predict(X[test_i]) probs = model.predict_proba(X[test_i]) aucs.append(metrics.roc_auc_score(y[test_i], probs[:, 1])) kappas.append(metrics.cohen_kappa_score(preds, y[test_i])) cm.append(metrics.confusion_matrix(y[test_i], preds)) cm = np.sum(cm, axis=0) print(cm) print(kappas) print('mean kappa: %.3f' % (sum(kappas) / len(kappas))) print('mean AUC: %.3f' % (sum(aucs) / len(aucs)))
elif np.sum(target_data_idx) == 1: Y = y_sr[0] else: raise ValueError('Wrong target number') Y.columns.rename('target', level=0, inplace=True) assert all(X.index == Y.index) # check index consistency """ Construct CV iterator """ from sklearn import model_selection odor_set = set(X.index.unique(level='stim1').str[:-2]) odor_group_dict = dict(zip(odor_set,range(len(odor_set)))) stim_group = X.index.get_level_values(level='stim1').str[:-2] stim_group = stim_group.map(lambda x: odor_group_dict[x]).tolist() group_kfold = model_selection.GroupKFold(n_splits=len(odor_set)) # stim_kfold = group_kfold.split(X, Y, groups=stim_group) """ Build regression model with CV """ reg_param = dict( max_iter=10000, selection='random' ) from sklearn import linear_model mtlcv = linear_model.MultiTaskLassoCV(copy_X=True, alphas=10**np.linspace(-4,0,num=20), cv=group_kfold.split(X, Y, groups=stim_group), n_jobs=12,#group_kfold.n_splits, verbose=1, **reg_param)
if __name__ == "__main__": args = get_args() # create folds df = pd.read_csv(args.source) df["kfold"] = -1 df = df.sample(frac=1).reset_index(drop=True) y = df.target.values assert args.type in [ "group_k_fold", "stratified_k_fold", "stratified_group_k_fold" ] if args.type == "group_k_fold": print("Using groupKFold") kf = model_selection.GroupKFold(n_splits=5) for f, (t_, v_) in enumerate( kf.split(X=df, y=y, groups=df["patient_id"].tolist())): df.loc[v_, "kfold"] = f df.to_csv(args.target, index=False) elif args.type == "stratified_k_fold": print("Using StratifiedKFold") kf = model_selection.StratifiedKFold(n_splits=5) for f, (t_, v_) in enumerate(kf.split(X=df, y=y)): df.loc[v_, "kfold"] = f df.to_csv(args.target, index=False) elif args.type == "stratified_group_k_fold": print("Using stratified_group_k_fold") groups = np.array(df["patient_id"].values) for f, (t_, v_) in enumerate(
print(assignment_ami) print( "Cluster assignment accuracy over {} runs: mean={}, std={}".format( args.cluster, np.mean(assignment_ami), np.std(assignment_ami))) classifier_models = list() if args.knn > 0: classifier_models.append(neighbors.KNeighborsClassifier(args.knn)) if args.svm: classifier_models.append(sklearn.svm.SVC()) for model in classifier_models: print("Evaluating ", str(model)) if args.predict_performance is not None: split = model_selection.GroupKFold(n_splits=5).split( t_vectors, targets, groups=person_ids) else: split = model_selection.StratifiedKFold(n_splits=5).split( t_vectors, targets) # Stratify by person_ids score = list() target_wise = list() saved = [list(), list()] for train, test in split: model.fit(t_vectors[train, :], targets[train].squeeze()) score.append( model.score(t_vectors[test, :], targets[test].squeeze())) print("Overall Accuracy: ", score[-1]) predictions = model.predict(t_vectors[test, :]) cmat = metrics.confusion_matrix(targets[test].squeeze(),
def process(out_path): np.random.seed(2299) ## Read data # Paths out_path = Path(out_path) out_path.mkdir(parents=True, exist_ok=True) ds_path = Path("../../film-gex-data/drug_screens/") cm_path = Path("../../film-gex-data/cellular_models/") # CCLE meta_ccle = pd.read_csv(cm_path.joinpath("sample_info.csv")) ccle = pd.read_csv(cm_path.joinpath("CCLE_expression.csv"), index_col=0) # L1000 genes genes = pd.read_csv( cm_path.joinpath("GSE70138_Broad_LINCS_gene_info_2017-03-06.txt.gz"), sep="\t", index_col=0) # CTRP cp_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_compound.txt"), sep="\t", index_col=0) cl_ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.meta.per_cell_line.txt"), sep="\t", index_col=0) exp_ctrp = pd.read_csv( ds_path.joinpath("CTRP/v20.meta.per_experiment.txt"), sep="\t", index_col=0) ctrp = pd.read_csv(ds_path.joinpath("CTRP/v20.data.per_cpd_post_qc.txt"), sep='\t', index_col=0) ## Merge data data = ctrp.join(exp_ctrp['master_ccl_id']).drop_duplicates() data = data.merge(cl_ctrp['ccl_name'], left_on='master_ccl_id', right_index=True) data = data.merge(cp_ctrp[['broad_cpd_id', 'cpd_smiles']], left_on='master_cpd_id', right_index=True) data = data.merge(meta_ccle[['stripped_cell_line_name', 'DepMap_ID']], left_on='ccl_name', right_on='stripped_cell_line_name') ## Generate Fingerprints nBits = 512 cpd_data = data[['broad_cpd_id', 'cpd_smiles']].drop_duplicates() bits = smiles_to_bits(cpd_data['cpd_smiles'], nBits=nBits) bits.index = cpd_data['broad_cpd_id'] bits.columns = ["FP_{}".format(i) for i in range(len(bits.columns))] ## Subset L1000 gene features by Entrez ID genes_lm = genes[genes['pr_is_lm'] == 1] ccle_cols = np.array([[gene[0], gene[1].strip("()")] for gene in ccle.columns.str.split(" ")]) ccle.columns = ccle_cols[:, 1] ccle = ccle[genes_lm.index.astype(str)] ## Merge bits and GEX data = data.merge(bits, left_on="broad_cpd_id", right_index=True) data = data.merge(ccle, left_on="DepMap_ID", right_index=True) print( "{} unique compounds and {} unique cell lines comprising {} data points" .format(data['broad_cpd_id'].nunique(), data['stripped_cell_line_name'].nunique(), data.shape[0])) ## Generate folds target = "cpd_avg_pv" group = "stripped_cell_line_name" n_splits = 5 gene_cols = ccle.columns.to_numpy() fp_cols = bits.columns.to_numpy() data["fold"] = -1 data = data.sample(frac=1).reset_index(drop=True) gkf = model_selection.GroupKFold(n_splits=n_splits) for fold, (train_idx, val_idx) in enumerate( gkf.split(X=data, y=data[target].to_numpy(), groups=data[group].to_numpy())): print(len(train_idx), len(val_idx)) data.loc[val_idx, 'fold'] = fold ## Generate transforms & write for fold in range(0, n_splits): train = data[data['fold'] != fold] val = data[data['fold'] == fold] # Transform scaler = StandardScaler() train.loc[:, gene_cols] = scaler.fit_transform(train.loc[:, gene_cols]) val.loc[:, gene_cols] = scaler.transform(val.loc[:, gene_cols]) # Write train.reset_index(drop=True).to_feather( out_path.joinpath("train_fold_{}.feather".format(fold))) val.reset_index(drop=True).to_feather( out_path.joinpath("val_fold_{}.feather".format(fold))) # Testing set train.sample(frac=0.05).reset_index(drop=True).to_feather( out_path.joinpath("sub_train_fold_{}.feather".format(fold))) val.sample(frac=0.05).reset_index(drop=True).to_feather( out_path.joinpath("sub_val_fold_{}.feather".format(fold))) ## Write out joblib.dump(gene_cols, out_path.joinpath("gene_cols.pkl")) joblib.dump(fp_cols, out_path.joinpath("fp_cols.pkl")) data.sample(frac=0.05).reset_index(drop=True).to_feather( out_path.joinpath("data_sub.feather")) data.reset_index(drop=True).to_feather(out_path.joinpath("data.feather")) return "Complete"
def __init__(self, cfg): super(GroupKFold, self).__init__(cfg) self.fold = model_selection.GroupKFold(**cfg.params)
def __init__(self, ds: pd.DataFrame, n_ss_folds: int = 3, n_folds: int = 5, target_col: str = 'target', random_state: int or None = None, unlabeled_target_col: str = '5means_classes', test_ratio: int = 0.25, labeled_train_size_per_class: int = None, unlabeled_train_size_per_class: int = None, labeled_train_size: int = None, unlabeled_train_size: int = None, group_col: str or None = None, equal_target: bool = True, equal_unlabeled_target: bool = True, shuffle: bool = True): super().__init__() self._test_ratio = test_ratio if equal_target and labeled_train_size_per_class is None: raise ValueError( "labeled_train_size_per_class must be determined when \ equal_target is True, but found None") if not equal_target and labeled_train_size is None: raise ValueError("labeled_train_size must be determined when \ equal_target is False, but found None") # Master split into Label/Unlabel if group_col is None: master_splitter = model_selection.StratifiedKFold( n_splits=n_ss_folds, random_state=random_state) unlabeled_idx, labeled_idx = next( master_splitter.split(ds, ds[target_col])) else: master_splitter = model_selection.GroupKFold(n_splits=n_ss_folds) unlabeled_idx, labeled_idx = next( master_splitter.split(ds, ds[target_col], groups=ds[group_col])) unlabeled_ds = ds.iloc[unlabeled_idx] # u_groups = ds[unlabeled_target_col].iloc[unlabeled_idx] labeled_ds = ds.iloc[labeled_idx] l_groups = ds[target_col].iloc[labeled_idx] if not equal_target and labeled_train_size is not None and labeled_train_size > len( labeled_idx): raise ValueError( 'Input labeled train size {} is larger than actual labeled train size {}' .format(labeled_train_size, len(labeled_idx))) if unlabeled_train_size is not None and unlabeled_train_size > len( unlabeled_idx): unlabeled_train_size = len(unlabeled_idx) # raise ValueError('Input unlabeled train size {} is larger than actual unlabeled train size {}'.format(unlabeled_train_size, len(unlabeled_idx))) # Split labeled data using GroupKFold # Split unlabeled data using GroupKFold self.__cv_folds_idx = [] self.__ds_chunks = [] # split of train/val data if group_col is None: unlabeled_splitter = model_selection.StratifiedKFold( n_splits=n_folds, random_state=random_state + 1) unlabeled_spl_iter = unlabeled_splitter.split( unlabeled_ds, unlabeled_ds[target_col]) else: unlabeled_splitter = model_selection.GroupKFold(n_splits=n_folds) unlabeled_spl_iter = unlabeled_splitter.split( unlabeled_ds, unlabeled_ds[target_col], groups=unlabeled_ds[group_col]) if group_col is None: labeled_splitter = model_selection.StratifiedKFold( n_splits=n_folds, random_state=random_state + 2) labeled_spl_iter = labeled_splitter.split(labeled_ds, labeled_ds[target_col]) else: labeled_splitter = model_selection.GroupKFold(n_splits=n_folds) labeled_spl_iter = labeled_splitter.split( labeled_ds, labeled_ds[target_col], groups=labeled_ds[group_col]) for i in range(n_folds): u_train, u_test = next(unlabeled_spl_iter) l_train, l_test = next(labeled_spl_iter) l_train_target = labeled_ds.iloc[l_train][target_col] l_train_data = labeled_ds.iloc[l_train] l_test_target = labeled_ds.iloc[l_test][target_col] l_test_data = labeled_ds.iloc[l_test] # Sample labeled_train_size of labeled data if equal_target: filtered_l_train_idx, chosen_l_train = self._sample_labeled_data( l_train_data, l_train_target, target_col, labeled_train_size_per_class, random_state) filtered_l_test_idx, chosen_l_test = self._sample_labeled_data( l_test_data, l_test_target, target_col, int(labeled_train_size_per_class * self._test_ratio), random_state) else: if labeled_train_size is not None: chosen_l_train, _ = model_selection.train_test_split( l_train, train_size=labeled_train_size, random_state=random_state, shuffle=shuffle, stratify=l_train_target) chosen_l_test, _ = model_selection.train_test_split( l_test, train_size=int(labeled_train_size * self._test_ratio), random_state=random_state, shuffle=shuffle, stratify=l_train_target) else: chosen_l_train = l_train chosen_l_test = l_test filtered_l_train_idx = labeled_ds.iloc[chosen_l_train] filtered_l_test_idx = labeled_ds.iloc[chosen_l_test] # Sample unlabeled_train_size of labeled data if equal_unlabeled_target: u_train_target = unlabeled_ds.iloc[u_train][ unlabeled_target_col] u_test_target = unlabeled_ds.iloc[u_test][unlabeled_target_col] filtered_u_train_idx, chosen_u_train = self._sample_unlabeled_data( unlabeled_ds, u_train, unlabeled_target_col, u_train_target, unlabeled_train_size_per_class, random_state) filtered_u_test_idx, chosen_u_test = self._sample_unlabeled_data( unlabeled_ds, u_test, unlabeled_target_col, u_test_target, int(unlabeled_train_size_per_class * self._test_ratio), random_state) else: if unlabeled_train_size is not None: # chosen_u_train, _ = model_selection.train_test_split(u_train, train_size=unlabeled_train_size, # random_state=random_state, shuffle=shuffle) is_replace = unlabeled_train_size > len(u_train) chosen_u_train = resample(u_train, n_samples=unlabeled_train_size, replace=is_replace, random_state=random_state) unlabeled_test_size = int(unlabeled_train_size * self._test_ratio) is_replace = unlabeled_test_size > len(u_test) chosen_u_test = resample(u_test, n_samples=unlabeled_test_size, replace=is_replace, random_state=random_state) else: chosen_u_train = u_train chosen_u_test = u_test filtered_u_train_idx = unlabeled_ds.iloc[chosen_u_train] filtered_u_test_idx = unlabeled_ds.iloc[chosen_u_test] self.__cv_folds_idx.append( (chosen_l_train, chosen_l_test, chosen_u_train, chosen_u_test)) self.__ds_chunks.append( (filtered_l_train_idx, filtered_l_test_idx, filtered_u_train_idx, filtered_u_test_idx)) self.__folds_iter = iter(self.__ds_chunks)
def split(self): if self.problem_type in [ 'binary_classification', 'multi-class classification' ]: if self.num_target != 1: raise Exception( 'Invalid number of targets for this problem type') target = self.target_cols[0] unique_values = self.dataframe[target].nunique() if unique_values == 1: raise Exception('Only one unique value found!') elif unique_values > 1: kf = model_selection.StratifiedKFold(n_splits=self.num_folds, shuffle=False) for fold, (train_idx, val_idx) in enumerate( kf.split(X=self.dataframe, y=self.dataframe[target].values)): self.dataframe.loc[val_idx, 'kfold'] = fold elif self.problem_type in [ 'single_col_regression', 'multi_col_regression' ]: if self.num_target != 1 and self.problem_type == 'single_col_regression': raise Exception( 'Invalid number of targets for this problem type') if self.num_target < 2 and self.problem_type == 'multi_col_regression': raise Exception( 'Invalid number of targets for this problem type') target = self.target_cols[0] kf = model_selection.KFold(n_splits=self.num_folds) for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe)): self.dataframe.loc[val_idx, 'kfold'] = fold elif self.problem_type.startswith('holdout_'): holdout_percentage = int(self.problem_type.split('_')[1]) num_holdout_samples = int( len(self.dataframe) * holdout_percentage / 100) self.dataframe.loc[:len(self.dataframe) - num_holdout_samples, 'kfold'] = 0 self.dataframe.loc[len(self.dataframe) - num_holdout_samples:, 'kfold'] = 1 elif self.problem_type == 'multilabel_classification': if self.num_target != 1: raise Exception( 'Invalid number of targets for this problem type ') targets = self.dataframe[self.target_cols[0]].apply( lambda x: len(str(x).split(self.multilabel_delimeter))) kf = model_selection.StratifiedKFold(n_splits=self.num_folds) for fold, (train_idx, val_idx) in enumerate( kf.split(X=self.dataframe, y=targets)): self.dataframe.loc[val_idx, 'kfold'] = fold elif self.problem_type in [ 'groupfold_regression', 'groupfold_classification' ]: if self.num_target != 1: raise Exception( 'INvalid number of targets for this problem type') target = self.target_cols[0] kf = model_selection.GroupKFold(n_splits=self.num_folds) for fold, (train_idx, valid_idx) in enumerate( kf.split(X=self.dataframe, group=self.groupfold)): self.dataframe.loc[val_idx, 'kfold'] = fold else: raise Exception('Problem type not understood') return self.dataframe
def classify_rf(self, max_depth=64, n_estimators=1000, max_features="sqrt", roc_flag=False, rand_flag=False, save="", compare_flag=True, group_classes=True): """ This uses LogisticRegressionCV to find the maximum mean f1 score using by adjusting the C parameter :param C_flag: A boolian indicating what to output from the function. (if False output the max mean f1, if True output the C value used to find the maximum mean f1 score) """ # seeds random state from time random_state = np.random.RandomState(int(time.time())) np.random.seed(int(time.time() / 100)) if group_classes: rng_idx = np.arange(len(self.class_list)) np.random.shuffle(rng_idx) # Uncomment if you want to seed random state from iteger instead (to be able to repeat exact results) #random_state = np.random.RandomState(11235813) #np.random.seed(112358) # Sets and fits Random ForestModel model2 = ensemble.RandomForestClassifier(class_weight='balanced', max_depth=max_depth, max_leaf_nodes=None, n_estimators=n_estimators, min_samples_leaf=1, min_samples_split=2, max_features=max_features, n_jobs=-1) fitModel = model2.fit(self.X, self.y) # saves the model if len(save) > 0: joblib.dump(fitModel, save) if rand_flag: # Generate random drug-disease pairs rand_n = 10000 self.rand_rate(rand_n, self.drugs_path, self.diseases_path) # Get random pairs cutoff rates probas_rand = fitModel.predict_proba(self.X2) self.data["treat_prob"] = [pr[1] for pr in probas_rand] rand_df_sort = self.data.sort_values( "treat_prob", ascending=False).reset_index(drop=True) rand_df_sort.to_csv(self.output + "random_pairs_names.csv", index=False) #print(self.data.sort_values("treat_prob", ascending = False).reset_index(drop=True)) # Get true positive cutoff rates probas_tp = fitModel.predict_proba(self.Xtp) # Get true negative cutoff rates probas_tn = fitModel.predict_proba(self.Xtn) # Plot the cutoff rates together self.plot_cutoff([ pd.DataFrame({"treat_prob": [pr[1] for pr in probas_rand]}), pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tp]}), pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tn]}) ], ["Random Pairs", "True Positives", "True Negatives"]) if roc_flag: model = ensemble.RandomForestClassifier(class_weight='balanced', max_depth=max_depth, max_leaf_nodes=None, n_estimators=n_estimators, min_samples_leaf=1, min_samples_split=2, max_features=max_features, n_jobs=-1) # Sets up 10-fold cross validation set cv = ms.StratifiedKFold(n_splits=10, random_state=random_state, shuffle=True) if group_classes: cv = ms.GroupKFold(n_splits=10) tprs = [] aucs = [] f1s = [] mean_fpr = np.linspace(0, 1, 100) i = 0 # Creates a shuffled index for X and y shuffled_idx = np.arange(len(self.y)) np.random.shuffle(shuffled_idx) # Uncomment if you want it to find and print the mean f1 score #test_f1_mean = np.mean(ms.cross_val_score(model, self.X[shuffled_idx], self.y[shuffled_idx], cv=10, n_jobs=-1, scoring='f1')) #print('using cross val score F1 = %0.4f' % (test_f1_mean)) prob_list = [] if group_classes: cv_params = { "X": self.X[rng_idx], "y": self.y[rng_idx], "groups": list(self.class_list[rng_idx]) } else: cv_params = {"X": self.X, "y": self.y} # Calculates and plots the roc cureve for each set in 10-fold cross validation for train, test in cv.split(**cv_params): model_i = model.fit(self.X[train], self.y[train]) probas_ = model_i.predict_proba(self.X[test]) pred = model_i.predict(self.X[test]) f1 = met.f1_score(self.y[test], pred, average='binary') f1s.append(f1) # Compute ROC curve and area the curve #prob_list += [pd.DataFrame({"treat_prob":[pr[1] for pr in probas_]})] fpr, tpr, thresholds = met.roc_curve(self.y[test], probas_[:, 1]) tprs.append(sci.interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = met.auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f, F1 = %0.4f)' % (i, roc_auc, f1)) i += 1 # Plots the 50/50 line plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Coin Flip', alpha=.8) # Finds and plots the mean roc curve and mean f1 score mean_tpr = np.mean(tprs, axis=0) mean_f1 = np.mean(f1s) mean_tpr[-1] = 1.0 mean_auc = met.auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=u'Mean ROC (AUC = %0.4f \u00B1 %0.4f, \n \ Mean F1 = %0.4f)' % (mean_auc, std_auc, mean_f1), lw=2, alpha=.8) # Finds and plots the +- standard deviation for roc curve std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') # Sets legend, limits, labels, and displays plot plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") outloc = self.output + '/Figure3.png' plt.savefig(outloc) #plt.show() plt.close()
# 以下是数据分布不均匀的时候,基于分层或分组的抽样方法 print('{0:-^70}'.format('Stratified K-Fold')) y = np.array([0, 0, 0, 0, 1, 1, 1, 1]) skf = sm.StratifiedKFold(n_splits=4) print('Stratified K-Fold class: ', skf) print('splits of skf: ', skf.get_n_splits(X, y)) # 增加一个参数,根据y来分层 for train_indices, test_indices in skf.split(X, y): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Group K-Fold, 除了X和y之外,还有个额外的参数是每个样本所属的组 # 抽样要保证测试集里面的数据所属的组与训练集里面的样本所属的组是完全不一样的 print('{0:-^70}'.format('Group K-Fold')) X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] gkf = sm.GroupKFold(n_splits=3) # groups的数量必须要大于n_splits print('X: \n', X) print('y: ', y) print('groups: ', groups) print('Group K-Fold class: ', gkf) print('splits of gkf: ', gkf.get_n_splits(X, y, groups)) # 再增加一个分组的参数 for train_indices, test_indices in gkf.split(X, y, groups): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Leave One Group out print('{0:-^70}'.format('Leave One Group out')) logo = sm.LeaveOneGroupOut() print('Leave One Group out class: ', logo) print('splits of logo: ', logo.get_n_splits(X, y, groups=groups)) # 等于groups的数量 for train_indices, test_indices in logo.split(X, y, groups=groups):
def create(self, X, y=None): self.cv_iterator = model_selection.GroupKFold(n_splits=self.n_splits) return self