def run(args): dirpath = Path(args['dirpath']) # Data splits # te_method = args['te_method'] # cv_method = args['cv_method'] # te_size = split_size(args['te_size']) vl_size = split_size(args['vl_size']) # Features cell_fea = args['cell_fea'] drug_fea = args['drug_fea'] fea_list = cell_fea + drug_fea # Other params n_jobs = args['n_jobs'] # Hard split # grp_by_col = None split_on = args['split_on'] if args['split_on'] is None else args['split_on'].upper() cv_method = 'simple' if split_on is None else 'group' te_method = cv_method # TODO: this needs to be improved mltype = 'reg' # required for the splits (stratify in case of classification) # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = Path( str(dirpath) + '_splits' ) os.makedirs(outdir, exist_ok=True) lg = Logger(outdir/'splitter.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file dump_dict(args, outpath=outdir/'args.txt') # ----------------------------------------------- # Load and break data # ----------------------------------------------- lg.logger.info('\nLoad master dataset.') files = list(dirpath.glob('**/*.parquet')) if len(files) > 0: data = pd.read_parquet( files[0], engine='auto', columns=None ) # TODO: assumes that there is only one data file lg.logger.info('data.shape {}'.format(data.shape)) # Split features and traget, and dump to file lg.logger.info('\nSplit features and meta.') xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_') meta = data.drop(columns=xdata.columns) xdata.to_parquet( outdir/'xdata.parquet' ) meta.to_parquet( outdir/'meta.parquet' ) lg.logger.info('Total DD: {}'.format( len([c for c in xdata.columns if 'DD_' in c]) )) lg.logger.info('Total GE: {}'.format( len([c for c in xdata.columns if 'GE_' in c]) )) lg.logger.info('Unique cells: {}'.format( meta['CELL'].nunique() )) lg.logger.info('Unique drugs: {}'.format( meta['DRUG'].nunique() )) # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger) plot_hist(meta['AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_all.png') # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- """ np.random.seed(SEED) idx_vec = np.random.permutation(xdata.shape[0]) cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') # Convert vl_size into int # vl_size_int = int(vl_size*len(idx_vec)) # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=SEED) # Command meta[split_on].values[idx_vec] returns the vector meta[split_on].values # in an order specified by idx_vec # For example: # aa = meta[split_on][:3] # print(aa.values) # print(aa.values[[0,2,1]]) # m = meta[split_on] cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds = {} vl_folds = {} te_folds = {} # Start CV iters for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold+1}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # t = meta.loc[tr_id, split_on] # v = meta.loc[vl_id, split_on] # print(len(vl_id)/len(idx_vec)) # ----------------- # Store tr ids tr_folds[fold] = tr_id.tolist() # Create splitter that splits vl into vl and te (splits by half) te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=0.5, mltype=mltype, shuffle=False, random_state=SEED) # Update the index array idx_vec_ = vl_id; del vl_id te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split vl set into vl and te vl_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) vl_id = idx_vec_[vl_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # v = meta.loc[vl_id, split_on] # e = meta.loc[te_id, split_on] # Store vl and te ids vl_folds[fold] = vl_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] )) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] )) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] )) # Confirm that group splits are correct if split_on is not None: tr_grp_unq = set(meta.loc[tr_id, split_on]) vl_grp_unq = set(meta.loc[vl_id, split_on]) te_grp_unq = set(meta.loc[te_id, split_on]) lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ])) # Dump tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False ) # Plot target dist only for the 1-fold case # TODO: consider to plot dist for all k-fold where k>1 if cv_folds==1 and fold==0: plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') """ # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- np.random.seed(SEED) idx_vec = np.random.permutation(xdata.shape[0]) cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=SEED) cv_grp = meta[split_on].values[idx_vec] if split_on is not None else None if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds = {} vl_folds = {} te_folds = {} # Start CV iters (this for loop generates the tr and vl splits) for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # ----------------- # Store vl ids vl_folds[fold] = vl_id.tolist() # Update te_size to the new full size of available samples if cv_folds == 1: te_size_ = vl_size / (1 - vl_size) else: te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec)) # Create splitter that splits tr into tr and te te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_, mltype=mltype, shuffle=False, random_state=SEED) # Update the index array idx_vec_ = tr_id; del tr_id te_grp = meta[split_on].values[idx_vec_] if split_on is not None else None if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # Store tr and te ids tr_folds[fold] = tr_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] )) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] )) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] )) # Confirm that group splits are correct if split_on is not None: tr_grp_unq = set(meta.loc[tr_id, split_on]) vl_grp_unq = set(meta.loc[vl_id, split_on]) te_grp_unq = set(meta.loc[te_id, split_on]) lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ])) # Dump tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False ) # Plot target dist only for the 1-fold case # TODO: consider to plot dist for all k-fold where k>1 if cv_folds==1 and fold==0: plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') # # ----------------------------------------------- # # Train-test split # # ----------------------------------------------- # np.random.seed(SEED) # idx_vec = np.random.permutation(xdata.shape[0]) # # if te_method is not None: # lg.logger.info('\nSplit train/test.') # te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size, # mltype=mltype, shuffle=False, random_state=SEED) # # te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None # if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # # # Split train/test # tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) # tr_id = idx_vec[tr_id] # adjust the indices! # te_id = idx_vec[te_id] # adjust the indices! # # pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] ) # pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] ) # # lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] )) # lg.logger.info('Test: {:.1f}'.format( len(te_id)/xdata.shape[0] )) # # # Update the master idx vector for the CV splits # idx_vec = tr_id # # # Plot dist of responses (TODO: this can be done to all response metrics) # # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values, # # title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png') # # # Confirm that group splits are correct # if te_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # te_grp_unq = set(meta.loc[te_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') # lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.') # # # Update vl_size to effective vl_size # vl_size = vl_size * xdata.shape[0]/len(tr_id) # # # Plot hist te # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') # plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') # # del tr_id, te_id # # # # ----------------------------------------------- # # Generate CV splits # # ----------------------------------------------- # cv_folds_list = [1, 5, 7, 10, 15, 20, 25] # lg.logger.info(f'\nStart CV splits ...') # # for cv_folds in cv_folds_list: # lg.logger.info(f'\nCV folds: {cv_folds}') # # cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, # mltype=mltype, shuffle=False, random_state=SEED) # # cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None # if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) # # tr_folds = {} # vl_folds = {} # # # Start CV iters # for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): # tr_id = idx_vec[tr_id] # adjust the indices! # vl_id = idx_vec[vl_id] # adjust the indices! # # tr_folds[fold] = tr_id.tolist() # vl_folds[fold] = vl_id.tolist() # # # Confirm that group splits are correct # if cv_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # vl_grp_unq = set(meta.loc[vl_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') # lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') # lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') # # # Convet to df # # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T # tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) # vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) # # # Dump # tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) # vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) # # # Plot target dist only for the 1-fold case # if cv_folds==1 and fold==0: # plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') # plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') # # plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], # title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') lg.kill_logger() print('Done.')
def run(args): dirpath = Path(args['dirpath']) target_name = args['target_name'] cv_folds = args['cv_folds'] # Features # cell_fea = args['cell_fea'] # drug_fea = args['drug_fea'] # fea_list = cell_fea + drug_fea # NN params epochs = args['epochs'] batch_size = args['batch_size'] dr_rate = args['dr_rate'] # Optimizer opt_name = args['opt'] clr_keras_kwargs = { 'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma'] } # Other params model_name = args['model_name'] skp_ep = args['skp_ep'] n_jobs = args['n_jobs'] # ML type ('reg' or 'cls') if 'reg' in model_name: mltype = 'reg' elif 'cls' in model_name: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") src = dirpath.name.split('_')[0] # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = Path(str(dirpath).split('_')[0] + '_trn') # os.makedirs(outdir, exist_ok=True) run_outdir = create_outdir(outdir, args, src) lg = Logger(run_outdir / 'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file dump_dict(args, outpath=run_outdir / 'args.txt') # ----------------------------------------------- # Load data and pre-proc # ----------------------------------------------- def get_file(fpath): return pd.read_csv( fpath, header=None).squeeze().values if fpath.is_file() else None def read_data_file(fpath, file_format='csv'): fpath = Path(fpath) if fpath.is_file(): if file_format == 'csv': df = pd.read_csv(fpath) elif file_format == 'parquet': df = pd.read_parquet(fpath) else: df = None return df # Data splits tr_id = pd.read_csv(dirpath / f'{cv_folds}fold_tr_id.csv') vl_id = pd.read_csv(dirpath / f'{cv_folds}fold_vl_id.csv') te_id = pd.read_csv(dirpath / 'te_id.csv') tr_dct = {} vl_dct = {} for fold in range(tr_id.shape[1]): tr_dct[fold] = tr_id.iloc[:, fold].dropna().values.astype(int).tolist() vl_dct[fold] = vl_id.iloc[:, fold].dropna().values.astype(int).tolist() te_id = te_id.iloc[:, 0].dropna().values.astype(int).tolist() # Load data lg.logger.info(f'\nLoading data ...') xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet') meta = read_data_file(dirpath / 'meta.parquet', 'parquet') ydata = meta[[target_name]] # Scale lg.logger.info(f'\nScaling data ...') scaler = args['scaler'] if scaler is not None: if scaler == 'stnd': scaler = StandardScaler() elif scaler == 'minmax': scaler = MinMaxScaler() elif scaler == 'rbst': scaler = RobustScaler() cols = xdata.columns xdata = pd.DataFrame(scaler.fit_transform(xdata), columns=cols, dtype=np.float32) # Test set xte = xdata.iloc[te_id, :] yte = np.squeeze(ydata.iloc[te_id, :]).values # ----------------------------------------------- # ML model configs # ----------------------------------------------- if model_name == 'lgb_reg': framework = 'lightgbm' init_kwargs = { 'n_jobs': n_jobs, 'random_state': SEED, 'logger': lg.logger } fit_kwargs = {'verbose': False} elif model_name == 'nn_reg': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'attn': attn, 'logger': lg.logger } fit_kwargs = {'batch_size': batch_size, 'epochs': epochs, 'verbose': 1} elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg2': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 elif model_name == 'nn_reg3' or 'nn_reg4': framework = 'keras' init_kwargs = { 'in_dim_rna': None, 'in_dim_dsc': None, 'dr_rate': dr_rate, 'opt_name': opt_name, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 # ----------------------------------------------- # Train # ----------------------------------------------- lg.logger.info('\n\n{}'.format('=' * 50)) lg.logger.info(f'Train {src} ...') lg.logger.info('=' * 50) # CV loop for fold, (tr_k, vl_k) in enumerate(zip(tr_dct.keys(), vl_dct.keys())): if lg.logger is not None: lg.logger.info(f'Fold {fold+1}/{cv_folds}') tr_id = tr_dct[tr_k] vl_id = vl_dct[vl_k] # Samples from this dataset are randomly sampled for training xtr = xdata.iloc[tr_id, :] ytr = np.squeeze(ydata.iloc[tr_id, :]).values # A fixed set of validation samples for the current CV split xvl = xdata.iloc[vl_id, :] yvl = np.squeeze(ydata.iloc[vl_id, :]).values # Get the estimator estimator = ml_models.get_model(model_name, init_kwargs=init_kwargs) model = estimator.model keras.utils.plot_model(model, to_file=run_outdir / 'nn_model.png') # Callbacks # keras_callbacks = define_keras_callbacks(run_outdir) model_checkpoint_dir = run_outdir / 'models' os.makedirs(model_checkpoint_dir, exist_ok=True) checkpointer = ModelCheckpoint(str( model_checkpoint_dir / 'model.ep_{epoch:d}-val_loss_{val_loss:.4f}-val_mae_{val_mean_absolute_error:.4f}.h5' ), save_best_only=False) csv_logger = CSVLogger(run_outdir / 'training.log') reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.75, patience=20, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) early_stop = EarlyStopping(monitor='val_loss', patience=60, verbose=1) keras_callbacks = [checkpointer, csv_logger, early_stop, reduce_lr] if clr_keras_kwargs['mode'] is not None: keras_callbacks.append( ml_models.clr_keras_callback(**clr_keras_kwargs)) # Fit params fit_kwargs['validation_data'] = (xvl, yvl) fit_kwargs['callbacks'] = keras_callbacks # Train t0 = time() history = model.fit(xtr, ytr, **fit_kwargs) lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600)) # Dump model, history, plots model.save(str(run_outdir / 'model_final.h5')) hh = ml_models.save_krs_history(history, outdir=run_outdir) ml_models.plot_prfrm_metrics(history, title=f'Training', skp_ep=skp_ep, add_lr=True, outdir=run_outdir) # Multi-gpu training # keras.utils.multi_gpu_model(model, gpus=[0, 1], cpu_merge=True, cpu_relocation=False) # Load the best model to make preds eval_metric = 'val_mean_absolute_error' ep_best = hh.loc[hh[eval_metric] == hh[eval_metric].min(), 'epoch'].values[0] mpath = glob( str(model_checkpoint_dir / f'model.ep_{ep_best}-val_loss*.h5'))[0] model = load_model(mpath) # Calc preds and scores # ... training set y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_dict(tr_scores, outpath=run_outdir / 'tr_scores.txt') pd.DataFrame({ 'y_true': y_true.reshape(-1), 'y_pred': y_pred.reshape(-1, ) }).to_csv(run_outdir / 'tr_preds.csv', index=False) # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_dict(vl_scores, outpath=run_outdir / 'vl_scores.txt') pd.DataFrame({ 'y_true': y_true.reshape(-1, ), 'y_pred': y_pred.reshape(-1, ) }).to_csv(run_outdir / 'vl_preds.csv', index=False) # Calc preds and scores for test set y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype) te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_dict(te_scores, outpath=run_outdir / 'te_scores.txt') pd.DataFrame({ 'y_true': y_true.reshape(-1), 'y_pred': y_pred.reshape(-1, ) }).to_csv(run_outdir / 'te_preds.csv', index=False) lg.kill_logger() del xdata, ydata print('Done.')
def run(args): # Global outdir gout = Path(args['global_outdir']) os.makedirs(gout, exist_ok=True) # dirpath = verify_dirpath(args['dirpath']) data = read_data_file(filepath / args['filepath'], 'parquet') print('data.shape', data.shape) # Get features (x), target (y), and meta fea_list = args['cell_fea'] + args['drug_fea'] xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_') meta = data.drop(columns=xdata.columns) ydata = meta[[args['target_name']]] del data # ML type ('reg' or 'cls') if 'reg' in args['model_name']: mltype = 'reg' elif 'cls' in args['model_name']: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") # Create logger lg = Logger(gout / 'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') def get_unq_split_ids(all_splits_path): """ List containing the full path of each split. """ unq = [ all_splits_path[i].split(os.sep)[-1].split('_')[1] for i, p in enumerate(all_splits_path) ] # unq = [] # for i, p in enumerate(all_splits_path): # sp = all_splits_path[i].split(os.sep)[-1].split('_')[1] # unq.append(sp) unq = np.unique(unq) return unq all_splits_path = glob(str(Path(args['splitpath']) / '1fold_*_id.csv')) unq_split_ids = get_unq_split_ids(all_splits_path) run_times = [] # Append scores (dicts) tr_scores_all = [] vl_scores_all = [] te_scores_all = [] # Sample size at each run smp_sz = [] file_smp_sz = open(gout / 'sample_sz', 'w') file_smp_sz.write('run\ttr_sz\tvl_sz\tte_sz\n') # Iterate over splits n_splits = None if args['n_splits'] is None else (args['n_splits'] + 1) for i, split_id in enumerate(unq_split_ids[:n_splits]): # print(f'Split {split_id}') # Get indices for the split aa = [p for p in all_splits_path if f'1fold_{split_id}' in p] if len(aa) < 2: print(f'The split {s} contains only one file.') continue for id_file in aa: if 'tr_id' in id_file: tr_id = read_data_file(id_file) # elif 'vl_id' in id_file: # # vl_id = read_data_file( id_file ) # te_id = read_data_file( id_file ) elif 'vl_id' in id_file: vl_id = read_data_file(id_file) elif 'te_id' in id_file: te_id = read_data_file(id_file) # Define run outdir rout = gout / f'run_{split_id}' os.makedirs(rout, exist_ok=True) # Scaling # xdata = scale_fea(xdata=xdata, scaler_name=args['scaler']) # scale features # Get training and val data # Extract Train set T, Validation set V, and Test set E tr_id = tr_id.iloc[:, 0].values.astype(int).tolist() vl_id = vl_id.iloc[:, 0].values.astype(int).tolist() te_id = te_id.iloc[:, 0].values.astype(int).tolist() xtr, ytr, mtr = get_data_by_id( tr_id, xdata, ydata, meta) # samples from xtr are sequentially sampled for TRAIN xvl, yvl, mvl = get_data_by_id( vl_id, xdata, ydata, meta) # fixed set of VAL samples for the current CV split xte, yte, mte = get_data_by_id( te_id, xdata, ydata, meta) # fixed set of TEST samples for the current CV split # Extract val data # from sklearn.model_selection import train_test_split # id_arr = np.arange(len(xtr)) # tr_, vl_ = train_test_split(id_arr, test_size=0.1) # xvl = xtr.iloc[vl_,:].reset_index(drop=True) # xtr = xtr.iloc[tr_,:].reset_index(drop=True) # mvl = mtr.iloc[vl_,:].reset_index(drop=True) # mtr = mtr.iloc[tr_,:].reset_index(drop=True) # yvl = ytr.iloc[vl_].reset_index(drop=True) # ytr = ytr.iloc[tr_].reset_index(drop=True) # Remove AUC gap min_gap = args['min_gap'] max_gap = args['max_gap'] if (min_gap is not None) & (max_gap is not None): idx = (ytr.values > min_gap) & (ytr.values < max_gap) xtr = xtr[~idx] mtr = mtr[~idx] ytr = ytr[~idx] def drop_samples(x_df, y_df, m_df, items_to_drop, drop_by: str): """ Args: drop_by : col in df ('CELL', 'DRUG', 'CTYPE') """ id_drop = m_df[drop_by].isin(items_to_drop) x_df = x_df[~id_drop].reset_index(drop=True) y_df = y_df[~id_drop].reset_index(drop=True) m_df = m_df[~id_drop].reset_index(drop=True) return x_df, y_df, m_df # Dump cell lines # if args['cell_list_drop'] is not None: # cell_to_drop_fpath = Path(args['cell_list_drop']) # cell_to_drop_fname = 'cell_list_tmp' # cell_to_drop_fpath = filepath / cell_to_drop_fname if args['cell_list_drop'] is not None: cell_to_drop_fpath = Path(args['cell_list_drop']) if cell_to_drop_fpath.exists(): # with open(cell_to_drop_fpath, 'r') as f: with open(cell_to_path_fpath, 'r') as f: cells_to_drop = [line.rstrip() for line in f] xtr, ytr, mtr = drop_samples(x_df=xtr, y_df=ytr, m_df=mtr, items_to_drop=cells_to_drop) xvl, yvl, mvl = drop_samples(x_df=xvl, y_df=yvl, m_df=mvl, items_to_drop=cells_to_drop) xte, yte, mte = drop_samples(x_df=xte, y_df=yte, m_df=mte, items_to_drop=cells_to_drop) line = 's{}\t{}\t{}\t{}\n'.format(split_id, xtr.shape[0], xvl.shape[0], xte.shape[0]) file_smp_sz.write(line) # Adjust the responses if mltype == 'cls': ytr = bin_rsp(ytr, resp_thres=0.5) yvl = bin_rsp(yvl, resp_thres=0.5) yte = bin_rsp(yte, resp_thres=0.5) # Define ML model if 'lgb' in args['model_name']: args['framework'] = 'lightgbm' elif args['model_name'] == 'rf_reg': args['framework'] = 'sklearn' elif 'nn_' in args['model_name']: args['framework'] = 'keras' model_init_kwargs, model_fit_kwargs = get_model_kwargs(args) # Get the estimator estimator = ml_models.get_model(args['model_name'], init_kwargs=model_init_kwargs) model = estimator.model # Train eval_set = (xvl, yvl) # eval_set = None if args['framework'] == 'lightgbm': model, runtime = trn_lgbm_model(model=model, xtr=xtr, ytr=ytr, eval_set=eval_set, fit_kwargs=model_fit_kwargs) elif args['framework'] == 'sklearn': model, runtime = trn_sklearn_model(model=model, xtr_sub=xtr, ytr_sub=ytr, eval_set=None, fit_kwargs=model_fit_kwargs) elif args['framework'] == 'keras': model, runtime = trn_keras_model(model=model, xtr_sub=xtr, ytr_sub=ytr, eval_set=eval_set) elif args['framework'] == 'pytorch': pass else: raise ValueError(f'Framework {framework} is not yet supported.') if model is None: continue # sometimes keras fails to train a model (evaluates to nan) # Append runtime run_times.append(runtime) # Dump model if args['save_model']: joblib.dump(model, filename=rout / ('model.' + args['model_name'] + '.pkl')) # Calc preds and scores # ... training set y_pred, y_true = calc_preds(model, x=xtr, y=ytr, mltype=mltype) tr_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_preds(y_true, y_pred, meta=mtr, outpath=rout / 'preds_tr.csv') # ... val set y_pred, y_true = calc_preds(model, x=xvl, y=yvl, mltype=mltype) vl_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_preds(y_true, y_pred, meta=mvl, outpath=rout / 'preds_vl.csv') # ... test set y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype=mltype) te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype=mltype, metrics=None) dump_preds(y_true, y_pred, meta=mte, outpath=rout / 'preds_te.csv') # Add metadata tr_scores['run'] = split_id vl_scores['run'] = split_id te_scores['run'] = split_id # Append scores (dicts) tr_scores_all.append(tr_scores) vl_scores_all.append(vl_scores) te_scores_all.append(te_scores) # Free space # del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, tr_, vl_ del xtr, ytr, mtr, xvl, yvl, mvl, xte, yte, mte, eval_set, model, estimator if i % 10 == 0: print(f'Finished {split_id}') file_smp_sz.close() # Scores to df tr_scores_df = scores_to_df(tr_scores_all) vl_scores_df = scores_to_df(vl_scores_all) te_scores_df = scores_to_df(te_scores_all) tr_scores_df.to_csv(gout / 'tr_scores.csv', index=False) vl_scores_df.to_csv(gout / 'vl_scores.csv', index=False) te_scores_df.to_csv(gout / 'te_scores.csv', index=False) if (time() - t0) // 3600 > 0: lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600)) else: lg.logger.info('Runtime: {:.1f} min'.format((time() - t0) / 60)) del tr_scores_df, vl_scores_df, te_scores_df # -------------------------------------------------------- # Calc stats def reorg_cols(df, col_first: str): """ Args: col_first : col name to put first """ cols = df.columns.tolist() cols.remove(col_first) return df[[col_first] + cols] def agg_preds_from_cls_runs(runs_dirs, phase='_te.csv', verbose=False): """ Aggregate predictions bootstraped ML trainings. """ prd = [] for i, dir_name in enumerate(runs_dirs): if '_tr.csv' in phase: prd_ = pd.read_csv(dir_name / 'preds_tr.csv') elif '_vl.csv' in phase: prd_ = pd.read_csv(dir_name / 'preds_vl.csv') elif '_te.csv' in phase: prd_ = pd.read_csv(dir_name / 'preds_te.csv') # prd_te_['err'] = abs(prd_te_['y_true'] - prd_te_['y_pred']) # add col 'err' prd_['run'] = str(dir_name).split( os.sep)[-1].split('_')[-1] # add col 'run' identifier prd.append(prd_) # append run data if verbose: if i % 20 == 0: print(f'Processing {dir_name}') # Aggregate to df prd = pd.concat(prd, axis=0) # Reorganize cols prd = reorg_cols(prd, col_first='run').sort_values('run').reset_index( drop=True).reset_index().rename(columns={'index': 'idx'}) return prd # Concat preds from all runs runs_dirs = [Path(p) for p in glob(str(gout / 'run_*'))] prd_te_all = agg_preds_from_cls_runs(runs_dirs, phase='_te.csv') if 'source' not in [str(i).lower() for i in prd_te_all.columns.to_list()]: prd_te_all.insert( loc=2, column='SOURCE', value=[s.split('.')[0].lower() for s in prd_te_all['CELL']]) # Cancer types cancer_types = pd.read_csv(filepath / 'data/combined_cancer_types', sep='\t', names=['CELL', 'CTYPE']) # Add CTYPE columns prd_te_all = pd.merge(prd_te_all, cancer_types, on='CELL') prd_te_all = reorg_cols(prd_te_all, col_first='CTYPE') # Rename prd_te_all = prd_te_all.rename(columns={ 'y_true': 'y_true_cls', 'y_pred': 'y_pred_prob' }) # Retain specific columns cols = [ 'idx', 'run', 'SOURCE', 'CTYPE', 'CELL', 'DRUG', 'R2fit', 'AUC', 'y_true_cls', 'y_pred_prob' ] prd_te_all = prd_te_all[cols] # Add col of pred labels prd_te_all['y_pred_cls'] = prd_te_all.y_pred_prob.map(lambda x: 0 if x < 0.5 else 1) # The highest error is 0.5 while the lowest is 0. # This value is proportional to the square root of Brier score. prd_te_all['prob_err'] = abs(prd_te_all.y_true_cls - prd_te_all.y_pred_prob) # Bin AUC values bins = np.arange(0, 1.1, 0.1).tolist() prd_te_all['AUC_bin'] = pd.cut(prd_te_all.AUC, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise') # Add col that cetegorizes the preds prd_te_all['prd_cat'] = None prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1) & (prd_te_all.y_pred_cls == 1)] = 'TP' prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0) & (prd_te_all.y_pred_cls == 0)] = 'TN' prd_te_all.prd_cat[(prd_te_all.y_true_cls == 1) & (prd_te_all.y_pred_cls == 0)] = 'FN' prd_te_all.prd_cat[(prd_te_all.y_true_cls == 0) & (prd_te_all.y_pred_cls == 1)] = 'FP' # Add cols prd_te_all['TP'] = prd_te_all.prd_cat == 'TP' prd_te_all['TN'] = prd_te_all.prd_cat == 'TN' prd_te_all['FP'] = prd_te_all.prd_cat == 'FP' prd_te_all['FN'] = prd_te_all.prd_cat == 'FN' # Save aggregated master table prd_te_all.to_csv('prd_te_all.csv', index=False) # Plot confusion matrix from sklearn.metrics import confusion_matrix # y_true_cls = prd_te_all.y_true_cls # y_pred_cls = prd_te_all.y_pred.map(lambda x: 0 if x<0.5 else 1) y_true_cls = prd_te_all.y_true_cls y_pred_cls = prd_te_all.y_pred_cls np_conf = confusion_matrix(y_true_cls, y_pred_cls) tn, fp, fn, tp = confusion_matrix(y_true_cls, y_pred_cls).ravel() mcc = sklearn.metrics.matthews_corrcoef(y_true_cls, y_pred_cls, sample_weight=None) print('TN:', tn) print('FP:', fp) print('FN:', fn) print('TP:', tp) print('FPR:', fp / (fp + tn)) print('FNR:', fn / (fn + tp)) print('MCC:', mcc) with open(gout / 'scores.txt', 'w') as f: f.write('TN: {:d}\n'.format(tn)) f.write('TN: {:d}\n'.format(tn)) f.write('FP: {:d}\n'.format(fp)) f.write('FN: {:d}\n'.format(fn)) f.write('TP: {:d}\n'.format(tp)) f.write('FPR: {:.5f}\n'.format(fp / (fp + tn))) f.write('FNR: {:.5f}\n'.format(fn / (fn + tp))) f.write('MCC: {:.5f}\n'.format(mcc)) # Confusion Matrix conf = confusion_matrix(y_true_cls, y_pred_cls, normalize=None) conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp']) conf_plot.plot(include_values=True, cmap=plt.cm.Blues, ax=None, xticks_rotation=None, values_format='d') plt.savefig(gout / 'conf_mat.png', dpi=100) # Confusion Matrix (normalized) conf = confusion_matrix(y_true_cls, y_pred_cls, normalize='all') conf_plot = ConfusionMatrixDisplay(conf, display_labels=['NoResp', 'Resp']) conf_plot.plot(include_values=True, cmap=plt.cm.Blues, ax=None, xticks_rotation=None, values_format='.2f') conf_plot.ax_.set_title('Normalized') plt.savefig(gout / 'conf_mat_norm.png', dpi=100) def add_conf_data(data): """ Add columns are used to calc confusion matrix TP, TN, FN, FP. """ data['TP'] = data.apply(lambda row: row.y_pred_cls_1 if row.y_true == 1 else False, axis=1) # tp data['TN'] = data.apply(lambda row: row.y_pred_cls_0 if row.y_true == 0 else False, axis=1) # tn data['FN'] = data.apply(lambda row: row.y_pred_cls_0 if row.y_true == 1 else False, axis=1) # fn data['FP'] = data.apply(lambda row: row.y_pred_cls_1 if row.y_true == 0 else False, axis=1) # fp data['TPR'] = data.apply( lambda row: np.nan if (row.TP == 0) & (row.FN == 0) else row.TP / (row.TP + row.FN), axis=1) # sensitivity, recall: TP/P = TP/(TP+FN) data['TNR'] = data.apply(lambda row: np.nan if (row.TN == 0) & (row.FP == 0) else row.TN / (row.TN + row.FP), axis=1) # specificity: TN/N = TN/(TN+FP) data['FPR'] = data.apply(lambda row: np.nan if (row.TN == 0) & (row.FP == 0) else row.FP / (row.TN + row.FP), axis=1) # fall-out: FP/N = FP/(FP+TN) data['FNR'] = data.apply(lambda row: np.nan if (row.TP == 0) & (row.FN == 0) else row.FN / (row.TP + row.FN), axis=1) # miss-rate: FN/NP = FN/(FN+TP) return data # Summary table prd_te_to_grp = prd_te_all.copy() prd_te_to_grp['y_pred_prob_median'] = prd_te_to_grp.y_pred_prob prd_te_to_grp['y_pred_prob_std'] = prd_te_to_grp.y_pred_prob prd_te_to_grp['y_pred_tot'] = prd_te_to_grp.idx prd_te_to_grp['y_pred_cls_0'] = prd_te_to_grp.y_pred.map( lambda x: True if x < 0.5 else False) prd_te_to_grp['y_pred_cls_1'] = prd_te_to_grp.y_pred.map( lambda x: True if x >= 0.5 else False) prd_te_to_grp['y_true_unq_vals'] = prd_te_to_grp.y_true_cls # ----------------------- # Groupby Cell # ----------------------- by = 'CELL' sm_cell = prd_te_to_grp.groupby([by, 'y_true']).agg({ 'DRUG': 'unique', 'CTYPE': 'unique', 'y_true_unq_vals': 'unique', 'y_pred_prob_median': np.median, 'y_pred_prob_std': np.std, 'y_pred_cls_0': lambda x: int(sum(x)), 'y_pred_cls_1': lambda x: int(sum(x)), 'y_pred_tot': lambda x: len(np.unique(x)), }).reset_index().sort_values(by, ascending=True) sm_cell['y_true_unq_vals'] = sm_cell.y_true_unq_vals.map( lambda x: len(x) if type(x) == np.ndarray else 1) sm_cell = add_conf_data(sm_cell) sm_cell.to_csv(gout / 'sm_by_cell.csv', index=False) # ----------------------- # Groupby Cancer Type # ----------------------- by = 'CTYPE' sm_ctype = prd_te_to_grp.groupby([by, 'y_true']).agg({ 'DRUG': 'unique', 'CELL': 'unique', 'y_true_unq_vals': 'unique', 'y_pred_prob_median': np.median, 'y_pred_prob_std': np.std, 'y_pred_cls_0': lambda x: int(sum(x)), 'y_pred_cls_1': lambda x: int(sum(x)), 'y_pred_tot': lambda x: len(np.unique(x)), }).reset_index().sort_values(by, ascending=True) sm_ctype['y_true_unq_vals'] = sm_ctype.y_true_unq_vals.map( lambda x: len(x) if type(x) == np.ndarray else 1) sm_ctype = add_conf_data(sm_ctype) sm_ctype.to_csv(gout / 'sm_by_ctype.csv', index=False) # ----------------------- # Groupby Drug # ----------------------- by = 'DRUG' sm_drug = prd_te_to_grp.groupby([by, 'y_true']).agg({ 'CTYPE': 'unique', 'CELL': 'unique', 'y_true_unq_vals': 'unique', 'y_pred_prob_median': np.median, 'y_pred_prob_std': np.std, 'y_pred_cls_0': lambda x: int(sum(x)), 'y_pred_cls_1': lambda x: int(sum(x)), 'y_pred_tot': lambda x: len(np.unique(x)), }).reset_index().sort_values(by, ascending=True) sm_drug['y_true_unq_vals'] = sm_drug.y_true_unq_vals.map( lambda x: len(x) if type(x) == np.ndarray else 1) sm_drug = add_conf_data(sm_drug) sm_drug.to_csv(gout / 'sm_by_drug.csv', index=False) # -------------------------------------------------------- lg.kill_logger()
} prfx_dtypes = { 'rna': np.float32, 'cnv': np.int8, 'snp': np.int8, 'dsc': np.float32, 'fng': np.int8 } # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = create_outdir(OUTDIR, args) args['outdir'] = str(outdir) lg = Logger(outdir / 'create_tidy_logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') dump_dict(args, outpath=outdir / 'create_tidy_args.txt') # dump args # ----------------------------------------------- # Load response data, and features # ----------------------------------------------- rsp = load_rsp(RSP_FILENAME, logger=lg.logger, args=args) rna = load_rna(rna_norm=args['rna_norm'], logger=lg.logger, float_type=prfx_dtypes['rna']) dsc = load_dsc(DSC_FILENAME, logger=lg.logger, float_type=prfx_dtypes['dsc']) # ----------------------------------------------- # Load cell and drug meta
def run(args): dirpath = Path(args['dirpath']) # dname = args['dname'] # src_names = args['src_names'] # Target target_name = args['target_name'] # Data split cv_folds = args['cv_folds'] # Features cell_fea = args['cell_features'] drug_fea = args['drug_features'] other_fea = args['other_features'] fea_list = cell_fea + drug_fea + other_fea # NN params epochs = args['epochs'] batch_size = args['batch_size'] dr_rate = args['dr_rate'] # Optimizer opt_name = args['opt'] clr_keras_kwargs = { 'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma'] } # Learning curve n_shards = args['n_shards'] # Other params # framework = args['framework'] model_name = args['model_name'] n_jobs = args['n_jobs'] # ML type ('reg' or 'cls') if 'reg' in model_name: mltype = 'reg' elif 'cls' in model_name: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") # Define metrics # metrics = {'r2': 'r2', # 'neg_mean_absolute_error': 'neg_mean_absolute_error', #sklearn.metrics.neg_mean_absolute_error, # 'neg_median_absolute_error': 'neg_median_absolute_error', #sklearn.metrics.neg_median_absolute_error, # 'neg_mean_squared_error': 'neg_mean_squared_error', #sklearn.metrics.neg_mean_squared_error, # 'reg_auroc_score': utils.reg_auroc_score} # ======================================================================== # Load data and pre-proc # ======================================================================== dfs = {} def get_file(fpath): return pd.read_csv( fpath, header=None).squeeze().values if fpath.is_file() else None def read_data_file(fpath, file_format='csv'): fpath = Path(fpath) if fpath.is_file(): if file_format == 'csv': df = pd.read_csv(fpath) elif file_format == 'parquet': df = pd.read_parquet(fpath) else: df = None return df if dirpath is not None: xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet') meta = read_data_file(dirpath / 'meta.parquet', 'parquet') ydata = meta[[target_name]] tr_id = pd.read_csv(dirpath / f'{cv_folds}fold_tr_id.csv') vl_id = pd.read_csv(dirpath / f'{cv_folds}fold_vl_id.csv') # tr_ids_list = get_file( dirpath/f'{cv_folds}fold_tr_id.csv' ) # vl_ids_list = get_file( dirpath/f'{cv_folds}fold_vl_id.csv' ) # te_ids_list = get_file( dirpath/f'{cv_folds}fold_te_id.csv' ) src = dirpath.name.split('_')[0] dfs[src] = (ydata, xdata, tr_id, vl_id) elif dname == 'combined': # TODO: this is not used anymore (probably won't work) DATADIR = file_path / '../../data/processed/data_splits' DATAFILENAME = 'data.parquet' dirs = glob(str(DATADIR / '*')) for src in src_names: print(f'\n{src} ...') subdir = f'{src}_cv_{cv_method}' if str(DATADIR / subdir) in dirs: # Get the CV indexes tr_id = pd.read_csv(DATADIR / subdir / f'{cv_folds}fold_tr_id.csv') vl_id = pd.read_csv(DATADIR / subdir / f'{cv_folds}fold_vl_id.csv') # Get the data datapath = DATADIR / subdir / DATAFILENAME data = pd.read_parquet(datapath) xdata, _, meta, _ = break_src_data( data, target=None, scaler=None) # logger=lg.logger ydata = meta[[target_name]] dfs[src] = (ydata, xdata, tr_id, vl_id) del data, xdata, ydata, tr_id, vl_id, src for src, data in dfs.items(): ydata, xdata, tr_id, vl_id = data[0], data[1], data[2], data[3] # Scale scaler = args['scaler'] if scaler is not None: if scaler == 'stnd': scaler = StandardScaler() elif scaler == 'minmax': scaler = MinMaxScaler() elif scaler == 'rbst': scaler = RobustScaler() cols = xdata.columns xdata = pd.DataFrame(scaler.fit_transform(xdata), columns=cols, dtype=np.float32) # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- run_outdir = create_outdir(OUTDIR, args, src) lg = Logger(run_outdir / 'logfile.log') lg.logger.info(f'File path: {file_path}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file utils.dump_dict(args, outpath=run_outdir / 'args.txt') # ----------------------------------------------- # ML model configs # ----------------------------------------------- if model_name == 'lgb_reg': framework = 'lightgbm' init_kwargs = { 'n_jobs': n_jobs, 'random_state': SEED, 'logger': lg.logger } fit_kwargs = {'verbose': False} elif model_name == 'nn_reg': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'attn': attn, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg2': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 elif model_name == 'nn_reg3' or 'nn_reg4': framework = 'keras' init_kwargs = { 'in_dim_rna': None, 'in_dim_dsc': None, 'dr_rate': dr_rate, 'opt_name': opt_name, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 # ----------------------------------------------- # Learning curve # ----------------------------------------------- lg.logger.info('\n\n{}'.format('=' * 50)) lg.logger.info(f'Learning curves {src} ...') lg.logger.info('=' * 50) t0 = time() lc = LearningCurve(X=xdata, Y=ydata, cv=None, cv_lists=(tr_id, vl_id), n_shards=n_shards, shard_step_scale='log10', args=args, logger=lg.logger, outdir=run_outdir) lrn_crv_scores = lc.trn_learning_curve( framework=framework, mltype=mltype, model_name=model_name, init_kwargs=init_kwargs, fit_kwargs=fit_kwargs, clr_keras_kwargs=clr_keras_kwargs, n_jobs=n_jobs, random_state=SEED) lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 360)) # ------------------------------------------------- # Learning curve (sklearn method) # Problem! cannot log multiple metrics. # ------------------------------------------------- """ lg.logger.info('\nStart learning curve (sklearn method) ...') # Define params metric_name = 'neg_mean_absolute_error' base = 10 train_sizes_frac = np.logspace(0.0, 1.0, lc_ticks, endpoint=True, base=base)/base # Run learning curve t0 = time() lrn_curve_scores = learning_curve( estimator=model.model, X=xdata, y=ydata, train_sizes=train_sizes_frac, cv=cv, groups=groups, scoring=metric_name, n_jobs=n_jobs, exploit_incremental_learning=False, random_state=SEED, verbose=1, shuffle=False) lg.logger.info('Runtime: {:.1f} mins'.format( (time()-t0)/60) ) # Dump results # lrn_curve_scores = utils.cv_scores_to_df(lrn_curve_scores, decimals=3, calc_stats=False) # this func won't work # lrn_curve_scores.to_csv(os.path.join(run_outdir, 'lrn_curve_scores_auto.csv'), index=False) # Plot learning curves lrn_crv.plt_learning_curve(rslt=lrn_curve_scores, metric_name=metric_name, title='Learning curve (target: {}, data: {})'.format(target_name, tr_sources_name), path=os.path.join(run_outdir, 'auto_learning_curve_' + target_name + '_' + metric_name + '.png')) """ lg.kill_logger() del xdata, ydata print('Done.')
def run(args): dirpath = Path(args['dirpath']) assert dirpath.exists(), 'You must specify the dirpath.' target_name = args['target_name'] cv_folds = args['cv_folds'] cv_folds_arr = args['cv_folds_arr'] # Features cell_fea = args['cell_fea'] drug_fea = args['drug_fea'] # other_fea = args['other_fea'] # fea_list = cell_fea + drug_fea + other_fea fea_list = cell_fea + drug_fea # NN params epochs = args['epochs'] batch_size = args['batch_size'] dr_rate = args['dr_rate'] batchnorm = args['batchnorm'] # Optimizer opt_name = args['opt'] lr = args['lr'] clr_keras_kwargs = { 'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma'] } # Learning curve shard_step_scale = args['shard_step_scale'] min_shard = args['min_shard'] max_shard = args['max_shard'] n_shards = args['n_shards'] shards_arr = args['shards_arr'] # Other params # framework = args['framework'] model_name = args['model_name'] n_jobs = args['n_jobs'] # ML type ('reg' or 'cls') if 'reg' in model_name: mltype = 'reg' elif 'cls' in model_name: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") # Define metrics # metrics = {'r2': 'r2', # 'neg_mean_absolute_error': 'neg_mean_absolute_error', #sklearn.metrics.neg_mean_absolute_error, # 'neg_median_absolute_error': 'neg_median_absolute_error', #sklearn.metrics.neg_median_absolute_error, # 'neg_mean_squared_error': 'neg_mean_squared_error', #sklearn.metrics.neg_mean_squared_error, # 'reg_auroc_score': utils.reg_auroc_score} # ----------------------------------------------- # Load data and pre-proc # ----------------------------------------------- xdata = read_data_file(dirpath / 'xdata.parquet', 'parquet') meta = read_data_file(dirpath / 'meta.parquet', 'parquet') ydata = meta[[target_name]] tr_id = read_data_file(dirpath / f'{cv_folds}fold_tr_id.csv') vl_id = read_data_file(dirpath / f'{cv_folds}fold_vl_id.csv') te_id = read_data_file(dirpath / f'{cv_folds}fold_te_id.csv') src = dirpath.name.split('_')[0] # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = Path(str(dirpath).split('_')[0] + '_trn') run_outdir = create_outdir(outdir, args, src) lg = Logger(run_outdir / 'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file dump_dict(args, outpath=run_outdir / 'args.txt') # ----------------------------------------------- # Data preprocessing # ----------------------------------------------- xdata = scale_fea(xdata=xdata, scaler_name=args['scaler']) # scale features # ----------------------------------------------- # ML model configs # ----------------------------------------------- if model_name == 'lgb_reg': framework = 'lightgbm' init_kwargs = { 'n_estimators': args['gbm_trees'], 'max_depth': args['gbm_max_depth'], 'learning_rate': args['gbm_lr'], 'num_leaves': args['gbm_leaves'], 'n_jobs': args['n_jobs'], 'random_state': args['seed'] } fit_kwargs = {'verbose': False} elif model_name == 'rf_reg': framework = 'sklearn' init_kwargs = {'n_jobs': args['n_jobs'], 'random_state': args['seed']} fit_kwargs = {} elif model_name == 'nn_reg0' or 'nn_reg1' or 'nn_reg_layer_less' or 'nn_reg_layer_more' or 'nn_reg_neuron_less' or 'nn_reg_neuron_more': framework = 'keras' init_kwargs = { 'input_dim': xdata.shape[1], 'dr_rate': dr_rate, 'opt_name': opt_name, 'lr': lr, 'batchnorm': batchnorm, 'logger': lg.logger } fit_kwargs = { 'batch_size': batch_size, 'epochs': epochs, 'verbose': 1 } # 'validation_split': 0.1 # ----------------------------------------------- # Learning curve # ----------------------------------------------- lg.logger.info('\n\n{}'.format('-' * 50)) lg.logger.info(f'Learning curves {src} ...') lg.logger.info('-' * 50) lrn_crv_init_kwargs = { 'cv': None, 'cv_lists': (tr_id, vl_id, te_id), 'cv_folds_arr': cv_folds_arr, 'shard_step_scale': shard_step_scale, 'n_shards': n_shards, 'min_shard': min_shard, 'max_shard': max_shard, 'shards_arr': shards_arr, 'args': args, 'logger': lg.logger, 'outdir': run_outdir } lrn_crv_trn_kwargs = { 'framework': framework, 'mltype': mltype, 'model_name': model_name, 'init_kwargs': init_kwargs, 'fit_kwargs': fit_kwargs, 'clr_keras_kwargs': clr_keras_kwargs, 'n_jobs': n_jobs, 'random_state': args['seed'] } t0 = time() lc = LearningCurve(X=xdata, Y=ydata, **lrn_crv_init_kwargs) lrn_crv_scores = lc.trn_learning_curve(**lrn_crv_trn_kwargs) lg.logger.info('Runtime: {:.1f} hrs'.format((time() - t0) / 3600)) # ------------------------------------------------- # Learning curve (sklearn method) # Problem! cannot log multiple metrics. # ------------------------------------------------- """ lg.logger.info('\nStart learning curve (sklearn method) ...') # Define params metric_name = 'neg_mean_absolute_error' base = 10 train_sizes_frac = np.logspace(0.0, 1.0, lc_ticks, endpoint=True, base=base)/base # Run learning curve t0 = time() lrn_curve_scores = learning_curve( estimator=model.model, X=xdata, y=ydata, train_sizes=train_sizes_frac, cv=cv, groups=groups, scoring=metric_name, n_jobs=n_jobs, exploit_incremental_learning=False, random_state=SEED, verbose=1, shuffle=False) lg.logger.info('Runtime: {:.1f} mins'.format( (time()-t0)/60) ) # Dump results # lrn_curve_scores = utils.cv_scores_to_df(lrn_curve_scores, decimals=3, calc_stats=False) # this func won't work # lrn_curve_scores.to_csv(os.path.join(run_outdir, 'lrn_curve_scores_auto.csv'), index=False) # Plot learning curves lrn_crv.plt_learning_curve(rslt=lrn_curve_scores, metric_name=metric_name, title='Learning curve (target: {}, data: {})'.format(target_name, tr_sources_name), path=os.path.join(run_outdir, 'auto_learning_curve_' + target_name + '_' + metric_name + '.png')) """ lg.kill_logger() del xdata, ydata print('Done.')
def make_split(xdata, meta, outdir, args): # Data splits te_method = args['te_method'] cv_method = args['cv_method'] te_size = split_size(args['te_size']) vl_size = split_size(args['vl_size']) # Features cell_fea = args['cell_fea'] drug_fea = args['drug_fea'] # fea_list = cell_fea + drug_fea # Other params n_jobs = args['n_jobs'] # Hard split grp_by_col = None # cv_method = 'simple' # TODO: this need to be improved mltype = 'reg' # required for the splits (stratify in case of classification) # ----------------------------------------------- # Outdir and Logger # ----------------------------------------------- # Logger lg = Logger(outdir / 'splitter.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') # Dump args to file dump_dict(args, outpath=outdir / 'args.txt') # ----------------------------------------------- # Load data and pre-proc # ----------------------------------------------- if (outdir / 'xdata.parquet').is_file(): xdata = pd.read_parquet(outdir / 'xdata.parquet') meta = pd.read_parquet(outdir / 'meta.parquet') # ----------------------------------------------- # Train-test split # ----------------------------------------------- np.random.seed(SEED) idx_vec = np.random.permutation(xdata.shape[0]) if te_method is not None: lg.logger.info('\nSplit train/test.') te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size, mltype=mltype, shuffle=False, random_state=SEED) te_grp = meta[grp_by_col].values[ idx_vec] if te_method == 'group' else None if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split train/test tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) tr_id = idx_vec[tr_id] # adjust the indices! te_id = idx_vec[te_id] # adjust the indices! pd.Series(tr_id).to_csv(outdir / f'tr_id.csv', index=False, header=[0]) pd.Series(te_id).to_csv(outdir / f'te_id.csv', index=False, header=[0]) lg.logger.info('Train: {:.1f}'.format(len(tr_id) / xdata.shape[0])) lg.logger.info('Test: {:.1f}'.format(len(te_id) / xdata.shape[0])) # Update the master idx vector for the CV splits idx_vec = tr_id # Plot dist of responses (TODO: this can be done to all response metrics) # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values, # title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png') # Confirm that group splits are correct if te_method == 'group' and grp_by_col is not None: tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) te_grp_unq = set(meta.loc[te_id, grp_by_col]) lg.logger.info( f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.' ) lg.logger.info( f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.' ) # Update vl_size to effective vl_size vl_size = vl_size * xdata.shape[0] / len(tr_id) del tr_id, te_id # ----------------------------------------------- # Generate CV splits # ----------------------------------------------- cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=SEED) cv_grp = meta[grp_by_col].values[ idx_vec] if cv_method == 'group' else None if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds = {} vl_folds = {} # Start CV iters for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! tr_folds[fold] = tr_id.tolist() vl_folds[fold] = vl_id.tolist() # Confirm that group splits are correct if cv_method == 'group' and grp_by_col is not None: tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) vl_grp_unq = set(meta.loc[vl_id, grp_by_col]) lg.logger.info( f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.' ) lg.logger.info( f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info( f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in tr_folds.items()])) vl_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in vl_folds.items()])) # Dump tr_folds.to_csv(outdir / f'{cv_folds}fold_tr_id.csv', index=False) vl_folds.to_csv(outdir / f'{cv_folds}fold_vl_id.csv', index=False) lg.kill_logger()
def build_dataframe(args): na_values = ['na', '-', ''] # (ap) # (ap) Create outdir and logger import os # outdir = Path('top' + str(args.top_n) + sffx + '_data') sffx = '' if args.src is None else '_'.join(args.src) if args.top_n < 200: outdir = Path('top' + str(args.top_n) + sffx) else: outdir = Path(sffx) os.makedirs(outdir, exist_ok=True) lg = Logger(outdir / 'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') dump_dict(vars(args), outpath=outdir / 'args.txt') # Identify Top N cancer types df_response = pd.read_csv(response_path, sep='\t', engine='c', low_memory=False, na_values=na_values, warn_bad_lines=True) lg.logger.info( df_response.groupby('SOURCE').agg({ 'CELL': 'nunique', 'DRUG': 'nunique' }).reset_index()) # (ap) # (ap) Extract specific data sources df_response['SOURCE'] = df_response['SOURCE'].apply( lambda x: x.lower()) # (ap) if args.src is not None: df_response = df_response[df_response['SOURCE'].isin( args.src)].reset_index(drop=True) df_uniq_cl_drugs = df_response[['CELL', 'DRUG' ]].drop_duplicates().reset_index(drop=True) df_cl_cancer_map = pd.read_csv(cell_cancer_types_map_path, sep='\t', header=None, names=['CELL', 'CANCER_TYPE']) df_cl_cancer_map.set_index('CELL') df_cl_cancer_drug = df_cl_cancer_map.merge(df_uniq_cl_drugs, on='CELL', how='left', sort='true') df_cl_cancer_drug['CELL_DRUG'] = df_cl_cancer_drug.CELL.astype( str) + '.' + df_cl_cancer_drug.DRUG.astype(str) top_n = df_cl_cancer_drug.groupby(['CANCER_TYPE']).count().sort_values( 'CELL_DRUG', ascending=False).head(args.top_n) top_n_cancer_types = top_n.index.to_list() lg.logger.info("Identified {} cancer types: {}".format( args.top_n, top_n_cancer_types)) # Indentify cell lines associated with the target cancer types df_cl = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin( top_n_cancer_types)][['CELL']].drop_duplicates().reset_index(drop=True) # Identify drugs associated with the target cancer type & filtered by drug_list df_drugs = df_cl_cancer_drug[df_cl_cancer_drug['CANCER_TYPE'].isin( top_n_cancer_types)][['DRUG']].drop_duplicates().reset_index(drop=True) drug_list = pd.read_csv(drug_list_path)['DRUG'].to_list() df_drugs = df_drugs[df_drugs['DRUG'].isin(drug_list)].reset_index( drop=True) # Filter response by cell lines (4882) and drugs (1779) cl_filter = df_cl.CELL.to_list() dr_filter = df_drugs.DRUG.to_list() target = args.target # df_response = df_response[df_response.CELL.isin(cl_filter) & df_response.DRUG.isin(dr_filter)][['CELL', 'DRUG', target]].drop_duplicates().reset_index(drop=True) # (ap) commented idx = df_response.CELL.isin(cl_filter) & df_response.DRUG.isin(dr_filter) df_response = df_response[idx].drop_duplicates().reset_index( drop=True) # (ap) keep all targets # (ap) Drop bad points (these identified by Yitan) # TODO: confirm this with Yitan! """ lg.logger.info('\nDrop bad samples ...') id_drop = (df_response['AUC'] == 0) & (df_response['EC50se'] == 0) & (df_response['R2fit'] == 0) df_response = df_response.loc[~id_drop,:] lg.logger.info(f'Dropped {sum(id_drop)} rsp data points.') lg.logger.info(f'df_response.shape {df_response.shape}') """ # (ap) Drop points with bad fit # TODO: check this (may require a more rigorous analysis) lg.logger.info('\nDrop samples with bad fit (R2fit) ...') lg.logger.info(f'df_response.shape {df_response.shape}') id_drop = df_response['R2fit'] <= 0 df_response = df_response.loc[~id_drop, :] lg.logger.info(f'Dropped {sum(id_drop)} rsp data points.') lg.logger.info(f'df_response.shape {df_response.shape}') if args.response_type == 'bin': df_response[target] = df_response[target].apply(lambda x: 0 if x < 0.5 else 1) df_response.rename(columns={target: 'Response'}, inplace=True) # ---------------- # Load RNA-Seq # ---------------- # Join response data with Drug descriptor & RNASeq df_rnaseq = pd.read_csv(get_cell_feature_path(args), sep='\t', low_memory=False, na_values=na_values, warn_bad_lines=True) df_rnaseq = df_rnaseq[df_rnaseq['Sample'].isin(cl_filter)].reset_index( drop=True) df_rnaseq.rename(columns={'Sample': 'CELL'}, inplace=True) df_rnaseq.columns = [ 'GE_' + x if i > 0 else x for i, x in enumerate(df_rnaseq.columns.to_list()) ] df_rnaseq = df_rnaseq.set_index(['CELL']) # ---------------- # Load descriptors # ---------------- df_descriptor = pd.read_csv(get_drug_descriptor_path(args), sep='\t', low_memory=False, na_values=na_values, warn_bad_lines=True) # df_descriptor = df_descriptor[df_descriptor.DRUG.isin(dr_filter)].set_index(['DRUG']).fillna(0) # (ap) commented --> bad imputation! df_descriptor = df_descriptor[df_descriptor.DRUG.isin( dr_filter)].set_index(['DRUG']) # (ap) added --> drop data imputation! # (ap) Some features have too many NA values (drop these) lg.logger.info('\nDrop cols with too many NA values ...') lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}') df_descriptor = dropna(df=df_descriptor, axis=1, th=0.5) lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}') # (ap) Impute missing values # There are descriptors for which there is a single unique value excluding NA (drop these) lg.logger.info( '\nDrop cols that have a single unique value (excluding NAs) ...') lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}') col_idx = df_descriptor.nunique(dropna=True).values == 1 df_descriptor = df_descriptor.iloc[:, ~col_idx] lg.logger.info(f'df_descriptor.shape {df_descriptor.shape}') # (ap) Impute missing values (drug descriptors) lg.logger.info('\nImpute NA values ...') df_descriptor = impute_values(df_descriptor, logger=None) # (ap) # There are still lots of descriptors which have only a few unique values. # We can categorize those values. e.g.: 564 descriptors have only 2 unique vals, # and 154 descriptors have only 3 unique vals, etc. # todo: use utility code from p1h_alex/utils/data_preproc.py that transform those # features into categorical and also applies an appropriate imputation. # df_descriptor.nunique(dropna=True).value_counts()[:10] # df_descriptor.nunique(dropna=True).value_counts().sort_index()[:10] df = df_response.merge(df_rnaseq, on='CELL', how='left', sort='true') df.set_index( ['DRUG']) # TODO: this doesn't take effect unless performed 'inplace' df_final = df.merge(df_descriptor, on='DRUG', how='left', sort='true') if args.labels: df_cell_map = df_final['CELL'].to_dict() df_drug_map = df_final['DRUG'].to_dict() df_final.drop(columns=['CELL', 'DRUG'], inplace=True) df_final.drop_duplicates(inplace=True) df_final.insert(0, 'DRUG', df_final.index.map(df_drug_map)) df_final.insert(0, 'CELL', df_final.index.map(df_cell_map)) df_final.reset_index(drop=True, inplace=True) else: df_final.drop(columns=['CELL', 'DRUG'], inplace=True) df_final.drop_duplicates(inplace=True) lg.logger.info("\nDataframe is built with total {} rows.".format( len(df_final))) # (ap) Shuffle # lg.logger.info("Shuffle final df.") # df_final = df_final.sample(frac=1.0, random_state=args.seed).reset_index(drop=True) lg.logger.info( df_final.groupby('SOURCE').agg({ 'CELL': 'nunique', 'DRUG': 'nunique' }).reset_index()) # (ap) save_filename = build_filename(args) # print("Saving to {}".format(save_filename)) # (ap) remove save_filename = outdir / save_filename # (ap) added if args.format == 'feather': df_final.to_feather(save_filename) elif args.format == 'csv': df_final.to_csv(str(save_filename) + '.csv', float_format='%g', index=False) elif args.format == 'tsv': df_final.to_csv(save_filename, sep='\t', float_format='%g', index=False) elif args.format == 'parquet': df_final.to_parquet(str(save_filename) + '.parquet', index=False) elif args.format == 'hdf5': df_cl.to_csv(build_file_basename(args) + '_cellline.txt', header=False, index=False) df_drugs.to_csv(build_file_basename(args) + '_drug.txt', header=False, index=False) df_final.to_hdf(save_filename, key='df', mode='w', complib='blosc:snappy', complevel=9) # -------------------------------------------------- # (ap) tissue type histogram # -------------------------------------------------- def plot_tissue_hist(top_n): dd = df_cl_cancer_drug[['CELL', 'DRUG', 'CANCER_TYPE' ]].merge(df_final[['CELL', 'DRUG', 'AUC']], on=['CELL', 'DRUG'], how='inner') dd = pd.DataFrame(dd['CANCER_TYPE'].value_counts()) dd = dd.reset_index().rename(columns={ 'index': 'ctype', 'CANCER_TYPE': 'count' }) dd['ctype'] = dd['ctype'].map(lambda x: ' '.join(x.split('_'))) x = dd['ctype'] y = dd['count'] ax = dd.plot.barh(x='ctype', y='count', xlim=[0, y.max() * 1.15], legend=False, figsize=(9, 7), fontsize=12) ax.set_ylabel(None, fontsize=14) ax.set_xlabel('Total responses', fontsize=14) ax.set_title( 'Number of AUC responses per cancer type ({})'.format(top_n), fontsize=14) ax.invert_yaxis() for p in ax.patches: val = int(p.get_width() / 1000) x = p.get_x() + p.get_width() + 1000 y = p.get_y() + p.get_height() / 2 ax.annotate(str(val) + 'k', (x, y), fontsize=10) # OR # fig, ax = plt.subplots(figsize=(7, 5)) # plt.barh(dd['CANCER_TYPE'], dd['CELL_DRUG'], color='b', align='center', alpha=0.7) # plt.xlabel('Total responses', fontsize=14); plt.savefig(outdir / 'Top{}_histogram.png'.format(top_n), dpi=100, bbox_inches='tight') return dd dd = plot_tissue_hist(top_n=args.top_n) # -------------------------------------------------- # -------------------------------------------------- # (ap) break data # -------------------------------------------------- # Split features and traget # print('\nSplit features and target.') # meta = df_final[['AUC', 'CELL', 'DRUG']] # xdata = df_final.drop(columns=['AUC', 'CELL', 'DRUG']) # xdata.to_parquet( outdir/'xdata.parquet' ) # meta.to_parquet( outdir/'meta.parquet' ) # print('Totoal DD: {}'.format( len([c for c in xdata.columns if 'DD' in c]) )) # print('Totoal GE: {}'.format( len([c for c in xdata.columns if 'GE' in c]) )) # -------------------------------------------------- # -------------------------------------------------- # (ap) generate train/val/test splits # -------------------------------------------------- # from data_split import make_split # print('\nSplit train/val/test.') # args['cell_fea'] = 'GE' # args['drug_fea'] = 'DD' # args['te_method'] = 'simple' # args['cv_method'] = 'simple' # args['te_size'] = 0.1 # args['vl_size'] = 0.1 # args['n_jobs'] = 4 # make_split(xdata=xdata, meta=meta, outdir=outdir, args=args) # -------------------------------------------------- lg.kill_logger() print('Done.')
def run(args): dirpath = verify_dirpath(args['dirpath']) te_size = split_size(args['te_size']) fea_list = args['cell_fea'] + args['drug_fea'] # Hard split split_on = None if args['split_on'] is None else args['split_on'].upper() cv_method = 'simple' if split_on is None else 'group' te_method = cv_method # TODO: this needs to be improved mltype = 'reg' # required for the splits (stratify in case of classification) # ----------------------------------------------- # Create (outdir and) logger # ----------------------------------------------- outdir = create_outdir(dirpath, args) args['outdir'] = str(outdir) lg = Logger(outdir / 'data_splitter_logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') dump_dict(args, outpath=outdir / 'data_splitter_args.txt') # dump args. # ----------------------------------------------- # Load and break data # ----------------------------------------------- lg.logger.info('\nLoad master dataset.') # files = list(dirpath.glob('**/*.parquet')) files = list(dirpath.glob('./*.parquet')) if len(files) > 0: data = pd.read_parquet( files[0]) # TODO: assumes that there is only one data file lg.logger.info('data.shape {}'.format(data.shape)) # Split features and traget, and dump to file lg.logger.info('\nSplit features and meta.') xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep='_') meta = data.drop(columns=xdata.columns) xdata.to_parquet(outdir / 'xdata.parquet') meta.to_parquet(outdir / 'meta.parquet') lg.logger.info('Total DD: {}'.format( len([c for c in xdata.columns if 'DD_' in c]))) lg.logger.info('Total GE: {}'.format( len([c for c in xdata.columns if 'GE_' in c]))) lg.logger.info('Unique cells: {}'.format(meta['CELL'].nunique())) lg.logger.info('Unique drugs: {}'.format(meta['DRUG'].nunique())) # cnt_fea(df, fea_sep='_', verbose=True, logger=lg.logger) plot_hist(meta['AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_all.png') # ----------------------------------------------- # Generate Hold-Out split (train/val/test) # ----------------------------------------------- """ First, we split the data into train and test. The remaining of train set is further splitted into train and validation. """ lg.logger.info('\n{}'.format('-' * 50)) lg.logger.info('Split into hold-out train/val/test') lg.logger.info('{}'.format('-' * 50)) # Note that we don't shuffle the original dataset, but rather we create a vector array of # representative indices. np.random.seed(args['seed']) idx_vec = np.random.permutation(data.shape[0]) # Create splitter that splits the full dataset into tr and te te_folds = int(1 / te_size) te_splitter = cv_splitter(cv_method=te_method, cv_folds=te_folds, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) te_grp = None if split_on is None else meta[split_on].values[idx_vec] if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) tr_id = idx_vec[ tr_id] # adjust the indices! we'll split the remaining tr into te and vl te_id = idx_vec[te_id] # adjust the indices! # Update a vector array that excludes the test indices idx_vec_ = tr_id del tr_id # Define vl_size while considering the new full size of the available samples vl_size = te_size / (1 - te_size) cv_folds = int(1 / vl_size) # Create splitter that splits tr into tr and vl cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) cv_grp = None if split_on is None else meta[split_on].values[idx_vec_] if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) # Split tr into tr and vl tr_id, vl_id = next(cv.split(idx_vec_, groups=cv_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! vl_id = idx_vec_[vl_id] # adjust the indices! # Dump tr, vl, te indices np.savetxt(outdir / '1fold_tr_id.csv', tr_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') np.savetxt(outdir / '1fold_vl_id.csv', vl_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') np.savetxt(outdir / '1fold_te_id.csv', te_id.reshape(-1, 1), fmt='%d', delimiter='', newline='\n') lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100 * len(tr_id) / xdata.shape[0])) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100 * len(vl_id) / xdata.shape[0])) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100 * len(te_id) / xdata.shape[0])) # Confirm that group splits are correct (no intersection) grp_col = 'CELL' if split_on is None else split_on print_intersection_on_var(meta, tr_id=tr_id, vl_id=vl_id, te_id=te_id, grp_col=grp_col, logger=lg.logger) plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir / 'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir / 'ytr_yvl_dist.png') # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- """ K-fold CV split is applied with multiple values of k. For each set of splits k, the dataset is divided into k splits, where each split results in train and val samples. In this process, we take the train samples, and divide them into a smaller subset of train samples and test samples. """ lg.logger.info('\n{}'.format('-' * 50)) lg.logger.info(f"Split into multiple sets k-fold splits (multiple k's)") lg.logger.info('{}'.format('-' * 50)) cv_folds_list = [5, 7, 10, 15, 20] for cv_folds in cv_folds_list: lg.logger.info(f'\n----- {cv_folds}-fold splits -----') # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) cv_grp = None if split_on is None else meta[split_on].values[idx_vec] if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds, vl_folds, te_folds = {}, {}, {} # Start CV iters (this for loop generates the tr and vl splits) for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold+1}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # ----------------- # Store vl ids vl_folds[fold] = vl_id.tolist() # Update te_size to the new full size of available samples te_size_ = len(vl_id) / len(idx_vec) / (1 - len(vl_id) / len(idx_vec)) te_folds_split = int(1 / te_size_) # Create splitter that splits tr into tr and te te_splitter = cv_splitter(cv_method=te_method, cv_folds=te_folds_split, test_size=None, mltype=mltype, shuffle=False, random_state=args['seed']) # Update the index array idx_vec_ = tr_id del tr_id te_grp = None if split_on is None else meta[split_on].values[ idx_vec_] if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # Store tr and te ids tr_folds[fold] = tr_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100 * len(tr_id) / xdata.shape[0])) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100 * len(vl_id) / xdata.shape[0])) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100 * len(te_id) / xdata.shape[0])) # Confirm that group splits are correct (no intersection) grp_col = 'CELL' if split_on is None else split_on print_intersection_on_var(meta, tr_id=tr_id, vl_id=vl_id, te_id=te_id, grp_col=grp_col, logger=lg.logger) # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in tr_folds.items()])) vl_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in vl_folds.items()])) te_folds = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in te_folds.items()])) # Dump tr_folds.to_csv(outdir / f'{cv_folds}fold_tr_id.csv', index=False) vl_folds.to_csv(outdir / f'{cv_folds}fold_vl_id.csv', index=False) te_folds.to_csv(outdir / f'{cv_folds}fold_te_id.csv', index=False) # ----------------------------------------------- # Generate CV splits (new) # ----------------------------------------------- """ # TODO: consider to separate the pipeline hold-out and k-fold splits! # Since we shuffled the dataset, we don't need to shuffle again. # np.random.seed(args['seed']) # idx_vec = np.random.permutation(xdata.shape[0]) idx_vec = np.array(range(xdata.shape[0])) cv_folds_list = [1, 5, 7, 10, 15, 20] lg.logger.info(f'\nStart CV splits ...') for cv_folds in cv_folds_list: lg.logger.info(f'\nCV folds: {cv_folds}') # Create CV splitter cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, mltype=mltype, shuffle=False, random_state=args['seed']) cv_grp = None if split_on is None else meta[split_on].values[idx_vec] if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) tr_folds, vl_folds, te_folds = {}, {}, {} # Start CV iters (this for loop generates the tr and vl splits) for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): lg.logger.info(f'\nFold {fold}') tr_id = idx_vec[tr_id] # adjust the indices! vl_id = idx_vec[vl_id] # adjust the indices! # ----------------- # Store vl ids vl_folds[fold] = vl_id.tolist() # Update te_size to the new full size of available samples if cv_folds == 1: te_size_ = vl_size / (1 - vl_size) else: te_size_ = len(vl_id)/len(idx_vec) / (1 - len(vl_id)/len(idx_vec)) # Create splitter that splits tr into tr and te te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size_, mltype=mltype, shuffle=False, random_state=args['seed']) # Update the index array idx_vec_ = tr_id; del tr_id te_grp = None if split_on is None else meta[split_on].values[idx_vec_] if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # Split tr into tr and te tr_id, te_id = next(te_splitter.split(idx_vec_, groups=te_grp)) tr_id = idx_vec_[tr_id] # adjust the indices! te_id = idx_vec_[te_id] # adjust the indices! # Store tr and te ids tr_folds[fold] = tr_id.tolist() te_folds[fold] = te_id.tolist() # ----------------- lg.logger.info('Train samples {} ({:.2f}%)'.format( len(tr_id), 100*len(tr_id)/xdata.shape[0] )) lg.logger.info('Val samples {} ({:.2f}%)'.format( len(vl_id), 100*len(vl_id)/xdata.shape[0] )) lg.logger.info('Test samples {} ({:.2f}%)'.format( len(te_id), 100*len(te_id)/xdata.shape[0] )) # Confirm that group splits are correct if split_on is not None: tr_grp_unq = set(meta.loc[tr_id, split_on]) vl_grp_unq = set(meta.loc[vl_id, split_on]) te_grp_unq = set(meta.loc[te_id, split_on]) lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tTotal group ({split_on}) intersec btw vl and te: {len(vl_grp_unq.intersection(te_grp_unq))}.') lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') lg.logger.info(f'\tUnique cell lines in te: {len(te_grp_unq)}.') # Convet to df # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) te_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in te_folds.items() ])) # Dump tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) te_folds.to_csv( outdir/f'{cv_folds}fold_te_id.csv', index=False ) # Plot target dist only for the 1-fold case # TODO: consider to plot dist for all k-fold where k>1 if cv_folds==1 and fold==0: plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') """ # # ----------------------------------------------- # # Train-test split # # ----------------------------------------------- # np.random.seed(SEED) # idx_vec = np.random.permutation(xdata.shape[0]) # # if te_method is not None: # lg.logger.info('\nSplit train/test.') # te_splitter = cv_splitter(cv_method=te_method, cv_folds=1, test_size=te_size, # mltype=mltype, shuffle=False, random_state=SEED) # # te_grp = meta[grp_by_col].values[idx_vec] if te_method=='group' else None # if is_string_dtype(te_grp): te_grp = LabelEncoder().fit_transform(te_grp) # # # Split train/test # tr_id, te_id = next(te_splitter.split(idx_vec, groups=te_grp)) # tr_id = idx_vec[tr_id] # adjust the indices! # te_id = idx_vec[te_id] # adjust the indices! # # pd.Series(tr_id).to_csv( outdir/f'tr_id.csv', index=False, header=[0] ) # pd.Series(te_id).to_csv( outdir/f'te_id.csv', index=False, header=[0] ) # # lg.logger.info('Train: {:.1f}'.format( len(tr_id)/xdata.shape[0] )) # lg.logger.info('Test: {:.1f}'.format( len(te_id)/xdata.shape[0] )) # # # Update the master idx vector for the CV splits # idx_vec = tr_id # # # Plot dist of responses (TODO: this can be done to all response metrics) # # plot_ytr_yvl_dist(ytr=tr_ydata.values, yvl=te_ydata.values, # # title='tr and te', outpath=run_outdir/'tr_te_resp_dist.png') # # # Confirm that group splits are correct # if te_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # te_grp_unq = set(meta.loc[te_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and te: {len(tr_grp_unq.intersection(te_grp_unq))}.') # lg.logger.info(f'\tA few intersections : {list(tr_grp_unq.intersection(te_grp_unq))[:3]}.') # # # Update vl_size to effective vl_size # vl_size = vl_size * xdata.shape[0]/len(tr_id) # # # Plot hist te # pd.Series(meta.loc[te_id, 'AUC'].values, name='yte').to_csv(outdir/'yte.csv') # plot_hist(meta.loc[te_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_test.png') # # del tr_id, te_id # # # # ----------------------------------------------- # # Generate CV splits # # ----------------------------------------------- # cv_folds_list = [1, 5, 7, 10, 15, 20, 25] # lg.logger.info(f'\nStart CV splits ...') # # for cv_folds in cv_folds_list: # lg.logger.info(f'\nCV folds: {cv_folds}') # # cv = cv_splitter(cv_method=cv_method, cv_folds=cv_folds, test_size=vl_size, # mltype=mltype, shuffle=False, random_state=SEED) # # cv_grp = meta[grp_by_col].values[idx_vec] if cv_method=='group' else None # if is_string_dtype(cv_grp): cv_grp = LabelEncoder().fit_transform(cv_grp) # # tr_folds = {} # vl_folds = {} # # # Start CV iters # for fold, (tr_id, vl_id) in enumerate(cv.split(idx_vec, groups=cv_grp)): # tr_id = idx_vec[tr_id] # adjust the indices! # vl_id = idx_vec[vl_id] # adjust the indices! # # tr_folds[fold] = tr_id.tolist() # vl_folds[fold] = vl_id.tolist() # # # Confirm that group splits are correct # if cv_method=='group' and grp_by_col is not None: # tr_grp_unq = set(meta.loc[tr_id, grp_by_col]) # vl_grp_unq = set(meta.loc[vl_id, grp_by_col]) # lg.logger.info(f'\tTotal group ({grp_by_col}) intersections btw tr and vl: {len(tr_grp_unq.intersection(vl_grp_unq))}.') # lg.logger.info(f'\tUnique cell lines in tr: {len(tr_grp_unq)}.') # lg.logger.info(f'\tUnique cell lines in vl: {len(vl_grp_unq)}.') # # # Convet to df # # from_dict takes too long --> faster described here: stackoverflow.com/questions/19736080/ # # tr_folds = pd.DataFrame.from_dict(tr_folds, orient='index').T # # vl_folds = pd.DataFrame.from_dict(vl_folds, orient='index').T # tr_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in tr_folds.items() ])) # vl_folds = pd.DataFrame(dict([ (k, pd.Series(v)) for k, v in vl_folds.items() ])) # # # Dump # tr_folds.to_csv( outdir/f'{cv_folds}fold_tr_id.csv', index=False ) # vl_folds.to_csv( outdir/f'{cv_folds}fold_vl_id.csv', index=False ) # # # Plot target dist only for the 1-fold case # if cv_folds==1 and fold==0: # plot_hist(meta.loc[tr_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_train.png') # plot_hist(meta.loc[vl_id, 'AUC'], var_name='AUC', fit=None, bins=100, path=outdir/'AUC_hist_val.png') # # plot_ytr_yvl_dist(ytr=meta.loc[tr_id, 'AUC'], yvl=meta.loc[vl_id, 'AUC'], # title='ytr_yvl_dist', outpath=outdir/'ytr_yvl_dist.png') # # pd.Series(meta.loc[tr_id, 'AUC'].values, name='ytr').to_csv(outdir/'ytr.csv') # pd.Series(meta.loc[vl_id, 'AUC'].values, name='yvl').to_csv(outdir/'yvl.csv') lg.kill_logger() print('Done.')
def run(args): dirpath = verify_dirpath(args['dirpath']) clr_keras_kwargs = {'mode': args['clr_mode'], 'base_lr': args['clr_base_lr'], 'max_lr': args['clr_max_lr'], 'gamma': args['clr_gamma']} # ML type ('reg' or 'cls') if 'reg' in args['model_name']: mltype = 'reg' elif 'cls' in args['model_name']: mltype = 'cls' else: raise ValueError("model_name must contain 'reg' or 'cls'.") # Find out which metadata field was used for hard split (cell, drug, or none) f = [f for f in dirpath.glob('*args.txt')][0] with open(f) as f: lines = f.readlines() split_on = [l.split(':')[-1].strip() for l in lines if 'split_on' in l][0] args['split_on'] = split_on.lower() # Define metrics # metrics = {'r2': 'r2', # 'neg_mean_absolute_error': 'neg_mean_absolute_error', #sklearn.metrics.neg_mean_absolute_error, # 'neg_median_absolute_error': 'neg_median_absolute_error', #sklearn.metrics.neg_median_absolute_error, # 'neg_mean_squared_error': 'neg_mean_squared_error', #sklearn.metrics.neg_mean_squared_error, # 'reg_auroc_score': utils.reg_auroc_score} # ----------------------------------------------- # Load data and pre-proc # ----------------------------------------------- xdata = read_data_file( dirpath/'xdata.parquet', 'parquet' ) meta = read_data_file( dirpath/'meta.parquet', 'parquet' ) ydata = meta[[ args['target_name'] ]] tr_id = read_data_file( dirpath/'{}fold_tr_id.csv'.format(args['cv_folds']) ) vl_id = read_data_file( dirpath/'{}fold_vl_id.csv'.format(args['cv_folds']) ) te_id = read_data_file( dirpath/'{}fold_te_id.csv'.format(args['cv_folds']) ) src = str(dirpath.parent).split('/')[-1].split('.')[0] # ----------------------------------------------- # Create outdir and logger # ----------------------------------------------- outdir = create_outdir(OUTDIR, args, src) args['outdir'] = str(outdir) lg = Logger(outdir/'logfile.log') lg.logger.info(f'File path: {filepath}') lg.logger.info(f'\n{pformat(args)}') dump_dict(args, outpath=outdir/'args.txt') # dump args # ----------------------------------------------- # Data preprocessing # ----------------------------------------------- xdata = scale_fea(xdata=xdata, scaler_name=args['scaler']) # scale features # ----------------------------------------------- # ML model configs # ----------------------------------------------- if args['model_name'] == 'lgb_reg': framework = 'lightgbm' init_kwargs = {'n_estimators': args['gbm_trees'], 'max_depth': args['gbm_max_depth'], 'learning_rate': args['gbm_lr'], 'num_leaves': args['gbm_leaves'], 'n_jobs': args['n_jobs'], 'random_state': args['seed']} fit_kwargs = {'verbose': False} elif args['model_name'] == 'rf_reg': framework = 'sklearn' init_kwargs = {'n_estimators': args['rf_trees'], 'n_jobs': args['n_jobs'], 'random_state': args['seed']} fit_kwargs = {} elif args['model_name'] == 'nn_reg0' or 'nn_reg1' or 'nn_reg_layer_less' or 'nn_reg_layer_more' or 'nn_reg_neuron_less' or 'nn_reg_neuron_more': framework = 'keras' init_kwargs = {'input_dim': xdata.shape[1], 'dr_rate': args['dr_rate'], 'opt_name': args['opt'], 'lr': args['lr'], 'batchnorm': args['batchnorm'], 'logger': lg.logger} fit_kwargs = {'batch_size': args['batch_size'], 'epochs': args['epochs'], 'verbose': 1} # 'validation_split': 0.1 # ----------------------------------------------- # Learning curve # ----------------------------------------------- lg.logger.info('\n\n{}'.format('-'*50)) lg.logger.info(f'Learning curves {src} ...') lg.logger.info('-'*50) lrn_crv_init_kwargs = { 'cv': None, 'cv_lists': (tr_id, vl_id, te_id), 'cv_folds_arr': args['cv_folds_arr'], 'shard_step_scale': args['shard_step_scale'], 'n_shards': args['n_shards'], 'min_shard': args['min_shard'], 'max_shard': args['max_shard'], 'shards_arr': args['shards_arr'], 'args': args, 'logger': lg.logger, 'outdir': outdir} lrn_crv_trn_kwargs = { 'framework': framework, 'mltype': mltype, 'model_name': args['model_name'], 'init_kwargs': init_kwargs, 'fit_kwargs': fit_kwargs, 'clr_keras_kwargs': clr_keras_kwargs, 'n_jobs': args['n_jobs'], 'random_state': args['seed'] } t0 = time() lc = LearningCurve( X=xdata, Y=ydata, meta=meta, **lrn_crv_init_kwargs ) lrn_crv_scores = lc.trn_learning_curve( **lrn_crv_trn_kwargs ) lg.logger.info('Runtime: {:.1f} hrs'.format( (time()-t0)/3600) ) # ------------------------------------------------- # Learning curve (sklearn method) # Problem! cannot log multiple metrics. # ------------------------------------------------- """ lg.logger.info('\nStart learning curve (sklearn method) ...') # Define params metric_name = 'neg_mean_absolute_error' base = 10 train_sizes_frac = np.logspace(0.0, 1.0, lc_ticks, endpoint=True, base=base)/base # Run learning curve t0 = time() lrn_curve_scores = learning_curve( estimator=model.model, X=xdata, y=ydata, train_sizes=train_sizes_frac, cv=cv, groups=groups, scoring=metric_name, n_jobs=n_jobs, exploit_incremental_learning=False, random_state=SEED, verbose=1, shuffle=False) lg.logger.info('Runtime: {:.1f} mins'.format( (time()-t0)/60) ) # Dump results # lrn_curve_scores = utils.cv_scores_to_df(lrn_curve_scores, decimals=3, calc_stats=False) # this func won't work # lrn_curve_scores.to_csv(os.path.join(outdir, 'lrn_curve_scores_auto.csv'), index=False) # Plot learning curves lrn_crv.plt_learning_curve(rslt=lrn_curve_scores, metric_name=metric_name, title='Learning curve (target: {}, data: {})'.format(target_name, tr_sources_name), path=os.path.join(outdir, 'auto_learning_curve_' + target_name + '_' + metric_name + '.png')) """ lg.kill_logger() del xdata, ydata print('Done.')