def extract_and_save_fea( df, fea, to_csv=False ): """ Extract specific feature type (including metadata) and save to file. """ name = fea fea_prfx_drop = [i for i in fea_list if i!=fea] fea_cols_drop = extract_subset_fea_col_names(df, fea_list=fea_prfx_drop, fea_sep=fea_sep) data = df.drop( columns=fea_cols_drop ) outpath_name = outdir/(fname+f'.{name}') data.to_parquet( str(outpath_name)+'.parquet' ) if to_csv: data.to_csv( str(outpath_name)+'.csv', index=False ) return data
def merge_dock_and_fea(dock, fea_df, fea_prfx, fea_sep, merger='TITLE', fea_name=None, baseline=False): """ ... """ # drug_names = set(common_samples).intersection(set(dock[ID].values)) ml_df = pd.merge(dock, fea_df, how='inner', on=merger).reset_index(drop=True) del fea_df # bb = fea_df[ fea_df[merger].isin(dock[merger].tolist()) ].reset_index(drop=True) # xdata = extract_subset_fea(bb, fea_list=[fea_prfx], fea_sep=fea_sep) # bb = pd.concat([bb[merger], xdata], axis=1) # keep only the merger meta col from fea_df # xdata = extract_subset_fea(fea_df, fea_list=[fea_prfx], fea_sep=fea_sep) # fea_df = pd.concat([fea_df[merger], xdata], axis=1) # keep only the merger meta col from fea_df # ml_df = pd.merge(dock, fea_df, how='inner', on=merger).reset_index(drop=True) # del fea_df, xdata # Re-org cols fea_cols = extract_subset_fea_col_names(ml_df, fea_list=[fea_prfx], fea_sep=fea_sep) meta_cols = ['Inchi-key', 'SMILES', 'TITLE', 'CAT', 'reg', 'cls'] cols = meta_cols + fea_cols # ml_df = ml_df[cols] ml_df = ml_df[[c for c in cols if c in ml_df.columns]] print_fn('{}: {}'.format(fea_name, ml_df.shape)) # Save outpath = trg_outdir / f'ml.{trg_name}.{fea_name}' ml_df.to_parquet(str(outpath) + '.parquet') # Compute baseline if specified if baseline: te_scr = trn_baseline(ml_df, fea_list=[fea_prfx], fea_sep=fea_sep) res[f'{fea_prfx}_r2'] = te_scr['r2'] res[f'{fea_prfx}_mae'] = te_scr['median_absolute_error'] del te_scr del ml_df
def gen_ml_df(dd, trg_name, meta_cols=['TITLE', 'SMILES'], fea_list=['dsc'], score_name='reg', q_cls=0.025, bin_th=2.0, print_fn=print, outdir=Path('out'), outfigs=Path('outfigs')): """ Generate a single ML dataframe for the specified target column trg_name. Args: dd : dataframe with (molecules x targets) where the first col is TITLE trg_name : a column in dd representing the target meta_cols : metadata columns to include in the dataframe score_name : rename the trg_name with score_name q_cls : quantile value to compute along the docking scores to generate the 'cls' col bin_th : threshold value of docking score to generate the 'binner' col Returns: dd_trg : the ML dataframe """ print_fn( f'Processing {trg_name} ...' ) res = {} res['target'] = trg_name meta_cols = set(meta_cols).intersection(set(dd.columns.tolist())) meta_cols = [i for i in meta_cols] # fea_list = ['dsc', 'ecfp2', 'ecfp4', 'ecfp6'] # fea_list = ['dsc'] fea_sep = '.' fea_cols = extract_subset_fea_col_names(dd, fea_list=fea_list, fea_sep=fea_sep) cols = [trg_name] + meta_cols + fea_cols dd_trg = dd[ cols ] del dd # Drop NaN scores dd_trg = dd_trg[ ~dd_trg[trg_name].isna() ].reset_index(drop=True) # Rename the scores col dd_trg = dd_trg.rename( columns={trg_name: score_name} ) # File name fname = 'ml.' + trg_name # Transform scores to positive dd_trg[score_name] = abs( np.clip(dd_trg[score_name], a_min=None, a_max=0) ) res['min'], res['max'] = dd_trg[score_name].min(), dd_trg[score_name].max() bins = 50 """ p = dd[score_name].hist(bins=bins); p.set_title(f'Scores Clipped to 0: {fname}'); p.set_ylabel('Count'); p.set_xlabel('Docking Score'); plt.savefig(outfigs/f'dock_scores_clipped_{fname}.png'); """ # Add binner binner = [1 if x>=bin_th else 0 for x in dd_trg[score_name]] dd_trg.insert(loc=1, column='binner', value=binner) # ----------------------------------------- # Create binner # ----------------------------------------- # Find quantile value if dd_trg[score_name].min() >= 0: # if scores were transformed to >=0 q_cls = 1.0 - q_cls cls_th = dd_trg[score_name].quantile(q=q_cls) res['cls_th'] = cls_th print_fn('Quantile score (q_cls={:.3f}): {:.3f}'.format( q_cls, cls_th )) # Generate a classification target col if dd_trg[score_name].min() >= 0: # if scores were transformed to >=0 value = (dd_trg[score_name] >= cls_th).astype(int) else: value = (dd_trg[score_name] <= cls_th).astype(int) dd_trg.insert(loc=1, column='cls', value=value) # print_fn('Ratio {:.3f}'.format( dd['dock_bin'].sum() / dd.shape[0] )) # Plot hist, bin_edges = np.histogram(dd_trg[score_name], bins=bins) x = np.ones((10,)) * cls_th y = np.linspace(0, hist.max(), len(x)) fig, ax = plt.subplots() plt.hist(dd_trg[score_name], bins=bins, density=False, facecolor='b', alpha=0.5) plt.title(f'Scores Clipped to 0: {fname}'); plt.ylabel('Count'); plt.xlabel('Docking Score'); plt.plot(x, y, 'r--', alpha=0.7, label=f'{q_cls}-th quantile') plt.grid(True) plt.savefig(outfigs/f'dock.score.bin.{fname}.png') # Separate the features def extract_and_save_fea( df, fea, to_csv=False ): """ Extract specific feature type (including metadata) and save to file. """ name = fea fea_prfx_drop = [i for i in fea_list if i!=fea] fea_cols_drop = extract_subset_fea_col_names(df, fea_list=fea_prfx_drop, fea_sep=fea_sep) data = df.drop( columns=fea_cols_drop ) outpath_name = outdir/(fname+f'.{name}') data.to_parquet( str(outpath_name)+'.parquet' ) if to_csv: data.to_csv( str(outpath_name)+'.csv', index=False ) return data print_fn( f'Create and save dataframes ...' ) for fea in fea_list: to_csv = False if 'dsc' in fea else True dsc_df = extract_and_save_fea( dd_trg, fea=fea, to_csv=to_csv ) # Scale desciptors and save scaler (save raw features rather the scaled) if sum([True for i in fea_list if 'dsc' in i]): dsc_prfx = ('dsc'+fea_sep) from sklearn.preprocessing import StandardScaler import joblib xdata = extract_subset_fea(dsc_df, fea_list='dsc', fea_sep=fea_sep) cols = xdata.columns sc = StandardScaler( with_mean=True, with_std=True ) sc.fit( xdata ) sc_outpath = outdir/(fname+f'.dsc.scaler.pkl') joblib.dump(sc, sc_outpath) # sc_ = joblib.load( sc_outpath ) # We decided to remove the feature-specific prefixes for descriptors dsc_df = dsc_df.rename(columns={c: c.split(dsc_prfx)[-1] if dsc_prfx in c else c for c in dsc_df.columns}) dsc_df.to_csv( outdir/(fname+'.dsc.csv'), index=False) try: import lightgbm as lgb from sklearn.model_selection import train_test_split from datasplit.splitter import data_splitter from ml.evals import calc_preds, calc_scores, dump_preds ml_model_def = lgb.LGBMRegressor ml_init_args = {'n_jobs': 8} ml_fit_args = {'verbose': False, 'early_stopping_rounds': 10} model = ml_model_def( **ml_init_args ) ydata = dd_trg['reg'] xdata = extract_subset_fea(dd_trg, fea_list=fea_list, fea_sep=fea_sep) x_, xte, y_, yte = train_test_split(xdata, ydata, test_size=0.2) xtr, xvl, ytr, yvl = train_test_split(x_, y_, test_size=0.2) ml_fit_args['eval_set'] = (xvl, yvl) model.fit(xtr, ytr, **ml_fit_args) y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype='reg') te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype='reg', metrics=None) res['r2'] = te_scores['r2'] res['mae'] = te_scores['median_absolute_error'] except: print('Could not import lightgbm.') return res