def extract_and_save_fea( df, fea, to_csv=False ):
     """ Extract specific feature type (including metadata) and
     save to file. 
     """
     name = fea
     fea_prfx_drop = [i for i in fea_list if i!=fea]
     fea_cols_drop = extract_subset_fea_col_names(df, fea_list=fea_prfx_drop, fea_sep=fea_sep)
     data = df.drop( columns=fea_cols_drop )
     outpath_name = outdir/(fname+f'.{name}')
     data.to_parquet( str(outpath_name)+'.parquet' )
     if to_csv:
         data.to_csv( str(outpath_name)+'.csv', index=False )
     return data
示例#2
0
    def merge_dock_and_fea(dock,
                           fea_df,
                           fea_prfx,
                           fea_sep,
                           merger='TITLE',
                           fea_name=None,
                           baseline=False):
        """ ... """
        # drug_names = set(common_samples).intersection(set(dock[ID].values))

        ml_df = pd.merge(dock, fea_df, how='inner',
                         on=merger).reset_index(drop=True)
        del fea_df

        # bb = fea_df[ fea_df[merger].isin(dock[merger].tolist()) ].reset_index(drop=True)
        # xdata = extract_subset_fea(bb, fea_list=[fea_prfx], fea_sep=fea_sep)
        # bb = pd.concat([bb[merger], xdata], axis=1)  # keep only the merger meta col from fea_df

        # xdata = extract_subset_fea(fea_df, fea_list=[fea_prfx], fea_sep=fea_sep)
        # fea_df = pd.concat([fea_df[merger], xdata], axis=1)  # keep only the merger meta col from fea_df
        # ml_df = pd.merge(dock, fea_df, how='inner', on=merger).reset_index(drop=True)
        # del fea_df, xdata

        # Re-org cols
        fea_cols = extract_subset_fea_col_names(ml_df,
                                                fea_list=[fea_prfx],
                                                fea_sep=fea_sep)
        meta_cols = ['Inchi-key', 'SMILES', 'TITLE', 'CAT', 'reg', 'cls']
        cols = meta_cols + fea_cols
        # ml_df = ml_df[cols]
        ml_df = ml_df[[c for c in cols if c in ml_df.columns]]
        print_fn('{}: {}'.format(fea_name, ml_df.shape))

        # Save
        outpath = trg_outdir / f'ml.{trg_name}.{fea_name}'
        ml_df.to_parquet(str(outpath) + '.parquet')

        # Compute baseline if specified
        if baseline:
            te_scr = trn_baseline(ml_df, fea_list=[fea_prfx], fea_sep=fea_sep)
            res[f'{fea_prfx}_r2'] = te_scr['r2']
            res[f'{fea_prfx}_mae'] = te_scr['median_absolute_error']
            del te_scr

        del ml_df
def gen_ml_df(dd, trg_name, meta_cols=['TITLE', 'SMILES'], fea_list=['dsc'],
              score_name='reg', q_cls=0.025, bin_th=2.0, print_fn=print,
              outdir=Path('out'), outfigs=Path('outfigs')):
    """ Generate a single ML dataframe for the specified target column trg_name.
    Args:
        dd : dataframe with (molecules x targets) where the first col is TITLE
        trg_name : a column in dd representing the target 
        meta_cols : metadata columns to include in the dataframe
        score_name : rename the trg_name with score_name
        q_cls : quantile value to compute along the docking scores to generate the 'cls' col
        bin_th : threshold value of docking score to generate the 'binner' col
    
    Returns:
        dd_trg : the ML dataframe 
    """
    print_fn( f'Processing {trg_name} ...' )
    res = {}
    res['target'] = trg_name

    meta_cols = set(meta_cols).intersection(set(dd.columns.tolist()))
    meta_cols = [i for i in meta_cols]

    # fea_list = ['dsc', 'ecfp2', 'ecfp4', 'ecfp6']
    # fea_list = ['dsc']
    fea_sep = '.'
    fea_cols = extract_subset_fea_col_names(dd, fea_list=fea_list, fea_sep=fea_sep)
    cols = [trg_name] + meta_cols + fea_cols
    dd_trg = dd[ cols ]
    del dd

    # Drop NaN scores
    dd_trg = dd_trg[ ~dd_trg[trg_name].isna() ].reset_index(drop=True)

    # Rename the scores col
    dd_trg = dd_trg.rename( columns={trg_name: score_name} )

    # File name
    fname = 'ml.' + trg_name
    
    # Transform scores to positive
    dd_trg[score_name] = abs( np.clip(dd_trg[score_name], a_min=None, a_max=0) )
    res['min'], res['max'] = dd_trg[score_name].min(), dd_trg[score_name].max()
    bins = 50
    """
    p = dd[score_name].hist(bins=bins);
    p.set_title(f'Scores Clipped to 0: {fname}');
    p.set_ylabel('Count'); p.set_xlabel('Docking Score');
    plt.savefig(outfigs/f'dock_scores_clipped_{fname}.png');
    """
    
    # Add binner
    binner = [1 if x>=bin_th else 0 for x in dd_trg[score_name]]
    dd_trg.insert(loc=1, column='binner', value=binner)

    # -----------------------------------------    
    # Create binner
    # -----------------------------------------      
    # Find quantile value
    if dd_trg[score_name].min() >= 0: # if scores were transformed to >=0
        q_cls = 1.0 - q_cls
    cls_th = dd_trg[score_name].quantile(q=q_cls)
    res['cls_th'] = cls_th
    print_fn('Quantile score (q_cls={:.3f}): {:.3f}'.format( q_cls, cls_th ))

    # Generate a classification target col
    if dd_trg[score_name].min() >= 0: # if scores were transformed to >=0
        value = (dd_trg[score_name] >= cls_th).astype(int)
    else:
        value = (dd_trg[score_name] <= cls_th).astype(int)
    dd_trg.insert(loc=1, column='cls', value=value)
    # print_fn('Ratio {:.3f}'.format( dd['dock_bin'].sum() / dd.shape[0] ))

    # Plot
    hist, bin_edges = np.histogram(dd_trg[score_name], bins=bins)
    x = np.ones((10,)) * cls_th
    y = np.linspace(0, hist.max(), len(x))

    fig, ax = plt.subplots()
    plt.hist(dd_trg[score_name], bins=bins, density=False, facecolor='b', alpha=0.5)
    plt.title(f'Scores Clipped to 0: {fname}');
    plt.ylabel('Count'); plt.xlabel('Docking Score');
    plt.plot(x, y, 'r--', alpha=0.7, label=f'{q_cls}-th quantile')
    plt.grid(True)
    plt.savefig(outfigs/f'dock.score.bin.{fname}.png')

    # Separate the features
    def extract_and_save_fea( df, fea, to_csv=False ):
        """ Extract specific feature type (including metadata) and
        save to file. 
        """
        name = fea
        fea_prfx_drop = [i for i in fea_list if i!=fea]
        fea_cols_drop = extract_subset_fea_col_names(df, fea_list=fea_prfx_drop, fea_sep=fea_sep)
        data = df.drop( columns=fea_cols_drop )
        outpath_name = outdir/(fname+f'.{name}')
        data.to_parquet( str(outpath_name)+'.parquet' )
        if to_csv:
            data.to_csv( str(outpath_name)+'.csv', index=False )
        return data

    print_fn( f'Create and save dataframes ...' )
    for fea in fea_list:
        to_csv = False if 'dsc' in fea else True
        dsc_df = extract_and_save_fea( dd_trg, fea=fea, to_csv=to_csv )

    # Scale desciptors and save scaler (save raw features rather the scaled)
    if sum([True for i in fea_list if 'dsc' in i]):
        dsc_prfx = ('dsc'+fea_sep)
        from sklearn.preprocessing import StandardScaler
        import joblib
        xdata = extract_subset_fea(dsc_df, fea_list='dsc', fea_sep=fea_sep)
        cols = xdata.columns
        sc = StandardScaler( with_mean=True, with_std=True )
        sc.fit( xdata )
        sc_outpath = outdir/(fname+f'.dsc.scaler.pkl')
        joblib.dump(sc, sc_outpath)
        # sc_ = joblib.load( sc_outpath ) 

        # We decided to remove the feature-specific prefixes for descriptors
        dsc_df = dsc_df.rename(columns={c: c.split(dsc_prfx)[-1] if dsc_prfx in c else c for c in dsc_df.columns})
        dsc_df.to_csv( outdir/(fname+'.dsc.csv'), index=False)        

    try:
        import lightgbm as lgb
        from sklearn.model_selection import train_test_split
        from datasplit.splitter import data_splitter
        from ml.evals import calc_preds, calc_scores, dump_preds
        ml_model_def = lgb.LGBMRegressor
        ml_init_args = {'n_jobs': 8}
        ml_fit_args = {'verbose': False, 'early_stopping_rounds': 10}
        model = ml_model_def( **ml_init_args )
        ydata = dd_trg['reg']
        xdata = extract_subset_fea(dd_trg, fea_list=fea_list, fea_sep=fea_sep)
        x_, xte, y_, yte = train_test_split(xdata, ydata, test_size=0.2)
        xtr, xvl, ytr, yvl = train_test_split(x_, y_, test_size=0.2)
        ml_fit_args['eval_set'] = (xvl, yvl)
        model.fit(xtr, ytr, **ml_fit_args)
        y_pred, y_true = calc_preds(model, x=xte, y=yte, mltype='reg')
        te_scores = calc_scores(y_true=y_true, y_pred=y_pred, mltype='reg', metrics=None)
        res['r2'] = te_scores['r2']
        res['mae'] = te_scores['median_absolute_error']
    except:
        print('Could not import lightgbm.')

    return res