Exemplo n.º 1
0
def run(args):
    # import ipdb; ipdb.set_trace()
    t0 = time()
    ID = 'TITLE'
    fea_sep = '_'

    assert args.par_jobs > 0, f"The arg 'par_jobs' must be int >1 (got {args.par_jobs})"

    outdir = Path(args.outdir)
    if outdir is None:
        outdir = Path(filepath, '../out', FEA_DIR.name, drg_set).resolve()
    os.makedirs(outdir, exist_ok=True)

    # Logger
    lg = Logger(outdir / 'gen.fea.dfs.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(vars(args))}')

    # ========================================================
    # Aggregate features from files
    # -----------------------------
    drug_names = None
    # N = 20
    N = None

    for fea_name in args.fea_type:
        if 'descriptors' == fea_name:

            dd_fea = load_mordred_descriptors(drg_set=args.drg_set,
                                              fea_name=fea_name,
                                              col_name=ID,
                                              drug_names=drug_names,
                                              fea_sep=fea_sep,
                                              n_jobs=args.par_jobs,
                                              N=N)

            print_fn('dd_fea.shape {}'.format(dd_fea.shape))
            dd_fea.to_parquet(outdir / 'descriptors.mordred.parquet')
            del dd_fea
        else:
            dd_fea = None

        if 'fps' == fea_name:
            fps_fea = load_fps(drg_set=args.drg_set,
                               fea_name=fea_name,
                               col_name=ID,
                               drug_names=drug_names,
                               fea_sep=fea_sep,
                               n_jobs=args.par_jobs,
                               N=N)

            print_fn('fps_fea.shape {}'.format(fps_fea.shape))
            fps_fea.to_parquet(outdir / 'fps.ecfp2.parquet')
            del fps_fea
        else:
            fps_fea = None

        if 'images' == fea_name:
            pass
        else:
            img_fea = None

    # --------------------------
    # Generate images
    # --------------------------
    # if 'images' in fea_type:
    #     files_path = Path(FEA_DIR, drg_set, 'images').resolve()
    #     fea_files = sorted( files_path.glob(f'{drg_set}-*.pkl') )

    #     if len(fea_files) > 0:
    #         fea_outpath = outdir/'images'
    #         os.makedirs(fea_outpath, exist_ok=True)

    #         dfs = []
    #         for i, f in enumerate(fea_files[:N]):
    #             if (i+1) % 100 == 0:
    #                 print(f'Load {i+1} ... {f.name}')
    #             imgs = pickle.load(open(fea_files[i], 'rb'))

    #             # That's from get_image_dct(mol)
    #             # image = (255 * transforms.ToTensor()(Invert()(generateFeatures.smiles_to_image(mol))).numpy()).astype(np.uint8)
    #             image = Invert()(image)
    #             image = transforms.ToTensor()(image)
    #             image = image.numpy()
    #             image = 255 * image
    #             image = image.astype(np.uint8)

    #             # To dict
    #             def img_data_to_dict( aa ):
    #                 dct = {}
    #                 dct['drg_set'] = aa[0]
    #                 dct['TITLE'] = aa[1]
    #                 dct['SMILES'] = aa[2]
    #                 dct['img'] = aa[3]

    # ========================================================
    print_fn('\nRuntime {:.1f} mins'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.kill_logger()
Exemplo n.º 2
0
def run(args):
    import pdb
    pdb.set_trace()
    t0 = time()
    datapath = Path(args['datapath']).resolve()

    if args['max_size'] is not None:
        assert args['min_size'] < args['max_size'], f"min train size (min_size={args['min_size']}) "\
                                                    f"must be smaller than max train size "\
                                                    f"(max_size={args['max_size']})."

    if args['splitdir'] is None:
        splitdir = None
    else:
        splitdir = Path(args['splitdir']).resolve()
    split_id = args['split_id']

    # -----------------------------------------------
    #       Global outdir
    # -----------------------------------------------
    if args['gout'] is not None:
        gout = Path(args['gout']).resolve()
    else:
        gout = fdir.parent / 'lc.trn'
        gout = gout / datapath.with_suffix('.lc').name
    args['gout'] = str(gout)
    os.makedirs(gout, exist_ok=True)

    # -----------------------------------------------
    #       Run (single split) outdir
    # -----------------------------------------------
    if args['rout'] is not None:
        rout = gout / args['rout']
    else:
        if splitdir is None:
            rout = gout / 'run_0'
        else:
            rout = gout / f'split_{split_id}'
    args['rout'] = str(rout)
    os.makedirs(rout, exist_ok=True)

    # -----------------------------------------------
    #       Logger
    # -----------------------------------------------
    lg = Logger(rout / 'lc.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {fdir}')
    print_fn(f'\n{pformat(args)}')
    dump_dict(args, outpath=rout / 'trn.args.txt')

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn('\nLoad master dataset.')
    data = load_data(datapath)
    print_fn('data.shape {}'.format(data.shape))

    # Get features (x), target (y), and meta
    fea_list = args['fea_prfx']
    fea_sep = args['fea_sep']
    xdata = extract_subset_fea(data, fea_list=fea_list, fea_sep=fea_sep)
    meta = data.drop(columns=xdata.columns)
    ydata = meta[[args['trg_name']]]
    del data

    # -----------------------------------------------
    #       Scale features
    # -----------------------------------------------
    xdata = scale_fea(xdata=xdata, scaler_name=args['scaler'])

    # -----------------------------------------------
    #       Data splits
    # -----------------------------------------------
    if splitdir is None:
        cv_lists = None
    else:
        split_pattern = f'1fold_s{split_id}_*_id.csv'
        single_split_files = glob(str(splitdir / split_pattern))

        # Get indices for the split
        for id_file in single_split_files:
            if 'tr_id' in id_file:
                tr_id = load_data(id_file).values.reshape(-1, )
            elif 'vl_id' in id_file:
                vl_id = load_data(id_file).values.reshape(-1, )
            elif 'te_id' in id_file:
                te_id = load_data(id_file).values.reshape(-1, )

        cv_lists = (tr_id, vl_id, te_id)

    # -----------------------------------------------
    #      ML model configs
    # -----------------------------------------------
    if args['ml'] == 'lgb':
        # LGBM regressor model definition
        import lightgbm as lgb
        framework = 'lightgbm'
        ml_model_def = lgb.LGBMRegressor
        mltype = 'reg'

        ml_init_kwargs = {
            'n_estimators': args['n_estimators'],
            'max_depth': args['max_depth'],
            'learning_rate': args['learning_rate'],
            'num_leaves': args['num_leaves'],
            'n_jobs': args['n_jobs'],
            'random_state': None
        }
        ml_fit_kwargs = {'verbose': False, 'early_stopping_rounds': 10}
        data_prep_def = None
        keras_callbacks_def = None
        keras_clr_kwargs = None

    elif args['ml'] == 'nn_reg0':
        # Keras model def
        from models.keras_model import (nn_reg0_model_def, data_prep_nn0_def,
                                        model_callback_def)
        framework = 'keras'
        mltype = 'reg'
        keras_callbacks_def = model_callback_def
        data_prep_def = data_prep_nn0_def

        if (args['ml'] == 'nn_reg0'):
            ml_model_def = nn_reg0_model_def

        ml_init_kwargs = {
            'input_dim': xdata.shape[1],
            'dr_rate': args['dr_rate'],
            'opt_name': args['opt'],
            'lr': args['lr'],
            'batchnorm': args['batchnorm']
        }
        ml_fit_kwargs = {
            'epochs': args['epoch'],
            'batch_size': args['batch_size'],
            'verbose': 1
        }
        keras_clr_kwargs = {}

    elif args['ml'] == 'nn_reg1':
        from models.keras_model import (nn_reg1_model_def, data_prep_nn1_def,
                                        model_callback_def)
        framework = 'keras'
        mltype = 'reg'
        keras_callbacks_def = model_callback_def
        data_prep_def = data_prep_nn1_def

        if (args['ml'] == 'nn_reg1'):
            ml_model_def = nn_reg1_model_def

        x_ge = extract_subset_fea(xdata, fea_list=['ge'], fea_sep='_')
        x_dd = extract_subset_fea(xdata, fea_list=['dd'], fea_sep='_')

        ml_init_kwargs = {
            'in_dim_ge': x_ge.shape[1],
            'in_dim_dd': x_dd.shape[1],
            'dr_rate': args['dr_rate'],
            'opt_name': args['opt'],
            'lr': args['lr'],
            'batchnorm': args['batchnorm']
        }
        ml_fit_kwargs = {
            'epochs': args['epoch'],
            'batch_size': args['batch_size'],
            'verbose': 1
        }
        keras_clr_kwargs = {}
        del x_ge, x_dd

    # Print NN
    if len(ml_init_kwargs) and ('nn' in args['ml']):
        model = ml_model_def(**ml_init_kwargs)
        model.summary(print_fn=lg.logger.info)
        del model

    # -----------------------------------------------
    #      Learning curve
    # -----------------------------------------------
    # LC args
    lc_init_args = {
        'cv_lists': cv_lists,
        'n_splits': args['n_splits'],
        'mltype': mltype,
        'lc_step_scale': args['lc_step_scale'],
        'lc_sizes': args['lc_sizes'],
        'min_size': args['min_size'],
        'max_size': args['max_size'],
        'lc_sizes_arr': args['lc_sizes_arr'],
        'outdir': rout,
        'print_fn': print_fn
    }

    lc_trn_args = {
        'framework': framework,
        'n_jobs': args['n_jobs'],
        'ml_model_def': ml_model_def,
        'ml_init_args': ml_init_kwargs,
        'ml_fit_args': ml_fit_kwargs,
        'data_prep_def': data_prep_def,
        'keras_callbacks_def': keras_callbacks_def,
        'keras_clr_args': keras_clr_kwargs
    }

    # LC object
    lc_obj = LearningCurve(X=xdata, Y=ydata, meta=meta, **lc_init_args)
    lc_scores = lc_obj.trn_learning_curve(**lc_trn_args)

    # Dump all scores
    lc_scores.to_csv(rout / 'lc_scores.csv', index=False)

    # Dump args
    dump_dict(args, outpath=rout / 'args.txt')

    # ------------------------------------------------------
    if (time() - t0) // 3600 > 0:
        print_fn('Runtime: {:.1f} hrs'.format((time() - t0) / 3600))
    else:
        print_fn('Runtime: {:.1f} mins'.format((time() - t0) / 60))

    print_fn('Done.')
    lg.close_logger()
    del xdata, ydata

    return None
Exemplo n.º 3
0
def run(args):

    te_size = verify_size(args.te_size)

    # Path
    appdir = MAIN_APPDIR / args.appname

    # import ipdb; ipdb.set_trace(context=11)

    # Hard split
    split_on = None if args.split_on is None else args.split_on.upper()
    te_method = args.cv_method

    # Specify ML task (regression or classification)
    if args.cv_method == 'strat':
        mltype = 'cls'  # cast mltype to cls in case of stratification
    else:
        mltype = args.ml_task

    # -----------------------------------------------
    #       Create appdir
    # -----------------------------------------------
    gout = appdir / 'splits'
    outfigs = gout / 'outfigs'
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / 'data.splitter.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File dir: {fdir}')
    print_fn(f'\n{pformat(vars(args))}')
    dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt')  # dump args

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn('\nLoad master dataset.')
    data = load_data(appdir / 'annotations.csv')
    print_fn('data.shape {}'.format(data.shape))

    GE_LEN = sum([1 for c in data.columns if c.startswith('ge_')])
    DD_LEN = sum([1 for c in data.columns if c.startswith('dd_')])

    # import ipdb; ipdb.set_trace(context=11)

    # -----------------------------------------------
    #       Determine the dataset
    # -----------------------------------------------
    ydata = data[args.trg_name] if args.trg_name in data.columns else None
    if (ydata is None) and (args.cv_method == 'strat'):
        raise ValueError(
            'Y data must be specified if splits needs to be stratified.')
    if ydata is not None:
        plot_hist(ydata,
                  title=f'{args.trg_name}',
                  fit=None,
                  bins=100,
                  path=outfigs / f'{args.trg_name}_hist_all.png')

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn('\n{}'.format('-' * 50))
    print_fn('Split into hold-out train/val/test')
    print_fn('{}'.format('-' * 50))

    kwargs = {
        'data': data,
        'cv_method': args.cv_method,
        'te_method': te_method,
        'te_size': te_size,
        'mltype': mltype,
        'split_on': split_on
    }

    data_splitter(n_splits=args.n_splits,
                  gout=gout,
                  outfigs=outfigs,
                  ydata=ydata,
                  print_fn=print_fn,
                  **kwargs)

    lg.kill_logger()
Exemplo n.º 4
0
def run(args):
    # import pdb; pdb.set_trace()
    t0 = time()
    rsp_cols = [
        'AUC', 'AUC1', 'EC50', 'EC50se', 'R2fit', 'Einf', 'IC50', 'HS', 'AAC1',
        'DSS1'
    ]
    outdir = create_outdir(args.gout, args)

    # -----------------------------------------------
    #     Logger
    # -----------------------------------------------
    lg = Logger(outdir / 'gen.df.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(vars(args))}')
    dump_dict(vars(args), outpath=outdir / 'gen.df.args')

    # -----------------------------------------------
    #     Load response data and features
    # -----------------------------------------------
    rsp = load_rsp(args.rsp_path,
                   src=args.src,
                   r2fit_th=args.r2fit_th,
                   print_fn=print_fn)
    ge = load_ge(args.cell_path, print_fn=print_fn, float_type=np.float32)
    dd = load_dd(args.drug_path,
                 dropna_th=args.dropna_th,
                 print_fn=print_fn,
                 float_type=np.float32,
                 src=args.src)

    # -----------------------------------------------
    #     Merge data
    # -----------------------------------------------
    print_fn('\n{}'.format('-' * 40))
    print_fn('Start merging response with other dfs.')
    print_fn('-' * 40)
    data = rsp

    # Merge with ge
    print_fn('\nMerge with expression (ge).')
    data = pd.merge(data, ge, on='CELL', how='inner')
    groupby_src_and_print(data, print_fn=print_fn)
    del ge

    # Merge with dd
    print_fn('\nMerge with descriptors (dd).')
    data = pd.merge(data, dd, on='DRUG', how='inner')
    groupby_src_and_print(data, print_fn=print_fn)
    del dd

    # Sample
    # if (args.n_samples is not None):
    #     print_fn('\nSample the final dataset.')
    #     if args.flatten:
    #         data = flatten_dist(df=data, n=args.n_samples, score_name=args.trg_name)
    #     else:
    #         if args.n_samaples <= data.shape[0]:
    #             data = data.sample(n=args.n_samples, replace=False, random_state=0)
    #     print_fn(f'data.shape {data.shape}\n')

    # Memory usage
    print_fn('\nTidy dataframe: {:.1f} GB'.format(sys.getsizeof(data) / 1e9))
    for fea_name, fea_prfx in fea_prfx_dct.items():
        cols = [c for c in data.columns if fea_prfx in c]
        aa = data[cols]
        mem = 0 if aa.shape[1] == 0 else sys.getsizeof(aa) / 1e9
        print_fn('Memory occupied by {} features: {} ({:.1f} GB)'.format(
            fea_name, len(cols), mem))

    print_fn(f"\nData final: {data.shape}")

    # Plot histograms of target variables
    plot_rsp_dists(data, rsp_cols=rsp_cols, savepath=outdir / 'rsp_dists.png')

    # -----------------------------------------------
    #   Save data
    # -----------------------------------------------
    # Save data
    print_fn('\nSave dataframe.')
    fname = create_basename(args)
    fpath = outdir / (fname + '.parquet')
    data.to_parquet(fpath)

    # Load data
    print_fn('Load dataframe.')
    data_fromfile = pd.read_parquet(fpath)

    # Check that the saved data is the same as original one
    print_fn(f'Loaded df is same as original: {data.equals(data_fromfile)}')

    print_fn('\n{}'.format('-' * 70))
    print_fn(f'Dataframe filepath:\n{fpath.resolve()}')
    print_fn('-' * 70)

    # -------------------------------------------------------
    print_fn('\nRuntime: {:.1f} mins'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.close_logger()
    return None
def run(args):
    t0=time()
    scores_path = Path( args['scores_path'] ).resolve()
    fea_path = Path( args['fea_path'] ).resolve()
    par_jobs = int( args['par_jobs'] )
    fea_list = args['fea_list']
    assert par_jobs > 0, f"The arg 'par_jobs' must be at least 1 (got {par_jobs})"

    if args['outdir'] is not None:
        outdir = Path( args['outdir'] ).resolve()
    else:
        batch_name = scores_path.parent.name
        outdir = Path( filepath/'../out'/batch_name ).resolve()

    outfigs = outdir/'figs'
    os.makedirs(outdir, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)
    args['outdir'] = outdir
    
    # Logger
    lg = Logger( outdir/'gen.ml.data.log' )
    print_fn = get_print_func( lg.logger )
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(args)}')
    
    print_fn('\nDocking scores path {}'.format( scores_path ))
    print_fn('Features path       {}'.format( fea_path ))
    print_fn('Outdir path         {}'.format( outdir ))

    # -----------------------------------------
    # Load data (features and docking scores)
    # -----------------------------------------    
    # Features (with SMILES)
    print_fn('\nLoad features ...')
    fea = load_data( fea_path )
    print_fn('Features {}'.format( fea.shape ))
    fea = drop_dup_rows(fea, print_fn=print_fn)

    # Docking scores
    print_fn('\nLoad docking scores ...')
    rsp = load_data( args['scores_path'] )
    print_fn('Docking {}'.format( rsp.shape ))
    rsp = drop_dup_rows(rsp, print_fn=print_fn)

    # # Check that 'SMILES' col exists
    # if 'SMILES' in rsp.columns:
    #     rsp = rsp.rename(columns={'SMILES': 'SMILES'})
    # assert 'SMILES' in rsp.columns, "Column 'SMILES' must exists in the docking scores file."

    # print_fn('\nCanonicalize SMILES ...')
    # can_smi_vec = canon_SMILES( rsp['SMILES'], par_jobs=args['par_jobs'] )
    # can_smi_vec = pd.Series(can_smi_vec)

    # # Save to file bad SMILES (that were not canonicalized)
    # nan_ids = can_smi_vec.isna()
    # bad_smi = rsp[ nan_ids ]
    # if len(bad_smi)>0:
    #     bad_smi.to_csv(outdir/'smi_canon_err.csv', index=False)

    # # Keep the good (canonicalized) SMILES
    # rsp['SMILES'] = can_smi_vec
    # rsp = rsp[ ~nan_ids ].reset_index(drop=True)

    print_fn( '\n{}'.format( rsp.columns.tolist() ))
    print_fn( '\n{}\n'.format( rsp.iloc[:3,:4] ))

    # -----------------------------------------    
    # Merge features with dock scores
    # -----------------------------------------    
    # merger = 'SMILES'
    merger = 'TITLE'
    assert merger in rsp.columns, f"Column '{merger}' must exist in the docking scores file."
    unq_smiles = set( rsp[merger] ).intersection( set(fea[merger]) )
    print_fn( 'Unique {} in rsp: {}'.format( merger, rsp[merger].nunique() ))
    print_fn( 'Unique {} in fea: {}'.format( merger, fea[merger].nunique() ))
    print_fn( 'Intersect on {}:  {}'.format( merger, len(unq_smiles) ))

    print_fn(f'\nMerge features with docking scores on {merger} ...')
    dd = pd.merge(rsp, fea, on=merger, how='inner')
    print_fn('Merged {}'.format( dd.shape ))
    print_fn('Unique {} in final df: {}'.format( merger, dd[merger].nunique() ))
    trg_names = rsp.columns[1:].tolist()
    del rsp, fea

    score_name = 'reg' # unified name for docking scores column in all output dfs
    bin_th = 2.0 # threshold value for the binner column (classifier)
    kwargs = { 'dd': dd, 'meta_cols': meta_cols, 'fea_list': fea_list,
               'score_name': score_name, 'q_cls': args['q_bins'], 'bin_th': bin_th,
               'print_fn': print_fn, 'outdir': outdir, 'outfigs': outfigs }

    if par_jobs > 1:
        # https://joblib.readthedocs.io/en/latest/parallel.html
        results = Parallel(n_jobs=par_jobs, verbose=20)(
                delayed(gen_ml_df)(trg_name=trg, **kwargs) for trg in trg_names )
    else:
        results = [] # docking summary including ML baseline scores
        for trg in trg_names:
            res = gen_ml_df(trg_name=trg, **kwargs)
            results.append( res )

    results = np.round(pd.DataFrame(results), decimals=3)
    results.to_csv( outdir/'dock.ml.baseline.csv', index=False )

    # --------------------------------------------------------
    print_fn('\nRuntime {:.2f} mins'.format( (time()-t0)/60 ))
    print_fn('Done.')
    lg.kill_logger()
Exemplo n.º 6
0
def run(args):
    t0 = time()
    n_splits = int(args.n_splits)
    te_size = verify_size(args.te_size)
    # te_size = args['te_size']
    datapath = Path(args.datapath).resolve()

    # Hard split
    split_on = None if args.split_on is None else args.split_on.upper()
    cv_method = args.cv_method
    te_method = cv_method

    # Specify ML task (regression or classification)
    if cv_method == 'strat':
        mltype = 'cls'  # cast mltype to cls in case of stratification
    else:
        mltype = args.ml_task

    # Target column name
    trg_name = str(args.trg_name)

    # -----------------------------------------------
    #       Create outdir
    # -----------------------------------------------
    if args.gout is not None:
        gout = Path(args.gout).resolve()
        gout = gout / datapath.with_suffix('.splits').name
    else:
        # Note! useful for drug response
        # sufx = 'none' if split_on is None else split_on
        # gout = gout / f'split_on_{sufx}'
        gout = datapath.with_suffix('.splits')

    outfigs = gout / 'outfigs'
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / 'data.splitter.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(vars(args))}')
    dump_dict(vars(args), outpath=gout / 'data.splitter.args.txt')  # dump args

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn('\nLoad master dataset.')
    data = load_data(datapath)
    print_fn('data.shape {}'.format(data.shape))

    ydata = data[trg_name] if trg_name in data.columns else None
    if (ydata is None) and (cv_method == 'strat'):
        raise ValueError(
            'Y data must be available if splits are required to stratified.')
    if ydata is not None:
        plot_hist(ydata,
                  title=f'{trg_name}',
                  fit=None,
                  bins=100,
                  path=outfigs / f'{trg_name}_hist_all.png')

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn('\n{}'.format('-' * 50))
    print_fn('Split into hold-out train/val/test')
    print_fn('{}'.format('-' * 50))

    kwargs = {
        'data': data,
        'cv_method': cv_method,
        'te_method': te_method,
        'te_size': te_size,
        'mltype': mltype,
        'split_on': split_on
    }

    data_splitter(n_splits=n_splits,
                  gout=gout,
                  outfigs=outfigs,
                  ydata=ydata,
                  print_fn=print_fn,
                  **kwargs)

    print_fn('Runtime: {:.1f} min'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.kill_logger()
def run(args):
    # import ipdb; ipdb.set_trace()
    t0 = time()
    scores_path = Path(args['scores_path']).resolve()
    fea_path = Path(args['fea_path']).resolve()
    img_path = None if args['img_path'] is None else Path(
        args['img_path']).resolve()
    par_jobs = int(args['par_jobs'])
    fea_list = args['fea_list']
    assert par_jobs > 0, f"The arg 'par_jobs' must be at least 1 (got {par_jobs})"

    if args['outdir'] is not None:
        outdir = Path(args['outdir']).resolve()
    else:
        batch_name = scores_path.parent.name
        outdir = Path(GOUT / batch_name).resolve()

    outfigs = outdir / 'figs'
    os.makedirs(outdir, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)
    args['outdir'] = outdir

    # Logger
    lg = Logger(outdir / 'gen.ml.data.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(args)}')

    print_fn('\nDocking scores {}'.format(scores_path))
    print_fn('Features       {}'.format(fea_path))
    print_fn('Images         {}'.format(img_path))
    print_fn('Outdir         {}'.format(outdir))

    # -----------------------------------------
    # Load data (features and docking scores)
    # -----------------------------------------
    # Docking scores
    print_fn('\nLoad docking scores ...')
    rsp = load_data(args['scores_path'])
    print_fn('Docking {}'.format(rsp.shape))
    rsp = drop_dup_rows(rsp, print_fn=print_fn)

    # Get target names
    trg_names = rsp.columns[1:].tolist()[:2]

    # -----------------------------------------
    # Dump docks of each trg to separate file
    # -----------------------------------------
    score_name = 'reg'  # unified name for docking scores column in all output dfs
    bin_th = 2.0  # threshold value for the binner column (classifier)
    kwargs = {
        'rsp': rsp,
        'meta_cols': meta_cols,
        'score_name': score_name,
        'q_cls': args['q_bins'],
        'print_fn': print_fn,
        'outdir': outdir
    }

    # import pdb; pdb.set_trace()
    if par_jobs > 1:
        results = Parallel(n_jobs=par_jobs, verbose=20)(
            delayed(dump_single_trg)(trg_name=trg, **kwargs)
            for trg in trg_names)
    else:
        for trg in trg_names:
            dump_single_trg(trg_name=trg, **kwargs)
    # -----------------------------------------------------

    # -----------------------------------------
    # Process Images
    # -----------------------------------------
    # Load images
    # import pdb; pdb.set_trace()
    if img_path is not None:
        print_fn('\nLoad images ...')
        images = load_data(img_path)
        print_fn('Images {} {}'.format(type(images), len(images)))

        # Keep intersect on samples (TITLE)
        kwargs = {
            'images': images,
            'rsp': rsp,
            'print_fn': print_fn,
            'outdir': outdir
        }

        if par_jobs > 1:
            Parallel(n_jobs=par_jobs,
                     verbose=20)(delayed(gen_ml_images)(trg_name=trg, **kwargs)
                                 for trg in trg_names)
        else:
            for trg in trg_names:
                gen_ml_images(trg_name=trg, **kwargs)
    # -----------------------------------------------------

    # Features (with SMILES)
    print_fn('\nLoad features ...')
    fea = load_data(fea_path)
    print_fn('Features {}'.format(fea.shape))
    fea = drop_dup_rows(fea, print_fn=print_fn)

    print_fn('\n{}'.format(rsp.columns.tolist()))
    print_fn('\n{}\n'.format(rsp.iloc[:3, :4]))

    # -----------------------------------------
    # Merge features with dock scores
    # -----------------------------------------
    merger = 'TITLE'  # we used 'SMILES' before
    assert merger in rsp.columns, f"Column '{merger}' must exist in the docking scores file."
    unq_smiles = set(rsp[merger]).intersection(set(fea[merger]))
    print_fn('Unique {} in rsp: {}'.format(merger, rsp[merger].nunique()))
    print_fn('Unique {} in fea: {}'.format(merger, fea[merger].nunique()))
    print_fn('Intersect on {}:  {}'.format(merger, len(unq_smiles)))

    print_fn(f'\nMerge features with docking scores on {merger} ...')
    dd = pd.merge(rsp, fea, on=merger, how='inner')
    print_fn('Merged {}'.format(dd.shape))
    print_fn('Unique {} in final df: {}'.format(merger, dd[merger].nunique()))
    del rsp, fea

    score_name = 'reg'  # unified name for docking scores column in all output dfs
    bin_th = 2.0  # threshold value for the binner column (classifier)
    kwargs = {
        'dd': dd,
        'meta_cols': meta_cols,
        'fea_list': fea_list,
        'score_name': score_name,
        'q_cls': args['q_bins'],
        'bin_th': bin_th,
        'print_fn': print_fn,
        'outdir': outdir,
        'outfigs': outfigs
    }

    # import pdb; pdb.set_trace()
    if par_jobs > 1:
        results = Parallel(n_jobs=par_jobs, verbose=20)(
            delayed(gen_ml_df)(trg_name=trg, **kwargs) for trg in trg_names)
    else:
        results = []  # docking summary including ML baseline scores
        for trg in trg_names:
            res = gen_ml_df(trg_name=trg, **kwargs)
            results.append(res)

    # TODO consider to generate baselines using ecfp features as well
    results = np.round(pd.DataFrame(results), decimals=3)
    results.to_csv(outdir / 'dock.ml.dsc.baseline.csv', index=False)

    # --------------------------------------------------------
    print_fn('\nRuntime {:.2f} mins'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.kill_logger()
Exemplo n.º 8
0
def run(args):
    import pdb
    pdb.set_trace()
    t0 = time()

    drg_set = Path(args.drg_set)
    scr_dir = Path(args.scr_dir).resolve()
    # fea_type = args.fea_type

    ID = 'TITLE'
    # ID = 'SMILES'

    par_jobs = int(args.par_jobs)
    assert par_jobs > 0, f"The arg 'par_jobs' must be int >0 (got {par_jobs})"

    if args.outdir is not None:
        outdir = Path(args.outdir).resolve()
    else:
        batch_name = scr_dir.parent.name
        outdir = Path(GOUT, batch_name).resolve()

    outfigs = outdir / 'figs'
    os.makedirs(outdir, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # Logger
    lg = Logger(outdir / 'gen.ml.data.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(vars(args))}')

    print_fn(f'\nDocking files  {scr_dir}')
    # print_fn(f'Features dir   {FEA_DIR}')
    print_fn(f'Outdir         {outdir}')

    # ========================================================
    # Glob the docking files
    # ----------------------
    scr_dir = Path(scr_dir, drg_set).resolve()
    ## scr_file_pattern = '*4col.csv'  # V5.1
    ## scr_file_pattern = '*sorted*csv'  # V7.0
    scr_file_pattern = 'rec_*3col.csv'  # Tom's receptors
    scr_files = sorted(scr_dir.glob(scr_file_pattern))

    # ss = ['ADRP_6W02_A_1_H',
    #       'NSP10-16_6W61_AB_1_F',
    #       'NSP10-16_6W61_AB_2_F']

    # def fnc(f):
    #     for s in ss:
    #         if s in str(f):
    #             return True
    #     return False

    # scr_files = [f for f in scr_files if fnc(f)]

    # ========================================================
    # Load features
    # ------------------------------
    dd_names = None
    fps_names = None
    img_names = None

    if (args.dd_fpath is not None) and (args.dd_fpath.lower() != 'none'):
        dd_fea = load_data(args.dd_fpath)
        dd_names = dd_fea[ID].tolist()
        dd_fea = dd_fea.drop(columns='SMILES')
        # tmp = dd_fea.isna().sum(axis=0).sort_values(ascending=False)
        dd_fea = dd_fea.fillna(0)
    else:
        dd_fea = None
        dd_names = None

    if (args.fps_fpath is not None) and (args.fps_fpath.lower() != 'none'):
        fps_fea = load_data(args.fps_fpath)
        fps_names = fps_fea[ID].tolist()
        fps_fea = fps_fea.drop(columns='SMILES')
        # tmp = fps_fea.isna().sum(axis=0).sort_values(ascending=False)
        fps_fea = fps_fea.fillna(0)
    else:
        fps_fea = None
        fps_names = None

    if (args.img_fpath is not None) and (args.img_fpath.lower() != 'none'):
        # TODO
        pass
    else:
        img_fea = None
        img_names = None

    # ========================================================
    # Get the common samples (by ID)
    # ------------------------------
    """
    For each feature type (descriptors, fps, images), obtain the list
    of drug names for which the features are available. Also, get the
    intersect of drug names across the feature types. This is required
    for multimodal learning (we want to make sure that we have all the
    feature types for a compound).
    """
    # Union of TITLE names across all features types
    all_names = []
    for ii in [dd_names, fps_names, img_names]:
        if ii is not None:
            all_names.extend(list(ii))
    print_fn(
        f'Union of titles across all feature types: {len(set(all_names))}')

    # Intersect of TITLE names across all features types
    common_names = None
    for ii in [dd_names, fps_names, img_names]:
        if (common_names is not None) and (ii is not None):
            common_names = set(common_names).intersection(set(ii))
        elif (common_names is None) and (ii is not None):
            common_names = ii
    print_fn(
        f'Intersect of titles across all feature types: {len(set(common_names))}'
    )

    # Get TITLEs that are not available across all feature types
    bb_names = list(set(all_names).difference(set(common_names)))
    if len(bb_names) > 0:
        # TODO consider to dump these titles!
        print_fn(
            f'Difference of titles across all feature types: {len(set(bb_names))}'
        )

    # Retain the common samples in fea dfs
    if dd_fea is not None:
        dd_fea = dd_fea[dd_fea[ID].isin(
            common_names)]  # .reset_index(drop=True)
    if fps_fea is not None:
        fps_fea = fps_fea[fps_fea[ID].isin(
            common_names)]  # .reset_index(drop=True)

    # ========================================================
    kwargs = {
        'common_samples': common_names,
        # 'fea_type': fea_type,
        # 'drg_set': drg_set,
        'dd_fea': dd_fea,
        'fps_fea': fps_fea,
        'img_fea': img_fea,
        'ID': ID,
        'print_fn': print_fn,
        'outdir': outdir,
        'outfigs': outfigs,
        'baseline': args.baseline,
        'n_samples': args.n_samples,
        'n_top': args.n_top,
        # 'flatten': args.flatten,
        'sampling': args.sampling,
    }

    if par_jobs > 1:
        results = Parallel(n_jobs=par_jobs,
                           verbose=20)(delayed(gen_ml_data)(fpath=f, **kwargs)
                                       for f in scr_files)
    else:
        results = []  # dock summary including ML baseline scores
        for f in scr_files:
            res = gen_ml_data(fpath=f, **kwargs)
            results.append(res)

    results = [r for r in results if r is not None]
    results = np.round(pd.DataFrame(results), decimals=3)
    results.sort_values('target').reset_index(drop=True)
    results.to_csv(outdir / 'dock.ml.baseline.csv', index=False)

    # ========================================================
    if (time() - t0) // 3600 > 0:
        print_fn('\nRuntime: {:.1f} hrs'.format((time() - t0) / 3600))
    else:
        print_fn('\nRuntime: {:.1f} min'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.kill_logger()
Exemplo n.º 9
0
def run(args):
    import ipdb
    ipdb.set_trace(context=5)
    t0 = time()
    smiles_path = args.smiles_path
    id_name = args.id_name
    par_jobs = args.par_jobs
    fea_type = args.fea_type

    print('\nLoad SMILES.')
    smiles_path = Path(args.smiles_path)
    smi = pd.read_csv(smiles_path, sep='\t')

    smi = smi.astype({'SMILES': str, id_name: str})
    smi['SMILES'] = smi['SMILES'].map(lambda x: x.strip())
    smi[id_name] = smi[id_name].map(lambda x: x.strip())
    # n_smiles = smi.shape[0]
    fea_id0 = smi.shape[1]  # index of the first feature

    # Create Outdir
    # i1, i2 = args.i1, args.i2
    # ids_dir = 'smi.ids.{}-{}'.format(i1, i2)
    # if i2 is None:
    #     i2 = n_smiles
    # gout = Path(args.gout, ids_dir)
    gout = Path(args.gout)
    os.makedirs(gout, exist_ok=True)

    # Logger
    lg = Logger(gout / 'gen.fea.dfs.log')
    print_fn = get_print_func(lg.logger)
    print_fn(f'File path: {filepath}')
    print_fn(f'\n{pformat(vars(args))}')

    print_fn('\nInput data path  {}'.format(smiles_path))
    print_fn('Output data dir  {}'.format(gout))

    # Duplicates
    # dup = smi[ smi.duplicated(subset=['smiles'], keep=False) ].reset_index(drop=True)
    # print(dup['smiles'].value_counts())

    # Drop duplicates
    smi = drop_dup_rows(smi, print_fn)

    # Exract subset SMILES
    # smi = smi.iloc[i1:i2+1, :].reset_index(drop=True)

    print_fn('\nCanonicalize SMILES.')
    can_smi_vec = canon_smiles(smi['SMILES'], par_jobs=par_jobs)
    can_smi_vec = pd.Series(can_smi_vec)

    # Save bad SMILES to file (that were not canonicalized)
    nan_ids = can_smi_vec.isna()
    bad_smi = smi[nan_ids]
    if len(bad_smi) > 0:
        bad_smi.to_csv(gout / 'smi_canon_err.csv', index=False)

    # Keep the good (canonicalized) SMILES
    smi['SMILES'] = can_smi_vec
    smi = smi[~nan_ids].reset_index(drop=True)

    # ========================================================
    # Generate images
    # ---------------
    if 'images' in fea_type:
        images = smiles_to_images(smi,
                                  smi_col_name='SMILES',
                                  title_col_name=id_name,
                                  molSize=(128, 128),
                                  kekulize=True,
                                  par_jobs=par_jobs)
        # print(images[0].keys())
        # img_outpath = gout/f'images.ids.{i1}-{i2}.pkl'
        img_outpath = gout / 'images.pkl'

        # Dump images to file (list of dicts)
        pickle.dump(images, open(img_outpath, 'wb'))
        # Load pkl
        # aa = pickle.load(open(img_outpath, 'rb'))
        # sum(images[0]['img'].reshape(-1,)-aa[0]['img'].reshape(-1,))

    # ========================================================
    # Generate fingerprints
    # ---------------------
    if 'fps' in fea_type:

        def gen_fps_and_save(smi, radius=1, par_jobs=par_jobs):
            ecfp = smiles_to_fps(smi,
                                 smi_name='SMILES',
                                 radius=radius,
                                 par_jobs=par_jobs)
            ecfp = add_fea_prfx(ecfp, prfx=f'ecfp{2*radius}.', id0=fea_id0)
            # ecfp.to_parquet(gout/f'ecfp{2*radius}.ids.{i1}-{i2}.{file_format}')
            ecfp.to_parquet(gout / f'ecfp{2*radius}.parquet')
            ecfp.to_csv(gout / f'ecfp{2*radius}', sep='\t', index=False)
            del ecfp

        gen_fps_and_save(smi, radius=1, par_jobs=par_jobs)
        gen_fps_and_save(smi, radius=2, par_jobs=par_jobs)
        gen_fps_and_save(smi, radius=3, par_jobs=par_jobs)

    # ========================================================
    # Generate descriptors
    # --------------------
    if 'descriptors' in fea_type:
        dd = smiles_to_mordred(smi,
                               smi_name='SMILES',
                               ignore_3D=args.ignore_3D,
                               par_jobs=par_jobs)
        dd = add_fea_prfx(dd, prfx='dd_', id0=fea_id0)

        # Filter NaNs (step 1)
        # Drop rows where all values are NaNs
        print_fn('\nDrop rows where all values are NaN.')
        print_fn('Shape: {}'.format(dd.shape))
        idx = (dd.isna().sum(axis=1) == dd.shape[1]).values
        dd = dd.iloc[~idx, :].reset_index(drop=True)
        # Drop cols where all values are NaNs
        # idx = ( dd.isna().sum(axis=0) == dd.shape[0] ).values
        # dd = dd.iloc[:, ~idx].reset_index(drop=True)
        print_fn('Shape: {}'.format(dd.shape))

        # Filter NaNs (step 2)
        # Drop rows based on a thershold of NaN values.
        # print(dd.isna().sum(axis=1).sort_values(ascending=False))
        # p=dd.isna().sum(axis=1).sort_values(ascending=False).hist(bins=100);
        th = 0.2
        print_fn('\nDrop rows with at least {} NaNs (at least {} out of {}).'.
                 format(th, int(th * dd.shape[1]), dd.shape[1]))
        print_fn('Shape: {}'.format(dd.shape))
        dd = dropna(dd, axis=0, th=th)
        print_fn('Shape: {}'.format(dd.shape))

        # Cast features (descriptors)
        print_fn('\nCast descriptors to float.')
        dd = dd.astype({c: np.float32 for c in dd.columns[fea_id0:]})

        # Dump the count of NANs in each column
        aa = dd.isna().sum(axis=0).reset_index()
        aa = aa.rename(columns={'index': 'col', 0: 'count'})
        aa = aa.sort_values('count', ascending=False).reset_index(drop=True)
        aa.to_csv(gout / 'nan_count_per_col.csv', index=False)

        # Impute missing values
        if args.impute:
            print_fn('\nImpute NaNs.')
            print_fn('Total NaNs: {}'.format(dd.isna().values.flatten().sum()))
            dd = dd.fillna(0.0)
            print_fn('Total NaNs: {}'.format(dd.isna().values.flatten().sum()))

        # Save
        print_fn('\nSave.')
        dd = dd.reset_index(drop=True)
        fname = 'dd.mordred.{}'.format('' if args.impute else 'with.nans')
        dd.to_parquet(gout / (fname + '.parquet'))
        dd.to_csv(gout / (fname + '.csv'), sep='\t', index=False)
        # dd.to_csv( gout/'dd.ids.{}-{}.{}'.format(i1, i2, file_format), index=False )

    # ======================================================
    print_fn('\nRuntime {:.1f} mins'.format((time() - t0) / 60))
    print_fn('Done.')
    lg.kill_logger()
Exemplo n.º 10
0
def run(args):

    print("\nInput args:")
    pprint(vars(args))

    t0 = time()
    te_size = verify_size(args.te_size)
    datapath = Path(args.datapath).resolve()

    # Hard split
    # split_on = None if args.split_on is None else args.split_on.upper()
    cv_method = args.cv_method
    te_method = cv_method

    # Specify ML task (regression or classification)
    if cv_method == "strat":
        mltask = "cls"  # cast mltask to cls in case of stratification
    else:
        mltask = args.ml_task

    # Target column name
    trg_name = str(args.trg_name)
    # assert args.trg_name in data.columns, f'The prediction target ({args.name}) \
    #     was not found in the dataset.'

    # import ipdb; ipdb.set_trace()

    # -----------------------------------------------
    #       Create outdir
    # -----------------------------------------------
    if args.gout is not None:
        gout = Path(args.gout).resolve()
        sufx = "none" if args.split_on is None else args.split_on
        gout = gout / datapath.with_suffix(".splits")
        if args.split_on is not None:
            gout = gout / f"split_on_{sufx}"
        else:
            gout = gout / f"split_on_none"
    else:
        # Note! useful for drug response
        sufx = "none" if args.split_on is None else args.split_on
        gout = datapath.with_suffix(".splits")

    outfigs = gout / "outfigs"
    os.makedirs(gout, exist_ok=True)
    os.makedirs(outfigs, exist_ok=True)

    # -----------------------------------------------
    #       Create logger
    # -----------------------------------------------
    lg = Logger(gout / "data.splitter.log")
    print_fn = get_print_func(lg.logger)
    print_fn(f"File path: {fdir}")
    print_fn(f"\n{pformat(vars(args))}")
    dump_dict(vars(args), outpath=gout / "data.splitter.args.txt")

    # -----------------------------------------------
    #       Load data
    # -----------------------------------------------
    print_fn("\nLoad master dataset.")
    data = load_data(datapath)
    print_fn("data.shape {}".format(data.shape))

    # ydata = data[trg_name] if trg_name in data.columns else None
    # if (cv_method == "strat") and (ydata is None):
    #     raise ValueError("Prediction target column must be available if splits need to be stratified.")

    if (cv_method == "strat") and (trg_name not in data.columns):
        raise ValueError(
            "Prediction target column must be available if splits need to be stratified."
        )

    # if ydata is not None:
    #     plot_hist(ydata, title=f"{trg_name}", fit=None, bins=100,
    #               path=outfigs/f"{trg_name}_hist_all.png")

    if trg_name in data.columns:
        plot_hist(data[trg_name],
                  title=f"{trg_name}",
                  fit=None,
                  bins=100,
                  path=outfigs / f"{trg_name}_hist_all.png")

    # -----------------------------------------------
    #       Generate splits (train/val/test)
    # -----------------------------------------------
    print_fn("\n{}".format("-" * 50))
    print_fn("Split data into hold-out train/val/test")
    print_fn("{}".format("-" * 50))

    kwargs = {
        "cv_method": cv_method,
        "te_method": te_method,
        "te_size": te_size,
        "mltask": mltask,
        "split_on": args.split_on
    }

    data_splitter(
        data=data,
        n_splits=args.n_splits,
        gout=gout,
        outfigs=outfigs,
        # ydata = ydata,
        target_name=trg_name,
        print_fn=print_fn,
        seed=seed,
        **kwargs)

    print_fn("Runtime: {:.1f} min".format((time() - t0) / 60))
    print_fn("Done.")
    lg.close_logger()