Пример #1
0
    def test_Oracle(self):
        from tdc import Oracle

        from tdc import Oracle
        oracle = Oracle(name='SA')
        x = oracle(['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
                'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
                'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'])

        oracle = Oracle(name='Hop')
        x = oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])
Пример #2
0
    def get(self, benchmark, num_max_call=5000):
        dataset = fuzzy_search(benchmark, self.dataset_names)
        data_path = os.path.join(self.path, dataset)
        if self.file_format == 'csv':
            train = pd.read_csv(os.path.join(data_path, 'train_val.csv'))
            test = pd.read_csv(os.path.join(data_path, 'test.csv'))
        elif self.file_format == 'pkl':
            train = pd.read_pickle(os.path.join(data_path, 'train_val.pkl'))
            test = pd.read_pickle(os.path.join(data_path, 'test.pkl'))
        elif self.file_format == 'oracle':
            target_pdb_file = os.path.join(self.path, dataset + '.pdb')

        if self.name == 'docking_group':
            oracle = Oracle(name="Docking_Score",
                            software="vina",
                            pyscreener_path=self.pyscreener_path,
                            receptors=[target_pdb_file],
                            center=docking_target_info[dataset]['center'],
                            size=docking_target_info[dataset]['size'],
                            buffer=10,
                            path=data_path,
                            num_worker=self.num_workers,
                            ncpu=self.num_cpus,
                            num_max_call=num_max_call)
            return {'oracle': oracle, 'name': dataset}
        else:
            return {'train_val': train, 'test': test, 'name': dataset}
Пример #3
0
    def __init__(self, name):
        ## DRD2  GSK3B  JNK3  cyp3a4_benchmark
        from tdc import Oracle
        self.name = name
        super().__init__(score_modifier=None)
        if 'docking' not in self.name.lower():  ### drd2 gsk3 JNK3
            self.oracle = Oracle(name=self.name)

        elif self.name.lower() == 'docking_5wiu':
            self.oracle = Oracle(
                name='Docking_Score',
                software='vina',
                pyscreener_path='/project/molecular_data/graphnn/pyscreener',
                receptors=[
                    '/project/molecular_data/graphnn/pyscreener/testing_inputs/5WIU.pdb'
                ],
                docked_ligand_file=
                '/project/molecular_data/graphnn/pyscreener/testing_inputs/5WIU_with_ligand.pdb',
                buffer=10,
                path='/project/molecular_data/graphnn/pyscreener/my_test/',
                num_worker=1,
                ncpu=4)

        elif self.name.lower() == 'docking_drd3':
            self.oracle = Oracle(
                name='Docking_Score',
                software='vina',
                pyscreener_path='/project/molecular_data/graphnn/pyscreener',
                receptors=[
                    '/project/molecular_data/graphnn/pyscreener/testing_inputs/DRD3.pdb'
                ],
                center=(9, 22.5, 26),
                size=(15, 15, 15),
                buffer=10,
                path='/project/molecular_data/graphnn/pyscreener/my_test/',
                num_worker=1,
                ncpu=10)

        self.docking_num_file = "/project/molecular_data/graphnn/pyscreener/docking_num.txt"
        write_num(self.docking_num_file, 0)
        print('----------initialize docking_num_file-------------')
Пример #4
0
    def __next__(self):
        if self.index < self.num_datasets:
            dataset = self.dataset_names[self.index]
            print_sys('--- ' + dataset + ' ---')

            data_path = os.path.join(self.path, dataset)
            if not os.path.exists(data_path):
                os.mkdir(data_path)
            if self.file_format == 'csv':
                train = pd.read_csv(os.path.join(data_path, 'train_val.csv'))
                test = pd.read_csv(os.path.join(data_path, 'test.csv'))
            elif self.file_format == 'pkl':
                train = pd.read_pickle(os.path.join(data_path,
                                                    'train_val.pkl'))
                test = pd.read_pickle(os.path.join(data_path, 'test.pkl'))
            elif self.file_format == 'oracle':
                target_pdb_file = os.path.join(self.path, dataset + '.pdb')
            self.index += 1

            if self.name == 'docking_group':
                oracle = Oracle(name="Docking_Score",
                                software="vina",
                                pyscreener_path=self.pyscreener_path,
                                receptors=[target_pdb_file],
                                center=docking_target_info[dataset]['center'],
                                size=docking_target_info[dataset]['size'],
                                buffer=10,
                                path=data_path,
                                num_worker=self.num_workers,
                                ncpu=self.num_cpus,
                                num_max_call=self.num_max_call)
                return {'oracle': oracle, 'name': dataset}
            else:
                return {'train_val': train, 'test': test, 'name': dataset}
        else:
            raise StopIteration
Пример #5
0
from tdc import Oracle
oracle = Oracle(name='isomers_c7h8n2o2')
print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
Пример #6
0
from absl import app
from absl import flags
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from tensorflow.compat.v1 import gfile

from dqn import deep_q_networks
from dqn import molecules as molecules_mdp
from dqn import run_dqn
from dqn.py import molecules
from dqn.tensorflow_core import core


from tdc import Oracle
qed = Oracle(name = 'qed')
logp = Oracle(name = 'logp')
jnk = Oracle(name = 'JNK3')
gsk = Oracle(name = 'GSK3B')



from scipy.stats import gmean

def logp_modifier(logp_score):
    return max(0.0,min(1.0,1/14*(logp_score+10))) 


def qed_logp_jnk_gsk_fusion(qed_score, logp_score, jsn_score, gsk_score):
    logp_score = logp_modifier(logp_score)
    gmean_score = gmean([qed_score, logp_score, jsn_score, gsk_score])
Пример #7
0
 def test_Oracle(self):
     from tdc import Oracle
     oracle = Oracle(name='Hop')
     print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
Пример #8
0
 def test_Oracle(self):
     from tdc import Oracle
     oracle = Oracle(name='celecoxib rediscovery')
     print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
Пример #9
0
from tdc import Oracle
oracle = Oracle(name='aripiprazole_similarity')
print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--oracle_num', type=int, default=1500)
    parser.add_argument('--oracle_name',
                        type=str,
                        default="qed",
                        choices=['jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk'])
    parser.add_argument('--generations', type=int, default=50)
    parser.add_argument('--population_size', type=int, default=20)
    args = parser.parse_args()

    oracle_num = args.oracle_num
    oracle_name = args.oracle_name
    generations = args.generations
    population_size = args.population_size

    start_smiles_lst = ['C1(N)=NC=CC=N1']  ## 'C1=CC=CC=C1NC2=NC=CC=N2'
    qed = Oracle('qed')
    sa = Oracle('sa')
    jnk = Oracle('JNK3')
    gsk = Oracle('GSK3B')
    logp = Oracle('logp')
    mu = 2.230044
    sigma = 0.6526308

    def normalize_sa(smiles):
        sa_score = sa(smiles)
        mod_score = np.maximum(sa_score, mu)
        return np.exp(-0.5 * np.power((mod_score - mu) / sigma, 2.))

    if oracle_name == 'jnkgsk':

        def oracle(smiles):
            return np.mean((jnk(smiles), gsk(smiles)))
    elif oracle_name == 'qedsajnkgsk':

        def oracle(smiles):
            return np.mean(
                (qed(smiles), normalize_sa(smiles), jnk(smiles), gsk(smiles)))
    elif oracle_name == 'qed':

        def oracle(smiles):
            return qed(smiles)
    elif oracle_name == 'jnk':

        def oracle(smiles):
            return jnk(smiles)
    elif oracle_name == 'gsk':

        def oracle(smiles):
            return gsk(smiles)
    elif oracle_name == 'logp':

        def oracle(smiles):
            return logp(smiles)

    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = 'cpu'  ## cpu is better
    model_ckpt = "save_model/GNN_epoch_0_validloss_1.61160.ckpt"
    gnn = torch.load(model_ckpt)
    gnn.switch_device(device)

    result_pkl = "result/" + oracle_name + ".pkl"
    optimization(start_smiles_lst,
                 gnn,
                 oracle,
                 oracle_num,
                 oracle_name,
                 generations=generations,
                 population_size=population_size,
                 lamb=2,
                 topk=5,
                 epsilon=0.7,
                 result_pkl=result_pkl)
Пример #11
0
    def test_Oracle(self):
        # Molecule Generation Oracles

        from tdc import Oracle

        oracle = Oracle(name='GSK3B')
        smiles_lst = [
            'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
            'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
            'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
            'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']
        oracle(smiles_lst)

        oracle = Oracle(name='DRD2')
        smiles_lst = [
            'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
            'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
            'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
            'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']
        oracle(smiles_lst)

        oracle = Oracle(name='Hop')
        oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])

        oracle = Oracle(name='Valsartan_SMARTS')
        oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])

        oracle = Oracle(name='Rediscovery')
        oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])

        oracle = Oracle(name='SA')
        smiles_lst = [
            'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
            'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
            'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
            'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']
        oracle(smiles_lst)

        oracle = Oracle(name='Uniqueness')

        smiles_lst = [
            'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
            'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
            'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
            'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']

        oracle(smiles_lst)

        oracle = Oracle(name='Novelty')

        smiles_lst = [
            'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
            'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
            'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
            'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']

        oracle(smiles_lst, smiles_lst)

        oracle = Oracle(name='Diversity')

        smiles_lst = [
            'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
            'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
            'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
            'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']

        oracle(smiles_lst)

        oracle = Oracle(name='Scaffold Hop')
        oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])
Пример #12
0
 def test_Oracle(self):
     from tdc import Oracle
     oracle = Oracle(name='isomers_c7h8n2o2')
     print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
from tdc import Oracle
oracle = Oracle(name = 'celecoxib rediscovery')
print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O',
       'C1=CC=C(C=C1)C=O']))

Пример #14
0
from tdc import Oracle
oracle = Oracle(name = 'Hop')
print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O',
       'C1=CC=C(C=C1)C=O']))

Пример #15
0
 def test_Oracle(self):
     from tdc import Oracle
     oracle = Oracle(name='aripiprazole_similarity')
     print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
Пример #16
0
from guacamol.goal_directed_generator import GoalDirectedGenerator
# from guacamol.scoring_function import ScoringFunction
from guacamol.utils.chemistry import canonicalize_list, canonicalize
from joblib import delayed

from smiles_lstm_hc.rnn_generator import SmilesRnnMoleculeGenerator
from smiles_lstm_hc.rnn_utils import load_rnn_model

from tdc import Oracle
drd3_oracle = Oracle(
    name='Docking_Score',
    software='vina',
    pyscreener_path='/project/molecular_data/graphnn/pyscreener',
    receptors=[
        '/project/molecular_data/graphnn/pyscreener/testing_inputs/DRD3.pdb'
    ],
    center=(9, 22.5, 26),
    size=(15, 15, 15),
    buffer=10,
    path='/project/molecular_data/graphnn/pyscreener/my_test/',
    num_worker=1,
    ncpu=10)

global oracle_num
oracle_num = 0


def drd3_docking_oracle(smiles):
    # oracle_num += 1
    # print('Docking call', oracle_num)
    return min(max(-drd3_oracle(smiles) / 15.0, 0), 1)
Пример #17
0
    def evaluate(self,
                 pred,
                 true=None,
                 benchmark=None,
                 criteria='all',
                 m1_api=None):

        if self.name == 'docking_group':
            results_all = {}

            for data_name, pred_ in pred.items():

                results = {}

                ## pred is a list of smiles strings
                if len(pred_) != 100:
                    raise ValueError(
                        "The expected output is a list of top 100 molecules!")
                dataset = fuzzy_search(benchmark, self.dataset_names)

                # docking scores for the top K smiles (K <= 100)
                target_pdb_file = os.path.join(self.path, dataset + '.pdb')

                oracle = Oracle(name="Docking_Score",
                                software="vina",
                                pyscreener_path=self.pyscreener_path,
                                receptors=[target_pdb_file],
                                center=docking_target_info[dataset]['center'],
                                size=docking_target_info[dataset]['size'],
                                buffer=10,
                                path=data_path,
                                num_worker=self.num_workers,
                                ncpu=self.num_cpus,
                                num_max_call=10000)

                docking_scores = oracle(pred_)
                results['docking_scores_dict'] = docking_scores
                values = np.array(list(docking_scores.values()))
                results['AVG_Top100'] = np.mean(values)
                results['AVG_Top10'] = np.mean(sorted(values)[:10])
                results['Top1'] = max(values)

                all_criteria = [
                    'm1', 'filters', 'diversity', 'validity', 'uniqueness'
                ]

                if criteria == 'all':
                    criteria = all_criteria
                elif criteria == 'none':
                    criteria = []
                else:
                    if sum([1 if i in all_criteria else 0
                            for i in criteria]) != len(criteria):
                        # there is at least one criteria does not match the supported evaluation
                        raise ValueError(
                            "Please select the criteria from a list of 'm1', 'filters', 'diversity', 'validity', 'uniqueness'!"
                        )

                if 'm1' in criteria:
                    if m1_api is None:
                        raise ValueError(
                            "Please input the m1_api token in the evaluate function call! You can obtain it via: https://tdcommons.ai/functions/oracles/#moleculeone"
                        )
                    m1 = Oracle(name='Molecule One Synthesis',
                                api_token=m1_api)
                    m1_scores = m1(pred_)
                    scores_array = list(m1_scores.values())
                    results['m1_scores_dict'] = m1_scores
                    results['AVG_m1_scores'] = np.mean(scores_array)
                    ## TODO: how good is the m1 score? ask stan; 0.5 placeholder
                    results['AVG_docking_scores_synthesizable'] = np.mean([
                        docking_scores[i] for i, j in m1_scores.items()
                        if j > 0.5
                    ])

                if 'filters' in criteria:
                    from tdc.chem_utils import MolFilter
                    ## TODO: select an optimal set of filters. test a bit.
                    filters = MolFilter(filters=['PAINS'], HBD=[0, 6])
                    pred_filter = filters(pred_)
                    results['pass_filter_smiles_list'] = pred_filter
                    results['unfiltered_fractions'] = float(
                        len(pred_filter)) / 100
                    results['AVG_docking_scores_unfiltered'] = np.mean(
                        [docking_scores[i] for i in pred_filter])

                if 'diversity' in criteria:
                    from tdc import Evaluator
                    evaluator = Evaluator(name='Diversity')
                    score = evaluator(pred_)
                    results['diversity'] = score

                if 'validity' in criteria:
                    from tdc import Evaluator
                    evaluator = Evaluator(name='Validity')
                    score = evaluator(pred_)
                    results['validity'] = score

                if 'uniqueness' in criteria:
                    from tdc import Evaluator
                    evaluator = Evaluator(name='Uniqueness')
                    score = evaluator(pred_)
                    results['uniqueness'] = score

                results_all[dataset_name] = results
            return results_all

        if true is None:
            # test set evaluation
            metric_dict = bm_metric_names[self.name]
            out = {}
            for data_name, pred_ in pred.items():
                data_name = fuzzy_search(data_name, self.dataset_names)
                data_path = os.path.join(self.path, data_name)
                if self.file_format == 'csv':
                    test = pd.read_csv(os.path.join(data_path, 'test.csv'))
                elif self.file_format == 'pkl':
                    test = pd.read_pickle(os.path.join(data_path, 'test.pkl'))
                y = test.Y.values
                evaluator = eval('Evaluator(name = \'' +
                                 metric_dict[data_name] + '\')')
                out[data_name] = {
                    metric_dict[data_name]: round(evaluator(y, pred_), 3)
                }

                # If reporting accuracy across target classes
                if 'target_class' in test.columns:
                    test['pred'] = pred_
                    for c in test['target_class'].unique():
                        data_name_subset = data_name + '_' + c
                        test_subset = test[test['target_class'] == c]
                        y_subset = test_subset.Y.values
                        pred_subset = test_subset.pred.values

                        evaluator = eval('Evaluator(name = \'' +
                                         metric_dict[data_name_subset] + '\')')
                        out[data_name_subset] = {
                            metric_dict[data_name_subset]:
                            round(evaluator(y_subset, pred_subset), 3)
                        }
            return out
        else:
            # validation set evaluation
            if benchmark is None:
                raise ValueError(
                    'Please specify the benchmark name for us to retrieve the standard metric!'
                )
            data_name = fuzzy_search(benchmark, self.dataset_names)
            metric_dict = bm_metric_names[self.name]
            evaluator = eval('Evaluator(name = \'' + metric_dict[data_name] +
                             '\')')
            return {metric_dict[data_name]: round(evaluator(true, pred), 3)}
Пример #18
0
sys.path.append("..")
import os
import json
import numpy as np
import pandas as pd
import functools
from dqn import molecules
from dqn import deep_q_networks
from dqn.py.SA_Score import sascorer
from chemutil import similarity

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors, QED

from tdc import Oracle
qed_oracle = Oracle(name='qed')

# import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path


def latest_ckpt(path):
    return max([
        int(p.stem.split('-')[1]) for p in path.iterdir()
        if p.stem[:4] == 'ckpt'
    ])


# basepath = '/Users/odin/sherlock_scratch/moldqn2/target_sas/mol%i_target_%.1f'
path = Path("save_qed")
Пример #19
0
seq = uniprot2seq('P49122')

# data split

from tdc.single_pred import ADME
data = ADME(name='Caco2_Wang')
split = data.get_split(method='scaffold')

from tdc.multi_pred import DTI
data = DTI(name='DAVIS')
split = data.get_split(method='cold_split', column_name='Drug')

# Molecule Generation Oracles

from tdc import Oracle
oracle = Oracle(name='GSK3B')
smiles_lst = ['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
     'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
     'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
     'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']
oracle(smiles_lst)

oracle = Oracle(name='DRD2')
smiles_lst = ['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
     'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
     'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
     'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']
oracle(smiles_lst)

oracle = Oracle(name='Hop')
oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])