예제 #1
0
 def __init__(self,
              name: str,
              split="train",
              path="./data",
              mode="cnn_cnn",
              y_log=True,
              drug_transform=None,
              protein_transform=None):
     self.data = DTI(name=name, path=path)
     self.mode = mode.lower()
     if y_log:
         self.data.convert_to_log()
     self.data = self.data.get_split()[split]
     self.drug_transform = drug_transform
     self.protein_transform = protein_transform
예제 #2
0
class BindingDBDataset(data.Dataset):
    """
    A custom dataset for loading and processing original TDC data, which is used as input data in DeepDTA model.

    Args:
         name (str): TDC dataset name.
         split (str): Data split type (train, valid or test).
         path (str): dataset download/local load path (default: "./data")
         mode (str): encoding mode (default: cnn_cnn)
         drug_transform: Transform operation (default: None)
         protein_transform: Transform operation (default: None)
         y_log (bool): Whether convert y values to log space. (default: True)
    """
    def __init__(
        self,
        name: str,
        split="train",
        path="./data",
        mode="cnn_cnn",
        y_log=True,
        drug_transform=None,
        protein_transform=None,
    ):
        self.data = DTI(name=name, path=path)
        self.mode = mode.lower()
        if y_log:
            self.data.convert_to_log()
        self.data = self.data.get_split()[split]
        self.drug_transform = drug_transform
        self.protein_transform = protein_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        drug, protein, label = self.data["Drug"][idx], self.data["Target"][
            idx], self.data["Y"][idx]
        mode_drug, mode_protein = self.mode.split("_")
        if mode_drug == "cnn":
            drug = torch.LongTensor(integer_label_smiles(drug))
        if mode_protein == "cnn":
            protein = torch.LongTensor(integer_label_protein(protein))
        label = torch.Tensor([label])
        if self.drug_transform is not None:
            self.drug_transform(drug)
        if self.protein_transform is not None:
            self.protein_transform(protein)
        return drug, protein, label
예제 #3
0
def main():
    # For now hard coding the values for all methods below, change to read it using config file
    print("Start..")
    args = arg_parse()

    # ---- set configs, logger and device ----
    cfg = get_cfg_defaults()
    cfg.merge_from_file(args.cfg)
    cfg.freeze()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Fetch the bindingDB dataset based on the name defined in config file
    bindingdb_dataset = DTI(name=cfg.DATASET.NAME)

    # Apply cluster based on the 3 types of cluster
    # can define cluster_type -> k_means, agglomerative, dbscan
    #     bindingdb_dataset = apply_clustering(bindingdb_dataset, num_of_clusters=cfg.SOLVER.NUM_OF_CLUSTERS,
    #                                          cluster_type=cfg.MODEL.CLUSTER_TYPE)
    bindingdb_dataset_cluster = apply_clustering(
        bindingdb_dataset,
        num_of_clusters=cfg.SOLVER.NUM_OF_CLUSTERS,
        cluster_type=cfg.MODEL.CLUSTER_TYPE)

    # Split the data based on the clusters formed by specifying the split in fraction
    train_dataset, val_dataset, test_dataset = get_split_by_clusters(
        bindingdb_dataset_cluster, num_of_clusters=cfg.SOLVER.NUM_OF_CLUSTERS)

    train_dataset = DTADataset(ds=train_dataset)
    val_dataset = DTADataset(ds=val_dataset)
    test_dataset = DTADataset(ds=test_dataset)

    train_loader = DataLoader(dataset=train_dataset,
                              shuffle=True,
                              batch_size=cfg.SOLVER.TRAIN_BATCH_SIZE)
    val_loader = DataLoader(dataset=val_dataset,
                            shuffle=True,
                            batch_size=cfg.SOLVER.TEST_BATCH_SIZE)
    test_loader = DataLoader(dataset=test_dataset,
                             shuffle=True,
                             batch_size=cfg.SOLVER.TEST_BATCH_SIZE)

    # ---- set model ----
    model = get_model(cfg)

    # ---- training and evaluation ----
    gpus = 1 if device == "cuda" else 0
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode="min")
    trainer = pl.Trainer(max_epochs=cfg.SOLVER.MAX_EPOCHS,
                         gpus=gpus,
                         callbacks=[checkpoint_callback])
    trainer.fit(model,
                train_dataloader=train_loader,
                val_dataloaders=val_loader)
    trainer.test(test_dataloaders=test_loader)

    print("Done!!!")
예제 #4
0
    def test_to_graph(self):
        from tdc.multi_pred import DTI
        data = DTI(name='DAVIS')
        data.to_graph(threshold=30,
                      format='edge_list',
                      split=True,
                      frac=[0.7, 0.1, 0.2],
                      seed=42,
                      order='descending')
        # output: {'edge_list': array of shape (X, 2), 'neg_edges': array of shape (X, 2), 'split': {'train': df, 'valid': df, 'test': df}}
        data.to_graph(threshold=30,
                      format='dgl',
                      split=True,
                      frac=[0.7, 0.1, 0.2],
                      seed=42,
                      order='descending')
        # output: {'dgl_graph': the DGL graph object, 'index_to_entities': a dict map from ID in the data to node ID in the DGL object, 'split': {'train': df, 'valid': df, 'test': df}}

        data.to_graph(threshold=30,
                      format='pyg',
                      split=True,
                      frac=[0.7, 0.1, 0.2],
                      seed=42,
                      order='descending')
예제 #5
0
    def test_multi_pred(self):
        from tdc.multi_pred import DTI

        data = DTI(name='DAVIS')
        split = data.get_split(method='cold_split', column_name='Drug')
예제 #6
0
 def test_DTI(self):
     from tdc.multi_pred import DTI
     data = DTI(name='DAVIS')
     split = data.get_split()
예제 #7
0
from tdc.multi_pred import DTI
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt
import math

data = DTI(name = 'BindingDB_Kd')
# data = DTI(name = 'DAVIS')
# data = DTI(name = 'KIBA')

def drugTarget2vec(data):
    data = data.get_data()
    
    data_selected = data[['Drug_ID','Drug','Target_ID','Target','Y']] 
    data_selected['Drug2vector'] = ''
    data_selected['TargetId'] = ''
    
    # use rdkit calculate ECFPs
    for ind, drug in enumerate(data_selected['Drug']):
        mol = Chem.MolFromSmiles(drug)
        Morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2) 
        #Explicitbitvects, which record whether or not a bit exists, are usually faster than SparseBitVects, but take up more memory, similar to fixed-length bit strings.
        intmap = map(int, Morgan_fp.ToBitString())
        data_selected['Drug2vector'][ind] = np.array(list(intmap))
    
    # create a dict to record the map relation of Target and TargetID
예제 #8
0
# import modules
import numpy as np
from tdc.multi_pred import DTI
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

# load in data
data_Kd = DTI(name='BindingDB_Kd')
data_Kd.convert_to_log(form='binding')
split = data_Kd.get_split(method='random', seed=42, frac=[0.6, 0.05, 0.35])

train = split['train']
test = split['test']
print('Data loaded')

train = train.dropna()

ID_to_Drug = dict(enumerate(list(dict.fromkeys(train['Drug']))))
Drug_to_ID = dict((v, k) for k, v in ID_to_Drug.items())
print('Drug dictionaries completed')

num_drugs = len(Drug_to_ID.keys())
drug_sim = np.zeros((num_drugs, num_drugs))
for i in range(num_drugs):
    if i % 1000 == 0:
        print('\n500 drug similarities calculated')
    drug1 = ID_to_Drug[i]
    m1 = Chem.MolFromSmiles(drug1)
    fp1 = AllChem.GetMorganFingerprint(m1, 2)
    for j in range(num_drugs):
예제 #9
0
 def test_convert_to_log(self):
     from tdc.multi_pred import DTI
     data = DTI(name='DAVIS')
     data.convert_to_log()
예제 #10
0
 def test_binarize(self):
     from tdc.multi_pred import DTI
     data = DTI(name='DAVIS')
     data.binarize(threshold=30, order='descending')
예제 #11
0
# evaluators

from tdc import Evaluator
evaluator = Evaluator(name='ROC-AUC')
print(evaluator([0, 1], [0.5, 0.6]))

# Processing Helpers

from tdc.single_pred import ADME
data = ADME(name='Caco2_Wang')
data.label_distribution()

from tdc.multi_pred import DTI
data = DTI(name='DAVIS')
data.binarize(threshold=30, order='descending')

from tdc.multi_pred import DTI
data = DTI(name='DAVIS')
data.convert_to_log()

from tdc.multi_pred import DDI
from tdc.utils import get_label_map
data = DDI(name='DrugBank')
split = data.get_split()
get_label_map(name='DrugBank', task='DDI')

from tdc.multi_pred import GDA
data = GDA(name='DisGeNET')
data.print_stats()

from tdc.single_pred import HTS
예제 #12
0
 def time_split(self):
     from tdc.multi_pred import DTI
     data = DTI(name='BindingDB_Patent')
     split = data.get_split(method='time', time_column='Year')