示例#1
0
 def __init__(self,
              get_atomic_attributes,
              node_attributes,
              filename,
              cols_to_read,
              delimiter=',',
              get_bond_attributes=None,
              edge_attributes=None):
     super(GraphDataset, self).__init__()
     assert (get_bond_attributes is None) == (edge_attributes is None)
     data_set = read_smiles_property_file(filename, cols_to_read, delimiter)
     data = data_set[0]
     target = data_set[1:]
     clean_smiles, clean_idx = sanitize_smiles(data)
     target = np.array(target).T
     max_size = 0
     for sm in clean_smiles:
         mol = Chem.MolFromSmiles(sm)
         if mol.GetNumAtoms() > max_size:
             max_size = mol.GetNumAtoms()
     self.target = target[clean_idx, :]
     self.graphs = []
     self.node_feature_matrix = []
     self.adj_matrix = []
     for sm in clean_smiles:
         graph = Graph(sm, max_size, get_atomic_attributes,
                       get_bond_attributes)
         self.node_feature_matrix.append(
             graph.get_node_feature_matrix(node_attributes, max_size))
         if get_bond_attributes is None:
             self.adj_matrix.append(graph.adj_matrix)
         else:
             self.adj_matrix.append(
                 graph.get_edge_attr_adj_matrix(edge_attributes, max_size))
     self.num_features = self.node_feature_matrix[0].shape[1]
示例#2
0
 def __init__(self,
              filename,
              cols_to_read,
              delimiter=',',
              tokens=None,
              pad=True,
              tokenize=True,
              augment=False,
              flip=True):
     super(SmilesDataset, self).__init__()
     self.tokenize = tokenize
     data = read_smiles_property_file(filename, cols_to_read, delimiter)
     smiles = data[0]
     clean_smiles, clean_idx = sanitize_smiles(smiles)
     if len(data) > 1:
         target = np.array(data[1:], dtype='float')
         target = np.array(target)
         target = target.T
         self.target = target[clean_idx]
     else:
         self.target = None
     if augment:
         clean_smiles, self.target = augment_smiles(clean_smiles,
                                                    self.target)
     if pad:
         clean_smiles, self.length = pad_sequences(clean_smiles)
     tokens, self.token2idx, self.num_tokens = get_tokens(
         clean_smiles, tokens)
     if tokenize:
         clean_smiles, self.tokens = seq2tensor(clean_smiles, tokens, flip)
     self.data = clean_smiles
示例#3
0
 def __init__(self,
              filename,
              cols_to_read,
              get_features,
              delimiter=',',
              return_smiles=False,
              get_features_args=None):
     super(FeatureDataset, self).__init__()
     self.return_smiles = return_smiles
     self.get_features = get_features
     data = read_smiles_property_file(filename, cols_to_read, delimiter)
     if len(cols_to_read) > 1:
         assert len(cols_to_read) == len(data)
         smiles = data[0]
         target = np.array(data[1:], dtype='float')
         target = target.T
         num_targets = len(cols_to_read) - 1
         target = target.reshape((-1, num_targets))
     else:
         smiles = data[0]
         target = None
     self.target = target
     features, valid_idx, invalid_idx = get_features(
         smiles, **get_features_args)
     self.objects = [smiles[i] for i in valid_idx]
     length = [len(sm) for sm in self.objects]
     self.max_len = max(length)
     self.data = features
示例#4
0
 def __init__(self, filename, cols_to_read, features, delimiter=',', tokens=None):
     super(VanillaDataset, self).__init__()
     data = read_smiles_property_file(filename, cols_to_read, delimiter)
     smiles = data[0]
     target = np.array(data[1], dtype='float')
     clean_smiles, clean_idx = sanitize_smiles(smiles)
     target = np.array(target)
     self.target = target[clean_idx]
示例#5
0
    def from_smiles_file(cls, get_atomic_attributes, node_attributes, filename, 
                cols_to_read, delimiter=',', get_bond_attributes=None, edge_attributes=None):
        data_set = read_smiles_property_file(filename, cols_to_read,
                                                 delimiter)
        data = data_set[0]
        target = np.array(data_set[1:]).squeeze()

        clean_smiles, clean_idx = sanitize_smiles(data)
        clean_mols = [Chem.MolFromSmiles(smiles) for smiles in clean_smiles]

        clean_target = target[clean_idx]

        return cls(get_atomic_attributes, node_attributes, clean_mols, clean_target,
                get_bond_attributes, edge_attributes)
 def __init__(self,
              filename,
              tokenized=False,
              cols_to_read=None,
              delimiter=',',
              mol_tokens=None,
              prot_tokens=None,
              pad=True):
     super(SmilesProteinDataset, self).__init__()
     if not tokenized:
         data = read_smiles_property_file(filename, cols_to_read, delimiter)
         smiles = data[0]
         proteins = np.array(data[1])
         target = np.array(data[2], dtype='float')
         clean_smiles, clean_idx = sanitize_smiles(smiles)
         self.target = target[clean_idx]
         proteins = list(proteins[clean_idx])
         if pad:
             clean_smiles, self.mol_lengths = pad_sequences(clean_smiles)
             proteins, self.prot_lengths = pad_sequences(proteins)
         self.mol_tokens, self.mol_token2idx, self.mol_num_tokens = \
             get_tokens(clean_smiles, mol_tokens)
         self.prot_tokens, self.prot_token2idx, self.prot_num_tokens = \
             get_tokens(proteins, prot_tokens)
         clean_smiles = seq2tensor(clean_smiles, self.mol_tokens)
         proteins = seq2tensor(proteins, self.prot_tokens)
         self.molecules = clean_smiles
         self.proteins = proteins
     else:
         f = open(filename, 'rb')
         data = pickle.load(f)
         self.mol_tokens = data['smiles_tokens']
         self.prot_tokens = data['proteins_tokens']
         self.mol_num_tokens = len(data['smiles_tokens'])
         self.prot_num_tokens = len(data['proteins_tokens'])
         self.molecules = data['smiles']
         self.proteins = data['proteins']
         self.target = data['labels']
     assert len(self.molecules) == len(self.proteins)
     assert len(self.molecules) == len(self.target)
示例#7
0
from openchem.data.siamese_data_layer import SiameseDataset
from openchem.utils.utils import identity

import torch
import torch.nn as nn

import numpy as np

from torch.optim import RMSprop, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, f1_score

from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file(
    './benchmark_datasets/reactions/4_11_with_y2.csv',
    cols_to_read=[11, 12, 14],
    keep_header=False)
reactant1 = data[0]
reactant2 = data[1]
labels = np.array(data[2], dtype="float").reshape(-1, 1)

reactants = [reactant1[i] + " " + reactant2[i] for i in range(len(reactant2))]

from openchem.data.utils import get_tokens
tokens, _, _ = get_tokens(reactants)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reactants,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)
示例#8
0
    def __init__(self,
                 get_atomic_attributes,
                 node_attributes,
                 filename,
                 cols_to_read,
                 delimiter=',',
                 get_bond_attributes=None,
                 edge_attributes=None,
                 restrict_min_atoms=-1,
                 restrict_max_atoms=-1,
                 kekulize=True,
                 file_format="smi",
                 addHs=False,
                 has_3D=False,
                 allowed_atoms=None,
                 return_smiles=False,
                 **kwargs):
        super(GraphDataset, self).__init__()
        assert (get_bond_attributes is None) == (edge_attributes is None)
        self.return_smiles = return_smiles
        self.restrict_min_atoms = restrict_min_atoms
        self.restrict_max_atoms = restrict_max_atoms
        self.kekulize = kekulize
        self.addHs = addHs
        self.has_3D = has_3D

        if file_format == "pickled":
            data = pickle.load(open(filename, "rb"))

            # this cleanup must be consistent with sanitize_smiles
            mn, mx = restrict_min_atoms, restrict_max_atoms
            indices = [
                i for i, n in enumerate(data["num_atoms_all"])
                if (n >= mn or mn < 0) and (n <= mx or mx < 0)
            ]
            data = {
                key: value[indices] if isinstance(value, np.ndarray) else
                [value[i] for i in indices]
                for key, value in data.items()
            }

            self.num_atoms_all = data["num_atoms_all"]
            self.target = data["target"]
            self.smiles = data["smiles"]
        elif file_format == "smi":
            data_set = read_smiles_property_file(filename, cols_to_read,
                                                 delimiter)
            data = data_set[0]
            if len(cols_to_read) == 1:
                target = None
            else:
                target = data_set[1:]
            clean_smiles, clean_idx, num_atoms, max_len = sanitize_smiles(
                data,
                min_atoms=restrict_min_atoms,
                max_atoms=restrict_max_atoms,
                return_num_atoms=True,
                return_max_len=True)
            self.max_len = max_len
            if target is not None:
                target = np.asarray(target, dtype=np.float).T
            clean_smiles = [clean_smiles[i] for i in clean_idx]
            num_atoms = [num_atoms[i] for i in clean_idx]
            self.clean_idx = clean_idx
            if target is not None:
                self.target = target[clean_idx, :]
            else:
                self.target = None
            self.smiles = clean_smiles
            self.num_atoms_all = num_atoms
        else:
            raise NotImplementedError()

        self.max_size = max(self.num_atoms_all)
        self.node_attributes = node_attributes
        self.edge_attributes = edge_attributes
        self.get_atomic_attributes = get_atomic_attributes
        self.get_bond_attributes = get_bond_attributes
示例#9
0
from openchem.criterion.multitask_loss import MultitaskLoss
from sklearn.metrics import r2_score, mean_squared_error
from openchem.utils.utils import identity
import torch
import torch.nn as nn

import numpy as np

from torch.optim import RMSprop, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
import torch.nn.functional as F

from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file(
    'benchmark_datasets/melt_temp/melting_data.txt',
    cols_to_read=[0, 1],
    delimiter='\t',
    keep_header=False)
smiles = data[0][1:]
labels = np.array(data[1][1:], dtype='float').reshape(-1)

from openchem.data.utils import get_tokens
tokens, _, _ = get_tokens(smiles)
tokens = tokens + ' '

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(smiles,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42)
示例#10
0
from openchem.modules.mlp.openchem_mlp import OpenChemMLP
from openchem.data.smiles_data_layer import SmilesDataset
from openchem.criterion.multitask_loss import MultitaskLoss

import torch
import torch.nn as nn

import numpy as np

from torch.optim import RMSprop, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, mean_squared_error

from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file('./benchmark_datasets/tox21/tox21.csv',
                                 cols_to_read=[13] + list(range(0,12)))
smiles = data[0]
labels = np.array(data[1:])

labels[np.where(labels=='')] = '999'
labels = labels.T

from openchem.data.utils import get_tokens
tokens, _, _ = get_tokens(smiles)
tokens = tokens + ' '

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2,
                                                    random_state=42)

from openchem.data.utils import save_smiles_property_file
示例#11
0

node_attributes = {}
node_attributes['valence'] = Attribute('node', 'valence', one_hot=True, values=[1, 2, 3, 4, 5, 6])
node_attributes['charge'] = Attribute('node', 'charge', one_hot=True, values=[-1, 0, 1, 2, 3, 4])
node_attributes['hybridization'] = Attribute('node', 'hybridization',
                                             one_hot=True, values=[0, 1, 2, 3, 4, 5, 6, 7])
node_attributes['aromatic'] = Attribute('node', 'aromatic', one_hot=True,
                                        values=[0, 1])
node_attributes['atom_element'] = Attribute('node', 'atom_element',
                                            one_hot=True,
                                            values=list(range(11)))


from openchem.data.utils import read_smiles_property_file
data = read_smiles_property_file('./benchmark_datasets/logp_dataset/logP_labels.csv',
                                         cols_to_read=[1,2], keep_header=False)

smiles = data[0]
labels = np.array(data[1]).reshape(-1, 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2,
                                                            random_state=42)

from openchem.data.utils import save_smiles_property_file
save_smiles_property_file('./benchmark_datasets/logp_dataset/train.smi', X_train, y_train)
save_smiles_property_file('./benchmark_datasets/logp_dataset/test.smi', X_test, y_test)


train_dataset = GraphDataset(get_atomic_attributes, node_attributes,
                             './benchmark_datasets/logp_dataset/train.smi',
    def __init__(self,
                 get_atomic_attributes,
                 node_attributes,
                 filename,
                 cols_to_read,
                 delimiter=',',
                 get_bond_attributes=None,
                 edge_attributes=None,
                 restrict_min_atoms=-1,
                 restrict_max_atoms=-1,
                 kekulize=True,
                 file_format="smi",
                 addHs=False,
                 has_3D=False,
                 allowed_atoms=None,
                 return_smiles=False,
                 **kwargs):
        super(GraphDataset, self).__init__()
        assert (get_bond_attributes is None) == (edge_attributes is None)
        self.return_smiles = return_smiles
        self.restrict_min_atoms = restrict_min_atoms
        self.restrict_max_atoms = restrict_max_atoms
        self.kekulize = kekulize
        self.addHs = addHs
        self.has_3D = has_3D

        if file_format == "pickled":
            data = pickle.load(open(kwargs["pickled"], "rb"))

            # this cleanup must be consistent with sanitize_smiles
            mn, mx = restrict_min_atoms, restrict_max_atoms
            indices = [
                i for i, n in enumerate(data["num_atoms_all"])
                if (n >= mn or mn < 0) and (n <= mx or mx < 0)
            ]
            data = {
                key: value[indices] if isinstance(value, np.ndarray) else
                [value[i] for i in indices]
                for key, value in data.items()
            }

            self.num_atoms_all = data["num_atoms_all"]
            self.target = data["target"]
            self.smiles = data["smiles"]
        elif file_format == "smi":
            data_set = read_smiles_property_file(filename, cols_to_read,
                                                 delimiter)
            data = data_set[0]
            if len(cols_to_read) == 1:
                target = None
            else:
                target = data_set[1:]
            clean_smiles, clean_idx, num_atoms, max_len = sanitize_smiles(
                data,
                min_atoms=restrict_min_atoms,
                max_atoms=restrict_max_atoms,
                return_num_atoms=True,
                return_max_len=True)
            self.max_len = max_len
            if target is not None:
                target = np.asarray(target, dtype=np.float).T
            clean_smiles = [clean_smiles[i] for i in clean_idx]
            num_atoms = [num_atoms[i] for i in clean_idx]
            self.clean_idx = clean_idx
            if target is not None:
                self.target = target[clean_idx, :]
            else:
                self.target = None
            self.smiles = clean_smiles
            self.num_atoms_all = num_atoms
        elif file_format == "sdf":
            filenames = []
            os.chdir("/home/Work/data/enamine_hll-500/")
            for file in glob.glob("*.sdf"):
                filenames.append(file)
            self.num_atoms_all = []
            smiles = []
            rd_mols = []
            for f in [filenames[10]]:
                print(f)
                supplier = Chem.SDMolSupplier(f, False, False)
                n = len(supplier)
                for i in range(n):
                    mol = supplier[i]
                    anum = [(a.GetAtomicNum() in allowed_atoms.keys())
                            for a in mol.GetAtoms()]
                    if sum(anum) == len(anum):
                        n = mol.GetNumAtoms()
                        x_coord = []
                        y_coord = []
                        z_coord = []
                        for k in range(n):
                            pos = mol.GetConformer().GetAtomPosition(k)
                            x_coord.append(pos.x)
                            y_coord.append(pos.y)
                            z_coord.append(pos.z)
                        if np.linalg.norm(z_coord, ord=2) > 1.0:
                            rd_mols.append(mol)
                            smiles.append(Chem.MolToSmiles(mol))
                            self.num_atoms_all.append(n)
            self.smiles = smiles
            self.rd_mols = rd_mols
            self.target = np.ones(len(self.smiles))
        else:
            raise NotImplementedError()

        self.max_size = max(self.num_atoms_all)
        self.node_attributes = node_attributes
        self.edge_attributes = edge_attributes
        self.get_atomic_attributes = get_atomic_attributes
        self.get_bond_attributes = get_bond_attributes