def __init__(self, get_atomic_attributes, node_attributes, filename, cols_to_read, delimiter=',', get_bond_attributes=None, edge_attributes=None): super(GraphDataset, self).__init__() assert (get_bond_attributes is None) == (edge_attributes is None) data_set = read_smiles_property_file(filename, cols_to_read, delimiter) data = data_set[0] target = data_set[1:] clean_smiles, clean_idx = sanitize_smiles(data) target = np.array(target).T max_size = 0 for sm in clean_smiles: mol = Chem.MolFromSmiles(sm) if mol.GetNumAtoms() > max_size: max_size = mol.GetNumAtoms() self.target = target[clean_idx, :] self.graphs = [] self.node_feature_matrix = [] self.adj_matrix = [] for sm in clean_smiles: graph = Graph(sm, max_size, get_atomic_attributes, get_bond_attributes) self.node_feature_matrix.append( graph.get_node_feature_matrix(node_attributes, max_size)) if get_bond_attributes is None: self.adj_matrix.append(graph.adj_matrix) else: self.adj_matrix.append( graph.get_edge_attr_adj_matrix(edge_attributes, max_size)) self.num_features = self.node_feature_matrix[0].shape[1]
def __init__(self, filename, cols_to_read, delimiter=',', tokens=None, pad=True, tokenize=True, augment=False, flip=True): super(SmilesDataset, self).__init__() self.tokenize = tokenize data = read_smiles_property_file(filename, cols_to_read, delimiter) smiles = data[0] clean_smiles, clean_idx = sanitize_smiles(smiles) if len(data) > 1: target = np.array(data[1:], dtype='float') target = np.array(target) target = target.T self.target = target[clean_idx] else: self.target = None if augment: clean_smiles, self.target = augment_smiles(clean_smiles, self.target) if pad: clean_smiles, self.length = pad_sequences(clean_smiles) tokens, self.token2idx, self.num_tokens = get_tokens( clean_smiles, tokens) if tokenize: clean_smiles, self.tokens = seq2tensor(clean_smiles, tokens, flip) self.data = clean_smiles
def __init__(self, filename, cols_to_read, get_features, delimiter=',', return_smiles=False, get_features_args=None): super(FeatureDataset, self).__init__() self.return_smiles = return_smiles self.get_features = get_features data = read_smiles_property_file(filename, cols_to_read, delimiter) if len(cols_to_read) > 1: assert len(cols_to_read) == len(data) smiles = data[0] target = np.array(data[1:], dtype='float') target = target.T num_targets = len(cols_to_read) - 1 target = target.reshape((-1, num_targets)) else: smiles = data[0] target = None self.target = target features, valid_idx, invalid_idx = get_features( smiles, **get_features_args) self.objects = [smiles[i] for i in valid_idx] length = [len(sm) for sm in self.objects] self.max_len = max(length) self.data = features
def __init__(self, filename, cols_to_read, features, delimiter=',', tokens=None): super(VanillaDataset, self).__init__() data = read_smiles_property_file(filename, cols_to_read, delimiter) smiles = data[0] target = np.array(data[1], dtype='float') clean_smiles, clean_idx = sanitize_smiles(smiles) target = np.array(target) self.target = target[clean_idx]
def from_smiles_file(cls, get_atomic_attributes, node_attributes, filename, cols_to_read, delimiter=',', get_bond_attributes=None, edge_attributes=None): data_set = read_smiles_property_file(filename, cols_to_read, delimiter) data = data_set[0] target = np.array(data_set[1:]).squeeze() clean_smiles, clean_idx = sanitize_smiles(data) clean_mols = [Chem.MolFromSmiles(smiles) for smiles in clean_smiles] clean_target = target[clean_idx] return cls(get_atomic_attributes, node_attributes, clean_mols, clean_target, get_bond_attributes, edge_attributes)
def __init__(self, filename, tokenized=False, cols_to_read=None, delimiter=',', mol_tokens=None, prot_tokens=None, pad=True): super(SmilesProteinDataset, self).__init__() if not tokenized: data = read_smiles_property_file(filename, cols_to_read, delimiter) smiles = data[0] proteins = np.array(data[1]) target = np.array(data[2], dtype='float') clean_smiles, clean_idx = sanitize_smiles(smiles) self.target = target[clean_idx] proteins = list(proteins[clean_idx]) if pad: clean_smiles, self.mol_lengths = pad_sequences(clean_smiles) proteins, self.prot_lengths = pad_sequences(proteins) self.mol_tokens, self.mol_token2idx, self.mol_num_tokens = \ get_tokens(clean_smiles, mol_tokens) self.prot_tokens, self.prot_token2idx, self.prot_num_tokens = \ get_tokens(proteins, prot_tokens) clean_smiles = seq2tensor(clean_smiles, self.mol_tokens) proteins = seq2tensor(proteins, self.prot_tokens) self.molecules = clean_smiles self.proteins = proteins else: f = open(filename, 'rb') data = pickle.load(f) self.mol_tokens = data['smiles_tokens'] self.prot_tokens = data['proteins_tokens'] self.mol_num_tokens = len(data['smiles_tokens']) self.prot_num_tokens = len(data['proteins_tokens']) self.molecules = data['smiles'] self.proteins = data['proteins'] self.target = data['labels'] assert len(self.molecules) == len(self.proteins) assert len(self.molecules) == len(self.target)
from openchem.data.siamese_data_layer import SiameseDataset from openchem.utils.utils import identity import torch import torch.nn as nn import numpy as np from torch.optim import RMSprop, Adam from torch.optim.lr_scheduler import ExponentialLR, StepLR import torch.nn.functional as F from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, f1_score from openchem.data.utils import read_smiles_property_file data = read_smiles_property_file( './benchmark_datasets/reactions/4_11_with_y2.csv', cols_to_read=[11, 12, 14], keep_header=False) reactant1 = data[0] reactant2 = data[1] labels = np.array(data[2], dtype="float").reshape(-1, 1) reactants = [reactant1[i] + " " + reactant2[i] for i in range(len(reactant2))] from openchem.data.utils import get_tokens tokens, _, _ = get_tokens(reactants) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(reactants, labels, test_size=0.2, random_state=42)
def __init__(self, get_atomic_attributes, node_attributes, filename, cols_to_read, delimiter=',', get_bond_attributes=None, edge_attributes=None, restrict_min_atoms=-1, restrict_max_atoms=-1, kekulize=True, file_format="smi", addHs=False, has_3D=False, allowed_atoms=None, return_smiles=False, **kwargs): super(GraphDataset, self).__init__() assert (get_bond_attributes is None) == (edge_attributes is None) self.return_smiles = return_smiles self.restrict_min_atoms = restrict_min_atoms self.restrict_max_atoms = restrict_max_atoms self.kekulize = kekulize self.addHs = addHs self.has_3D = has_3D if file_format == "pickled": data = pickle.load(open(filename, "rb")) # this cleanup must be consistent with sanitize_smiles mn, mx = restrict_min_atoms, restrict_max_atoms indices = [ i for i, n in enumerate(data["num_atoms_all"]) if (n >= mn or mn < 0) and (n <= mx or mx < 0) ] data = { key: value[indices] if isinstance(value, np.ndarray) else [value[i] for i in indices] for key, value in data.items() } self.num_atoms_all = data["num_atoms_all"] self.target = data["target"] self.smiles = data["smiles"] elif file_format == "smi": data_set = read_smiles_property_file(filename, cols_to_read, delimiter) data = data_set[0] if len(cols_to_read) == 1: target = None else: target = data_set[1:] clean_smiles, clean_idx, num_atoms, max_len = sanitize_smiles( data, min_atoms=restrict_min_atoms, max_atoms=restrict_max_atoms, return_num_atoms=True, return_max_len=True) self.max_len = max_len if target is not None: target = np.asarray(target, dtype=np.float).T clean_smiles = [clean_smiles[i] for i in clean_idx] num_atoms = [num_atoms[i] for i in clean_idx] self.clean_idx = clean_idx if target is not None: self.target = target[clean_idx, :] else: self.target = None self.smiles = clean_smiles self.num_atoms_all = num_atoms else: raise NotImplementedError() self.max_size = max(self.num_atoms_all) self.node_attributes = node_attributes self.edge_attributes = edge_attributes self.get_atomic_attributes = get_atomic_attributes self.get_bond_attributes = get_bond_attributes
from openchem.criterion.multitask_loss import MultitaskLoss from sklearn.metrics import r2_score, mean_squared_error from openchem.utils.utils import identity import torch import torch.nn as nn import numpy as np from torch.optim import RMSprop, Adam from torch.optim.lr_scheduler import ExponentialLR, StepLR import torch.nn.functional as F from openchem.data.utils import read_smiles_property_file data = read_smiles_property_file( 'benchmark_datasets/melt_temp/melting_data.txt', cols_to_read=[0, 1], delimiter='\t', keep_header=False) smiles = data[0][1:] labels = np.array(data[1][1:], dtype='float').reshape(-1) from openchem.data.utils import get_tokens tokens, _, _ = get_tokens(smiles) tokens = tokens + ' ' from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2, random_state=42)
from openchem.modules.mlp.openchem_mlp import OpenChemMLP from openchem.data.smiles_data_layer import SmilesDataset from openchem.criterion.multitask_loss import MultitaskLoss import torch import torch.nn as nn import numpy as np from torch.optim import RMSprop, Adam from torch.optim.lr_scheduler import ExponentialLR, StepLR import torch.nn.functional as F from sklearn.metrics import roc_auc_score, mean_squared_error from openchem.data.utils import read_smiles_property_file data = read_smiles_property_file('./benchmark_datasets/tox21/tox21.csv', cols_to_read=[13] + list(range(0,12))) smiles = data[0] labels = np.array(data[1:]) labels[np.where(labels=='')] = '999' labels = labels.T from openchem.data.utils import get_tokens tokens, _, _ = get_tokens(smiles) tokens = tokens + ' ' from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2, random_state=42) from openchem.data.utils import save_smiles_property_file
node_attributes = {} node_attributes['valence'] = Attribute('node', 'valence', one_hot=True, values=[1, 2, 3, 4, 5, 6]) node_attributes['charge'] = Attribute('node', 'charge', one_hot=True, values=[-1, 0, 1, 2, 3, 4]) node_attributes['hybridization'] = Attribute('node', 'hybridization', one_hot=True, values=[0, 1, 2, 3, 4, 5, 6, 7]) node_attributes['aromatic'] = Attribute('node', 'aromatic', one_hot=True, values=[0, 1]) node_attributes['atom_element'] = Attribute('node', 'atom_element', one_hot=True, values=list(range(11))) from openchem.data.utils import read_smiles_property_file data = read_smiles_property_file('./benchmark_datasets/logp_dataset/logP_labels.csv', cols_to_read=[1,2], keep_header=False) smiles = data[0] labels = np.array(data[1]).reshape(-1, 1) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(smiles, labels, test_size=0.2, random_state=42) from openchem.data.utils import save_smiles_property_file save_smiles_property_file('./benchmark_datasets/logp_dataset/train.smi', X_train, y_train) save_smiles_property_file('./benchmark_datasets/logp_dataset/test.smi', X_test, y_test) train_dataset = GraphDataset(get_atomic_attributes, node_attributes, './benchmark_datasets/logp_dataset/train.smi',
def __init__(self, get_atomic_attributes, node_attributes, filename, cols_to_read, delimiter=',', get_bond_attributes=None, edge_attributes=None, restrict_min_atoms=-1, restrict_max_atoms=-1, kekulize=True, file_format="smi", addHs=False, has_3D=False, allowed_atoms=None, return_smiles=False, **kwargs): super(GraphDataset, self).__init__() assert (get_bond_attributes is None) == (edge_attributes is None) self.return_smiles = return_smiles self.restrict_min_atoms = restrict_min_atoms self.restrict_max_atoms = restrict_max_atoms self.kekulize = kekulize self.addHs = addHs self.has_3D = has_3D if file_format == "pickled": data = pickle.load(open(kwargs["pickled"], "rb")) # this cleanup must be consistent with sanitize_smiles mn, mx = restrict_min_atoms, restrict_max_atoms indices = [ i for i, n in enumerate(data["num_atoms_all"]) if (n >= mn or mn < 0) and (n <= mx or mx < 0) ] data = { key: value[indices] if isinstance(value, np.ndarray) else [value[i] for i in indices] for key, value in data.items() } self.num_atoms_all = data["num_atoms_all"] self.target = data["target"] self.smiles = data["smiles"] elif file_format == "smi": data_set = read_smiles_property_file(filename, cols_to_read, delimiter) data = data_set[0] if len(cols_to_read) == 1: target = None else: target = data_set[1:] clean_smiles, clean_idx, num_atoms, max_len = sanitize_smiles( data, min_atoms=restrict_min_atoms, max_atoms=restrict_max_atoms, return_num_atoms=True, return_max_len=True) self.max_len = max_len if target is not None: target = np.asarray(target, dtype=np.float).T clean_smiles = [clean_smiles[i] for i in clean_idx] num_atoms = [num_atoms[i] for i in clean_idx] self.clean_idx = clean_idx if target is not None: self.target = target[clean_idx, :] else: self.target = None self.smiles = clean_smiles self.num_atoms_all = num_atoms elif file_format == "sdf": filenames = [] os.chdir("/home/Work/data/enamine_hll-500/") for file in glob.glob("*.sdf"): filenames.append(file) self.num_atoms_all = [] smiles = [] rd_mols = [] for f in [filenames[10]]: print(f) supplier = Chem.SDMolSupplier(f, False, False) n = len(supplier) for i in range(n): mol = supplier[i] anum = [(a.GetAtomicNum() in allowed_atoms.keys()) for a in mol.GetAtoms()] if sum(anum) == len(anum): n = mol.GetNumAtoms() x_coord = [] y_coord = [] z_coord = [] for k in range(n): pos = mol.GetConformer().GetAtomPosition(k) x_coord.append(pos.x) y_coord.append(pos.y) z_coord.append(pos.z) if np.linalg.norm(z_coord, ord=2) > 1.0: rd_mols.append(mol) smiles.append(Chem.MolToSmiles(mol)) self.num_atoms_all.append(n) self.smiles = smiles self.rd_mols = rd_mols self.target = np.ones(len(self.smiles)) else: raise NotImplementedError() self.max_size = max(self.num_atoms_all) self.node_attributes = node_attributes self.edge_attributes = edge_attributes self.get_atomic_attributes = get_atomic_attributes self.get_bond_attributes = get_bond_attributes