def structure_standardization(smi): """ Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object. Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen). If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file. Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization process are saved in the log file. The remaining standardized structures are converted back into their canonical SMILES format. :param smi: Input SMILES from the given structure data file T4 :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES. """ tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules() importlib.reload(MolVS_standardizer) param = ReadConfig() standardization_param = param.get_conf_dict(parameters.get_parameters())['standardization'] max_num_atoms = standardization_param['max_num_atoms'] max_num_tautomers = standardization_param['max_num_tautomers'] include_stereoinfo = standardization_param['include_stereoinfo'] my_standardizer = MolVS_standardizer.Standardizer(max_tautomers=max_num_tautomers) mol = MolFromSmiles(smi) # Read SMILES and convert it to RDKit mol object. if mol is not None: # Check, if the input SMILES has been converted into a mol object. if mol.GetNumAtoms() <= max_num_atoms: # check size of the molecule based on the non-hydrogen atom count. try: mol = my_standardizer.charge_parent(mol) # standardize molecules using MolVS and RDKit mol = my_standardizer.isotope_parent(mol) if include_stereoinfo is False: mol = my_standardizer.stereo_parent(mol) mol = my_standardizer.tautomer_parent(mol) mol_clean = my_standardizer.standardize(mol) smi_clean = MolToSmiles(mol_clean) # convert mol object back to SMILES else: mol = my_standardizer.tautomer_parent(mol) mol_clean = my_standardizer.standardize(mol) smi_clean = MolToSmiles(mol_clean) except (ValueError, AttributeError) as e: smi_clean = np.nan logging.error( 'Standardization error, ' + smi + ', Error Type: ' + str( e)) # write failed molecules during standardization to log file else: smi_clean = np.nan logging.error('Molecule too large, ' + smi) else: smi_clean = np.nan logging.error('Reading Error, ' + smi) return smi_clean
def test_custom_kekulize(): smiles = 'CC=C1c2ccccc2C(=CC)c3ccccc13' smiles = 'N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#N)C12CCCCC2' mol = MolFromSmiles(smiles) display(mol) for atom_idx in range(0,mol.GetNumAtoms()): bonds = mol.GetAtomWithIdx(atom_idx).GetBonds() for bond in bonds: print(bond.GetBondType()) non_aromatic_atoms = find_custom_Kekulize_set(mol, 60, 5) mol = custom_kekulize(mol,non_aromatic_atoms) display(mol) for atom_idx in range(0,mol.GetNumAtoms()): bonds = mol.GetAtomWithIdx(atom_idx).GetBonds() for bond in bonds: print(bond.GetBondType())
def _one_random(smls, n, iso, q): res = list() for s in smls: r = list() m = MolFromSmiles(s) if m: start = time() while len(set(r)) < n and (time() - start) < 10: ans = list(range(m.GetNumAtoms())) np.random.shuffle(ans) nm = RenumberAtoms(m, ans) r.append( MolToSmiles(nm, canonical=False, isomericSmiles=iso)) res.extend(r) q.put(res)
def process(self, smiles): #构图 mol = MolFromSmiles(smiles) n = mol.GetNumAtoms()+1 graph = DGLGraph() graph.add_nodes(n) graph.add_edges(graph.nodes(), graph.nodes()) graph.add_edges(range(1, n), 0) for e in mol.GetBonds(): u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx() graph.add_edge(u+1, v+1) graph.add_edge(v+1, u+1) adj = graph.adjacency_matrix(transpose=False).to_dense() v, m = torch.cat([atom_feature(atom)[0][None, :] for atom in mol.GetAtoms()]), FEATURE_DIM vec = torch.cat([torch.zeros((1, m)),v]).to(self.device) return GCNPoint(n, adj, vec)
def calculate_single(self, smiles) -> Tuple: if smiles is nan: return None, False, "No smiles entry." try: mol = MolFromSmiles( smiles) # Read SMILES and convert it to RDKit mol object. except (TypeError, ValueError, AttributeError) as e: return None, False, str(e) # Check, if the input SMILES has been converted into a mol object. if mol is None: return None, False, "failed to parse smiles {}".format(smiles) # check size of the molecule based on the non-hydrogen atom count. if mol.GetNumAtoms() >= self.max_num_atoms: return ( None, False, "number of non-H atoms {0} exceeds limit of {1} for smiles {2}" .format(mol.GetNumAtoms(), self.max_num_atoms, smiles), ) try: mol = rdMolStandardize.ChargeParent( mol) # standardize molecules using MolVS and RDKit mol = self.isotope_parent(mol) if self.include_stereoinfo is False: Chem.RemoveStereochemistry(mol) mol = self.tautomerizer.Canonicalize(mol) mol_clean_tmp = self.my_standardizer(mol) smi_clean_tmp = MolToSmiles( mol_clean_tmp) # convert mol object back to SMILES ## Double check if standardized SMILES is a valid mol object mol_clean = MolFromSmiles(smi_clean_tmp) smi_clean = MolToSmiles(mol_clean) except (TypeError, ValueError, AttributeError) as e: return None, False, str(e) return smi_clean, True, None
def randomize_smiles(smiles, num=10, isomeric=True): """ Generate different SMILES representations for the same molecule :param smiles: {str} SMILES string :param num: {int} number of different SMILES strings to generate :param isomeric: {bool} whether to consider stereo centers :return: different SMILES representation for same molecule """ m = MolFromSmiles(smiles) res = list() while len(set(res)) < num: ans = list(range(m.GetNumAtoms())) np.random.shuffle(ans) nm = RenumberAtoms(m, ans) res.append(MolToSmiles(nm, canonical=False, isomericSmiles=isomeric)) return res
def test_12_convertToRdkit(self): smimol = SMILE_SMI sm = SmallMol(smimol, removeHs=True, fixHs=False) mrd = MolFromSmiles(smimol) mrd_natom = mrd.GetNumAtoms() sm_rd = sm.toRdkitMol(includeConformer=True) sm_rd_natoms = sm_rd.GetNumAtoms() self.assertIsInstance( sm_rd, rdkit.Chem.rdchem.Mol, msg="The conversion of the SmallMol object into the rdkit" "Mol one get wrong") self.assertEqual( sm_rd_natoms, mrd_natom, msg="NUmber of atoms different. The handle and convertion of the " "SmallMol object into the rdkit Mol one probably get wrong")
def construct_RGCN_bigraph_from_smiles(smiles): g = DGLGraph() # Add nodes mol = MolFromSmiles(smiles) num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) atoms_feature_all = [] for atom_index, atom in enumerate(mol.GetAtoms()): atom_feature = atom_features(atom).tolist() atoms_feature_all.append(atom_feature) g.ndata["atom"] = torch.tensor(atoms_feature_all) # Add edges src_list = [] dst_list = [] etype_feature_all = [] num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) etype_feature = etype_features(bond) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() src_list.extend([u, v]) dst_list.extend([v, u]) etype_feature_all.append(etype_feature) etype_feature_all.append(etype_feature) g.add_edges(src_list, dst_list) normal_all = [] for i in etype_feature_all: normal = etype_feature_all.count(i)/len(etype_feature_all) normal = round(normal, 1) normal_all.append(normal) g.edata["etype"] = torch.tensor(etype_feature_all) g.edata["normal"] = torch.tensor(normal_all) return g
def processline(t, step, line): global lensum if t.incr(): return 1 if step == 0: lensum += len(line) else: m = MolFromSmiles(line) if step == 100: lensum += len(line) elif step == 105: lensum += len(sha256(line).hexdigest()) elif step in (110, 120): with open(tmpname, 'wb+') as f: print(line, file=f) if step == 120: os.fsync(f.fileno()) lensum += os.stat(tmpname).st_size elif step == 210: lensum += m.GetNumAtoms() elif step == 220: lensum += m.GetNumBonds() elif step == 300: lensum += len(MolToSmiles(m)) elif step == 400: lensum += len(MolToMolBlock(m)) elif step == 420: m2 = AddHs(m) EmbedMolecule(m2, randomSeed=2020) m2 = RemoveHs(m2) m2.SetProp("_Name", "test") lensum += len(MolToMolBlock(m2)) elif step == 600: lensum += mol2file(m, 'svg') elif step == 610: lensum += mol2file(m, 'png') else: raise ValueError("Not implemented step " + str(step)) return 0
def process(self, smiles): #构图 mol = MolFromSmiles(smiles) n = mol.GetNumAtoms() graph = DGLGraph() graph.add_nodes(n) graph.add_edges(graph.nodes(), graph.nodes()) graph.add_edges(range(1, n), 0) graph.ndata["element"] = torch.tensor( [ATOM[atom.GetAtomicNum()] for atom in mol.GetAtoms()]) graph.ndata["explicit"] = torch.tensor( [atom.GetExplicitValence() for atom in mol.GetAtoms()]) graph.ndata["implicit"] = torch.tensor( [atom.GetImplicitValence() for atom in mol.GetAtoms()]) graph.ndata["hybrid"] = torch.tensor( [HYBRID[atom.GetHybridization()] for atom in mol.GetAtoms()]) graph.ndata["hcount"] = torch.tensor( [atom.GetTotalNumHs() for atom in mol.GetAtoms()]) graph.ndata["degree"] = torch.tensor( [atom.GetDegree() for atom in mol.GetAtoms()]) graph.ndata["charge"] = torch.tensor( [atom.GetFormalCharge() + 2 for atom in mol.GetAtoms()]) graph.ndata["ring"] = torch.tensor( [int(atom.IsInRing()) for atom in mol.GetAtoms()]) graph.ndata["aromatic"] = torch.tensor( [int(atom.GetIsAromatic()) for atom in mol.GetAtoms()]) for e in mol.GetBonds(): u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx() graph.add_edge(u, v) graph.add_edge(v, u) vec = self.embed(graph.ndata["element"] + graph.ndata["explicit"] + graph.ndata["implicit"] + graph.ndata["hybrid"] + graph.ndata["hcount"] + graph.ndata["degree"] + graph.ndata["charge"] + graph.ndata["ring"] + graph.ndata["aromatic"]) return GNNPoint(n, graph, vec)
def construct_feature_matrices(self, smiles, train=True): """ construct a molecule from the given smiles string and return atom and bond classes. Returns dict with entries 'n_atom' : number of atoms in the molecule 'n_bond' : number of bonds in the molecule 'atom' : (n_atom,) length list of atom classes 'bond' : (n_bond,) list of bond classes 'connectivity' : (n_bond, 2) array of source atom, target atom pairs. """ self.atom_tokenizer.train = train self.bond_tokenizer.train = train logger = logging.getLogger(__name__) mol = MolFromSmiles(smiles) if self.explicit_hs: mol = AddHs(mol) n_atom = mol.GetNumAtoms() n_bond = 2 * mol.GetNumBonds() # If its an isolated atom, add a self-link if n_bond == 0: n_bond = 1 logger.warning(f'Found molecule {smiles} with zero bonds') atom_feature_matrix = np.zeros(n_atom, dtype='int') bond_feature_matrix = np.zeros(n_bond, dtype='int') bond_indices = np.zeros(n_bond, dtype='int') connectivity = np.zeros((n_bond, 2), dtype='int') bond_index = 0 for n, atom in enumerate(mol.GetAtoms()): # Atom Classes atom_feature_matrix[n] = self.atom_tokenizer( self.atom_features(atom)) start_index = atom.GetIdx() for bond in atom.GetBonds(): # Is the bond pointing at the target atom rev = bond.GetBeginAtomIdx() != start_index # Bond Classes bond_feature_matrix[bond_index] = self.bond_tokenizer( self.bond_features(bond, flipped=rev)) # Connect edges to original bonds bond_indices[bond_index] = bond.GetIdx() # Connectivity if not rev: # Original direction connectivity[bond_index, 0] = bond.GetBeginAtomIdx() connectivity[bond_index, 1] = bond.GetEndAtomIdx() else: # Reversed connectivity[bond_index, 0] = bond.GetEndAtomIdx() connectivity[bond_index, 1] = bond.GetBeginAtomIdx() bond_index += 1 # Track the largest atom and bonds seen if train: if n_atom > self.max_atoms: self.max_atoms = n_atom if mol.GetNumBonds() > self.max_bonds: self.max_bonds = mol.GetNumBonds() return { 'n_atom': n_atom, 'n_bond': mol.GetNumBonds(), # the real number of bonds 'bond_indices': bond_indices, 'atom': atom_feature_matrix, 'bond': bond_feature_matrix, 'connectivity': connectivity, }
def parse_smiles_str(self, smiles_str, id, target=None): # Use RDKit to parse SMILES string mol = MolFromSmiles(smiles_str) if not mol: return None # Represent Hydrogen atoms explicity (if necessary) if self.config['explicit_Hs']: mol = Chem.AddHs(mol) # Compute number of nodes (atoms) and edges (bonds) n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds() # Allocate space for Numpy arrays representing the molecular graph node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32) edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32) adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64) # Adjacency matrix (sparse representation) inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64) # Incidence matrix (sparse representation) # Retrieve node (atom) features, if needed if self.num_node_features > 0: for i, atom in enumerate(mol.GetAtoms()): node_features[i] = self.get_node_features(atom) # Retrieve edges (bonds) for i, bond in enumerate(mol.GetBonds()): # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()] adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()] # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i] inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i] # Retrieve edge (bond) features, if needed if self.num_edge_features > 0: edge_features[i] = self.get_edge_features(bond) # Sort the adjacency and incidence matrices lexicographically adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))] inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))] # Represent molecular graph as a dictionary g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat} # Add target(s) (if any), making sure they are a NumPy array object with method tobytes() if target is not None: # Convert scalars to NumPy array if not isinstance(target, np.ndarray): target = np.array(target, np.float32) # Ensure target is of type np.float32 target = target.astype(np.float32) # Flatten targets of rank >= 2 if target.ndim > 1: target = target.flatten() # Store target as a (row) 2D NumPy array (for compatibility) g['target'] = np.reshape(target, (1, -1)) n_targets = g['target'].shape[1] # If there are no targets, add an empty NumPy array (for compatibility) else: g['target'] = np.zeros((1, 0), dtype=np.float32) n_targets = 0 # Add ID, making sure it is a NumPy array object with method tobytes() if not isinstance(target, np.ndarray): id = np.array(id, np.int64) g['id'] = id # Finally, add shape information. The last element refers to the number of graphs, and is included for # compatibility with batched graphs g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1), np.int64) return g
def process(self): if osp.exists( os.path.join(self.processed_dir, 'Decagon-{}-multi.pt'.format(self.datatype))): return data_list = [] # >>> Obtain One-Hot Encoding for Side-Effects json_dict = { literal_eval(k): v for k, v in self.json_load[self.datatype].items() } total = len(json_dict) for idx, (smiles1, smiles2) in enumerate(json_dict): printProgress(idx + 1, total, '{} dataset preparation: '.format(self.datatype), ' ', 2, 50) mol1 = MolFromSmiles(smiles1) mol2 = MolFromSmiles(smiles2) label = np.array(json_dict[(smiles1, smiles2)]) #print(len(label[label == 1])) #print(len(label[label == 0])) #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label)) if mol1 is None or mol2 is None: print("There is a missing drug from the pair (%s,%s)" % (mol1, mol2)) continue ###################################################################### # >>> Get pairwise graph G1, G2 c1_size = mol1.GetNumAtoms() c2_size = mol2.GetNumAtoms() if c1_size == 0 or c2_size == 0: print("There is a size error from pair (%s,%s)" % (mol1, mol2)) continue atoms1 = mol1.GetAtoms() atoms2 = mol2.GetAtoms() bonds1 = mol1.GetBonds() bonds2 = mol2.GetBonds() features, edges = [], [] for atom in atoms1: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for atom in atoms2: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for bond in bonds1: edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) for bond in bonds2: edges.append([ bond.GetBeginAtomIdx() + c1_size, bond.GetEndAtomIdx() + c1_size ]) if len(edges) == 0: continue G = nx.Graph(edges).to_directed() edge_index = [[e1, e2] for e1, e2 in G.edges] GraphSiameseData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.Tensor(label).view(1, -1)) GraphSiameseData.__setitem__('c1_size', torch.LongTensor([c1_size])) GraphSiameseData.__setitem__('c2_size', torch.LongTensor([c2_size])) data_list.append(GraphSiameseData) ########################################################################### if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] # check this function data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])
def structure_standardization(smi: str) -> str: """ Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object. Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen). If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file. Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization process are saved in the log file. The remaining standardized structures are converted back into their canonical SMILES format. :param smi: Input SMILES from the given structure data file T4 :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES. Args: smi (str): Non-standardized smiles string Returns: str: standardized smiles string """ # tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules() # importlib.reload(MolVS_standardizer) # param = ReadConfig() standardization_param = ConfigDict.get_parameters()["standardization"] max_num_atoms = standardization_param["max_num_atoms"] max_num_tautomers = standardization_param["max_num_tautomers"] include_stereoinfo = standardization_param["include_stereoinfo"] ## Load new tautomer enumarator/canonicalizer tautomerizer = rdMolStandardize.TautomerEnumerator() tautomerizer.SetMaxTautomers(max_num_tautomers) tautomerizer.SetRemoveSp3Stereo( False) # Keep stereo information of keto/enol tautomerization def isotope_parent(mol: Chem.Mol) -> Chem.Mol: """ Isotope parent from MOLVS Return the isotope parent of a given molecule. The isotope parent has all atoms replaced with the most abundant isotope for that element. Args: mol (Chem.Mol): input rdkit mol object Returns: Chem.Mol: isotope parent rdkit mol object """ mol = copy.deepcopy(mol) # Replace isotopes with common weight for atom in mol.GetAtoms(): atom.SetIsotope(0) return mol def my_standardizer(mol: Chem.Mol) -> Chem.Mol: """ MolVS implementation of standardization Args: mol (Chem.Mol): non-standardized rdkit mol object Returns: Chem.Mol: stndardized rdkit mol object """ mol = copy.deepcopy(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) disconnector = rdMolStandardize.MetalDisconnector() mol = disconnector.Disconnect(mol) normalizer = rdMolStandardize.Normalizer() mol = normalizer.normalize(mol) reionizer = rdMolStandardize.Reionizer() mol = reionizer.reionize(mol) Chem.AssignStereochemistry(mol, force=True, cleanIt=True) # TODO: Check this removes symmetric stereocenters return mol mol = MolFromSmiles(smi) # Read SMILES and convert it to RDKit mol object. if (mol is not None ): # Check, if the input SMILES has been converted into a mol object. if ( mol.GetNumAtoms() <= max_num_atoms ): # check size of the molecule based on the non-hydrogen atom count. try: mol = rdMolStandardize.ChargeParent( mol) # standardize molecules using MolVS and RDKit mol = isotope_parent(mol) if include_stereoinfo is False: Chem.RemoveStereochemistry(mol) mol = tautomerizer.Canonicalize(mol) mol_clean = my_standardizer(mol) smi_clean = MolToSmiles( mol_clean) # convert mol object back to SMILES else: mol = tautomerizer.Canonicalize(mol) mol_clean = my_standardizer(mol) smi_clean = MolToSmiles(mol_clean) except (ValueError, AttributeError) as e: smi_clean = np.nan logging.error( "Standardization error, " + smi + ", Error Type: " + str(e) ) # write failed molecules during standardization to log file else: smi_clean = np.nan logging.error("Molecule too large, " + smi) else: smi_clean = np.nan logging.error("Reading Error, " + smi) return smi_clean
def extract_graph(data_path, out_file_path, max_atom_num, label_name=None): import os from rdkit import RDConfig from rdkit.Chem import ChemicalFeatures fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) data_pd = pd.read_csv(data_path) smiles_list = data_pd['SMILES'].tolist() symbol_candidates = set() atom_attribute_dim = num_atom_features() bond_attribute_dim = num_bond_features() node_attribute_matrix_list = [] bond_attribute_matrix_list = [] adjacent_matrix_list = [] distance_matrix_list = [] valid_index = [] ### degree_set = set() h_num_set = set() implicit_valence_set = set() charge_set = set() ### for line_idx, smiles in enumerate(smiles_list): smiles = smiles.strip() mol = MolFromSmiles(smiles) AllChem.Compute2DCoords(mol) conformer = mol.GetConformers()[0] feats = factory.GetFeaturesForMol(mol) acceptor_atom_ids = map( lambda x: x.GetAtomIds()[0], filter(lambda x: x.GetFamily() == 'Acceptor', feats)) donor_atom_ids = map(lambda x: x.GetAtomIds()[0], filter(lambda x: x.GetFamily() == 'Donor', feats)) adjacent_matrix = np.zeros((max_atom_num, max_atom_num)) adjacent_matrix = adjacent_matrix.astype(int) distance_matrix = np.zeros((max_atom_num, max_atom_num)) node_attribute_matrix = np.zeros((max_atom_num, atom_attribute_dim)) node_attribute_matrix = node_attribute_matrix.astype(int) if len(mol.GetAtoms()) > max_atom_num: print('Outlier {} has {} atoms'.format(line_idx, mol.GetNumAtoms())) continue valid_index.append(line_idx) atom_positions = [None for _ in range(mol.GetNumAtoms() + 1)] for atom in mol.GetAtoms(): atom_idx = atom.GetIdx() symbol_candidates.add(atom.GetSymbol()) atom_positions[atom_idx] = conformer.GetAtomPosition(atom_idx) degree_set.add(atom.GetDegree()) h_num_set.add(atom.GetTotalNumHs()) implicit_valence_set.add(atom.GetImplicitValence()) charge_set.add(atom.GetFormalCharge()) node_attribute_matrix[atom_idx] = extract_atom_features( atom, is_acceptor=atom_idx in acceptor_atom_ids, is_donor=atom_idx in donor_atom_ids) node_attribute_matrix_list.append(node_attribute_matrix) for idx_i in range(mol.GetNumAtoms()): for idx_j in range(idx_i + 1, mol.GetNumAtoms()): distance = get_atom_distance(conformer.GetAtomPosition(idx_i), conformer.GetAtomPosition(idx_j)) distance_matrix[idx_i, idx_j] = distance distance_matrix[idx_j, idx_i] = distance distance_matrix_list.append(distance_matrix) for bond in mol.GetBonds(): begin_atom = bond.GetBeginAtom() end_atom = bond.GetEndAtom() begin_index = begin_atom.GetIdx() end_index = end_atom.GetIdx() adjacent_matrix[begin_index, end_index] = 1 adjacent_matrix[end_index, begin_index] = 1 adjacent_matrix_list.append(adjacent_matrix) adjacent_matrix_list = np.asarray(adjacent_matrix_list) distance_matrix_list = np.asarray(distance_matrix_list) node_attribute_matrix_list = np.asarray(node_attribute_matrix_list) bond_attribute_matrix_list = np.asarray(bond_attribute_matrix_list) print('adjacent matrix shape\t', adjacent_matrix_list.shape) print('distance matrix shape\t', distance_matrix_list.shape) print('node attr matrix shape\t', node_attribute_matrix_list.shape) print('bond attr matrix shape\t', bond_attribute_matrix_list.shape) print(symbol_candidates) print('{} valid out of {}'.format(len(valid_index), len(smiles_list))) print('degree set:\t', degree_set) print('h num set: \t', h_num_set) print('implicit valence set: \t', implicit_valence_set) print('charge set:\t', charge_set) if label_name is None: np.savez_compressed( out_file_path, adjacent_matrix_list=adjacent_matrix_list, distance_matrix_list=distance_matrix_list, node_attribute_matrix_list=node_attribute_matrix_list, bond_attribute_matrix_list=bond_attribute_matrix_list) else: true_labels = data_pd[label_name].tolist() true_labels = np.array(true_labels) valid_index = np.array(valid_index) true_labels = true_labels[valid_index] np.savez_compressed( out_file_path, adjacent_matrix_list=adjacent_matrix_list, distance_matrix_list=distance_matrix_list, node_attribute_matrix_list=node_attribute_matrix_list, bond_attribute_matrix_list=bond_attribute_matrix_list, label_name=true_labels) print() return
def process(self): if osp.exists( os.path.join(self.processed_dir, 'Decagon-{}.pt'.format(self.datatype))): return data_list = [] # >>> Obtain One-Hot Encoding for Side-Effects target_list = [] with open(self.total_data_dir, 'r', encoding='utf-8') as f: rdr = csv.reader(f) for line in rdr: target_list.append(line[-1]) label_encoder = LabelEncoder() label_encoder.fit( target_list ) # Automatically generate one-hot labels for side-effects label_list = label_encoder.transform(target_list) num_classes = len(label_encoder.classes_) target_dict = {} for target_idx, targets in enumerate(target_list): target_dict[targets] = label_list[target_idx] for label_idx, mode in enumerate(['negative', 'positive']): # negative will be 0, positive will be 1 pair_list, se_list = [], [] with open(osp.join(self.dataset_dir, 'Decagon-{}-{}.csv'.format(mode, self.datatype)), 'r', encoding='utf-8') as f: rdr = csv.reader(f) for line in rdr: se_list.append(line[-1]) pair_list.append(line[:-1]) one_hot = [0] * num_classes total = len(pair_list) for idx, (smiles_pair, se) in enumerate(zip(pair_list, se_list)): smiles1, smiles2 = smiles_pair side_effect = one_hot.copy() side_effect[target_dict[se]] = 1 printProgress(idx + 1, total, '{} dataset preparation: '.format(self.datatype), ' ', 2, 50) mol1 = MolFromSmiles(smiles1) mol2 = MolFromSmiles(smiles2) label = [int(label_idx)] #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label)) if mol1 is None or mol2 is None: print("There is a missing drug from the pair (%s,%s)" % (mol1, mol2)) continue ###################################################################### # >>> Get pairwise graph G1, G2 c1_size = mol1.GetNumAtoms() c2_size = mol2.GetNumAtoms() if c1_size == 0 or c2_size == 0: print("There is a size error from pair (%s,%s)" % (mol1, mol2)) continue atoms1 = mol1.GetAtoms() atoms2 = mol2.GetAtoms() bonds1 = mol1.GetBonds() bonds2 = mol2.GetBonds() features, edges = [], [] for atom in atoms1: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for atom in atoms2: feature = atom_features(atom) features.append(feature / sum(feature)) # normalize for bond in bonds1: edges.append( [bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) for bond in bonds2: edges.append([ bond.GetBeginAtomIdx() + c1_size, bond.GetEndAtomIdx() + c1_size ]) if len(edges) == 0: continue G = nx.Graph(edges).to_directed() edge_index = [[e1, e2] for e1, e2 in G.edges] GraphSiameseData = DATA.Data( x=torch.Tensor(features), edge_index=torch.LongTensor(edge_index).transpose(1, 0), y=torch.Tensor(label).view(-1, 1)) GraphSiameseData.__setitem__('c1_size', torch.LongTensor([c1_size])) GraphSiameseData.__setitem__('c2_size', torch.LongTensor([c2_size])) GraphSiameseData.__setitem__( 'side_effect', torch.Tensor(side_effect).view(1, -1)) data_list.append(GraphSiameseData) ########################################################################### if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] # check this function data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])