def construct_RGCN_bigraph_from_smiles(smiles): g = DGLGraph() # Add nodes mol = MolFromSmiles(smiles) num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) atoms_feature_all = [] for atom_index, atom in enumerate(mol.GetAtoms()): atom_feature = atom_features(atom).tolist() atoms_feature_all.append(atom_feature) g.ndata["atom"] = torch.tensor(atoms_feature_all) # Add edges src_list = [] dst_list = [] etype_feature_all = [] num_bonds = mol.GetNumBonds() for i in range(num_bonds): bond = mol.GetBondWithIdx(i) etype_feature = etype_features(bond) u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() src_list.extend([u, v]) dst_list.extend([v, u]) etype_feature_all.append(etype_feature) etype_feature_all.append(etype_feature) g.add_edges(src_list, dst_list) normal_all = [] for i in etype_feature_all: normal = etype_feature_all.count(i)/len(etype_feature_all) normal = round(normal, 1) normal_all.append(normal) g.edata["etype"] = torch.tensor(etype_feature_all) g.edata["normal"] = torch.tensor(normal_all) return g
def processline(t, step, line): global lensum if t.incr(): return 1 if step == 0: lensum += len(line) else: m = MolFromSmiles(line) if step == 100: lensum += len(line) elif step == 105: lensum += len(sha256(line).hexdigest()) elif step in (110, 120): with open(tmpname, 'wb+') as f: print(line, file=f) if step == 120: os.fsync(f.fileno()) lensum += os.stat(tmpname).st_size elif step == 210: lensum += m.GetNumAtoms() elif step == 220: lensum += m.GetNumBonds() elif step == 300: lensum += len(MolToSmiles(m)) elif step == 400: lensum += len(MolToMolBlock(m)) elif step == 420: m2 = AddHs(m) EmbedMolecule(m2, randomSeed=2020) m2 = RemoveHs(m2) m2.SetProp("_Name", "test") lensum += len(MolToMolBlock(m2)) elif step == 600: lensum += mol2file(m, 'svg') elif step == 610: lensum += mol2file(m, 'png') else: raise ValueError("Not implemented step " + str(step)) return 0
def construct_feature_matrices(self, smiles, train=True): """ construct a molecule from the given smiles string and return atom and bond classes. Returns dict with entries 'n_atom' : number of atoms in the molecule 'n_bond' : number of bonds in the molecule 'atom' : (n_atom,) length list of atom classes 'bond' : (n_bond,) list of bond classes 'connectivity' : (n_bond, 2) array of source atom, target atom pairs. """ self.atom_tokenizer.train = train self.bond_tokenizer.train = train logger = logging.getLogger(__name__) mol = MolFromSmiles(smiles) if self.explicit_hs: mol = AddHs(mol) n_atom = mol.GetNumAtoms() n_bond = 2 * mol.GetNumBonds() # If its an isolated atom, add a self-link if n_bond == 0: n_bond = 1 logger.warning(f'Found molecule {smiles} with zero bonds') atom_feature_matrix = np.zeros(n_atom, dtype='int') bond_feature_matrix = np.zeros(n_bond, dtype='int') bond_indices = np.zeros(n_bond, dtype='int') connectivity = np.zeros((n_bond, 2), dtype='int') bond_index = 0 for n, atom in enumerate(mol.GetAtoms()): # Atom Classes atom_feature_matrix[n] = self.atom_tokenizer( self.atom_features(atom)) start_index = atom.GetIdx() for bond in atom.GetBonds(): # Is the bond pointing at the target atom rev = bond.GetBeginAtomIdx() != start_index # Bond Classes bond_feature_matrix[bond_index] = self.bond_tokenizer( self.bond_features(bond, flipped=rev)) # Connect edges to original bonds bond_indices[bond_index] = bond.GetIdx() # Connectivity if not rev: # Original direction connectivity[bond_index, 0] = bond.GetBeginAtomIdx() connectivity[bond_index, 1] = bond.GetEndAtomIdx() else: # Reversed connectivity[bond_index, 0] = bond.GetEndAtomIdx() connectivity[bond_index, 1] = bond.GetBeginAtomIdx() bond_index += 1 # Track the largest atom and bonds seen if train: if n_atom > self.max_atoms: self.max_atoms = n_atom if mol.GetNumBonds() > self.max_bonds: self.max_bonds = mol.GetNumBonds() return { 'n_atom': n_atom, 'n_bond': mol.GetNumBonds(), # the real number of bonds 'bond_indices': bond_indices, 'atom': atom_feature_matrix, 'bond': bond_feature_matrix, 'connectivity': connectivity, }
def generate_graph(smiles, label=None): mol = MolFromSmiles(smiles) if not mol: raise ValueError("Could not parse SMILES string:", smiles) SYMBOL = [ 'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te', 'I', 'At', 'other' ] HYBRIDIZATION = [ Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2, Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D, Chem.rdchem.HybridizationType.SP3D2, 'other', ] num_atom = Chem.RemoveHs(mol).GetNumAtoms() symbol = np.zeros((num_atom, 16), np.uint8) hybridization = np.zeros((num_atom, 6), np.uint8) degree = np.zeros((num_atom, 6), np.uint8) num_h = np.zeros((num_atom, 5), np.uint8) chirality = np.zeros((num_atom, 3), np.uint8) aromatic = np.zeros((num_atom, 1), np.uint8) formal_charge = np.zeros((num_atom, 1), np.float32) radical_electrons = np.zeros((num_atom, 1), np.float32) for i in range(num_atom): atom = mol.GetAtomWithIdx(i) symbol[i] = one_of_k_encoding_unk(atom.GetSymbol(), SYMBOL) hybridization[i] = one_of_k_encoding_unk(atom.GetHybridization(), HYBRIDIZATION) degree[i] = one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4, 5]) num_h[i] = one_of_k_encoding_unk( atom.GetTotalNumHs(includeNeighbors=True), [0, 1, 2, 3, 4]) try: chirality[i] = one_of_k_encoding_unk(atom.GetProp('_CIPCode'), ['R', 'S', 'unknown']) except: chirality[i] = [0, 0, 0] aromatic[i] = atom.GetIsAromatic() formal_charge[i] = atom.GetFormalCharge() radical_electrons[i] = atom.GetNumRadicalElectrons() # abundant features # won't bring substantial change to predictive performance, sometimes even worse AtomicWeight = np.zeros((num_atom, 1), np.float32) AtomicNumber = np.zeros((num_atom, 1), np.float32) Rvdw = np.zeros((num_atom, 1), np.float32) RCovalent = np.zeros((num_atom, 1), np.float32) DefaultValence = np.zeros((num_atom, 1), np.float32) valence = np.zeros((num_atom, 1), np.float32) NOuterElecs = np.zeros((num_atom, 1), np.float32) ring = np.zeros((num_atom, 7), np.uint8) acceptor = np.zeros((num_atom, 1), np.uint8) donor = np.zeros((num_atom, 1), np.uint8) for i in range(num_atom): atom = mol.GetAtomWithIdx(i) AtomicNum = atom.GetAtomicNum() AtomicNumber[i] = AtomicNum AtomicWeight[i] = Chem.GetPeriodicTable().GetAtomicWeight(AtomicNum) Rvdw[i] = Chem.GetPeriodicTable().GetRvdw( AtomicNum) # (van der Waals radius) RCovalent[i] = Chem.GetPeriodicTable().GetRcovalent( AtomicNum) #(covalent radius) DefaultValence[i] = Chem.GetPeriodicTable().GetDefaultValence( AtomicNum) valence[i] = atom.GetExplicitValence() NOuterElecs[i] = Chem.GetPeriodicTable().GetNOuterElecs(AtomicNum) ring[i] = [int(atom.IsInRing()), int(atom.IsInRingSize(3)), \ int(atom.IsInRingSize(4)), int(atom.IsInRingSize(5)), \ int(atom.IsInRingSize(6)), int(atom.IsInRingSize(7)), int(atom.IsInRingSize(8))] factory = ChemicalFeatures.BuildFeatureFactory( os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) feature = factory.GetFeaturesForMol(mol) for t in range(0, len(feature)): if feature[t].GetFamily() == 'Donor': for i in feature[t].GetAtomIds(): donor[i] = 1 elif feature[t].GetFamily() == 'Acceptor': for i in feature[t].GetAtomIds(): acceptor[i] = 1 num_bond = mol.GetNumBonds() if num_bond == 0: num_bond = 1 # except error caused by CH4, NH3 bond_feat = np.zeros((num_bond * 2, 10), np.int16) bond_index = np.zeros((num_bond * 2, 2), np.int16) BOND_TYPE = [ Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC, ] BOND_STEREO = ["STEREONONE", "STEREOANY", "STEREOZ", "STEREOE"] ij = 0 for i in range(num_atom): for j in range(num_atom): if i == j: continue bond = mol.GetBondBetweenAtoms(i, j) if bond is not None: atom1 = mol.GetAtomWithIdx(i) atom2 = mol.GetAtomWithIdx(j) bond_index[ij] = [i, j] bond_type = one_of_k_encoding(bond.GetBondType(), BOND_TYPE) bond_ring = [bond.GetIsConjugated(), bond.IsInRing()] bond_stereo = one_of_k_encoding(str(bond.GetStereo()), BOND_STEREO) bond_feat[ij] = bond_type + bond_ring + bond_stereo ij += 1 graph = Graph( smiles, [symbol, hybridization, degree, num_h, chirality, aromatic, formal_charge, radical_electrons, \ AtomicWeight, AtomicNumber, Rvdw, RCovalent, DefaultValence, valence, NOuterElecs, ring, acceptor, donor], bond_feat, bond_index, np.array(label).reshape((1, 1)), ) return graph
def parse_smiles_str(self, smiles_str, id, target=None): # Use RDKit to parse SMILES string mol = MolFromSmiles(smiles_str) if not mol: return None # Represent Hydrogen atoms explicity (if necessary) if self.config['explicit_Hs']: mol = Chem.AddHs(mol) # Compute number of nodes (atoms) and edges (bonds) n_nodes, n_edges = mol.GetNumAtoms(), mol.GetNumBonds() # Allocate space for Numpy arrays representing the molecular graph node_features = np.zeros((n_nodes, self.num_node_features), dtype=np.float32) edge_features = np.zeros((n_edges, self.num_edge_features), dtype=np.float32) adj_mat = np.zeros((2*n_edges, 2), dtype=np.int64) # Adjacency matrix (sparse representation) inc_mat = np.zeros((2*n_edges, 2), dtype=np.int64) # Incidence matrix (sparse representation) # Retrieve node (atom) features, if needed if self.num_node_features > 0: for i, atom in enumerate(mol.GetAtoms()): node_features[i] = self.get_node_features(atom) # Retrieve edges (bonds) for i, bond in enumerate(mol.GetBonds()): # Fill in the two pairs of indices this edge (bond) contributes to the adjacency matrix adj_mat[2*i] = [bond.GetBeginAtom().GetIdx(), bond.GetEndAtom().GetIdx()] adj_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), bond.GetBeginAtom().GetIdx()] # Fill in the two pairs of indices this edge (bond) contributes to the incidence matrix inc_mat[2*i] = [bond.GetBeginAtom().GetIdx(), i] inc_mat[2*i+1] = [bond.GetEndAtom().GetIdx(), i] # Retrieve edge (bond) features, if needed if self.num_edge_features > 0: edge_features[i] = self.get_edge_features(bond) # Sort the adjacency and incidence matrices lexicographically adj_mat = adj_mat[np.lexsort((adj_mat[:, 1], adj_mat[:, 0]))] inc_mat = inc_mat[np.lexsort((inc_mat[:, 1], inc_mat[:, 0]))] # Represent molecular graph as a dictionary g = {'node_features': node_features, 'edge_features': edge_features, 'adj_mat': adj_mat, 'inc_mat': inc_mat} # Add target(s) (if any), making sure they are a NumPy array object with method tobytes() if target is not None: # Convert scalars to NumPy array if not isinstance(target, np.ndarray): target = np.array(target, np.float32) # Ensure target is of type np.float32 target = target.astype(np.float32) # Flatten targets of rank >= 2 if target.ndim > 1: target = target.flatten() # Store target as a (row) 2D NumPy array (for compatibility) g['target'] = np.reshape(target, (1, -1)) n_targets = g['target'].shape[1] # If there are no targets, add an empty NumPy array (for compatibility) else: g['target'] = np.zeros((1, 0), dtype=np.float32) n_targets = 0 # Add ID, making sure it is a NumPy array object with method tobytes() if not isinstance(target, np.ndarray): id = np.array(id, np.int64) g['id'] = id # Finally, add shape information. The last element refers to the number of graphs, and is included for # compatibility with batched graphs g['shape'] = np.array((n_nodes, n_edges, self.num_node_features, self.num_edge_features, n_targets, 1), np.int64) return g