def __get_context_env(mol, radius): """ INPUT: mol - Mol object containing chain(s) of molecular context radius - integer, number of bonds to cut context OUTPUT: Mol containing only atoms within the specified radius from the attachment point(s). All explicit Hs will be stripped. """ # mol is context consisting of one or more groups with single attachment point m = Chem.RemoveHs(mol) m = Chem.RWMol(m) bond_ids = set() for a in m.GetAtoms(): if a.GetSymbol() == "*": i = radius b = Chem.FindAtomEnvironmentOfRadiusN(m, i, a.GetIdx()) while not b and i > 0: i -= 1 b = Chem.FindAtomEnvironmentOfRadiusN(m, i, a.GetIdx()) bond_ids.update(b) atom_ids = set(__bonds_to_atoms(m, bond_ids)) dummy_atoms = [] for a in m.GetAtoms(): if a.GetIdx() not in atom_ids: nei_ids = set(na.GetIdx() for na in a.GetNeighbors()) intersect = nei_ids & atom_ids if intersect: dummy_atom_bonds = [] for ai in intersect: dummy_atom_bonds.append( (ai, m.GetBondBetweenAtoms(a.GetIdx(), ai).GetBondType())) dummy_atoms.append(dummy_atom_bonds) for data in dummy_atoms: dummy_id = m.AddAtom(Chem.Atom(0)) for atom_id, bond_type in data: m.AddBond(dummy_id, atom_id, bond_type) atom_ids.add(dummy_id) m = __get_submol(m, atom_ids) return m
def bit2atom_mapping(self, mol_obj) -> Dict[int, List[AtomEnvironment]]: hash2atom_dict = self.explain_rdmol(mol_obj) bit2atom_dict = { self.bit_mapping[hash_val]: atom_env for hash_val, atom_env in hash2atom_dict.items() } result_dict = defaultdict(list) # Iterating over all present bits and respective matches for bit, matches in bit2atom_dict.items(): # type: int, tuple for central_atom, radius in matches: # type: int, int if radius == 0: result_dict[bit].append( AtomEnvironment(central_atom, radius, {central_atom})) continue env = Chem.FindAtomEnvironmentOfRadiusN( mol_obj, radius, central_atom) atom_map = {} _ = Chem.PathToSubmol(mol_obj, env, atomMap=atom_map) env_atoms = atom_map.keys() assert central_atom in env_atoms result_dict[bit].append( AtomEnvironment(central_atom, radius, set(env_atoms))) # Transforming defaultdict to dict return {k: v for k, v in result_dict.items()}
def getMorganEnvironment(mol, bitInfo, fp=None, minRad=0): """ >>> m = Chem.MolFromSmiles('CC(O)C') >>> bi = {} >>> fp = AllChem.GetMorganFingerprintAsBitVect(m,2,2048,bitInfo=bi) >>> getMorganEnvironment(m,bi) defaultdict(<class 'list'>, {1057: [[], []], 227: [[1]], 709: [[0, 1, 2]], 1: [[]], 283: [[0], [2]], 807: [[]]}) >>> getMorganEnvironment(m,bi,minRad=1) defaultdict(<class 'list'>, {283: [[0], [2]], 227: [[1]], 709: [[0, 1, 2]]}) >>> list(fp.GetOnBits()) [1, 227, 283, 709, 807, 1057] >>> getMorganEnvironment(m,bi,minRad=1,fp=fp) defaultdict(<class 'list'>, {283: [[0], [2]], 227: [[1]], 709: [[0, 1, 2]]}) >>> list(fp.GetOnBits()) [227, 283, 709] """ bitPaths = defaultdict(list) for bit, info in bitInfo.items(): for atomID, radius in info: if radius < minRad: if fp != None: fp[bit] = 0 continue env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID) bitPaths[bit].append(list(env)) return bitPaths
def find_feature_fragments(self, feature_num, mols, radius=3, nBits=1024): from rdkit import Chem from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import AllChem, DataStructs, Draw from collections import defaultdict fragmol = defaultdict(list) fragmol_mol = defaultdict(list) for mol in mols: bit_info = {} #fragmol = defaultdict( list ) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, bitInfo=bit_info) for bit, info in bit_info.items(): for atm_idx, rad in info: env = Chem.FindAtomEnvironmentOfRadiusN(mol, rad, atm_idx) amap = {} try: submol = Chem.PathToSubmol(mol, env, atomMap=amap) except: raise ValueError('feature does not turn on any bits') smi = Chem.MolToSmiles(submol) if smi != '': if smi not in fragmol[bit]: fragmol[bit].append(smi) fragmol_mol[bit].append(submol) return fragmol[feature_num], fragmol_mol[feature_num]
def _featurize(self, mol): """ Calculate circular fingerprint. Parameters ---------- mol : RDKit Mol Molecule. """ if self.sparse: info = {} fp = rdMolDescriptors.GetMorganFingerprint( mol, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root) frag = Chem.PathToSubmol(mol, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) return fp
def getSubstructDepiction(mol, atomID, radius, molSize=(450, 200)): """ do a depiction where the atom environment is highlighted normally and the central atom is highlighted in blue :param mol: :param atomID: :param radius: :param molSize: :return: """ if radius > 0: env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID) atomsToUse = [] for b in env: atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx()) atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx()) atomsToUse = list(set(atomsToUse)) else: atomsToUse = [atomID] env = None return moltosvg(mol, molSize=molSize, highlightAtoms=atomsToUse, highlightAtomColors={atomID: (0.3, 0.3, 1)})
def depict_atoms(mol, atom_ids, radii, molSize=(300, 300), atm_color=(0, 1, 0), oth_color=(0.8, 1, 0)): """Get a depiction of molecular substructure. Useful for depicting bits in fingerprints. Inspired by: http://rdkit.blogspot.ch/2016/02/morgan-fingerprint-bit-statistics.html Parameters ---------- mol : rdkit.Chem.rdchem.Mol atom_ids : list List of atoms to depict radii : list List of radii - how many atoms around each atom with atom_id to highlight molSize : tuple atm_color, oth_color : tuple Colors of central atoms and surrounding atoms and bonds Returns ------- IPython.display.SVG """ atoms_to_use = [] bonds = [] for atom_id, radius in zip(atom_ids, radii): if radius > 0: env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atom_id) bonds += [x for x in env if x not in bonds] for b in env: atoms_to_use.append(mol.GetBondWithIdx(b).GetBeginAtomIdx()) atoms_to_use.append(mol.GetBondWithIdx(b).GetEndAtomIdx()) atoms_to_use = list(set(atoms_to_use)) else: atoms_to_use.append(atom_id) env = None if sum(radii) == 0: return mol_to_svg(mol, molSize=molSize, highlightBonds=False, highlightAtoms=atoms_to_use, highlightAtomColors={x: atm_color for x in atom_ids}) else: colors = {x: atm_color for x in atom_ids} for x in atoms_to_use: if x not in atom_ids: colors[x] = oth_color bond_colors = {b: oth_color for b in bonds} return mol_to_svg(mol, molSize=molSize, highlightAtoms=atoms_to_use, highlightAtomColors=colors, highlightBonds=bonds, highlightBondColors=bond_colors)
def create_histogram(): """ Uses the given structure and generates its substructure and the substructures frequency. This data is visualized with a histogram (png file). """ first_smiles_list =['C=C(C)C1CCC(C)=CCCc2coc(c2)CC2(C)OC2C1',\ 'c1nccc2n1ccc2','OCC=CC(=O)O','OC1C2C1CC2'] structure = first_smiles_list[0] # Select structure structure = Chem.MolFromSmiles(structure) for smile in first_smiles_list: m = Chem.MolFromSmiles(smile) nr_of_atoms = m.GetNumAtoms() # Generate all possible mol environments per structure substructures_list = [] for i in range(nr_of_atoms): for j in range(nr_of_atoms): env = Chem.FindAtomEnvironmentOfRadiusN(m, i, j) substructures_list += [env] # Generate all possible substructures based on the mol envs. smile_list = [] for env in substructures_list: amap = {} submol = Chem.PathToSubmol(m, env, atomMap=amap) mol = Chem.MolToSmiles(submol, canonical=True) if mol != '' and mol not in smile_list: smile_list += [mol] # Add the substructure to the 'all substructures list' sub_list = [] for smile in smile_list: x = Chem.MolFromSmiles(smile) if x != None: sub_list += [x] nr_of_matches = 0 sub_dict = {} for substructure in sub_list: match = structure.GetSubstructMatches(substructure) nr_of_matches += len(match) mol = Chem.MolToSmiles(substructure) sub_dict[mol] = len(match) # Create and save histogram fig, ax = plt.subplots(figsize=(10, 5)) plt.bar(list(sub_dict.keys()), sub_dict.values(), color='b') plt.xticks(fontsize=7, rotation=90) xlabel = plt.xlabel('Substructure smile') plt.ylabel('Substructure frequency') plt.title("Substructure frequency for structure XXX") fig.savefig('/path/to/histogram.png', bbox_extra_artists=[xlabel], bbox_inches='tight')
def get_substruct(mol, atom_idx, radius=1): # this function creates submolecules for r in range(radius)[::-1]: env = Chem.FindAtomEnvironmentOfRadiusN(mol, r, atom_idx) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) smi = Chem.MolToSmiles(submol) if smi != "": break return submol
def _featurize(self, datapoint: RDKitMol, **kwargs) -> np.ndarray: """Calculate circular fingerprint. Parameters ---------- datapoint: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of circular fingerprint. """ try: from rdkit import Chem from rdkit.Chem import rdMolDescriptors except ModuleNotFoundError: raise ImportError("This class requires RDKit to be installed.") if 'mol' in kwargs: datapoint = kwargs.get("mol") raise DeprecationWarning( 'Mol is being phased out as a parameter, please pass "datapoint" instead.' ) if self.sparse: info: Dict = {} fp = rdMolDescriptors.GetMorganFingerprint( datapoint, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(datapoint, radius, root) frag = Chem.PathToSubmol(datapoint, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( datapoint, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) fp = np.asarray(fp, dtype=float) return fp
def use_rdkit3(): # Fingerprinting and Molecular Similarity, default; Tanimoto similarity m_list2 = [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), \ Chem.MolFromSmiles('COC')] fps = [FingerprintMols.FingerprintMol(x) for x in m_list2] print('Fingerprint Similarity -->', DataStructs.FingerprintSimilarity(fps[0],fps[1])) print('Fingerprint Similarity -->', DataStructs.FingerprintSimilarity(fps[0],fps[0])) print('Fingerprint Similarity -->', DataStructs.FingerprintSimilarity(fps[2],fps[1])) # MACCS keys fps = [MACCSkeys.GenMACCSKeys(x) for x in m_list2] print('Fingerprint MACCS keys -->', DataStructs.FingerprintSimilarity(fps[0],fps[1])) print('Fingerprint MACCS keys -->', DataStructs.FingerprintSimilarity(fps[0],fps[0])) print('Fingerprint MACCS keys -->', DataStructs.FingerprintSimilarity(fps[2],fps[1])) # Morgan/ circular fingerprints m7 = Chem.MolFromSmiles('CCOC') fp1 = AllChem.GetMorganFingerprint(m7,2) m8 = Chem.MolFromSmiles('CCO') fp2 = AllChem.GetMorganFingerprint(m8,2) print('Fingerprint Morgan similarity -->', DataStructs.DiceSimilarity(fp1,fp2)) fp1 = AllChem.GetMorganFingerprintAsBitVect(m7,2,nBits=1024) fp2 = AllChem.GetMorganFingerprintAsBitVect(m8,2,nBits=1024) print(DataStructs.DiceSimilarity(fp1,fp2)) ffp1 = AllChem.GetMorganFingerprint(m7,2,useFeatures=True) ffp2 = AllChem.GetMorganFingerprint(m8,2,useFeatures=True) print(DataStructs.DiceSimilarity(ffp1,ffp2)) #explaining bit (Morgan) m9 = Chem.MolFromSmiles('c1cccnc1C') info={} AllChem.GetMorganFingerprint(m9,2,bitInfo=info) print (info) print (info[98513984]) print (info[4048591891]) env = Chem.FindAtomEnvironmentOfRadiusN(m9,2,5) amap={} submol=Chem.PathToSubmol(m9,env,atomMap=amap) print (submol.GetNumAtoms()) print (amap) # bit to smile print (Chem.MolToSmiles(submol)) # Descriptor Calculation; used in papers or coding languages m6 = Chem.MolFromSmiles('c1ccccc1O') print('Descriptor TPSA -->', Descriptors.TPSA(m6)) print('Descriptor MolLogP -->', Descriptors.MolLogP(m6)) # Chemical reactions # rxn = AllChem.ReactionFromSmarts('[C:1]=[C:2].[C:3]=[*:4][*:5]=[C:6]>>[C:1]1[C:2][C:3][*:4]=[*:5][C:6]1') rxn = AllChem.ReactionFromSmarts('[C:1](=[O:2])-[OD1].[N!H0:3]>>[C:1](=[O:2])[N:3]') ps = rxn.RunReactants((Chem.MolFromSmiles('CC(=O)O'),Chem.MolFromSmiles('NC=C'))) print ('Reaction product -->', Chem.MolToSmiles(ps[0][0]))
def find_center_Environment(centers, radius): center_environment = "" for atom_idx in centers.atom_ids: env = Chem.FindAtomEnvironmentOfRadiusN(centers.mol, radius, atom_idx) amap = {} submol = Chem.PathToSubmol(centers.mol, env, atomMap=amap) env_smi = Chem.MolToSmiles(submol) if center_environment == "": center_environment = env_smi else: center_environment = center_environment + "." + env_smi if center_environment == "": center_environment = "NA" return center_environment
def __get_context_env(mol, radius): """ INPUT: mol - Mol object containing chain(s) of molecular context radius - integer, number of bonds to cut context OUTPUT: Mol containing only atoms within the specified radius from the attachment point(s). All explicit Hs will be stripped. """ # mol is context consisting of one or more groups with single attachment point bond_ids = set() for a in mol.GetAtoms(): if a.GetSymbol() == "*": i = radius b = Chem.FindAtomEnvironmentOfRadiusN(mol, i, a.GetIdx()) while not b and i > 0: i -= 1 b = Chem.FindAtomEnvironmentOfRadiusN(mol, i, a.GetIdx()) bond_ids.update(b) m = Chem.PathToSubmol(mol, list(bond_ids)) # remove Hs, otherwise terminal atoms will produce smiles with H ([CH2]C[*:1]) for a in m.GetAtoms(): a.SetNumExplicitHs(0) return m
def lads_score_v2(actives, decoys): # Similar to DEKOIS (v2) # Lower is better (less like actives), higher is worse (more like actives) active_fps = [] active_info = {} info = {} atoms_per_bit = defaultdict(int) for smi in actives: m = Chem.MolFromSmiles(smi) active_fps.append( AllChem.GetMorganFingerprint(m, 3, useFeatures=True, bitInfo=info)) for key in info: if key not in active_info: active_info[key] = info[key] env = Chem.FindAtomEnvironmentOfRadiusN( m, info[key][0][1], info[key][0][0]) amap = {} submol = Chem.PathToSubmol(m, env, atomMap=amap) if info[key][0][1] == 0: atoms_per_bit[key] = 1 else: atoms_per_bit[key] = submol.GetNumHeavyAtoms() decoys_fps = [ AllChem.GetMorganFingerprint(Chem.MolFromSmiles(smi), 3, useFeatures=True) for smi in decoys ] # Roughly FCFP_6 master_active_fp_freq = defaultdict(int) for fp in active_fps: fp_dict = fp.GetNonzeroElements() for k, v in fp_dict.items(): master_active_fp_freq[k] += 1 # Reweight for k in master_active_fp_freq: # Normalise master_active_fp_freq[k] /= len(active_fps) # Weight by size of bit master_active_fp_freq[k] *= atoms_per_bit[k] decoys_lads_avoid_scores = [ sum([master_active_fp_freq[k] for k in decoy_fp.GetNonzeroElements()]) / len(decoy_fp.GetNonzeroElements()) for decoy_fp in decoys_fps ] return decoys_lads_avoid_scores
def GetMorganFingerprint(mol, atomId=-1, radius=2, fpType='bv', nBits=2048, useFeatures=False): """ Calculates the Morgan fingerprint with the counts of atomId removed. Parameters: mol -- the molecule of interest radius -- the maximum radius fpType -- the type of Morgan fingerprint: 'count' or 'bv' atomId -- the atom to remove the counts for (if -1, no count is removed) nBits -- the size of the bit vector (only for fpType = 'bv') useFeatures -- if false: ConnectivityMorgan, if true: FeatureMorgan """ if fpType not in ['bv', 'count']: raise ValueError("Unknown Morgan fingerprint type") if not hasattr(mol, '_fpInfo'): info = {} # get the fingerprint if fpType == 'bv': molFp = rdMD.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits, useFeatures=useFeatures, bitInfo=info) else: molFp = rdMD.GetMorganFingerprint(mol, radius, useFeatures=useFeatures, bitInfo=info) # construct the bit map if fpType == 'bv': bitmap = [DataStructs.ExplicitBitVect(nBits) for x in range(mol.GetNumAtoms())] else: bitmap = [[] for x in range(mol.GetNumAtoms())] for bit, es in info.iteritems(): for at1, rad in es: if rad == 0: # for radius 0 if fpType == 'bv': bitmap[at1][bit] = 1 else: bitmap[at1].append(bit) else: # for radii > 0 env = Chem.FindAtomEnvironmentOfRadiusN(mol, rad, at1) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) for at2 in amap.keys(): if fpType == 'bv': bitmap[at2][bit] = 1 else: bitmap[at2].append(bit) mol._fpInfo = (molFp, bitmap) if atomId < 0: return mol._fpInfo[0] else: # remove the bits of atomId if atomId >= mol.GetNumAtoms(): raise ValueError("atom index greater than number of atoms") if len(mol._fpInfo) != 2: raise ValueError("_fpInfo not set") if fpType == 'bv': molFp = mol._fpInfo[0] ^ mol._fpInfo[1][atomId] # xor else: # count molFp = copy.deepcopy(mol._fpInfo[0]) # delete the bits with atomId for bit in mol._fpInfo[1][atomId]: molFp[bit] -= 1 return molFp
def _featurize(self, mol: RDKitMol) -> np.ndarray: """Calculate circular fingerprint. Parameters ---------- mol: rdkit.Chem.rdchem.Mol RDKit Mol object Returns ------- np.ndarray A numpy array of circular fingerprint. """ from rdkit import Chem from rdkit.Chem import rdMolDescriptors if self.sparse: info: Dict = {} fp = rdMolDescriptors.GetMorganFingerprint( mol, self.radius, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features, bitInfo=info) fp = fp.GetNonzeroElements() # convert to a dict # generate SMILES for fragments if self.smiles: fp_smiles = {} for fragment_id, count in fp.items(): root, radius = info[fragment_id][0] env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, root) frag = Chem.PathToSubmol(mol, env) smiles = Chem.MolToSmiles(frag) fp_smiles[fragment_id] = {'smiles': smiles, 'count': count} fp = fp_smiles else: fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) fp = np.asarray(fp, dtype=np.float) return fp
def get_substruct(mol, atom_idx_list, radius=3): subsmiDic = {} # key, value: <substr in str, list of amap> / each of the amap elements are the indices of the significant atoms orimol_atomI = () # (orimol, list of amap) / each of the amap elements are the indices of the significant atoms for r in range(1, radius)[::-1]: # can extract the submolecule consisting of all atoms within a radius of r of atom_idx for atom_idx in atom_idx_list: env = Chem.FindAtomEnvironmentOfRadiusN(mol, r, atom_idx) amap = {} # key, val = <atom index prime(different from whole index), order> submol = Chem.PathToSubmol(mol, env, atomMap=amap) subsmi = Chem.MolToSmiles(submol) if subsmi != "":# found the submolecule tmpAmapList = list(amap.keys()) subsmiDic[subsmi] = tmpAmapList orimol_atomI = (mol, tmpAmapList) return subsmiDic, orimol_atomI
def compute_all_ecfp(mol, indices=None, degree=2): """ For each atom: Obtain molecular fragment for all atoms emanating outward to given degree. For each fragment, compute SMILES string (for now) and hash to an int. Return a dictionary mapping atom index to hashed SMILES. """ ecfp_dict = {} for i in range(mol.GetNumAtoms()): if indices is not None and i not in indices: continue env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True) submol = Chem.PathToSubmol(mol, env) smile = Chem.MolToSmiles(submol) ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile) return ecfp_dict
def gradient2atom(smi, gradient, pos_cut=3, neg_cut=-3, nBits=2048): """ map the gradient of Morgan fingerprint bit on the molecule Input: smi - the smiles of the molecule (a string) gradient - the 2048 coeffients of the feature cutoff - if positive, get the pos where the integrated weight is bigger than the cutoff; if negative, get the pos where the integrated weight is smaller than the cutoff Output: two list of atom ids (positive and negative) """ # generate mol mol = Chem.MolFromSmiles(smi) # get the bit info of the Morgan fingerprint bi = {} fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, bitInfo=bi, nBits=nBits) onbits = list(fp.GetOnBits()) # calculate the integrated weight atomsToUse = np.zeros((len(mol.GetAtoms()), 1)) for bitId in onbits: atomID, radius = bi[bitId][0] temp_atomsToUse = [] if radius > 0: env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID) for b in env: temp_atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx()) temp_atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx()) else: temp_atomsToUse.append(atomID) env = None temp_atomsToUse = list(set(temp_atomsToUse)) atomsToUse[temp_atomsToUse] += gradient[bitId] # get the postively/negatively contributed atom ids highlit_pos = [] highlit_neg = [] for i in range(len(atomsToUse)): if atomsToUse[i] > pos_cut: highlit_pos.append(i) elif atomsToUse[i] < neg_cut: highlit_neg.append(i) return mol, highlit_pos, highlit_neg, atomsToUse
def get_environment_smarts(carbon, mol): """For a given carbon atom and molecule, return a SMARTS representation of the atom environment. carbon: rdkit.Chem.Atom The desired carbon atom mol: rdkit.Chem.Mol The molecule the atom is present in """ bond_list = list( Chem.FindAtomEnvironmentOfRadiusN(mol, 1, carbon.GetIdx(), useHs=True)) bond_smarts = bond_list_to_smarts(mol, bond_list) if carbon.IsInRing(): return bond_smarts + ' | (Ring)' else: return bond_smarts
def count_substructures(radius, molecule): """Helper function for get the information of molecular signature of a metabolite. The relaxed signature requires the number of each substructure to construct a matrix for each molecule. Parameters ---------- radius : int the radius is bond-distance that defines how many neighbor atoms should be considered in a reaction center. molecule : Molecule a molecule object create by RDkit (e.g. Chem.MolFromInchi(inchi_code) or Chem.MolToSmiles(smiles_code)) Returns ------- dict dictionary of molecular signature for a molecule, {smiles: molecular_signature} """ m = molecule smi_count = dict() atomList = [atom for atom in m.GetAtoms()] for i in range(len(atomList)): env = Chem.FindAtomEnvironmentOfRadiusN(m, radius, i) atoms = set() for bidx in env: atoms.add(m.GetBondWithIdx(bidx).GetBeginAtomIdx()) atoms.add(m.GetBondWithIdx(bidx).GetEndAtomIdx()) # only one atom is in this environment, such as O in H2O if len(atoms) == 0: atoms = {i} smi = Chem.MolFragmentToSmiles(m, atomsToUse=list(atoms), bondsToUse=env, canonical=True) if smi in smi_count: smi_count[smi] = smi_count[smi] + 1 else: smi_count[smi] = 1 return smi_count
def getSubstructSmi(mol, atomID, radius): if radius > 0: env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atomID) atomsToUse = [] for b in env: atomsToUse.append(mol.GetBondWithIdx(b).GetBeginAtomIdx()) atomsToUse.append(mol.GetBondWithIdx(b).GetEndAtomIdx()) atomsToUse = list(set(atomsToUse)) else: atomsToUse = [atomID] env = None smi = Chem.MolFragmentToSmiles(mol, atomsToUse, bondsToUse=env, allHsExplicit=True, allBondsExplicit=True, rootedAtAtom=atomID) order = eval(mol.GetProp('_smilesAtomOutputOrder')) smi2 = writePropsToSmiles(mol, smi, order) return smi, smi2
def select_atoms(mol, selected_bits, vis_dir=None): features_vec, info = get_fingerprint(mol) # print('on bits:', info) selected_atoms = set() for onbit, subgraphs in info.items(): if onbit in selected_bits: for center, radius in subgraphs: # print(f'on bit = {onbit}, center = {center}, radius = {radius}') env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius=radius, rootedAtAtom=center) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) atoms = list(amap.keys()) selected_atoms.update(atoms) # print(atoms) if vis_dir is not None: png_f = f'bit{onbit}_center{center}_radius{radius}.png' Draw.MolToFile(mol, filename=os.path.join(vis_dir, png_f), highlightAtoms=atoms) return selected_atoms
def eliminate(mol): # tag atoms within 4 bonds of attachment toRemove = set(range(mol.GetNumAtoms())) for atom in mol.GetAtoms(): if atom.GetProp('molAtomRadius') == '0': for idx in Chem.FindAtomEnvironmentOfRadiusN( mol, 3, atom.GetIdx()): envBond = mol.GetBondWithIdx(idx) toRemove.discard(envBond.GetBeginAtom().GetIdx()) toRemove.discard(envBond.GetEndAtom().GetIdx()) # remove environment from core toRemove = list(toRemove) toRemove.sort(reverse=True) frag = Chem.EditableMol(mol) for atom in toRemove: frag.RemoveAtom(atom) frag = frag.GetMol() # frag.Debug() return frag
def explain_fingerprint_bit(mol, vis_dir=None): features_vec, info = get_fingerprint(mol) print('on bits:', info) for onbit, subgraphs in info.items(): for center, radius in subgraphs: print(f'on bit = {onbit}, center = {center}, radius = {radius}') if radius > 0: env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius=radius, rootedAtAtom=center) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) atoms = list(amap.keys()) print(atoms) if vis_dir is not None: # mfp2_svg = Draw.DrawMorganEnv(mol, atomId=center, radius=radius, useSVG=True) # svg_f = f'bit{onbit}_center{center}_radius{radius}.svg' # with open(os.path.join(vis_dir, svg_f), 'w') as f: # f.write(mfp2_svg) png_f = f'bit{onbit}_center{center}_radius{radius}.png' Draw.MolToFile(mol, filename=os.path.join(vis_dir, png_f), highlightAtoms=atoms)
def explain_circular_substructure(mol, center, radius, use_hs=False, canonical=True, isomeric=False, kekule=False, all_bonds_explicit=False): """Returns a SMILES description of the circular structure defined by a center and a topological radius.""" atoms = {center} env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, center, useHs=use_hs) for bidx in env: bond = mol.GetBondWithIdx(bidx) atoms.add(bond.GetBeginAtomIdx()) atoms.add(bond.GetEndAtomIdx()) return Chem.MolFragmentToSmiles(mol, atomsToUse=list(atoms), bondsToUse=env, rootedAtAtom=center, isomericSmiles=isomeric, kekuleSmiles=kekule, canonical=canonical, allBondsExplicit=all_bonds_explicit)
def compute_all_ecfp(mol: RDKitMol, indices: Optional[Set[int]] = None, degree: int = 2) -> Dict[int, str]: """Obtain molecular fragment for all atoms emanating outward to given degree. For each fragment, compute SMILES string (for now) and hash to an int. Return a dictionary mapping atom index to hashed SMILES. Parameters ---------- mol: rdkit Molecule Molecule to compute ecfp fragments on indices: Optional[Set[int]] List of atom indices for molecule. Default is all indices. If specified will only compute fragments for specified atoms. degree: int Graph degree to use when computing ECFP fingerprints Returns ---------- dict Dictionary mapping atom index to hashed smiles. """ ecfp_dict = {} from rdkit import Chem for i in range(mol.GetNumAtoms()): if indices is not None and i not in indices: continue env = Chem.FindAtomEnvironmentOfRadiusN(mol, degree, i, useHs=True) submol = Chem.PathToSubmol(mol, env) smile = Chem.MolToSmiles(submol) ecfp_dict[i] = "%s,%s" % (mol.GetAtoms()[i].GetAtomicNum(), smile) return ecfp_dict
def FingerprintToSmiles(m, s): fp_sm = [] bi = {} fp = AllChem.GetMorganFingerprint(m, s, bitInfo=bi) # print('FPSM : ', bi) for f in bi: # print('K:', f,' V:', bi[f]) a = bi[f][0][0] r = bi[f][0][1] # print(f, a, r) if r > 0: env = Chem.FindAtomEnvironmentOfRadiusN(m, r, a) amap = {} submol = Chem.PathToSubmol(m, env, atomMap=amap) sm = Chem.MolToSmiles(submol) else: am = m.GetAtomWithIdx(a) sm = am.GetSymbol() if am.GetIsAromatic(): sm = sm.lower() fp_sm.append((f, sm)) # print(f,' - ',sm) # print(f,' - ',len(v),' - ',v,' - ',a,' - ',r,' - ',sm) return fp_sm
def generate_substructures(): """ Uses the structure table from the NP SQLite database and generates substructures data which will be stored in a text file. """ # Connect conn = sqlite3.connect( "/path/to/SQLiteDatabase/Natural_Product_Structure.sqlite") c = conn.cursor() # Generate the nr of structures for class XXX numrows = c.execute("SELECT count(*) FROM structure WHERE Class = 'XXX';") numrows = str(numrows.fetchone()).lstrip("(").rstrip(",)") numrows = int(numrows) # Select all data for class XXX c.execute("SELECT * FROM structure WHERE Class = 'XXX';") str_mol_list = [] sub_smiles_list = [] for x in range(0, numrows): row = c.fetchone() m = Chem.MolFromSmiles(row[3]) str_mol_list += [m] nr_of_atoms = m.GetNumAtoms() substructures_list = [] for i in range(nr_of_atoms): for j in range(nr_of_atoms): mol = Chem.FindAtomEnvironmentOfRadiusN(m, i, j) substructures_list += [mol] smile_list = [] for mol in substructures_list: amap = {} submol = Chem.PathToSubmol(m, mol, atomMap=amap) p = Chem.MolToSmiles(submol, canonical=True) # prevent overlapping substructures per structure if p != '' and p not in smile_list: smile_list += [p] for sm in smile_list: # prevent overlapping substructure for all structures if sm not in sub_smiles_list: sub_smiles_list += [sm] sub_mol_list = [] for sub_struc in sub_smiles_list: sm = Chem.MolFromSmiles(sub_struc) if sm != None: sub_mol_list += [sm] with open("/path/to/store/substructures_from_class_XXX.txt", 'w') as db_file: db_file.write("Structure has Substructure" + '\n\n') for structure in str_mol_list: for substructure in sub_mol_list: if structure.HasSubstructMatch(substructure) == True: struc = Chem.MolToSmiles(structure) substruc = Chem.MolToSmiles(substructure) db_file.write(struc + '\t' + substruc + '\n') # Close the connection conn.close()
def calculate_p_values(self, mols, substructure_dictionary, bioactivities, mols_ids, threshold_frequency, threshold_nb_substructures=5, threshold_pvalue=0.05, active_label=1, inactive_label=0, Bonferroni=True): self.Bonferroni = Bonferroni # n nb_mols = float( len( set([ item for sublist in substructure_dictionary.values() for item in sublist ]))) # m nb_active_mols = float(np.sum(bioactivities == active_label)) # (m - n) nb_inactive_mols = float(np.sum(bioactivities == inactive_label)) nb_substructures_processed = 0 if type(mols) != list: mols = [ext.mols[i] for i in np.arange(0, len(mols))] #[x for x in mols] subs_discarded = [ ] # substructure that have been identified in other molecules. for m, mol in enumerate(mols): #np.arange(0,len(mols)): #mol=mols[m] root_atoms_discarded = [] # center (or root) atoms discarded.. info = {} fp = AllChem.GetMorganFingerprint(mol, self.max_radius, bitInfo=info) # sort info to make sure the substructures are read from the smallest to the biggest. # In case a substructure with low radius is removed, we make sure all containing it will not be considered either in the following steps) # get keys sorted ff = sorted(info.iteritems(), key=operator.itemgetter(1)) substructure_ids = [ff[x][0] for x in range(0, len(info))] substructures_sub_dict = substructure_dictionary.keys() for substructure_id in substructure_ids: atom_radius = info[substructure_id] nb_substructures_processed += 1 # check is the substructure is in the database (i.e. training data) if substructure_id in substructures_sub_dict and substructure_id not in subs_discarded and atom_radius[ 0][0] not in root_atoms_discarded: mols_with_current_substructure = substructure_dictionary[ substructure_id] nb_comp_with_substructure = float( len(mols_with_current_substructure)) active_comp = (bioactivities == active_label) comp_with_substructure = np.in1d( np.asarray(mols_ids), np.asarray(mols_with_current_substructure)) nb_comp_with_substructure_active = np.sum( active_comp * comp_with_substructure) #i.e. m_{S act} inactive_comp = (bioactivities == inactive_label) #comp_with_substructure = np.in1d(np.asarray(mols_ids) , np.asarray(mols_with_current_substructure)) nb_comp_with_substructure_inactive = np.sum( inactive_comp * comp_with_substructure) ## ACTIVE ######### #filter threshold of compounds with the substructure filter_a = nb_comp_with_substructure > threshold_nb_substructures if filter_a: # filter threshold filter_b = (float(nb_comp_with_substructure_active) / float(np.sum(comp_with_substructure)) ) > threshold_frequency if filter_b: p_value = 0 for count in np.arange( nb_comp_with_substructure_active, nb_comp_with_substructure): numerator = Decimal( sc.math.factorial( float(nb_comp_with_substructure))) denominatorA = Decimal( sc.math.factorial(float(count))) * Decimal( sc.math.factorial( float(nb_comp_with_substructure - count))) denominatorB = (nb_active_mols / nb_mols)**float(count) denominatorC = (1.0 - (nb_active_mols / nb_mols))**( nb_comp_with_substructure - count) out = float( numerator / denominatorA) * denominatorB * denominatorC p_value += out if p_value < threshold_pvalue: #self.p_values_dictionary[substructure_id] = p_value # Drawing env = Chem.FindAtomEnvironmentOfRadiusN( mol, atom_radius[0][1], atom_radius[0][0]) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) m1 = mol m1.GetSubstructMatch(submol) #mm = Draw.MolToImage( mol,wedgeBonds=True,kekulize=True,highlightAtoms=amap.keys(),colour='green') self.output = self.output.append( { 'Compound ID': mols_ids[m], 'Compounds with substr.': nb_comp_with_substructure, #'Compounds with substr. and activity' : nb_comp_with_substructure_active, 'p_value': p_value, 'Activity label': active_label, 'Substructure in Molecule': m1, 'Substructure': submol, 'Comp. with substr. active': nb_comp_with_substructure_active, 'Comp. with substr. inactive': nb_comp_with_substructure_inactive #'Smiles': Chem.MolToSmiles(mol) }, ignore_index=True) root_atoms_discarded.append(atom_radius[0][0]) subs_discarded.append(substructure_id) ## INACTIVE ######### #filter threshold of compounds with the substructure # filter threshold filter_b = (float(nb_comp_with_substructure_inactive) / float(np.sum(comp_with_substructure)) ) > threshold_frequency if filter_b: p_value = 0 for count in np.arange( nb_comp_with_substructure_inactive, nb_comp_with_substructure): numerator = Decimal( sc.math.factorial( float(nb_comp_with_substructure))) denominatorA = Decimal( sc.math.factorial(float(count))) * Decimal( sc.math.factorial( float(nb_comp_with_substructure - count))) denominatorB = (nb_inactive_mols / nb_mols)**float(count) denominatorC = ( 1.0 - (nb_inactive_mols / nb_mols))**( nb_comp_with_substructure - count) out = float( numerator / denominatorA) * denominatorB * denominatorC p_value += out if p_value < threshold_pvalue: #self.p_values_dictionary[substructure_id] = p_value # Drawing env = Chem.FindAtomEnvironmentOfRadiusN( mol, atom_radius[0][1], atom_radius[0][0]) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) m1 = mol m1.GetSubstructMatch(submol) #mm = Draw.MolToImage(mol,wedgeBonds=True,kekulize=True,highlightAtoms=amap.keys(),colour='red') self.output = self.output.append( { 'Compound ID': mols_ids[m], 'Compounds with substr.': nb_comp_with_substructure, #'Compounds with substr. and activity' : nb_comp_with_substructure_active, 'p_value': p_value, 'Activity label': inactive_label, 'Substructure in Molecule': m1, 'Substructure': submol, 'Comp. with substr. active': nb_comp_with_substructure_active, 'Comp. with substr. inactive': nb_comp_with_substructure_inactive #'Smiles': Chem.MolToSmiles(mol) }, ignore_index=True) root_atoms_discarded.append(atom_radius[0][0]) subs_discarded.append(substructure_id) else: subs_discarded.append(substructure_id) root_atoms_discarded.append(atom_radius[0][0]) if self.Bonferroni == True: self.output[ 'p_value'] = self.output['p_value'] * self.output.shape[0] self.output = self.output[self.output.p_value < 0.05] print 'Number of substructures processed: ', nb_substructures_processed print 'Significant substructures: ', self.output.shape[ 0], 'substructures'