def testGithub1973(self): smiles = ("c1ccccc1S", "c1cscc1", "CC(=S)C", "CSC", "CS(=O)C", "CP(C)C", "CP=O", "CP(C)(C)=O", "C[PH](C)=O") orig_tpsa = (0, 0, 0, 0, 17.07, 0.0, 17.07, 17.07, 17.07) new_tpsa = (38.8, 28.24, 32.09, 25.30, 36.28, 13.59, 51.21, 26.88, 40.54) for i, smi in enumerate(smiles): mol = Chem.MolFromSmiles(smi) oTPSA = rdMD.CalcTPSA(mol) self.assertAlmostEqual(oTPSA, orig_tpsa[i], 2) nTPSA = rdMD.CalcTPSA(mol, force=True, includeSandP=True) self.assertAlmostEqual(nTPSA, new_tpsa[i], 2)
def get_filter_values(mol): """ calculate the values, for a given molecule, that are used to filter return as a dictionary """ assert isinstance(mol, Chem.Mol) values = {} values["MW"] = desc.CalcExactMolWt(mol) values["logP"] = crip.MolLogP(mol) values["HBA"] = lip.NumHAcceptors(mol) values["HBD"] = lip.NumHDonors(mol) values["tPSA"] = desc.CalcTPSA(mol) values["rot_bonds"] = lip.NumRotatableBonds(mol) values["rigid_bonds"] = mol.GetNumBonds() - values["rot_bonds"] # assume mutual exclusion values["num_rings"] = lip.RingCount(mol) values["num_hetero_atoms"] = lip.NumHeteroatoms(mol) values["charge"] = rdmolops.GetFormalCharge(mol) # trusting this charge calculation method values["num_carbons"], values["num_charges"], values["max_ring_size"] = get_atom_props(mol) try: values["hc_ratio"] = float(values["num_hetero_atoms"]) / float(values["num_carbons"]) except ZeroDivisionError: values["hc_ratio"] = 100000000 # if there are zero carbons values["fc"] = len(list(Brics.FindBRICSBonds(mol))) # how many BRICS bonds, related to complexity values["is_good"] = True # default to true, but not yet observed atoms = [atom.GetSymbol() for atom in mol.GetAtoms()] # get all the atoms, and make the list unique (only types) atoms = set(atoms) atoms = list(atoms) values["atoms"] = atoms values["num_chiral_centers"] = len(Chem.FindMolChiralCenters(mol, includeUnassigned=True)) values["rejections"] = [] # empty list to store the reasons for rejection return values
def calc(smi, name): m = Chem.MolFromSmiles(smi) if m is not None: try: hba = rdMolDescriptors.CalcNumHBA(m) hbd = rdMolDescriptors.CalcNumHBD(m) nrings = rdMolDescriptors.CalcNumRings(m) rtb = rdMolDescriptors.CalcNumRotatableBonds(m) psa = rdMolDescriptors.CalcTPSA(m) logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m) mw = rdMolDescriptors._CalcMolWt(m) csp3 = rdMolDescriptors.CalcFractionCSP3(m) hac = m.GetNumHeavyAtoms() if hac == 0: fmf = 0 else: fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / hac qed = QED.qed(m) nrings_fused = fused_ring_count(m) return name, hba, hbd, hba + hbd, nrings, rtb, round(psa, 2), round(logp, 2), round(mr, 2), round(mw, 2), \ round(csp3, 3), round(fmf, 3), round(qed, 3), hac, nrings_fused except: sys.stderr.write( f'molecule {name} was omitted due to an error in calculation of some descriptors\n' ) return None else: sys.stderr.write('smiles %s cannot be parsed (%s)' % (smi, name)) return None
def calc(smi, name): m = Chem.MolFromSmiles(smi) if m is not None: try: hba = rdMolDescriptors.CalcNumHBA(m) hbd = rdMolDescriptors.CalcNumHBD(m) nrings = rdMolDescriptors.CalcNumRings(m) rtb = rdMolDescriptors.CalcNumRotatableBonds(m) psa = rdMolDescriptors.CalcTPSA(m) logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m) mw = rdMolDescriptors._CalcMolWt(m) csp3 = rdMolDescriptors.CalcFractionCSP3(m) hac = m.GetNumHeavyAtoms() if hac == 0: fmf = 0 else: fmf = GetScaffoldForMol(m).GetNumHeavyAtoms() / hac qed = QED.qed(m) nrings_fused = fused_ring_count(m) n_unique_hba_hbd_atoms = count_hbd_hba_atoms(m) max_ring_size = len(max(m.GetRingInfo().AtomRings(), key=len, default=())) n_chiral_centers = len(FindMolChiralCenters(m, includeUnassigned=True)) fcsp3_bm = rdMolDescriptors.CalcFractionCSP3(GetScaffoldForMol(m)) return name, hba, hbd, hba + hbd, nrings, rtb, round(psa, 2), round(logp, 2), round(mr, 2), round(mw, 2), \ round(csp3, 3), round(fmf, 3), round(qed, 3), hac, nrings_fused, n_unique_hba_hbd_atoms, \ max_ring_size, n_chiral_centers, round(fcsp3_bm, 3) except: sys.stderr.write(f'molecule {name} was omitted due to an error in calculation of some descriptors\n') return None else: sys.stderr.write('smiles %s cannot be parsed (%s)' % (smi, name)) return None
def main(): sub_df = pd.read_csv("submissions_final_result.csv") cmp_ds = [] for _, row in sub_df.iterrows(): cmp_dict = {} mol = Chem.MolFromSmiles(row['smiles_string']) cmp_dict['submission_id'] = row['submission_id'] cmp_dict['smiles_string'] = row['smiles_string'] # Lipinski's rule cmp_dict['h_bond_donor'] = rd.CalcNumLipinskiHBD( mol) # Lipinski Hbond donor cmp_dict['h_bond_acceptor'] = rd.CalcNumLipinskiHBA( mol) # Lipinski Hbond acceptor cmp_dict['moluclar_mass'] = rd._CalcMolWt(mol) # Molecular Weight cmp_dict['log_p'] = rd.CalcCrippenDescriptors(mol)[ 0] # Partition coefficient # Topological polar surface area cmp_dict['topological_polar_surface_area'] = rd.CalcTPSA(mol) cmp_ds.append(cmp_dict) result = pd.merge(sub_df, pd.DataFrame(cmp_ds), on=['submission_id', 'smiles_string']) result.to_csv("lipinski_psa_result.csv", index=False, encoding='utf-8')
def get_mol_props(mol: AllChem.Mol): """ Get the properties of a molecule. """ logP = Descriptors.MolLogP(mol) tpsa = rdMolDescriptors.CalcTPSA(mol) alpha = rdMolDescriptors.CalcHallKierAlpha(mol) MR = Descriptors.MolMR(mol) asa = rdMolDescriptors.CalcLabuteASA(mol) return [logP, tpsa, alpha, MR, asa]
def calculate_properties(self, mol): """this method calculates basic properties for the smiles returns : list of int or float (properties)""" properties = [] properties.append(mol.GetNumAtoms()) properties.append(desc.CalcCrippenDescriptors(mol)[0]) properties.append(desc.CalcTPSA(mol)) properties.append(desc.CalcNumRotatableBonds(mol)) properties.append(desc.CalcFractionCSP3(mol)) return properties
def calculate(self): tpsa = rdMolDescriptors.CalcTPSA(self.mol) if self._no_only: return tpsa for atom in self.mol.GetAtoms(): atomic_num = atom.GetAtomicNum() if atomic_num == 15: tpsa += self._get_phosphorus_contrib(atom) elif atomic_num == 16: tpsa += self._get_sulfur_contrib(atom) return tpsa
def main(in_file, output): Cmpds = {} InMols = rdkit_open([in_file]) print('\n # Number of input molecule: {0}'.format(len(InMols))) for mol in InMols: m = {} name = mol.GetProp('_Name').split()[0] m['Name'] = name m['Formula'] = rd.CalcMolFormula(mol) m['SMILES'] = Chem.MolToSmiles(mol) m['MW'] = rd._CalcMolWt(mol) # Molecular Weight m['logP'] = rd.CalcCrippenDescriptors(mol)[0] # Partition coefficient m['HDon'] = rd.CalcNumLipinskiHBD(mol) # Lipinski Hbond donor m['HAcc'] = rd.CalcNumLipinskiHBA(mol) # Lipinski Hbond acceptor m['TPSA'] = rd.CalcTPSA(mol) # Topological polar surface area m['Rotat'] = rd.CalcNumRotatableBonds(mol, strict=True) # Rotatable bond m['MolRef'] = rd.CalcCrippenDescriptors(mol)[1] # Molar refractivity m['AliRing'] = rd.CalcNumAliphaticRings(mol) # Aliphatic ring number m['AroRing'] = rd.CalcNumAromaticRings(mol) # Aromatic ring number # m['Stereo'] = rd.CalcNumAtomStereoCenters(mol) # Stereo center number # m['UnspStereo'] = rd.CalcNumUnspecifiedAtomStereoCenters(mol) # unspecified stereo m['SMILES'] = Chem.MolToSmiles(mol, isomericSmiles=True, allHsExplicit=False) Cmpds[name] = m #################################### df = pd.DataFrame.from_dict(Cmpds, orient='index') df.index.name = 'Name' # Columns of data to print out Columns = [ 'Formula', 'MW', 'logP', 'HDon', 'HAcc', 'TPSA', 'Rotat', 'MolRef', 'AliRing', 'AroRing', #'Stereo', 'UnspStereo', 'SMILES', ] reorder = df[Columns] # Output to CSV reorder.to_csv( output+'.csv', sep=',', na_rep='NA', encoding='utf-8', float_format='%.5f', header=True ) # Output to Excel reorder.to_excel( output+'.xlsx', header=True, na_rep='NA' )
def calc(smi, name): m = Chem.MolFromSmiles(smi) if m is not None: hba = rdMolDescriptors.CalcNumHBA(m) hbd = rdMolDescriptors.CalcNumHBD(m) nrings = rdMolDescriptors.CalcNumRings(m) rtb = rdMolDescriptors.CalcNumRotatableBonds(m) psa = rdMolDescriptors.CalcTPSA(m) logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m) mw = rdMolDescriptors._CalcMolWt(m) csp3 = rdMolDescriptors.CalcFractionCSP3(m) fmf = GetScaffoldForMol(m).GetNumAtoms(onlyHeavy=True) / m.GetNumAtoms(onlyHeavy=True) return name, hba, hbd, hba + hbd, nrings, rtb, round(psa, 2), round(logp, 2), round(mr, 2), round(mw, 2), \ round(csp3, 3), round(fmf, 3) else: sys.stderr.write('smiles %s cannot be parsed (%s)' % (smi, name)) return None
def calculate_properties(self, smiles=None, mol=None, props=[]): """this method calculates basic properties for the mol returns : error (bool)""" if len(props) == 0: return True if mol is None: mol = Chem.MolFromSmiles(smiles) if mol is None: return True if 'py_formula' in props: self.data['py_formula'] = desc.CalcMolFormula(mol) if 'py_em' in props: self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5) if 'py_n_Cl_Br' in props: all_atoms = [] for atom in mol.GetAtoms(): all_atoms.append(atom.GetSymbol()) n_Cl = all_atoms.count('Cl') n_Br = all_atoms.count('Br') self.data['py_n_Cl_Br'] = n_Cl + n_Br if 'py_na' in props: self.data['py_na'] = mol.GetNumAtoms() if 'py_mw' in props: self.data['py_mw'] = desc._CalcMolWt(mol) if 'py_fsp3' in props: self.data['py_fsp3'] = desc.CalcFractionCSP3(mol) if 'py_rb' in props: self.data['py_rb'] = desc.CalcNumRotatableBonds(mol) if 'py_tpsa' in props: self.data['py_tpsa'] = desc.CalcTPSA(mol) if 'py_clogp' in props: self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0] if 'py_nar' in props: self.data['py_nar'] = desc.CalcNumAromaticRings(mol) if 'py_nhba' in props: self.data['py_nhba'] = desc.CalcNumHBA(mol) if 'py_nhbd' in props: self.data['py_nhbd'] = desc.CalcNumHBD(mol) return False
def feature_fp(smiles): mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.MQNs_(mol) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcExactMolWt(mol)) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcFractionCSP3(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticRings((mol))) fp.append(rdMolDescriptors.CalcNumAromaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticRings(mol)) fp.append(rdMolDescriptors.CalcNumBridgeheadAtoms(mol)) fp.append(rdMolDescriptors.CalcNumRings(mol)) fp.append(rdMolDescriptors.CalcNumAmideBonds(mol)) fp.append(rdMolDescriptors.CalcNumHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumSpiroAtoms(mol)) fp.append(rdMolDescriptors.CalcTPSA(mol)) return np.array(fp)
def generateCompoundPropertiesTask(structure, debug=False): if debug: pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) molecule = structure.molecule if not molecule.compoundProperty: prop = CompoundProperties(molecule=molecule) else: prop = molecule.compoundProperty saltRemover = SaltRemover() mol = Chem.MolFromMolBlock(str(structure.molfile)) base = saltRemover.StripMol(mol) prop.hbd = Descriptors.CalcNumHBD(mol) prop.hba = Descriptors.CalcNumHBA(mol) prop.rtb = Descriptors.CalcNumRotatableBonds(mol) prop.alogp = Crippen.MolLogP(mol) prop.psa = Descriptors.CalcTPSA(mol) prop.full_mwt = NewDescriptors.MolWt(mol) # prop.exact_mass = Descriptors.CalcExactMolWt(mol) if base.GetNumAtoms(): prop.mw_freebase = NewDescriptors.MolWt(base) prop.full_molformula = Descriptors.CalcMolFormula(mol) try: prop.save() except IntegrityError as e: if debug: print e.message else: raise e
def computeFeatures(mol): numRings = rdMolDescriptors.CalcNumRings(mol) numRotBonds = rdMolDescriptors.CalcNumRotatableBonds(mol) nitrogenCount = countNitrogens(mol) oxygenCount = countOxygens(mol) carbonCount = countCarbons(mol) boronCount = countBorons(mol) phosCount = countPhos(mol) sulfurCount = countSulfurs(mol) fluorCount = countFluorine(mol) iodCount = countIodine(mol) doubleBonds = countDoubleBonds(mol) surf_area = rdMolDescriptors.CalcLabuteASA(mol) mol_weight = rdMolDescriptors.CalcExactMolWt(mol) tpsa = rdMolDescriptors.CalcTPSA(mol) dist_hs = recurseMolHCount(mol) output = [ numRings, nitrogenCount, oxygenCount, carbonCount, boronCount, phosCount, sulfurCount, fluorCount, iodCount, doubleBonds, surf_area, mol_weight, tpsa ] for d in dist_hs: output.append(dist_hs[d]) return output
def get_fingerprint(SMILES=None, E_BIND=None): """ PRE: Takes in a MOLECULE as a SMILES POST: Prints its finger prints as two list, the first contains the names, the second contains the fingerprints """ def get_atoms_coords(RDKIT_BLOCK): """Takes as input an RDKIT BLOCK and returns a list of atoms with a numpy array containing the coordinates""" RDKIT_BLOCK = RDKIT_BLOCK.split('\n') atm_number = int(RDKIT_BLOCK[3][:3]) RDKIT_BLOCK = [x.split() for x in RDKIT_BLOCK] atm_list = [] coords_array = np.zeros([atm_number, 3], dtype=float) for i, line in enumerate(RDKIT_BLOCK[4:4 + atm_number]): coords_atm = line atm_list.append(coords_atm[3]) coords_array[i, :] = coords_atm[:3] return atm_list, coords_array def get_atom_types(mol): """ PRE: Takes in the mol POST: Returns a dictionary with the atom types and numbers """ atom_types = {} for atom in mol.GetAtoms(): symbol = atom.GetSymbol() if symbol in atom_types: atom_types[symbol] += 1 else: atom_types[symbol] = 1 return atom_types def AreRingFused(mol): """ PRE : Takes in a mol rdkit POST : Returns the max number of fused rings. That is the maximum number of rings any atom belongs to """ rings = Chem.GetSymmSSSR(mol) ring_dic = {} for ring in rings: for atom in list(ring): if atom in ring_dic: ring_dic[atom] += 1 else: ring_dic[atom] = 1 if ring_dic.values() == []: return 0 else: return max(ring_dic.values()) def getVolume(mol, atom_types): """ PRE: Takes in a mol with HYDROGENS ADDED POST: Returns its volume computed as a linear combination of the contribution of the vdW volumes """ index_of_vols = {'H': 7.24, 'C': 20.58, 'N': 15.60, 'O': 14.71, 'F': 13.31, 'Cl': 22.45, 'Br': 26.52, 'I': 32.52, 'P': 24.43, 'S': 24.43, 'As': 26.52, 'B': 40.48, 'Si': 38.79, 'Se': 28.73, 'Te': 36.62} gross_volume = 0 # for sym in atom_types: # gross_volume += atom_types[sym] * index_of_vols[sym] bonds = mol.GetNumBonds() rings = Chem.GetSymmSSSR(mol) # print 'aromatic ring count is ',descriptors.CalcNumAromaticRings(mol) # print 'aliphatic ring count is ',descriptors.CalcNumAliphaticRings(mol) ra = 0 largest_ra = 0 rna = 0 largest_rna = 0 for ringId in range(len(rings)): if isRingAromatic(mol, tuple(rings[ringId])): ra += 1 if largest_ra < len(rings[ringId]): largest_ra = len(rings[ringId]) else: rna += 1 if largest_rna < len(rings[ringId]): largest_rna = len(rings[ringId]) # volume = gross_volume - 5.92 * bonds - 14.7 * ra - 3.8 * rna try: AllChem.EmbedMolecule(mol) AllChem.MMFFOptimizeMolecule(mol) volume = AllChem.ComputeMolVolume(mol) except: raise ValueError("Can't build the molecule") return volume, ra, rna, largest_ra, largest_rna def isRingAromatic(mol, ring): """ PRE: Takes in a mol and a ring given as a tuple of atom id POST: Returns TRUE is all the atoms inside the ring are aromatic and FALSE otherwise """ aromatic = True for ids in ring: if mol.GetAtomWithIdx(ids).GetIsAromatic(): # print ids pass else: aromatic = False break return aromatic mol = SMILES features = [ 'atomNbr', 'Volume', 'NAtom', 'OAtom', 'SAtom', 'PAtom', 'ClAtom', 'BrAtom', 'FAtom', 'IAtom', 'AromaticRingNumber', 'LargestAromaticRingAtomNbr', 'NonAromaticRingNumber', 'LargestNonAromaticRingAtomNbr', 'MaxNbrFusedRings', 'SurfaceArea', 'Charge', # 'MinRadiusOfCylinder', # 'RadiusOfCylinderBestConf', 'NitroNbr', 'AlcoholNbr', 'KetoneNbr', 'NitrileNbr', 'ThiolNbr', 'Phenol_likeNbr', 'EsterNbr', 'SulfideNbr', 'CarboxilicAcidNbr', 'EtherNbr', 'AmideNbr', 'AnilineNbr', 'PrimaryAmineNbr', 'SecondaryAmineNbr', 'RotableBondNum', 'HBondDonor', 'HBondAcceptor', 'MolLogP', 'MolMR' ] for i in range(6): features.append('Chi{}v'.format(i + 1)) features.append('Chi{}n'.format(i + 1)) if i < 3: features.append('Kappa{}'.format(i + 1)) feature_dic = dict.fromkeys(features) if mol == None: return sorted(feature_dic.keys()) mol = Chem.MolFromSmiles(SMILES) mol = Chem.AddHs(mol) feature_dic['RotableBondNum'] = descriptors.CalcNumRotatableBonds(mol) for i in range(6): feature_dic['Chi{}v'.format(i + 1)] = descriptors.CalcChiNv(mol, i + 1) feature_dic['Chi{}n'.format(i + 1)] = descriptors.CalcChiNn(mol, i + 1) feature_dic['Kappa1'] = descriptors.CalcKappa1(mol) feature_dic['Kappa2'] = descriptors.CalcKappa2(mol) feature_dic['Kappa3'] = descriptors.CalcKappa3(mol) feature_dic['HBondAcceptor'] = descriptors.CalcNumHBA(mol) feature_dic['HBondDonor'] = descriptors.CalcNumHBD(mol) CrippenDescriptors = descriptors.CalcCrippenDescriptors(mol) feature_dic['MolLogP'] = CrippenDescriptors[0] feature_dic['MolMR'] = CrippenDescriptors[1] atom_types = get_atom_types(mol) for feat, symbol in zip(['NAtom', 'OAtom', 'SAtom', 'PAtom', 'ClAtom', 'BrAtom', 'FAtom', 'IAtom'], ['N', 'O', 'S', 'P', 'Cl', 'Br', 'F', 'I']): if symbol in atom_types: feature_dic[feat] = atom_types[symbol] else: feature_dic[feat] = 0 feature_dic['atomNbr'] = mol.GetNumHeavyAtoms() feature_dic['Volume'], feature_dic['AromaticRingNumber'], feature_dic['NonAromaticRingNumber'], feature_dic[ 'LargestAromaticRingAtomNbr'], feature_dic['LargestNonAromaticRingAtomNbr'] = getVolume(mol, atom_types) feature_dic['MaxNbrFusedRings'] = AreRingFused(mol) feature_dic['SurfaceArea'] = descriptors.CalcTPSA(mol) feature_dic['Charge'] = Chem.GetFormalCharge(mol) funct_dic = { '[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]': 'NitroNbr', '[#6][OX2H]': 'AlcoholNbr', '[NX1]#[CX2]': 'NitrileNbr', '[#6][CX3](=O)[#6]': 'KetoneNbr', '[#16X2H]': 'ThiolNbr', "[OX2H][cX3][c]": 'Phenol_likeNbr', '[#6][CX3](=O)[OX2H0][#6]': 'EsterNbr', '[#16X2H0]': 'SulfideNbr', '[CX3](=O)[OX2H1]': 'CarboxilicAcidNbr', '[OD2]([#6])[#6]': 'EtherNbr', # '[NX3][CX3](=[OX1])[#6]':'AmideNbr', '[#7X3][#6X3](=[OX1])[#6]': 'AmideNbr', '[NX3][cc]': 'AnilineNbr', '[NX3H2;!$(NC=O)]': 'PrimaryAmineNbr', '[NX3H1;!$(NC=O)]': 'SecondaryAmineNbr'} for funct in funct_dic: patt = Chem.MolFromSmarts(funct) feature_dic[funct_dic[funct]] = len(mol.GetSubstructMatches(patt)) # names, coords = get_atoms_coords(Chem.MolToMolBlock(mol)) # feature_dic['MinRadiusOfCylinder'] = returnCircleAsTuple(coords[:,1:])[2] # feature_dic['MinRadiusOfCylinder'] = RADIUS[0] # feature_dic['RadiusOfCylinderBestConf'] = RADIUS[1] values = [] for key in sorted(feature_dic.keys()): values.append(feature_dic[key]) # print key, feature_dic[key] return values
Algorithm in: P. Ertl, B. Rohde, P. Selzer Fast Calculation of Molecular Polar Surface Area as a Sum of Fragment-based Contributions and Its Application to the Prediction of Drug Transport Properties, J.Med.Chem. 43, 3714-3717, 2000 Implementation based on the Daylight contrib program tpsa.c """ contribs = _pyTPSAContribs(mol,verbose=verbose) res = 0.0 for contrib in contribs: res += contrib return res _pyTPSA.version="1.0.1" TPSA=lambda *x,**y:rdMolDescriptors.CalcTPSA(*x,**y) TPSA.version=rdMolDescriptors._CalcTPSA_version if __name__ == '__main__': smis = ['C','CC','CCC','CCCC','CO','CCO','COC'] smis = ['C(=O)O','c1ccccc1'] for smi in smis: m = Chem.MolFromSmiles(smi) #print(smi, LabuteASA(m)) print('-----------\n',smi) #print('M:',['% 4.2f'%x for x in SMR_VSA_(m)]) #print('L:',['% 4.2f'%x for x in SlogP_VSA_(m)]) print('P:',['% 4.2f'%x for x in PEOE_VSA_(m)]) print('P:',['% 4.2f'%x for x in PEOE_VSA_(m)]) print()
# "anhydride": Chem.MolFromSmarts('[#6]-[#6](=O)-[#8]-[#6](-[#6])=O'), # CC(=O)OC(=O)C # "peroxide": Chem.MolFromSmarts('[#8]-[#8]'), # R-O-O-R' # "ab_unsaturated_ketone": Chem.MolFromSmarts('[#6]=[#6]-[#6]=O'), # R=CC=O #} DESCRIPTORS = { # classical molecular descriptors "num_heavy_atoms": lambda x: x.GetNumAtoms(), "molecular_weight": lambda x: round(Desc.ExactMolWt(x), 4), "num_rings": lambda x: rdMolDesc.CalcNumRings(x), "num_rings_arom": lambda x: rdMolDesc.CalcNumAromaticRings(x), "num_rings_ali": lambda x: rdMolDesc.CalcNumAliphaticRings(x), "num_hbd": lambda x: rdMolDesc.CalcNumLipinskiHBD(x), "num_hba": lambda x: rdMolDesc.CalcNumLipinskiHBA(x), "slogp": lambda x: round(Crippen.MolLogP(x), 4), "tpsa": lambda x: round(rdMolDesc.CalcTPSA(x), 4), "num_rotatable_bond": lambda x: rdMolDesc.CalcNumRotatableBonds(x), "num_atoms_oxygen": lambda x: len( [a for a in x.GetAtoms() if a.GetAtomicNum() == 8] ), "num_atoms_nitrogen": lambda x: len( [a for a in x.GetAtoms() if a.GetAtomicNum() == 7] ), "num_atoms_halogen": Fragments.fr_halogen, "num_atoms_bridgehead": rdMolDesc.CalcNumBridgeheadAtoms, # custom molecular descriptors #"ring_size_min": get_min_ring_size, #"ring_size_max": get_max_ring_size, "frac_sp3": lambda x: rdMolDesc.CalcFractionCSP3(x), # HTS filters 1/2 - present in the RDKit Fragments #"num_aldehyde": Fragments.fr_aldehyde,
'HBD', 'jIndex' ] for name in prop_names: d[f'{name}'] = [] for i, s in enumerate(smiles): if (i % 10000 == 0): print(i) m = Chem.MolFromSmiles(s) if (m == None or 'i' in s or '.' in s): DUD = DUD.drop(i) print(s, i) else: d['QED'].append(QED.default(m)) d['logP'].append(Crippen.MolLogP(m)) d['molWt'].append(Descriptors.MolWt(m)) d['maxCharge'].append(Descriptors.MaxPartialCharge(m)) d['minCharge'].append(Descriptors.MinPartialCharge(m)) d['valence'].append(Descriptors.NumValenceElectrons(m)) d['TPSA'].append(rdMolDescriptors.CalcTPSA(m)) d['HBA'].append(rdMolDescriptors.CalcNumHBA(m)) d['HBD'].append(rdMolDescriptors.CalcNumHBD(m)) d['jIndex'].append(GraphDescriptors.BalabanJ(m)) df = pd.DataFrame.from_dict(d) df_merge = pd.merge(df, DUD, on=df.index) #df_merge.to_csv('/home/mcb/jboitr/data/DUD_full.csv') df_merge.to_csv('C:/Users/jacqu/Documents/data/DUD_full.csv')
def extract(x, from_smiles): if from_smiles: mol = Chem.MolFromSmiles(x) else: mol = x if (mol is None) or (len(mol.GetAtoms()) == 0): if include_3D: return [0] * 29 else: return [0] * 24 else: logP = Crippen.MolLogP(mol) refractivity = Crippen.MolMR(mol) weight = Descriptors.MolWt(mol) exact_weight = Descriptors.ExactMolWt(mol) heavy_weight = Descriptors.HeavyAtomMolWt(mol) heavy_count = Lipinski.HeavyAtomCount(mol) nhoh_count = Lipinski.NHOHCount(mol) no_count = Lipinski.NOCount(mol) hacceptor_count = Lipinski.NumHAcceptors(mol) hdonor_count = Lipinski.NumHDonors(mol) hetero_count = Lipinski.NumHeteroatoms(mol) rotatable_bond_count = Lipinski.NumRotatableBonds(mol) valance_electron_count = Descriptors.NumValenceElectrons(mol) amide_bond_count = rdMolDescriptors.CalcNumAmideBonds(mol) aliphatic_ring_count = Lipinski.NumAliphaticRings(mol) aromatic_ring_count = Lipinski.NumAromaticRings(mol) saturated_ring_count = Lipinski.NumSaturatedRings(mol) aliphatic_cycle_count = Lipinski.NumAliphaticCarbocycles(mol) aliphaticHetero_cycle_count = Lipinski.NumAliphaticHeterocycles( mol) aromatic_cycle_count = Lipinski.NumAromaticCarbocycles(mol) aromaticHetero_cycle_count = Lipinski.NumAromaticHeterocycles(mol) saturated_cycle_count = Lipinski.NumSaturatedCarbocycles(mol) saturatedHetero_cycle_count = Lipinski.NumSaturatedHeterocycles( mol) tpsa = rdMolDescriptors.CalcTPSA(mol) if include_3D: mol_3D = Chem.AddHs(mol) AllChem.EmbedMolecule(mol_3D) AllChem.MMFFOptimizeMolecule(mol_3D) eccentricity = rdMolDescriptors.CalcEccentricity(mol_3D) asphericity = rdMolDescriptors.CalcAsphericity(mol_3D) spherocity = rdMolDescriptors.CalcSpherocityIndex(mol_3D) inertial = rdMolDescriptors.CalcInertialShapeFactor(mol_3D) gyration = rdMolDescriptors.CalcRadiusOfGyration(mol_3D) return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa, eccentricity, asphericity, spherocity, inertial, gyration ] else: return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa ]
def get_global_features(self, mol): u = [] # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) # First get some basic features natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol) NumValenceElectrons = Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) MaxAbsPartialCharge = Descriptors.MaxAbsPartialCharge(mol) MaxPartialCharge = Descriptors.MaxPartialCharge(mol) MinAbsPartialCharge = Descriptors.MinAbsPartialCharge(mol) MinPartialCharge = Descriptors.MinPartialCharge(mol) ''' # FpDensityMorgan1 = Descriptors.FpDensityMorgan1(mol) # FpDensityMorgan2 = Descriptors.FpDensityMorgan2(mol) # FpDensityMorgan3 = Descriptors.FpDensityMorgan3(mol) # Get some features using chemical feature factory nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass #print(feats[j].GetFamily()) # Now get some features using rdMolDescriptors moreGlobalFeatures = [rdm.CalcNumRotatableBonds(mol), rdm.CalcChi0n(mol), rdm.CalcChi0v(mol), \ rdm.CalcChi1n(mol), rdm.CalcChi1v(mol), rdm.CalcChi2n(mol), rdm.CalcChi2v(mol), \ rdm.CalcChi3n(mol), rdm.CalcChi4n(mol), rdm.CalcChi4v(mol), \ rdm.CalcFractionCSP3(mol), rdm.CalcHallKierAlpha(mol), rdm.CalcKappa1(mol), \ rdm.CalcKappa2(mol), rdm.CalcLabuteASA(mol), \ rdm.CalcNumAliphaticCarbocycles(mol), rdm.CalcNumAliphaticHeterocycles(mol), \ rdm.CalcNumAliphaticRings(mol), rdm.CalcNumAmideBonds(mol), \ rdm.CalcNumAromaticCarbocycles(mol), rdm.CalcNumAromaticHeterocycles(mol), \ rdm.CalcNumAromaticRings(mol), rdm.CalcNumBridgeheadAtoms(mol), rdm.CalcNumHBA(mol), \ rdm.CalcNumHBD(mol), rdm.CalcNumHeteroatoms(mol), rdm.CalcNumHeterocycles(mol), \ rdm.CalcNumLipinskiHBA(mol), rdm.CalcNumLipinskiHBD(mol), rdm.CalcNumRings(mol), \ rdm.CalcNumSaturatedCarbocycles(mol), rdm.CalcNumSaturatedHeterocycles(mol), \ rdm.CalcNumSaturatedRings(mol), rdm.CalcNumSpiroAtoms(mol), rdm.CalcTPSA(mol)] u = [natoms, nbonds, mw, HeavyAtomMolWt, NumValenceElectrons, \ nbrAcceptor, nbrDonor, nbrHydrophobe, nbrLumpedHydrophobe, \ nbrPosIonizable, nbrNegIonizable] u = u + moreGlobalFeatures u = np.array(u).T # Some of the descriptors produice NAN. We can convert them to 0 # If you are getting outliers in the training or validation set this could be # Because some important features were set to zero here because it produced NAN # Removing those features from the feature set might remove the outliers #u[np.isnan(u)] = 0 #u = torch.tensor(u, dtype=torch.float) return (u)
def loadSDF(sdfPath): # Create images #generateImages(sdfPath) # Create a molecule supplier suppl = Chem.SDMolSupplier(sdfPath) # Filter empty entries sdf = [x for x in suppl if x is not None] # For each molecule in supplier for mol in sdf: data = {} try: data['fCharge'] = mol.GetProp('Charge') except: data['fCharge'] = Chem.GetFormalCharge(mol) try: data['name'] = mol.GetProp('DATABASE_ID') except: data['name'] = 'unkown' try: data['molMass'] = mol.GetProp('Total Molweight') except: data['molMass'] = Descriptors.ExactMolWt(mol) try: data['cLogP'] = mol.GetProp('cLogP') except: data['cLogP'] = Crippen.MolLogP(mol) # não sei se ta certo try: data['cLogS'] = mol.GetProp('cLogS') except: data['cLogS'] = 0.0 try: data['tpsa'] = mol.GetProp('Polar Surface Area') except: data['tpsa'] = rdMolDescriptors.CalcTPSA(mol) try: data['totalSurfaceArea'] = mol.GetProp('Total Surface Area') except: data['totalSurfaceArea'] = rdMolDescriptors.CalcTPSA(mol) try: data['hbondAcceptors'] = mol.GetProp('H-Acceptors') except: data['hbondAcceptors'] = rdMolDescriptors.CalcNumHBA(mol) try: data['hbondDonnors'] = mol.GetProp('H-Donors') except: data['hbondDonnors'] = rdMolDescriptors.CalcNumHBD(mol) try: data['rotable'] = mol.GetProp('Rotatable Bonds') except: data['rotable'] = rdMolDescriptors.CalcNumRotatableBonds(mol) try: data['mutagenic'] = mol.GetProp('Mutagenic') except: data['mutagenic'] = 'Unknown' try: data['tumorigenic'] = mol.GetProp('Tumorigenic') except: data['tumorigenic'] = 'Unknown' try: data['irritant'] = mol.GetProp('Irritant') except: data['irritant'] = 'Unkown' try: data['smiles'] = mol.GetProp('SMILES') except: data['smiles'] = Chem.MolToSmiles(mol) try: data['InChI'] = mol.GetProp('INCHI_IDENTIFIER') except: data['InChI'] = inchi.MolToInchi(mol) try: data['inchiKey'] = mol.GetProp('INCHI_KEY') except: data['inchiKey'] = inchi.MolToInchiKey(mol) try: data['nonHAtoms'] = mol.GetProp('Non-H Atoms') except: data['nonHAtoms'] = -1 # Não sei calcular try: data['numAtoms'] = mol.GetProp('numAtoms') except: data['numAtoms'] = mol.GetNumAtoms() try: data['stereoCenters'] = mol.GetProp('Stereo Centers') except: data['stereoCenters'] = mol.GetNumAtoms() try: data['provider'] = mol.GetProp('DATABASE_NAME') except: print("Nenhum fornecedor encontrado, o campo é obrigatório!") continue tmp = AllChem.Compute2DCoords(mol) # Compute its coordinates Draw.MolToFile(mol, os.path.join(settings.FILES_DIR, f'molImages/' + data["inchiKey"] + '.png'), size=(300,300), kekulize=True, wedgeBonds=True, fitImage=True) # Save it Draw.MolToFile(mol, os.path.join(settings.FILES_DIR, f'molThumbs/' + data["inchiKey"] + '.png'), size=(150,150), kekulize=True, wedgeBonds=True, fitImage=True) feedDatabase(data) if Compounds.objects.filter(inChIKey=data['inchiKey']).exists(): if not Compounds.objects.filter(provider=['provider']).exists(): feedDatabase(data) print("feed1") # append no sdf da base de dados a = 1 else: print("continue123") continue else: a = 1 feedDatabase(data) print("feed2") '''except:
def get_tpsa_(mol: Mol) -> float: return round(rdMolDescriptors.CalcTPSA(mol), round_digs)
def get_molecular_features(dataframe, mol_list): df = dataframe for i in range(len(mol_list)): print("Getting molecular features for molecule: ", i) mol = mol_list[i] natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) df.at[i,"NbrAtoms"] = natoms df.at[i,"NbrBonds"] = nbonds df.at[i,"mw"] = mw df.at[i,'HeavyAtomMolWt'] = Chem.Descriptors.HeavyAtomMolWt(mol) df.at[i,'NumValenceElectrons'] = Chem.Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) df.at[i,'MaxAbsPartialCharge'] = Chem.Descriptors.MaxAbsPartialCharge(mol) df.at[i,'MaxPartialCharge'] = Chem.Descriptors.MaxPartialCharge(mol) df.at[i,'MinAbsPartialCharge'] = Chem.Descriptors.MinAbsPartialCharge(mol) df.at[i,'MinPartialCharge'] = Chem.Descriptors.MinPartialCharge(mol) ''' df.at[i,'FpDensityMorgan1'] = Chem.Descriptors.FpDensityMorgan1(mol) df.at[i,'FpDensityMorgan2'] = Chem.Descriptors.FpDensityMorgan2(mol) df.at[i,'FpDensityMorgan3'] = Chem.Descriptors.FpDensityMorgan3(mol) #print(natoms, nbonds) # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) #df["Acceptor"] = 0 #df["Aromatic"] = 0 #df["Hydrophobe"] = 0 nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass#print(feats[j].GetFamily()) df.at[i,"Acceptor"] = nbrAcceptor df.at[i,"Donor"] = nbrDonor df.at[i,"Hydrophobe"] = nbrHydrophobe df.at[i,"LumpedHydrophobe"] = nbrLumpedHydrophobe df.at[i,"PosIonizable"] = nbrPosIonizable df.at[i,"NegIonizable"] = nbrNegIonizable # We can also get some more molecular features using rdMolDescriptors df.at[i,"NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol) df.at[i,"CalcChi0n"] = rdMolDescriptors.CalcChi0n(mol) df.at[i,"CalcChi0v"] = rdMolDescriptors.CalcChi0v(mol) df.at[i,"CalcChi1n"] = rdMolDescriptors.CalcChi1n(mol) df.at[i,"CalcChi1v"] = rdMolDescriptors.CalcChi1v(mol) df.at[i,"CalcChi2n"] = rdMolDescriptors.CalcChi2n(mol) df.at[i,"CalcChi2v"] = rdMolDescriptors.CalcChi2v(mol) df.at[i,"CalcChi3n"] = rdMolDescriptors.CalcChi3n(mol) df.at[i,"CalcChi3v"] = rdMolDescriptors.CalcChi3v(mol) df.at[i,"CalcChi4n"] = rdMolDescriptors.CalcChi4n(mol) df.at[i,"CalcChi4v"] = rdMolDescriptors.CalcChi4v(mol) df.at[i,"CalcFractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol) df.at[i,"CalcHallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol) df.at[i,"CalcKappa1"] = rdMolDescriptors.CalcKappa1(mol) df.at[i,"CalcKappa2"] = rdMolDescriptors.CalcKappa2(mol) #df.at[i,"CalcKappa3"] = rdMolDescriptors.CalcKappa3(mol) df.at[i,"CalcLabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol) df.at[i,"CalcNumAliphaticCarbocycles"] = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol) df.at[i,"CalcNumAliphaticHeterocycles"] = rdMolDescriptors.CalcNumAliphaticHeterocycles(mol) df.at[i,"CalcNumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol) df.at[i,"CalcNumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol) df.at[i,"CalcNumAromaticCarbocycles"] = rdMolDescriptors.CalcNumAromaticCarbocycles(mol) df.at[i,"CalcNumAromaticHeterocycles"] = rdMolDescriptors.CalcNumAromaticHeterocycles(mol) df.at[i,"CalcNumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol) df.at[i,"CalcNumBridgeheadAtoms"] = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) df.at[i,"CalcNumHBA"] = rdMolDescriptors.CalcNumHBA(mol) df.at[i,"CalcNumHBD"] = rdMolDescriptors.CalcNumHBD(mol) df.at[i,"CalcNumHeteroatoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol) df.at[i,"CalcNumHeterocycles"] = rdMolDescriptors.CalcNumHeterocycles(mol) df.at[i,"CalcNumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol) df.at[i,"CalcNumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol) df.at[i,"CalcNumRings"] = rdMolDescriptors.CalcNumRings(mol) df.at[i,"CalcNumSaturatedCarbocycles"] = rdMolDescriptors.CalcNumSaturatedCarbocycles(mol) df.at[i,"CalcNumSaturatedHeterocycles"] = rdMolDescriptors.CalcNumSaturatedHeterocycles(mol) df.at[i,"CalcNumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol) df.at[i,"CalcNumSpiroAtoms"] = rdMolDescriptors.CalcNumSpiroAtoms(mol) df.at[i,"CalcTPSA"] = rdMolDescriptors.CalcTPSA(mol) return(df)