def test1(self): " testing first 200 mols from NCI " # figure out which rotor version we are using m = Chem.MolFromSmiles("CC(C)(C)c1cc(O)c(cc1O)C(C)(C)C") if Lipinski.NumRotatableBonds(m) == 2: rot_prop = NonStrict else: rot_prop = Strict suppl = Chem.SDMolSupplier(self.inFileName) idx = 1 for m in suppl: if m: calc = Lipinski.NHOHCount(m) orig = int(m.GetProp('NUM_LIPINSKIHDONORS')) assert calc == orig, 'bad num h donors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NOCount(m) orig = int(m.GetProp('NUM_LIPINSKIHACCEPTORS')) assert calc == orig, 'bad num h acceptors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumHDonors(m) orig = int(m.GetProp('NUM_HDONORS')) assert calc == orig, 'bad num h donors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumHAcceptors(m) orig = int(m.GetProp('NUM_HACCEPTORS')) assert calc == orig, 'bad num h acceptors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumHeteroatoms(m) orig = int(m.GetProp('NUM_HETEROATOMS')) assert calc == orig, 'bad num heteroatoms for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumRotatableBonds(m) orig = int(m.GetProp(rot_prop)) assert calc == orig, 'bad num rotors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) # test the underlying numrotatable bonds calc = rdMolDescriptors.CalcNumRotatableBonds( m, rdMolDescriptors.NumRotatableBondsOptions.NonStrict) orig = int(m.GetProp(NonStrict)) assert calc == orig, 'bad num rotors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = rdMolDescriptors.CalcNumRotatableBonds( m, rdMolDescriptors.NumRotatableBondsOptions.Strict) orig = int(m.GetProp(Strict)) assert calc == orig, 'bad num rotors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) idx += 1
def calculate_property(m): # SA_score = -sascorer.calculateScore(m) MW = Descriptors.MolWt(m) RB = Lipinski.NumRotatableBonds(m) logp = Descriptors.MolLogP(m) #return (SA_score, MW, RB, logp) return (MW, RB, logp)
def veber_infraction(molecule: Chem.Mol) -> bool: """ Checks if a given molecule fails the veber infraction filters. """ rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10 hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10 return rotatable_bond_saturation or hydrogen_bond_saturation
def score_molecule(smiles): lipinski_score = 0 qed = LipinskiRuleOfFiveDecorator.MAX_QED + 1 try: m = Chem.MolFromSmiles(smiles) logp = Descriptors.MolLogP(m) lipinski_score += 1 if logp < LipinskiRuleOfFiveDecorator.MAX_LOGP else 0 wt = Descriptors.MolWt(m) lipinski_score += 1 if wt < LipinskiRuleOfFiveDecorator.MAX_MOL_WT else 0 hdonor = Lipinski.NumHDonors(m) lipinski_score += 1 if hdonor < LipinskiRuleOfFiveDecorator.MAX_H_DONORS else 0 hacceptor = Lipinski.NumHAcceptors(m) lipinski_score += 1 if hacceptor < LipinskiRuleOfFiveDecorator.MAX_H_DONORS else 0 rotatable_bond = Lipinski.NumRotatableBonds(m) lipinski_score += 1 if rotatable_bond < LipinskiRuleOfFiveDecorator.MAX_ROTATABLE_BONDS else 0 qed = QED.qed(m) except Exception as ex: lipinski_score = 0 logger.exception(ex) return lipinski_score, qed
def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id', redraw=False,keepHs=False, skipProps=False,addComputedProps=False, skipSmiles=False, uniqNames=None,namesSeen=None): if not mol: raise ValueError('no molecule') if keepHs: Chem.SanitizeMol(mol) try: nm = mol.GetProp(nameProp) except KeyError: nm = None if not nm: nm = 'Mol_%d'%nDone if uniqNames and nm in namesSeen: logger.error('duplicate compound id (%s) encountered. second instance skipped.'%nm) return None namesSeen.add(nm) row = [nm] if not skipProps: if addComputedProps: nHD=Lipinski.NumHDonors(mol) mol.SetProp('DonorCount',str(nHD)) nHA=Lipinski.NumHAcceptors(mol) mol.SetProp('AcceptorCount',str(nHA)) nRot=Lipinski.NumRotatableBonds(mol) mol.SetProp('RotatableBondCount',str(nRot)) MW=Descriptors.MolWt(mol) mol.SetProp('AMW',str(MW)) logp=Crippen.MolLogP(mol) mol.SetProp('MolLogP',str(logp)) pns = list(mol.GetPropNames()) pD={} for pi,pn in enumerate(pns): if pn.lower()==nameCol.lower(): continue pv = mol.GetProp(pn).strip() if pv.find('>')<0 and pv.find('<')<0: colTyp = globalProps.get(pn,2) while colTyp>0: try: tpi = typeConversions[colTyp][1](pv) except: colTyp-=1 else: break globalProps[pn]=colTyp pD[pn]=typeConversions[colTyp][1](pv) else: pD[pn]=pv else: pD={} if redraw: AllChem.Compute2DCoords(m) if not skipSmiles: row.append(Chem.MolToSmiles(mol,True)) row.append(DbModule.binaryHolder(mol.ToBinary())) row.append(pD) return row
def pct_rotatable_bonds(mol): n_bonds = mol.GetNumBonds() if n_bonds > 0: rot_bonds = Lipinski.NumRotatableBonds(mol) / n_bonds else: rot_bonds = 0 return rot_bonds
def get_descriptors(mol, write=False): # Make a copy of the molecule dataframe desc = [ Lipinski.NumAromaticHeterocycles(mol), Lipinski.NumAromaticRings(mol), Lipinski.NumHDonors(mol), Lipinski.RingCount(mol), Lipinski.NHOHCount(mol), Lipinski.NumHeteroatoms(mol), Lipinski.NumAliphaticCarbocycles(mol), Lipinski.NumSaturatedCarbocycles(mol), Lipinski.NumAliphaticHeterocycles(mol), Lipinski.NumHAcceptors(mol), Lipinski.NumSaturatedHeterocycles(mol), Lipinski.NumAliphaticRings(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.MaxPartialCharge(mol), Descriptors.NumValenceElectrons(mol), Lipinski.FractionCSP3(mol), Descriptors.MaxAbsPartialCharge(mol), Lipinski.NumAromaticCarbocycles(mol), Lipinski.NumSaturatedRings(mol), Lipinski.NumRotatableBonds(mol) ] desc = [0 if i != i else i for i in desc] return desc
def generate(smiles): moldata = [] for elem in smiles: mol = Chem.MolFromSmiles(elem) moldata.append(mol) baseData = np.arange(1, 1) i = 0 for mol in moldata: desc_MolLogP = Crippen.MolLogP(mol) desc_MolWt = Descriptors.MolWt(mol) desc_NumRotatableBonds = Lipinski.NumRotatableBonds(mol) desc_AromaticProportion = getAromaticProportion(mol) row = np.array([desc_MolLogP, desc_MolWt, desc_NumRotatableBonds, desc_AromaticProportion]) if i == 0: baseData = row else: baseData = np.vstack([baseData, row]) i = i + 1 columnNames = ["MolLogP", "MolWt", "NumRotatableBonds", "AromaticProportion"] descriptors = pd.DataFrame(data=baseData, columns=columnNames) return descriptors
def testMQN(self): m = Chem.MolFromSmiles("CC(C)(C)c1cc(O)c(cc1O)C(C)(C)C") if Lipinski.NumRotatableBonds(m) == 2: tgt = [ 42917, 274, 870, 621, 135, 1582, 29, 3147, 5463, 6999, 470, 62588, 19055, 4424, 309, 24061, 17820, 1, 9303, 24146, 16076, 5560, 4262, 646, 746, 13725, 5430, 2629, 362, 24211, 15939, 292, 41, 20, 1852, 5642, 31, 9, 1, 2, 3060, 1750 ] else: tgt = [ 42917, 274, 870, 621, 135, 1582, 29, 3147, 5463, 6999, 470, 62588, 19055, 4424, 309, 24061, 17820, 1, 8314, 24146, 16076, 5560, 4262, 646, 746, 13725, 5430, 2629, 362, 24211, 15939, 292, 41, 20, 1852, 5642, 31, 9, 1, 2, 3060, 1750 ] tgt = [ 42917, 274, 870, 621, 135, 1582, 29, 3147, 5463, 6999, 470, 62588, 19055, 4424, 309, 24059, 17822, 1, 8314, 24146, 16076, 5560, 4262, 646, 746, 13725, 5430, 2629, 362, 24211, 15939, 292, 41, 20, 1852, 5642, 31, 9, 1, 2, 3060, 1750 ] fn = os.path.join(os.path.dirname(__file__), 'test_data', 'aromat_regress.txt') ms = [x for x in Chem.SmilesMolSupplier(fn, delimiter='\t')] vs = np.zeros((42, ), np.int32) for m in ms: vs += rdMolDescriptors.MQNs_(m) self.assertEqual(list(vs), tgt)
def testMQNDetails(self): refFile = os.path.join(os.path.dirname(__file__), 'test_data', 'MQNs_regress.pkl') refFile2 = os.path.join(os.path.dirname(__file__), 'test_data', 'MQNs_non_strict_regress.pkl') # figure out which definition we are currently using m = Chem.MolFromSmiles("CC(C)(C)c1cc(O)c(cc1O)C(C)(C)C") if Lipinski.NumRotatableBonds(m) == 2: refFile = refFile2 with open(refFile, 'rb') as intf: refData = pickle.load(intf) fn = os.path.join(os.path.dirname(__file__), 'test_data', 'aromat_regress.txt') ms = [x for x in Chem.SmilesMolSupplier(fn, delimiter='\t')] for i, m in enumerate(ms): mqns = rdMolDescriptors.MQNs_(m) if mqns != refData[i][1]: indices = [ (j, x, y) for j, x, y in zip(range(len(mqns)), mqns, refData[i][1]) if x != y ] print(i, Chem.MolToSmiles(m), indices) self.assertEqual(mqns, refData[i][1])
def get_filter_values(mol): """ calculate the values, for a given molecule, that are used to filter return as a dictionary """ assert isinstance(mol, Chem.Mol) values = {} values["MW"] = desc.CalcExactMolWt(mol) values["logP"] = crip.MolLogP(mol) values["HBA"] = lip.NumHAcceptors(mol) values["HBD"] = lip.NumHDonors(mol) values["tPSA"] = desc.CalcTPSA(mol) values["rot_bonds"] = lip.NumRotatableBonds(mol) values["rigid_bonds"] = mol.GetNumBonds() - values["rot_bonds"] # assume mutual exclusion values["num_rings"] = lip.RingCount(mol) values["num_hetero_atoms"] = lip.NumHeteroatoms(mol) values["charge"] = rdmolops.GetFormalCharge(mol) # trusting this charge calculation method values["num_carbons"], values["num_charges"], values["max_ring_size"] = get_atom_props(mol) try: values["hc_ratio"] = float(values["num_hetero_atoms"]) / float(values["num_carbons"]) except ZeroDivisionError: values["hc_ratio"] = 100000000 # if there are zero carbons values["fc"] = len(list(Brics.FindBRICSBonds(mol))) # how many BRICS bonds, related to complexity values["is_good"] = True # default to true, but not yet observed atoms = [atom.GetSymbol() for atom in mol.GetAtoms()] # get all the atoms, and make the list unique (only types) atoms = set(atoms) atoms = list(atoms) values["atoms"] = atoms values["num_chiral_centers"] = len(Chem.FindMolChiralCenters(mol, includeUnassigned=True)) values["rejections"] = [] # empty list to store the reasons for rejection return values
def testMQNDetails(self): refFile = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', 'MQNs_regress.pkl') refFile2 = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', 'MQNs_non_strict_regress.pkl') # figure out which definition we are currently using m = Chem.MolFromSmiles("CC(C)(C)c1cc(O)c(cc1O)C(C)(C)C") if Lipinski.NumRotatableBonds(m) == 2: refFile = refFile2 with open(refFile, 'r') as intf: buf = intf.read().replace('\r\n', '\n').encode('utf-8') intf.close() with io.BytesIO(buf) as inf: pkl = inf.read() refData = cPickle.loads(pkl, encoding='bytes') fn = os.path.join(RDConfig.RDCodeDir, 'Chem', 'test_data', 'aromat_regress.txt') ms = [x for x in Chem.SmilesMolSupplier(fn, delimiter='\t')] refData2 = [] for i, m in enumerate(ms): mqns = rdMolDescriptors.MQNs_(m) refData2.append((m, mqns)) if mqns != refData[i][1]: indices = [ (j, x, y) for j, x, y in zip(range(len(mqns)), mqns, refData[i][1]) if x != y ] print(i, Chem.MolToSmiles(m), indices) self.assertEqual(mqns, refData[i][1])
def CalculateRotationBondNumber(mol): """ Calculation of rotation bonds count in a molecule Parameters: mol: rdkit molecule Returns: Rotation Bond Number """ return LPK.NumRotatableBonds(mol)
def mole_proper(mol): num_hdonors = Lipinski.NumHDonors(mol) num_hacceptors = Lipinski.NumHAcceptors(mol) num_rotatable = Lipinski.NumRotatableBonds(mol) mol_weight = Descriptors.MolWt(mol) mol_logp = Crippen.MolLogP(mol) mol_TPSA = Descriptors.TPSA(mol) proper = (num_hdonors, num_hacceptors, num_rotatable, mol_weight, mol_logp, mol_TPSA) return proper
def calc_esol_descriptors(self, mol): """ Calcuate mw,logp,rotors and aromatic proportion (ap) :param mol: input molecule :return: named tuple with descriptor values """ mw = Descriptors.MolWt(mol) logp = Crippen.MolLogP(mol) rotors = Lipinski.NumRotatableBonds(mol) ap = self.calc_ap(mol) return self.Descriptor(mw=mw, logp=logp, rotors=rotors, ap=ap)
def auto_sampling(mult_factor, mol, log): auto_samples = 0 auto_samples += 3 * (Lipinski.NumRotatableBonds(mol) ) # x3, for C3 rotations auto_samples += 3 * (Lipinski.NHOHCount(mol)) # x3, for OH/NH rotations auto_samples += 3 * (Lipinski.NumSaturatedRings(mol) ) # x3, for boat/chair/envelope confs if auto_samples == 0: auto_samples = mult_factor else: auto_samples = mult_factor * auto_samples return auto_samples
def descriptors(self, mol): aromatic_frac = self.arofrac(mol) mw = Descriptors.ExactMolWt(mol, False) valence_e = Descriptors.NumValenceElectrons(mol) h_acceptors = Lipinski.NumHAcceptors(mol) h_donors = Lipinski.NumHDonors(mol) NO_counts = Lipinski.NOCount(mol) NHOH_count = Lipinski.NHOHCount(mol) rotors = Lipinski.NumRotatableBonds(mol) SP3_frac = Lipinski.FractionCSP3(mol) logP = Crippen.MolLogP(mol) SP_bonds = len(mol.GetSubstructMatches(Chem.MolFromSmarts('[^1]'))) return([aromatic_frac,mw,valence_e,h_acceptors,h_donors,NO_counts,NHOH_count, rotors,SP3_frac,logP,SP_bonds])
def auto_sampling(mult_factor,mol,args,log): if args.metal_complex: if len(args.metal_idx) > 0: mult_factor = mult_factor*3*len(args.metal_idx) # this accounts for possible trans/cis isomers in metal complexes auto_samples = 0 auto_samples += 3*(Lipinski.NumRotatableBonds(mol)) # x3, for C3 rotations auto_samples += 3*(Lipinski.NHOHCount(mol)) # x3, for OH/NH rotations auto_samples += 3*(Lipinski.NumSaturatedRings(mol)) # x3, for boat/chair/envelope confs if auto_samples == 0: auto_samples = mult_factor else: auto_samples = mult_factor*auto_samples return auto_samples
def getDiscriptor(self): from rdkit.Chem import Crippen from rdkit import Chem import pandas as pd from rdkit.Chem import Descriptors, Lipinski import os os.chdir(r"G:\マイドライブ\Data\Meram Chronic Data") df = pd.read_csv('extChronicStrcture.csv', engine='python') df = df[['CAS', 'canonical_smiles']] df = df.dropna(how='any') #df = pd.read_csv('extractInchi.csv',header=None) columns = [ 'CAS', 'weight', 'logP', 'RotatableBonds', 'HeavyAtomCounts', 'AromProp', 'TPSA', 'HDonor', 'HAcceptors', 'FractionCSP3', 'AromaticCarbocycles', 'AromaticHeterocycles' ] CAS = df['CAS'] SMILES = df['canonical_smiles'] resultDf = pd.DataFrame(columns=columns) for cas, smiles in zip(CAS, SMILES): mol = Chem.MolFromSmiles(smiles) wt = Descriptors.MolWt(mol) rot = Lipinski.NumRotatableBonds(mol) heavy = Lipinski.HeavyAtomCount(mol) logp = Crippen.MolLogP(mol) aromaticHeavyatoms = len( mol.GetSubstructMatches(Chem.MolFromSmarts('[a]'))) numAtoms = mol.GetNumAtoms() aromprop = float(aromaticHeavyatoms / numAtoms) TPSA = Descriptors.TPSA(mol) HDonors = Descriptors.NumHDonors(mol) HAcceptors = Descriptors.NumHAcceptors(mol) FractionCSP3 = Descriptors.FractionCSP3(mol) AromaticCarbocycles = Descriptors.NumAromaticCarbocycles(mol) AromaticHeterocycles = Descriptors.NumAromaticHeterocycles(mol) (print(HDonors, HAcceptors)) tempDf = pd.DataFrame([[ cas, wt, logp, rot, heavy, aromprop, TPSA, HDonors, HAcceptors, FractionCSP3, AromaticCarbocycles, AromaticHeterocycles ]], columns=columns) resultDf = pd.concat([resultDf, tempDf]) resultDf.to_csv('Descriptors.csv', index=False)
def ProcessMol(session, mol, globalProps, nDone, nameProp='_Name', nameCol='compound_id', redraw=False, keepHs=False, skipProps=False, addComputedProps=False, skipSmiles=False): if not mol: raise ValueError('no molecule') if keepHs: Chem.SanitizeMol(mol) try: nm = mol.GetProp(nameProp) except KeyError: nm = None if not nm: nm = 'Mol_%d' % nDone cmpd = Compound() session.add(cmpd) if redraw: AllChem.Compute2DCoords(m) if not skipSmiles: cmpd.smiles = Chem.MolToSmiles(mol, True) cmpd.molpkl = mol.ToBinary() setattr(cmpd, nameCol, nm) if not skipProps: if addComputedProps: cmpd.DonorCount = Lipinski.NumHDonors(mol) cmpd.AcceptorCount = Lipinski.NumHAcceptors(mol) cmpd.RotatableBondCount = Lipinski.NumRotatableBonds(mol) cmpd.AMW = Descriptors.MolWt(mol) cmpd.MolLogP = Crippen.MolLogP(mol) pns = list(mol.GetPropNames()) for pi, pn in enumerate(pns): if pn.lower() == nameCol.lower(): continue pv = mol.GetProp(pn).strip() if pn in globalProps: setattr(cmpd, pn.lower(), pv) return cmpd
def run_filter(self, mol): """ This runs a Mozziconacci filter. Mozziconacci filter is a filter for Drug-likeliness which filters molecules by the number of: To pass the filter a molecule must be: # of Rotatable bonds: Max 15 # of Rings: Max 6 # of Oxygens: Min 1 # of Nitrogens: Min 1 # of Halogens: Max 7 Inputs: :param rdkit.Chem.rdchem.Mol object mol: An rdkit mol object to be tested if it passes the filters Returns: :returns: bool bool: True if the mol passes the filter; False if it fails the filter """ halogen = Chem.MolFromSmarts("[*;#9,#17,#35,#53,#85]") number_of_halogens = len(mol.GetSubstructMatches(halogen, maxMatches=8)) if number_of_halogens > 7: return False oxygen = Chem.MolFromSmarts("[#8]") number_of_oxygens = len(mol.GetSubstructMatches(oxygen, maxMatches=2)) if number_of_oxygens < 1: return False nitrogen = Chem.MolFromSmarts("[#7]") number_of_nitrogen = len( mol.GetSubstructMatches(nitrogen, maxMatches=2)) if number_of_nitrogen < 1: return False num_rotatable_bonds = Lipinski.NumRotatableBonds(mol) if num_rotatable_bonds > 15: return False ring_count = Chem.rdmolops.GetSSSR(mol) if ring_count > 6: return False # Passes everything return True
def filters(mol,args): valid_structure = True # First filter: number of rotatable bonds if Lipinski.NumRotatableBonds(mol) < args.max_torsions: # Second filter: molecular weight if Descriptors.MolWt(mol) < args.max_MolWt: # Third filter: this filters salts off (2 separated components) #if len(Chem.MolToSmiles(mol).split('.')) == 1: for atom in mol.GetAtoms(): #Fourth filter: atoms outside the scope chosen in 'possible_atoms' if atom.GetSymbol() not in possible_atoms: valid_structure = False #else: valid_structure = False else: valid_structure = False else: valid_structure = False return valid_structure
def PhyChem(smiles): """ Calculating the 19D physicochemical descriptors for each molecules, the value has been normalized with Gaussian distribution. Arguments: smiles (list): list of SMILES strings. Returns: props (ndarray): m X 19 matrix as normalized PhysChem descriptors. m is the No. of samples """ props = [] for smile in smiles: mol = Chem.MolFromSmiles(smile) try: MW = desc.MolWt(mol) LOGP = Crippen.MolLogP(mol) HBA = Lipinski.NumHAcceptors(mol) HBD = Lipinski.NumHDonors(mol) rotable = Lipinski.NumRotatableBonds(mol) amide = AllChem.CalcNumAmideBonds(mol) bridge = AllChem.CalcNumBridgeheadAtoms(mol) heteroA = Lipinski.NumHeteroatoms(mol) heavy = Lipinski.HeavyAtomCount(mol) spiro = AllChem.CalcNumSpiroAtoms(mol) FCSP3 = AllChem.CalcFractionCSP3(mol) ring = Lipinski.RingCount(mol) Aliphatic = AllChem.CalcNumAliphaticRings(mol) aromatic = AllChem.CalcNumAromaticRings(mol) saturated = AllChem.CalcNumSaturatedRings(mol) heteroR = AllChem.CalcNumHeterocycles(mol) TPSA = MolSurf.TPSA(mol) valence = desc.NumValenceElectrons(mol) mr = Crippen.MolMR(mol) # charge = AllChem.ComputeGasteigerCharges(mol) prop = [ MW, LOGP, HBA, HBD, rotable, amide, bridge, heteroA, heavy, spiro, FCSP3, ring, Aliphatic, aromatic, saturated, heteroR, TPSA, valence, mr ] except Exception: print(smile) prop = [0] * 19 props.append(prop) props = np.array(props) props = Scaler().fit_transform(props) return props
def properties(fnames, labels, is_active=False): """ Five structural properties calculation for each molecule in each given file. These properties contains No. of Hydrogen Bond Acceptor/Donor, Rotatable Bond, Aliphatic Ring, Aromatic Ring and Heterocycle. Arguments: fnames (list): the file path of molecules. labels (list): the label for each file in the fnames. is_active (bool, optional): selecting only active ligands (True) or all of the molecules (False) if it is true, the molecule with PCHEMBL_VALUE >= 6.5 or SCORE > 0.5 will be selected. (Default: False) Returns: df (DataFrame): the table contains three columns; 'Set' is the label of fname the molecule belongs to, 'Property' is the name of one of five properties, 'Number' is the property value. """ props = [] for i, fname in enumerate(fnames): df = pd.read_table(fname) if 'SCORE' in df.columns: df = df[df.SCORE > (0.5 if is_active else 0)] elif 'PCHEMBL_VALUE' in df.columns: df = df[df.PCHEMBL_VALUE >= (6.5 if is_active else 0)] df = df.drop_duplicates(subset='CANONICAL_SMILES') if len(df) > int(1e5): df = df.sample(int(1e5)) for smile in tqdm(df.CANONICAL_SMILES): mol = Chem.MolFromSmiles(smile) HA = Lipinski.NumHAcceptors(mol) props.append([labels[i], 'Hydrogen Bond\nAcceptor', HA]) HD = Lipinski.NumHDonors(mol) props.append([labels[i], 'Hydrogen\nBond Donor', HD]) RB = Lipinski.NumRotatableBonds(mol) props.append([labels[i], 'Rotatable\nBond', RB]) RI = AllChem.CalcNumAliphaticRings(mol) props.append([labels[i], 'Aliphatic\nRing', RI]) AR = Lipinski.NumAromaticRings(mol) props.append([labels[i], 'Aromatic\nRing', AR]) HC = AllChem.CalcNumHeterocycles(mol) props.append([labels[i], 'Heterocycle', HC]) df = pd.DataFrame(props, columns=['Set', 'Property', 'Number']) return df
def test1(self): " testing first 200 mols from NCI " suppl = Chem.SDMolSupplier(self.inFileName) idx = 1 oldDonorSmarts = Chem.MolFromSmarts('[NH1,NH2,OH1]') OldDonorCount = lambda x, y=oldDonorSmarts: Lipinski._NumMatches(x, y) oldAcceptorSmarts = Chem.MolFromSmarts('[N,O]') OldAcceptorCount = lambda x, y=oldAcceptorSmarts: Lipinski._NumMatches( x, y) for m in suppl: if m: calc = Lipinski.NHOHCount(m) orig = int(m.GetProp('NUM_LIPINSKIHDONORS')) assert calc == orig, 'bad num h donors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NOCount(m) orig = int(m.GetProp('NUM_LIPINSKIHACCEPTORS')) assert calc == orig, 'bad num h acceptors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumHDonors(m) orig = int(m.GetProp('NUM_HDONORS')) assert calc == orig, 'bad num h donors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumHAcceptors(m) orig = int(m.GetProp('NUM_HACCEPTORS')) assert calc == orig, 'bad num h acceptors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumHeteroatoms(m) orig = int(m.GetProp('NUM_HETEROATOMS')) assert calc == orig, 'bad num heteroatoms for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) calc = Lipinski.NumRotatableBonds(m) orig = int(m.GetProp('NUM_ROTATABLEBONDS')) assert calc == orig, 'bad num rotors for mol %d (%s): %d != %d' % ( idx, m.GetProp('SMILES'), calc, orig) idx += 1
def CalculateRotationBondNumber(mol): """ ################################################################# Calculation of rotation bonds counts in a molecule ---->nrot Note that this is the same as calculation of single bond counts in a molecule. Usage: result=CalculateRotationBondNumber(mol) Input: mol is a molecule object. Output: result is a numeric value. ################################################################# """ return LPK.NumRotatableBonds(mol)
def properties(mol): """ Calculates the properties that are required to calculate the QED descriptor. """ matches = [] if (mol is None): raise TypeError('You need to provide a mol argument.') x = [0] * 8 x[0] = rdmd._CalcMolWt(mol) # MW x[1] = Crippen.MolLogP(mol) # ALOGP for hbaPattern in Acceptors: # HBA if (mol.HasSubstructMatch(hbaPattern)): matches = mol.GetSubstructMatches(hbaPattern) x[2] += len(matches) x[3] = Lipinski.NumHDonors(mol) # HBD x[4] = MolSurf.TPSA(mol) # PSA x[5] = Lipinski.NumRotatableBonds(mol) # ROTB x[6] = Chem.GetSSSR(Chem.DeleteSubstructs(deepcopy(mol), AliphaticRings)) # AROM for alert in StructuralAlerts: # ALERTS if (mol.HasSubstructMatch(alert)): x[7] += 1 return x
def calc_rdkit(mol): descriptors = pd.Series( np.array([ Crippen.MolLogP(mol), Crippen.MolMR(mol), Descriptors.FpDensityMorgan1(mol), Descriptors.FpDensityMorgan2(mol), Descriptors.FpDensityMorgan3(mol), Descriptors.FractionCSP3(mol), Descriptors.HeavyAtomMolWt(mol), Descriptors.MaxAbsPartialCharge(mol), Descriptors.MaxPartialCharge(mol), Descriptors.MinAbsPartialCharge(mol), Descriptors.MinPartialCharge(mol), Descriptors.MolWt(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.NumValenceElectrons(mol), EState.EState.MaxAbsEStateIndex(mol), EState.EState.MaxEStateIndex(mol), EState.EState.MinAbsEStateIndex(mol), EState.EState.MinEStateIndex(mol), EState.EState_VSA.EState_VSA1(mol), EState.EState_VSA.EState_VSA10(mol), EState.EState_VSA.EState_VSA11(mol), EState.EState_VSA.EState_VSA2(mol), EState.EState_VSA.EState_VSA3(mol), EState.EState_VSA.EState_VSA4(mol), EState.EState_VSA.EState_VSA5(mol), EState.EState_VSA.EState_VSA6(mol), EState.EState_VSA.EState_VSA7(mol), EState.EState_VSA.EState_VSA8(mol), EState.EState_VSA.EState_VSA9(mol), Fragments.fr_Al_COO(mol), Fragments.fr_Al_OH(mol), Fragments.fr_Al_OH_noTert(mol), Fragments.fr_aldehyde(mol), Fragments.fr_alkyl_carbamate(mol), Fragments.fr_alkyl_halide(mol), Fragments.fr_allylic_oxid(mol), Fragments.fr_amide(mol), Fragments.fr_amidine(mol), Fragments.fr_aniline(mol), Fragments.fr_Ar_COO(mol), Fragments.fr_Ar_N(mol), Fragments.fr_Ar_NH(mol), Fragments.fr_Ar_OH(mol), Fragments.fr_ArN(mol), Fragments.fr_aryl_methyl(mol), Fragments.fr_azide(mol), Fragments.fr_azo(mol), Fragments.fr_barbitur(mol), Fragments.fr_benzene(mol), Fragments.fr_benzodiazepine(mol), Fragments.fr_bicyclic(mol), Fragments.fr_C_O(mol), Fragments.fr_C_O_noCOO(mol), Fragments.fr_C_S(mol), Fragments.fr_COO(mol), Fragments.fr_COO2(mol), Fragments.fr_diazo(mol), Fragments.fr_dihydropyridine(mol), Fragments.fr_epoxide(mol), Fragments.fr_ester(mol), Fragments.fr_ether(mol), Fragments.fr_furan(mol), Fragments.fr_guanido(mol), Fragments.fr_halogen(mol), Fragments.fr_hdrzine(mol), Fragments.fr_hdrzone(mol), Fragments.fr_HOCCN(mol), Fragments.fr_imidazole(mol), Fragments.fr_imide(mol), Fragments.fr_Imine(mol), Fragments.fr_isocyan(mol), Fragments.fr_isothiocyan(mol), Fragments.fr_ketone(mol), Fragments.fr_ketone_Topliss(mol), Fragments.fr_lactam(mol), Fragments.fr_lactone(mol), Fragments.fr_methoxy(mol), Fragments.fr_morpholine(mol), Fragments.fr_N_O(mol), Fragments.fr_Ndealkylation1(mol), Fragments.fr_Ndealkylation2(mol), Fragments.fr_NH0(mol), Fragments.fr_NH1(mol), Fragments.fr_NH2(mol), Fragments.fr_Nhpyrrole(mol), Fragments.fr_nitrile(mol), Fragments.fr_nitro(mol), Fragments.fr_nitro_arom(mol), Fragments.fr_nitro_arom_nonortho(mol), Fragments.fr_nitroso(mol), Fragments.fr_oxazole(mol), Fragments.fr_oxime(mol), Fragments.fr_para_hydroxylation(mol), Fragments.fr_phenol(mol), Fragments.fr_phenol_noOrthoHbond(mol), Fragments.fr_phos_acid(mol), Fragments.fr_phos_ester(mol), Fragments.fr_piperdine(mol), Fragments.fr_piperzine(mol), Fragments.fr_priamide(mol), Fragments.fr_prisulfonamd(mol), Fragments.fr_pyridine(mol), Fragments.fr_quatN(mol), Fragments.fr_SH(mol), Fragments.fr_sulfide(mol), Fragments.fr_sulfonamd(mol), Fragments.fr_sulfone(mol), Fragments.fr_term_acetylene(mol), Fragments.fr_tetrazole(mol), Fragments.fr_thiazole(mol), Fragments.fr_thiocyan(mol), Fragments.fr_thiophene(mol), Fragments.fr_unbrch_alkane(mol), Fragments.fr_urea(mol), GraphDescriptors.BalabanJ(mol), GraphDescriptors.BertzCT(mol), GraphDescriptors.Chi0(mol), GraphDescriptors.Chi0n(mol), GraphDescriptors.Chi0v(mol), GraphDescriptors.Chi1(mol), GraphDescriptors.Chi1n(mol), GraphDescriptors.Chi1v(mol), GraphDescriptors.Chi2n(mol), GraphDescriptors.Chi2v(mol), GraphDescriptors.Chi3n(mol), GraphDescriptors.Chi3v(mol), GraphDescriptors.Chi4n(mol), GraphDescriptors.Chi4v(mol), GraphDescriptors.HallKierAlpha(mol), GraphDescriptors.Ipc(mol), GraphDescriptors.Kappa1(mol), GraphDescriptors.Kappa2(mol), GraphDescriptors.Kappa3(mol), Lipinski.HeavyAtomCount(mol), Lipinski.NHOHCount(mol), Lipinski.NOCount(mol), Lipinski.NumAliphaticCarbocycles(mol), Lipinski.NumAliphaticHeterocycles(mol), Lipinski.NumAliphaticRings(mol), Lipinski.NumAromaticCarbocycles(mol), Lipinski.NumAromaticHeterocycles(mol), Lipinski.NumAromaticRings(mol), Lipinski.NumHAcceptors(mol), Lipinski.NumHDonors(mol), Lipinski.NumHeteroatoms(mol), Lipinski.NumRotatableBonds(mol), Lipinski.NumSaturatedCarbocycles(mol), Lipinski.NumSaturatedHeterocycles(mol), Lipinski.NumSaturatedRings(mol), Lipinski.RingCount(mol), MolSurf.LabuteASA(mol), MolSurf.PEOE_VSA1(mol), MolSurf.PEOE_VSA10(mol), MolSurf.PEOE_VSA11(mol), MolSurf.PEOE_VSA12(mol), MolSurf.PEOE_VSA13(mol), MolSurf.PEOE_VSA14(mol), MolSurf.PEOE_VSA2(mol), MolSurf.PEOE_VSA3(mol), MolSurf.PEOE_VSA4(mol), MolSurf.PEOE_VSA5(mol), MolSurf.PEOE_VSA6(mol), MolSurf.PEOE_VSA7(mol), MolSurf.PEOE_VSA8(mol), MolSurf.PEOE_VSA9(mol), MolSurf.SlogP_VSA1(mol), MolSurf.SlogP_VSA10(mol), MolSurf.SlogP_VSA11(mol), MolSurf.SlogP_VSA12(mol), MolSurf.SlogP_VSA2(mol), MolSurf.SlogP_VSA3(mol), MolSurf.SlogP_VSA4(mol), MolSurf.SlogP_VSA5(mol), MolSurf.SlogP_VSA6(mol), MolSurf.SlogP_VSA7(mol), MolSurf.SlogP_VSA8(mol), MolSurf.SlogP_VSA9(mol), MolSurf.SMR_VSA1(mol), MolSurf.SMR_VSA10(mol), MolSurf.SMR_VSA2(mol), MolSurf.SMR_VSA3(mol), MolSurf.SMR_VSA4(mol), MolSurf.SMR_VSA5(mol), MolSurf.SMR_VSA6(mol), MolSurf.SMR_VSA7(mol), MolSurf.SMR_VSA8(mol), MolSurf.SMR_VSA9(mol), MolSurf.TPSA(mol) ])) return descriptors
def extract(x, from_smiles): if from_smiles: mol = Chem.MolFromSmiles(x) else: mol = x if (mol is None) or (len(mol.GetAtoms()) == 0): if include_3D: return [0] * 29 else: return [0] * 24 else: logP = Crippen.MolLogP(mol) refractivity = Crippen.MolMR(mol) weight = Descriptors.MolWt(mol) exact_weight = Descriptors.ExactMolWt(mol) heavy_weight = Descriptors.HeavyAtomMolWt(mol) heavy_count = Lipinski.HeavyAtomCount(mol) nhoh_count = Lipinski.NHOHCount(mol) no_count = Lipinski.NOCount(mol) hacceptor_count = Lipinski.NumHAcceptors(mol) hdonor_count = Lipinski.NumHDonors(mol) hetero_count = Lipinski.NumHeteroatoms(mol) rotatable_bond_count = Lipinski.NumRotatableBonds(mol) valance_electron_count = Descriptors.NumValenceElectrons(mol) amide_bond_count = rdMolDescriptors.CalcNumAmideBonds(mol) aliphatic_ring_count = Lipinski.NumAliphaticRings(mol) aromatic_ring_count = Lipinski.NumAromaticRings(mol) saturated_ring_count = Lipinski.NumSaturatedRings(mol) aliphatic_cycle_count = Lipinski.NumAliphaticCarbocycles(mol) aliphaticHetero_cycle_count = Lipinski.NumAliphaticHeterocycles( mol) aromatic_cycle_count = Lipinski.NumAromaticCarbocycles(mol) aromaticHetero_cycle_count = Lipinski.NumAromaticHeterocycles(mol) saturated_cycle_count = Lipinski.NumSaturatedCarbocycles(mol) saturatedHetero_cycle_count = Lipinski.NumSaturatedHeterocycles( mol) tpsa = rdMolDescriptors.CalcTPSA(mol) if include_3D: mol_3D = Chem.AddHs(mol) AllChem.EmbedMolecule(mol_3D) AllChem.MMFFOptimizeMolecule(mol_3D) eccentricity = rdMolDescriptors.CalcEccentricity(mol_3D) asphericity = rdMolDescriptors.CalcAsphericity(mol_3D) spherocity = rdMolDescriptors.CalcSpherocityIndex(mol_3D) inertial = rdMolDescriptors.CalcInertialShapeFactor(mol_3D) gyration = rdMolDescriptors.CalcRadiusOfGyration(mol_3D) return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa, eccentricity, asphericity, spherocity, inertial, gyration ] else: return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa ]
def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], smile_cols: int = 0): mol_wt = [] mol_logp = [] hdonors = [] hacceptors = [] rotatable_bonds = [] qeds = [] for idx in range(df.shape[0]): smiles = df.iat[idx, smile_cols] m = Chem.MolFromSmiles(smiles) if m is None: mol_logp.append({'value': '-', 'level': 'info'}) mol_wt.append({'value': '-', 'level': 'info'}) hdonors.append({'value': '-', 'level': 'info'}) hacceptors.append({'value': '-', 'level': 'info'}) rotatable_bonds.append({'value': '-', 'level': 'info'}) qeds.append({'value': '-', 'level': 'info'}) continue try: logp = Descriptors.MolLogP(m) mol_logp.append({ 'value': round(logp, 2), 'level': 'info' if logp < LipinskiRuleOfFiveDecorator.MAX_LOGP else 'error' }) except Exception as ex: logger.exception(ex) mol_logp.append({'value': '-', 'level': 'info'}) try: wt = Descriptors.MolWt(m) mol_wt.append({ 'value': round(wt, 2), 'level': 'info' if wt < LipinskiRuleOfFiveDecorator.MAX_MOL_WT else 'error' }) except Exception as ex: logger.exception(ex) mol_wt.append({'value': '-', 'level': 'info'}) try: hdonor = Lipinski.NumHDonors(m) hdonors.append({ 'value': hdonor, 'level': 'info' if hdonor < LipinskiRuleOfFiveDecorator.MAX_H_DONORS else 'error' }) except Exception as ex: logger.exception(ex) hdonors.append({'value': '-', 'level': 'info'}) try: hacceptor = Lipinski.NumHAcceptors(m) hacceptors.append({ 'value': hacceptor, 'level': 'info' if hacceptor < LipinskiRuleOfFiveDecorator.MAX_H_DONORS else 'error' }) except Exception as ex: logger.exception(ex) hacceptors.append({'value': '-', 'level': 'info'}) try: rotatable_bond = Lipinski.NumRotatableBonds(m) rotatable_bonds.append({ 'value': rotatable_bond, 'level': 'info' if rotatable_bond < LipinskiRuleOfFiveDecorator.MAX_ROTATABLE_BONDS else 'error' }) except Exception as ex: logger.exception(ex) rotatable_bonds.append({'value': '-', 'level': 'info'}) try: qed = QED.qed(m) qeds.append({ 'value': round(qed, 4), 'level': 'info' if qed < LipinskiRuleOfFiveDecorator.MAX_QED else 'error' }) except Exception as ex: logger.exception(ex) qeds.append({'value': '-', 'level': 'info'}) df['Molecular Weight'] = mol_wt df['LogP'] = mol_logp df['H-Bond Donors'] = hdonors df['H-Bond Acceptors'] = hacceptors df['Rotatable Bonds'] = rotatable_bonds df['QED'] = qeds return df