def check_ligand(file_path): bool = False if os.path.isfile(file_path): suppl = Chem.SDMolSupplier(file_path) for mol in suppl: if mol is not None: # components of rule hydrogen_bond_doner = True if Lipinski.NumHDonors( mol) <= 5 else False hydrogen_bond_acceptors = True if Lipinski.NumHAcceptors( mol) <= 10 else False molecular_mass = True if Descriptors.ExactMolWt( mol) <= 500 else False octanol_water_partition_coefficient_logP = True if Crippen.MolLogP( mol) <= 5 else False components_rank = hydrogen_bond_doner + hydrogen_bond_acceptors + molecular_mass + octanol_water_partition_coefficient_logP # variants partition_coefficient_logP = True if -0.4 <= Crippen.MolLogP( mol) <= 5.6 else False molar_refractivity = True if 40 <= Crippen.MolMR( mol) <= 130 else False molecular_weight = True if 180 <= Descriptors.ExactMolWt( mol) <= 500 else False number_of_atoms = True if 20 <= Lipinski.HeavyAtomCount( mol) <= 70 else False polar_surface_area = True if MolSurf.TPSA( mol) <= 140 else False variants_rank = partition_coefficient_logP + molar_refractivity + molecular_weight + number_of_atoms + polar_surface_area if (components_rank == 4) and (variants_rank == 4 or variants_rank == 5): bool = True return bool
def testIssue80(self): from rdkit.Chem import Lipinski m = Chem.MolFromSmiles('CCOC') ref = Crippen.MolLogP(m) Lipinski.NHOHCount(m) probe = Crippen.MolLogP(m) self.failUnless(probe == ref)
def canonicalize(smi_list, showprogress=False): mol_list = [] if showprogress: print('Canonicalising mols') for smi in tqdm(smi_list): mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) else: for smi in smi_list: mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) mol_list = list(set(mol_list)) final_list = [] if showprogress: print('Size of unfiltered final library: {}'.format(len(mol_list))) print('Filtering by n_heavy and logP:') for smi in tqdm(mol_list): mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) else: for smi in mol_list: mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) return final_list
def properties_mw_logp(filepaths): properties = [] for i, fname in enumerate(filepaths): with open(filepaths[i], 'r') as f: reader = csv.reader(f) it = iter(reader) if not ("generated" in fname): for row in it: try: properties.append([float(row[2]), float(row[3]), i]) except: print("") else: for row in it: try: mol = Chem.MolFromSmiles(row[0]) x, y = desc.MolWt(mol), Crippen.MolLogP(mol) properties.append([x, y, i]) except: print("Non-Canonical SMILES: " + row[0]) df = pd.DataFrame(properties[2000:2355], columns=['MW', 'logP', 'Label']) return df
def get_filter_values(mol): """ calculate the values, for a given molecule, that are used to filter return as a dictionary """ assert isinstance(mol, Chem.Mol) values = {} values["MW"] = desc.CalcExactMolWt(mol) values["logP"] = crip.MolLogP(mol) values["HBA"] = lip.NumHAcceptors(mol) values["HBD"] = lip.NumHDonors(mol) values["tPSA"] = desc.CalcTPSA(mol) values["rot_bonds"] = lip.NumRotatableBonds(mol) values["rigid_bonds"] = mol.GetNumBonds() - values["rot_bonds"] # assume mutual exclusion values["num_rings"] = lip.RingCount(mol) values["num_hetero_atoms"] = lip.NumHeteroatoms(mol) values["charge"] = rdmolops.GetFormalCharge(mol) # trusting this charge calculation method values["num_carbons"], values["num_charges"], values["max_ring_size"] = get_atom_props(mol) try: values["hc_ratio"] = float(values["num_hetero_atoms"]) / float(values["num_carbons"]) except ZeroDivisionError: values["hc_ratio"] = 100000000 # if there are zero carbons values["fc"] = len(list(Brics.FindBRICSBonds(mol))) # how many BRICS bonds, related to complexity values["is_good"] = True # default to true, but not yet observed atoms = [atom.GetSymbol() for atom in mol.GetAtoms()] # get all the atoms, and make the list unique (only types) atoms = set(atoms) atoms = list(atoms) values["atoms"] = atoms values["num_chiral_centers"] = len(Chem.FindMolChiralCenters(mol, includeUnassigned=True)) values["rejections"] = [] # empty list to store the reasons for rejection return values
def LogP(smile): smile = str(smile) try: m = Chem.MolFromSmiles(smile) return Crippen.MolLogP(m) except: return 'NaN'
def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id', redraw=False,keepHs=False, skipProps=False,addComputedProps=False, skipSmiles=False, uniqNames=None,namesSeen=None): if not mol: raise ValueError('no molecule') if keepHs: Chem.SanitizeMol(mol) try: nm = mol.GetProp(nameProp) except KeyError: nm = None if not nm: nm = 'Mol_%d'%nDone if uniqNames and nm in namesSeen: logger.error('duplicate compound id (%s) encountered. second instance skipped.'%nm) return None namesSeen.add(nm) row = [nm] if not skipProps: if addComputedProps: nHD=Lipinski.NumHDonors(mol) mol.SetProp('DonorCount',str(nHD)) nHA=Lipinski.NumHAcceptors(mol) mol.SetProp('AcceptorCount',str(nHA)) nRot=Lipinski.NumRotatableBonds(mol) mol.SetProp('RotatableBondCount',str(nRot)) MW=Descriptors.MolWt(mol) mol.SetProp('AMW',str(MW)) logp=Crippen.MolLogP(mol) mol.SetProp('MolLogP',str(logp)) pns = list(mol.GetPropNames()) pD={} for pi,pn in enumerate(pns): if pn.lower()==nameCol.lower(): continue pv = mol.GetProp(pn).strip() if pv.find('>')<0 and pv.find('<')<0: colTyp = globalProps.get(pn,2) while colTyp>0: try: tpi = typeConversions[colTyp][1](pv) except: colTyp-=1 else: break globalProps[pn]=colTyp pD[pn]=typeConversions[colTyp][1](pv) else: pD[pn]=pv else: pD={} if redraw: AllChem.Compute2DCoords(m) if not skipSmiles: row.append(Chem.MolToSmiles(mol,True)) row.append(DbModule.binaryHolder(mol.ToBinary())) row.append(pD) return row
def logP(smile, train_smiles=None): low_logp = -2.12178879609 high_logp = 6.0429063424 logp = Crippen.MolLogP(Chem.MolFromSmiles(smile)) val = remap(logp, low_logp, high_logp) val = np.clip(val, 0.0, 1.0) return val
def _testLogPLong2(self): """ test calculation of Lipinski params """ fName = 'PP_descrs_regress.2.csv' col = 33 self.__testDesc(fName, col, lambda x: Crippen.MolLogP(x, includeHs=1))
def calc_lipinski(self, mol): """ Returns: a tuple consisting of: - a boolean indicating whether the molecule passed Lipinski test - a dictionary giving the values of the Lipinski check. NOTE: Lipinski's rules are: - Hydrogen bond donors <= 5 - Hydrogen bond acceptors <= 10 - Molecular weight < 500 daltons - logP < 5 """ num_hdonors = Lipi.NumHDonors(mol) num_hacceptors = Lipi.NumHAcceptors(mol) mol_weight = Descriptors.MolWt(mol) mol_logp = round(Crippen.MolLogP(mol), 4) return ((num_hdonors <= 5 and num_hacceptors <= 10 and mol_weight < 500 and mol_logp < 5), { 'hydrogen_bond_donors': num_hdonors, 'hydrogen_bond_acceptors': num_hacceptors, 'molecular_weight': mol_weight, 'logp': mol_logp })
def generate(smiles): moldata = [] for elem in smiles: mol = Chem.MolFromSmiles(elem) moldata.append(mol) baseData = np.arange(1, 1) i = 0 for mol in moldata: desc_MolLogP = Crippen.MolLogP(mol) desc_MolWt = Descriptors.MolWt(mol) desc_NumRotatableBonds = Lipinski.NumRotatableBonds(mol) desc_AromaticProportion = getAromaticProportion(mol) row = np.array([desc_MolLogP, desc_MolWt, desc_NumRotatableBonds, desc_AromaticProportion]) if i == 0: baseData = row else: baseData = np.vstack([baseData, row]) i = i + 1 columnNames = ["MolLogP", "MolWt", "NumRotatableBonds", "AromaticProportion"] descriptors = pd.DataFrame(data=baseData, columns=columnNames) return descriptors
def properties(mol): """ Calculates the properties that are required to calculate the QED descriptor. """ if mol is None: raise ValueError('You need to provide a mol argument.') mol = Chem.RemoveHs(mol) qedProperties = QEDproperties( MW=rdmd._CalcMolWt(mol), ALOGP=Crippen.MolLogP(mol), HBA=sum( len(mol.GetSubstructMatches(pattern)) for pattern in Acceptors if mol.HasSubstructMatch(pattern)), HBD=rdmd.CalcNumHBD(mol), PSA=MolSurf.TPSA(mol), ROTB=rdmd.CalcNumRotatableBonds(mol, rdmd.NumRotatableBondsOptions.Strict), AROM=Chem.GetSSSR(Chem.DeleteSubstructs(Chem.Mol(mol), AliphaticRings)), ALERTS=sum(1 for alert in StructuralAlerts if mol.HasSubstructMatch(alert)), ) # The replacement # AROM=Lipinski.NumAromaticRings(mol), # is not identical. The expression above tends to count more rings # N1C2=CC=CC=C2SC3=C1C=CC4=C3C=CC=C4 # OC1=C(O)C=C2C(=C1)OC3=CC(=O)C(=CC3=C2C4=CC=CC=C4)O # CC(C)C1=CC2=C(C)C=CC2=C(C)C=C1 uses 2, should be 0 ? return qedProperties
def water_octanol_partition_coefficient_scores(mols, norm=False): scores = [MolecularMetrics._avoid_sanitization_error(lambda: Crippen.MolLogP(mol)) if mol is not None else None for mol in mols] scores = np.array(list(map(lambda x: -3 if x is None else x, scores))) scores = np.clip(MolecularMetrics.remap(scores, -2.12178879609, 6.0429063424), 0.0, 1.0) if norm else scores return scores
def get_properties(mols): properties = [] for mol in tqdm(mols): molwt = Descriptors.MolWt(mol) logp = Crippen.MolLogP(mol) properties.append((molwt, logp)) return properties
def testRepeat(self): self._readData() nMols = len(self.smis) for i in range(nMols): smi = self.smis[i] mol = Chem.MolFromSmiles(smi) clog = self.clogs[i] tmp = Crippen.MolLogP(mol) tmp = Crippen.MolLogP(mol) self.failUnless(feq(clog,tmp),'bad logp fooutF,r %s: %4.4f != %4.4f'%(smi,clog,tmp)) mr = self.mrs[i] tmp = Crippen.MolMR(mol) tmp = Crippen.MolMR(mol) self.failUnless(feq(mr,tmp),'bad MR for %s: %4.4f != %4.4f'%(smi,mr,tmp))
def logP(smile, train_smiles): low_logp = -2.10799552492 high_logp = 2.71567964162 logp = Crippen.MolLogP(Chem.MolFromSmiles(smile)) val = remap(logp, low_logp, high_logp) val = np.clip(logp, 0.0, 1.0) return val
def logP(mol, train_smiles=None): val = Crippen.MolLogP(mol) if NORMALIZE: low_logp = -2.12178879609 high_logp = 6.0429063424 val = remap(val, low_logp, high_logp) val = np.clip(val, 0.0, 1.0) return val
def _rdkit_eval(entry: dict) -> dict: """Computes the chemical properties from RDKit, adds them to the input dictionary""" mol = Chem.MolFromSmiles(entry['smiles']) entry['logP'] = Crippen.MolLogP(mol) entry['QED'] = QED.qed(mol) entry['SA_score'] = calculateScore(mol) return entry
def testLipinskiLong(self): """ Lipinski parameter """ if not doLong: raise unittest.SkipTest('long test') fName = 'PP_descrs_regress.csv' self.__testDesc(fName, 30, Lipinski.NumHDonors) self.__testDesc(fName, 31, Lipinski.NumHeteroatoms) self.__testDesc(fName, 32, Lipinski.NumRotatableBonds) self.__testDesc(fName, 33, lambda x: Crippen.MolLogP(x, includeHs=1)) fName = 'Block_regress.Lip.csv' self.__testDesc(fName, 1, Lipinski.NumHAcceptors) self.__testDesc(fName, 2, Lipinski.NumHDonors) self.__testDesc(fName, 3, Lipinski.NumHeteroatoms) self.__testDesc(fName, 4, Lipinski.NumRotatableBonds) fName = 'PP_descrs_regress.2.csv' self.__testDesc(fName, 33, lambda x: Crippen.MolLogP(x, includeHs=1))
def mole_proper(mol): num_hdonors = Lipinski.NumHDonors(mol) num_hacceptors = Lipinski.NumHAcceptors(mol) num_rotatable = Lipinski.NumRotatableBonds(mol) mol_weight = Descriptors.MolWt(mol) mol_logp = Crippen.MolLogP(mol) mol_TPSA = Descriptors.TPSA(mol) proper = (num_hdonors, num_hacceptors, num_rotatable, mol_weight, mol_logp, mol_TPSA) return proper
def log_partition_coefficient(smiles): ''' Returns the octanol-water partition coefficient given a molecule SMILES string ''' try: mol = Chem.MolFromSmiles(smiles) except Exception as e: raise SmilesError('%s returns a None molecule' % smiles) return Crippen.MolLogP(mol)
def get_crippen(x): """ Get the logP value for molecule X :param x: Molecule :return: float: logP """ try: cp = Crippen.MolLogP(x) except: return np.nan return cp
def evaluate_chem_mol(mol): try: Chem.GetSSSR(mol) clogp = Crippen.MolLogP(mol) mw = MolDescriptors.CalcExactMolWt(mol) tpsa = Descriptors.TPSA(mol) ret_val = [True, 320 < mw < 420, 2 < clogp < 3, 40 < tpsa < 60] except: ret_val = [False] * 4 return ret_val
def calc_esol_descriptors(self, mol): """ Calcuate mw,logp,rotors and aromatic proportion (ap) :param mol: input molecule :return: named tuple with descriptor values """ mw = Descriptors.MolWt(mol) logp = Crippen.MolLogP(mol) rotors = Lipinski.NumRotatableBonds(mol) ap = self.calc_ap(mol) return self.Descriptor(mw=mw, logp=logp, rotors=rotors, ap=ap)
def testLogP(self): self._readData() nMols = len(self.smis) #outF = file(self.fName,'w') for i in range(nMols): smi = self.smis[i] mol = Chem.MolFromSmiles(smi) if 1: clog = self.clogs[i] tmp = Crippen.MolLogP(mol) self.failUnless(feq(clog,tmp),'bad logp for %s: %4.4f != %4.4f'%(smi,clog,tmp)) mr = self.mrs[i] tmp = Crippen.MolMR(mol) self.failUnless(feq(mr,tmp),'bad MR for %s: %4.4f != %4.4f'%(smi,mr,tmp)) else: clog = Crippen.MolLogP(mol) mr = Crippen.MolMR(mol) print >>outF,'%s,%.4f,%.4f'%(smi,clog,mr)
def canonicalize_and_filter(smi_list, showprogress=False): """ Function that returns the set of unique RDKit molecules from a list of input RDKit molecules by turning them into canonical SMILES and checking the strings for uniqueness. Also performs rudimentary Lipinski rule-of-5 filtering by dropping molecules with logP >5 and more than 17 heavy atoms. """ mol_list = [] if showprogress: print('Canonicalising mols') for smi in tqdm(smi_list): mol = MolFromSmiles(smi) if mol is not None: mol_list.append(MolToSmiles(mol)) else: for smi in smi_list: mol = MolFromSmiles(smi) if mol is not None: mol_list.append(mol) mol_list = list(set(mol_list)) final_list = [] if showprogress: print('Size of unfiltered final library: {}'.format(len(mol_list))) print('Filtering by n_heavy and logP:') for smi in tqdm(mol_list): mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) else: for smi in mol_list: mol = MolFromSmiles(smi) n_heavy = mol.GetNumHeavyAtoms() if n_heavy > 17: logP = Crippen.MolLogP(mol) if logP <= 5: final_list.append(smi) return final_list
def descriptors(self, mol): aromatic_frac = self.arofrac(mol) mw = Descriptors.ExactMolWt(mol, False) valence_e = Descriptors.NumValenceElectrons(mol) h_acceptors = Lipinski.NumHAcceptors(mol) h_donors = Lipinski.NumHDonors(mol) NO_counts = Lipinski.NOCount(mol) NHOH_count = Lipinski.NHOHCount(mol) rotors = Lipinski.NumRotatableBonds(mol) SP3_frac = Lipinski.FractionCSP3(mol) logP = Crippen.MolLogP(mol) SP_bonds = len(mol.GetSubstructMatches(Chem.MolFromSmarts('[^1]'))) return([aromatic_frac,mw,valence_e,h_acceptors,h_donors,NO_counts,NHOH_count, rotors,SP3_frac,logP,SP_bonds])
def LogP(smile): ''' Given the Smile, this function compute the partition coefficient for each Chemical Inputs: - smile (string): original SMILES code Outputs: - LogP (float): partition coefficient of the chemical (NaN if not found) ''' smile = str(smile) try: m = Chem.MolFromSmiles(smile) return Crippen.MolLogP(m) except: return 'NaN'
def apply_lead_like_filters(data_dict): """Apply lead like filtering, exclude structures AlogP > 4.5 mol wt > 450 g/mmol :param data_dict: {'CHEMBL12345' : 'c1ccccc1OC'} :return: filtered smiles dict """ new_dict = {} for k, v in data_dict.items(): rdkit_mol = Chem.MolFromSmiles(v) if rdkit_mol: if Crippen.MolLogP(rdkit_mol) < 4.5 or Descriptors.ExactMolWt(rdkit_mol) < 450: new_dict[k] = v return new_dict
def getDiscriptor(self): from rdkit.Chem import Crippen from rdkit import Chem import pandas as pd from rdkit.Chem import Descriptors, Lipinski import os os.chdir(r"G:\マイドライブ\Data\Meram Chronic Data") df = pd.read_csv('extChronicStrcture.csv', engine='python') df = df[['CAS', 'canonical_smiles']] df = df.dropna(how='any') #df = pd.read_csv('extractInchi.csv',header=None) columns = [ 'CAS', 'weight', 'logP', 'RotatableBonds', 'HeavyAtomCounts', 'AromProp', 'TPSA', 'HDonor', 'HAcceptors', 'FractionCSP3', 'AromaticCarbocycles', 'AromaticHeterocycles' ] CAS = df['CAS'] SMILES = df['canonical_smiles'] resultDf = pd.DataFrame(columns=columns) for cas, smiles in zip(CAS, SMILES): mol = Chem.MolFromSmiles(smiles) wt = Descriptors.MolWt(mol) rot = Lipinski.NumRotatableBonds(mol) heavy = Lipinski.HeavyAtomCount(mol) logp = Crippen.MolLogP(mol) aromaticHeavyatoms = len( mol.GetSubstructMatches(Chem.MolFromSmarts('[a]'))) numAtoms = mol.GetNumAtoms() aromprop = float(aromaticHeavyatoms / numAtoms) TPSA = Descriptors.TPSA(mol) HDonors = Descriptors.NumHDonors(mol) HAcceptors = Descriptors.NumHAcceptors(mol) FractionCSP3 = Descriptors.FractionCSP3(mol) AromaticCarbocycles = Descriptors.NumAromaticCarbocycles(mol) AromaticHeterocycles = Descriptors.NumAromaticHeterocycles(mol) (print(HDonors, HAcceptors)) tempDf = pd.DataFrame([[ cas, wt, logp, rot, heavy, aromprop, TPSA, HDonors, HAcceptors, FractionCSP3, AromaticCarbocycles, AromaticHeterocycles ]], columns=columns) resultDf = pd.concat([resultDf, tempDf]) resultDf.to_csv('Descriptors.csv', index=False)