def _calculateDescriptors(mol): df = pd.DataFrame(index=[0]) df["SlogP"] = rdMolDescriptors.CalcCrippenDescriptors(mol)[0] df["SMR"] = rdMolDescriptors.CalcCrippenDescriptors(mol)[1] df["LabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol) df["TPSA"] = Descriptors.TPSA(mol) df["AMW"] = Descriptors.MolWt(mol) df["ExactMW"] = rdMolDescriptors.CalcExactMolWt(mol) df["NumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol) df["NumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol) df["NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol) df["NumHBD"] = rdMolDescriptors.CalcNumHBD(mol) df["NumHBA"] = rdMolDescriptors.CalcNumHBA(mol) df["NumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol) df["NumHeteroAtoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol) df["NumHeavyAtoms"] = Chem.rdchem.Mol.GetNumHeavyAtoms(mol) df["NumAtoms"] = Chem.rdchem.Mol.GetNumAtoms(mol) df["NumRings"] = rdMolDescriptors.CalcNumRings(mol) df["NumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol) df["NumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol) df["NumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol) df["NumAromaticHeterocycles"] = \ rdMolDescriptors.CalcNumAromaticHeterocycles(mol) df["NumSaturatedHeterocycles"] = \ rdMolDescriptors.CalcNumSaturatedHeterocycles(mol) df["NumAliphaticHeterocycles"] = \ rdMolDescriptors.CalcNumAliphaticHeterocycles(mol) df["NumAromaticCarbocycles"] = \ rdMolDescriptors.CalcNumAromaticCarbocycles(mol) df["NumSaturatedCarbocycles"] = \ rdMolDescriptors.CalcNumSaturatedCarbocycles(mol) df["NumAliphaticCarbocycles"] = \ rdMolDescriptors.CalcNumAliphaticCarbocycles(mol) df["FractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol) df["Chi0v"] = rdMolDescriptors.CalcChi0v(mol) df["Chi1v"] = rdMolDescriptors.CalcChi1v(mol) df["Chi2v"] = rdMolDescriptors.CalcChi2v(mol) df["Chi3v"] = rdMolDescriptors.CalcChi3v(mol) df["Chi4v"] = rdMolDescriptors.CalcChi4v(mol) df["Chi1n"] = rdMolDescriptors.CalcChi1n(mol) df["Chi2n"] = rdMolDescriptors.CalcChi2n(mol) df["Chi3n"] = rdMolDescriptors.CalcChi3n(mol) df["Chi4n"] = rdMolDescriptors.CalcChi4n(mol) df["HallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol) df["kappa1"] = rdMolDescriptors.CalcKappa1(mol) df["kappa2"] = rdMolDescriptors.CalcKappa2(mol) df["kappa3"] = rdMolDescriptors.CalcKappa3(mol) slogp_VSA = list(map(lambda i: "slogp_VSA" + str(i), list(range(1, 13)))) df = df.assign(**dict(zip(slogp_VSA, rdMolDescriptors.SlogP_VSA_(mol)))) smr_VSA = list(map(lambda i: "smr_VSA" + str(i), list(range(1, 11)))) df = df.assign(**dict(zip(smr_VSA, rdMolDescriptors.SMR_VSA_(mol)))) peoe_VSA = list(map(lambda i: "peoe_VSA" + str(i), list(range(1, 15)))) df = df.assign(**dict(zip(peoe_VSA, rdMolDescriptors.PEOE_VSA_(mol)))) MQNs = list(map(lambda i: "MQN" + str(i), list(range(1, 43)))) df = df.assign(**dict(zip(MQNs, rdMolDescriptors.MQNs_(mol)))) return df
def main(in_file, output): Cmpds = {} InMols = rdkit_open([in_file]) print('\n # Number of input molecule: {0}'.format(len(InMols))) for mol in InMols: m = {} name = mol.GetProp('_Name').split()[0] m['Name'] = name m['Formula'] = rd.CalcMolFormula(mol) m['SMILES'] = Chem.MolToSmiles(mol) m['MW'] = rd._CalcMolWt(mol) # Molecular Weight m['logP'] = rd.CalcCrippenDescriptors(mol)[0] # Partition coefficient m['HDon'] = rd.CalcNumLipinskiHBD(mol) # Lipinski Hbond donor m['HAcc'] = rd.CalcNumLipinskiHBA(mol) # Lipinski Hbond acceptor m['TPSA'] = rd.CalcTPSA(mol) # Topological polar surface area m['Rotat'] = rd.CalcNumRotatableBonds(mol, strict=True) # Rotatable bond m['MolRef'] = rd.CalcCrippenDescriptors(mol)[1] # Molar refractivity m['AliRing'] = rd.CalcNumAliphaticRings(mol) # Aliphatic ring number m['AroRing'] = rd.CalcNumAromaticRings(mol) # Aromatic ring number # m['Stereo'] = rd.CalcNumAtomStereoCenters(mol) # Stereo center number # m['UnspStereo'] = rd.CalcNumUnspecifiedAtomStereoCenters(mol) # unspecified stereo m['SMILES'] = Chem.MolToSmiles(mol, isomericSmiles=True, allHsExplicit=False) Cmpds[name] = m #################################### df = pd.DataFrame.from_dict(Cmpds, orient='index') df.index.name = 'Name' # Columns of data to print out Columns = [ 'Formula', 'MW', 'logP', 'HDon', 'HAcc', 'TPSA', 'Rotat', 'MolRef', 'AliRing', 'AroRing', #'Stereo', 'UnspStereo', 'SMILES', ] reorder = df[Columns] # Output to CSV reorder.to_csv( output+'.csv', sep=',', na_rep='NA', encoding='utf-8', float_format='%.5f', header=True ) # Output to Excel reorder.to_excel( output+'.xlsx', header=True, na_rep='NA' )
def feature_fp(smiles): mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.MQNs_(mol) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcExactMolWt(mol)) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcFractionCSP3(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticRings((mol))) fp.append(rdMolDescriptors.CalcNumAromaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticRings(mol)) fp.append(rdMolDescriptors.CalcNumBridgeheadAtoms(mol)) fp.append(rdMolDescriptors.CalcNumRings(mol)) fp.append(rdMolDescriptors.CalcNumAmideBonds(mol)) fp.append(rdMolDescriptors.CalcNumHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumSpiroAtoms(mol)) fp.append(rdMolDescriptors.CalcTPSA(mol)) return np.array(fp)
def get_global_features(self, mol): u = [] # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) # First get some basic features natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol) NumValenceElectrons = Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) MaxAbsPartialCharge = Descriptors.MaxAbsPartialCharge(mol) MaxPartialCharge = Descriptors.MaxPartialCharge(mol) MinAbsPartialCharge = Descriptors.MinAbsPartialCharge(mol) MinPartialCharge = Descriptors.MinPartialCharge(mol) ''' # FpDensityMorgan1 = Descriptors.FpDensityMorgan1(mol) # FpDensityMorgan2 = Descriptors.FpDensityMorgan2(mol) # FpDensityMorgan3 = Descriptors.FpDensityMorgan3(mol) # Get some features using chemical feature factory nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass #print(feats[j].GetFamily()) # Now get some features using rdMolDescriptors moreGlobalFeatures = [rdm.CalcNumRotatableBonds(mol), rdm.CalcChi0n(mol), rdm.CalcChi0v(mol), \ rdm.CalcChi1n(mol), rdm.CalcChi1v(mol), rdm.CalcChi2n(mol), rdm.CalcChi2v(mol), \ rdm.CalcChi3n(mol), rdm.CalcChi4n(mol), rdm.CalcChi4v(mol), \ rdm.CalcFractionCSP3(mol), rdm.CalcHallKierAlpha(mol), rdm.CalcKappa1(mol), \ rdm.CalcKappa2(mol), rdm.CalcLabuteASA(mol), \ rdm.CalcNumAliphaticCarbocycles(mol), rdm.CalcNumAliphaticHeterocycles(mol), \ rdm.CalcNumAliphaticRings(mol), rdm.CalcNumAmideBonds(mol), \ rdm.CalcNumAromaticCarbocycles(mol), rdm.CalcNumAromaticHeterocycles(mol), \ rdm.CalcNumAromaticRings(mol), rdm.CalcNumBridgeheadAtoms(mol), rdm.CalcNumHBA(mol), \ rdm.CalcNumHBD(mol), rdm.CalcNumHeteroatoms(mol), rdm.CalcNumHeterocycles(mol), \ rdm.CalcNumLipinskiHBA(mol), rdm.CalcNumLipinskiHBD(mol), rdm.CalcNumRings(mol), \ rdm.CalcNumSaturatedCarbocycles(mol), rdm.CalcNumSaturatedHeterocycles(mol), \ rdm.CalcNumSaturatedRings(mol), rdm.CalcNumSpiroAtoms(mol), rdm.CalcTPSA(mol)] u = [natoms, nbonds, mw, HeavyAtomMolWt, NumValenceElectrons, \ nbrAcceptor, nbrDonor, nbrHydrophobe, nbrLumpedHydrophobe, \ nbrPosIonizable, nbrNegIonizable] u = u + moreGlobalFeatures u = np.array(u).T # Some of the descriptors produice NAN. We can convert them to 0 # If you are getting outliers in the training or validation set this could be # Because some important features were set to zero here because it produced NAN # Removing those features from the feature set might remove the outliers #u[np.isnan(u)] = 0 #u = torch.tensor(u, dtype=torch.float) return (u)
#FRAGMENTS = { # "acyl_halide": Chem.MolFromSmarts('[#9,#17,#35,#53]=O'), # C(=O)X # "anhydride": Chem.MolFromSmarts('[#6]-[#6](=O)-[#8]-[#6](-[#6])=O'), # CC(=O)OC(=O)C # "peroxide": Chem.MolFromSmarts('[#8]-[#8]'), # R-O-O-R' # "ab_unsaturated_ketone": Chem.MolFromSmarts('[#6]=[#6]-[#6]=O'), # R=CC=O #} DESCRIPTORS = { # classical molecular descriptors "num_heavy_atoms": lambda x: x.GetNumAtoms(), "molecular_weight": lambda x: round(Desc.ExactMolWt(x), 4), "num_rings": lambda x: rdMolDesc.CalcNumRings(x), "num_rings_arom": lambda x: rdMolDesc.CalcNumAromaticRings(x), "num_rings_ali": lambda x: rdMolDesc.CalcNumAliphaticRings(x), "num_hbd": lambda x: rdMolDesc.CalcNumLipinskiHBD(x), "num_hba": lambda x: rdMolDesc.CalcNumLipinskiHBA(x), "slogp": lambda x: round(Crippen.MolLogP(x), 4), "tpsa": lambda x: round(rdMolDesc.CalcTPSA(x), 4), "num_rotatable_bond": lambda x: rdMolDesc.CalcNumRotatableBonds(x), "num_atoms_oxygen": lambda x: len( [a for a in x.GetAtoms() if a.GetAtomicNum() == 8] ), "num_atoms_nitrogen": lambda x: len( [a for a in x.GetAtoms() if a.GetAtomicNum() == 7] ), "num_atoms_halogen": Fragments.fr_halogen, "num_atoms_bridgehead": rdMolDesc.CalcNumBridgeheadAtoms, # custom molecular descriptors #"ring_size_min": get_min_ring_size,
def get_molecular_features(dataframe, mol_list): df = dataframe for i in range(len(mol_list)): print("Getting molecular features for molecule: ", i) mol = mol_list[i] natoms = mol.GetNumAtoms() nbonds = mol.GetNumBonds() mw = Descriptors.ExactMolWt(mol) df.at[i,"NbrAtoms"] = natoms df.at[i,"NbrBonds"] = nbonds df.at[i,"mw"] = mw df.at[i,'HeavyAtomMolWt'] = Chem.Descriptors.HeavyAtomMolWt(mol) df.at[i,'NumValenceElectrons'] = Chem.Descriptors.NumValenceElectrons(mol) ''' # These four descriptors are producing the value of infinity for refcode_csd = YOLJUF (CCOP(=O)(Cc1ccc(cc1)NC(=S)NP(OC(C)C)(OC(C)C)[S])OCC\t\n) df.at[i,'MaxAbsPartialCharge'] = Chem.Descriptors.MaxAbsPartialCharge(mol) df.at[i,'MaxPartialCharge'] = Chem.Descriptors.MaxPartialCharge(mol) df.at[i,'MinAbsPartialCharge'] = Chem.Descriptors.MinAbsPartialCharge(mol) df.at[i,'MinPartialCharge'] = Chem.Descriptors.MinPartialCharge(mol) ''' df.at[i,'FpDensityMorgan1'] = Chem.Descriptors.FpDensityMorgan1(mol) df.at[i,'FpDensityMorgan2'] = Chem.Descriptors.FpDensityMorgan2(mol) df.at[i,'FpDensityMorgan3'] = Chem.Descriptors.FpDensityMorgan3(mol) #print(natoms, nbonds) # Now get some specific features fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef') factory = ChemicalFeatures.BuildFeatureFactory(fdefName) feats = factory.GetFeaturesForMol(mol) #df["Acceptor"] = 0 #df["Aromatic"] = 0 #df["Hydrophobe"] = 0 nbrAcceptor = 0 nbrDonor = 0 nbrHydrophobe = 0 nbrLumpedHydrophobe = 0 nbrPosIonizable = 0 nbrNegIonizable = 0 for j in range(len(feats)): #print(feats[j].GetFamily(), feats[j].GetType()) if ('Acceptor' == (feats[j].GetFamily())): nbrAcceptor = nbrAcceptor + 1 elif ('Donor' == (feats[j].GetFamily())): nbrDonor = nbrDonor + 1 elif ('Hydrophobe' == (feats[j].GetFamily())): nbrHydrophobe = nbrHydrophobe + 1 elif ('LumpedHydrophobe' == (feats[j].GetFamily())): nbrLumpedHydrophobe = nbrLumpedHydrophobe + 1 elif ('PosIonizable' == (feats[j].GetFamily())): nbrPosIonizable = nbrPosIonizable + 1 elif ('NegIonizable' == (feats[j].GetFamily())): nbrNegIonizable = nbrNegIonizable + 1 else: pass#print(feats[j].GetFamily()) df.at[i,"Acceptor"] = nbrAcceptor df.at[i,"Donor"] = nbrDonor df.at[i,"Hydrophobe"] = nbrHydrophobe df.at[i,"LumpedHydrophobe"] = nbrLumpedHydrophobe df.at[i,"PosIonizable"] = nbrPosIonizable df.at[i,"NegIonizable"] = nbrNegIonizable # We can also get some more molecular features using rdMolDescriptors df.at[i,"NumRotatableBonds"] = rdMolDescriptors.CalcNumRotatableBonds(mol) df.at[i,"CalcChi0n"] = rdMolDescriptors.CalcChi0n(mol) df.at[i,"CalcChi0v"] = rdMolDescriptors.CalcChi0v(mol) df.at[i,"CalcChi1n"] = rdMolDescriptors.CalcChi1n(mol) df.at[i,"CalcChi1v"] = rdMolDescriptors.CalcChi1v(mol) df.at[i,"CalcChi2n"] = rdMolDescriptors.CalcChi2n(mol) df.at[i,"CalcChi2v"] = rdMolDescriptors.CalcChi2v(mol) df.at[i,"CalcChi3n"] = rdMolDescriptors.CalcChi3n(mol) df.at[i,"CalcChi3v"] = rdMolDescriptors.CalcChi3v(mol) df.at[i,"CalcChi4n"] = rdMolDescriptors.CalcChi4n(mol) df.at[i,"CalcChi4v"] = rdMolDescriptors.CalcChi4v(mol) df.at[i,"CalcFractionCSP3"] = rdMolDescriptors.CalcFractionCSP3(mol) df.at[i,"CalcHallKierAlpha"] = rdMolDescriptors.CalcHallKierAlpha(mol) df.at[i,"CalcKappa1"] = rdMolDescriptors.CalcKappa1(mol) df.at[i,"CalcKappa2"] = rdMolDescriptors.CalcKappa2(mol) #df.at[i,"CalcKappa3"] = rdMolDescriptors.CalcKappa3(mol) df.at[i,"CalcLabuteASA"] = rdMolDescriptors.CalcLabuteASA(mol) df.at[i,"CalcNumAliphaticCarbocycles"] = rdMolDescriptors.CalcNumAliphaticCarbocycles(mol) df.at[i,"CalcNumAliphaticHeterocycles"] = rdMolDescriptors.CalcNumAliphaticHeterocycles(mol) df.at[i,"CalcNumAliphaticRings"] = rdMolDescriptors.CalcNumAliphaticRings(mol) df.at[i,"CalcNumAmideBonds"] = rdMolDescriptors.CalcNumAmideBonds(mol) df.at[i,"CalcNumAromaticCarbocycles"] = rdMolDescriptors.CalcNumAromaticCarbocycles(mol) df.at[i,"CalcNumAromaticHeterocycles"] = rdMolDescriptors.CalcNumAromaticHeterocycles(mol) df.at[i,"CalcNumAromaticRings"] = rdMolDescriptors.CalcNumAromaticRings(mol) df.at[i,"CalcNumBridgeheadAtoms"] = rdMolDescriptors.CalcNumBridgeheadAtoms(mol) df.at[i,"CalcNumHBA"] = rdMolDescriptors.CalcNumHBA(mol) df.at[i,"CalcNumHBD"] = rdMolDescriptors.CalcNumHBD(mol) df.at[i,"CalcNumHeteroatoms"] = rdMolDescriptors.CalcNumHeteroatoms(mol) df.at[i,"CalcNumHeterocycles"] = rdMolDescriptors.CalcNumHeterocycles(mol) df.at[i,"CalcNumLipinskiHBA"] = rdMolDescriptors.CalcNumLipinskiHBA(mol) df.at[i,"CalcNumLipinskiHBD"] = rdMolDescriptors.CalcNumLipinskiHBD(mol) df.at[i,"CalcNumRings"] = rdMolDescriptors.CalcNumRings(mol) df.at[i,"CalcNumSaturatedCarbocycles"] = rdMolDescriptors.CalcNumSaturatedCarbocycles(mol) df.at[i,"CalcNumSaturatedHeterocycles"] = rdMolDescriptors.CalcNumSaturatedHeterocycles(mol) df.at[i,"CalcNumSaturatedRings"] = rdMolDescriptors.CalcNumSaturatedRings(mol) df.at[i,"CalcNumSpiroAtoms"] = rdMolDescriptors.CalcNumSpiroAtoms(mol) df.at[i,"CalcTPSA"] = rdMolDescriptors.CalcTPSA(mol) return(df)