def check_ligand(file_path): bool = False if os.path.isfile(file_path): suppl = Chem.SDMolSupplier(file_path) for mol in suppl: if mol is not None: # components of rule hydrogen_bond_doner = True if Lipinski.NumHDonors( mol) <= 5 else False hydrogen_bond_acceptors = True if Lipinski.NumHAcceptors( mol) <= 10 else False molecular_mass = True if Descriptors.ExactMolWt( mol) <= 500 else False octanol_water_partition_coefficient_logP = True if Crippen.MolLogP( mol) <= 5 else False components_rank = hydrogen_bond_doner + hydrogen_bond_acceptors + molecular_mass + octanol_water_partition_coefficient_logP # variants partition_coefficient_logP = True if -0.4 <= Crippen.MolLogP( mol) <= 5.6 else False molar_refractivity = True if 40 <= Crippen.MolMR( mol) <= 130 else False molecular_weight = True if 180 <= Descriptors.ExactMolWt( mol) <= 500 else False number_of_atoms = True if 20 <= Lipinski.HeavyAtomCount( mol) <= 70 else False polar_surface_area = True if MolSurf.TPSA( mol) <= 140 else False variants_rank = partition_coefficient_logP + molar_refractivity + molecular_weight + number_of_atoms + polar_surface_area if (components_rank == 4) and (variants_rank == 4 or variants_rank == 5): bool = True return bool
def getAromaticProportion(m): aromatic_list = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())] aromatic = 0 for i in aromatic_list: if i: aromatic += 1 heavy_atom = Lipinski.HeavyAtomCount(m) return aromatic / heavy_atom
def desalt_compound(smiles): """Function to desalt compound a given smiles string Takes a smiles string Returns a desalted smiles string.""" # Chose the biggest fragment, after splitting into fragments return sorted([(x, Lipinski.HeavyAtomCount(Chem.MolFromSmiles(x))) for x in smiles.split(".")], key=lambda x: x[1], reverse=True)[0][0]
def getDiscriptor(self): from rdkit.Chem import Crippen from rdkit import Chem import pandas as pd from rdkit.Chem import Descriptors, Lipinski import os os.chdir(r"G:\マイドライブ\Data\Meram Chronic Data") df = pd.read_csv('extChronicStrcture.csv', engine='python') df = df[['CAS', 'canonical_smiles']] df = df.dropna(how='any') #df = pd.read_csv('extractInchi.csv',header=None) columns = [ 'CAS', 'weight', 'logP', 'RotatableBonds', 'HeavyAtomCounts', 'AromProp', 'TPSA', 'HDonor', 'HAcceptors', 'FractionCSP3', 'AromaticCarbocycles', 'AromaticHeterocycles' ] CAS = df['CAS'] SMILES = df['canonical_smiles'] resultDf = pd.DataFrame(columns=columns) for cas, smiles in zip(CAS, SMILES): mol = Chem.MolFromSmiles(smiles) wt = Descriptors.MolWt(mol) rot = Lipinski.NumRotatableBonds(mol) heavy = Lipinski.HeavyAtomCount(mol) logp = Crippen.MolLogP(mol) aromaticHeavyatoms = len( mol.GetSubstructMatches(Chem.MolFromSmarts('[a]'))) numAtoms = mol.GetNumAtoms() aromprop = float(aromaticHeavyatoms / numAtoms) TPSA = Descriptors.TPSA(mol) HDonors = Descriptors.NumHDonors(mol) HAcceptors = Descriptors.NumHAcceptors(mol) FractionCSP3 = Descriptors.FractionCSP3(mol) AromaticCarbocycles = Descriptors.NumAromaticCarbocycles(mol) AromaticHeterocycles = Descriptors.NumAromaticHeterocycles(mol) (print(HDonors, HAcceptors)) tempDf = pd.DataFrame([[ cas, wt, logp, rot, heavy, aromprop, TPSA, HDonors, HAcceptors, FractionCSP3, AromaticCarbocycles, AromaticHeterocycles ]], columns=columns) resultDf = pd.concat([resultDf, tempDf]) resultDf.to_csv('Descriptors.csv', index=False)
def PhyChem(smiles): """ Calculating the 19D physicochemical descriptors for each molecules, the value has been normalized with Gaussian distribution. Arguments: smiles (list): list of SMILES strings. Returns: props (ndarray): m X 19 matrix as normalized PhysChem descriptors. m is the No. of samples """ props = [] for smile in smiles: mol = Chem.MolFromSmiles(smile) try: MW = desc.MolWt(mol) LOGP = Crippen.MolLogP(mol) HBA = Lipinski.NumHAcceptors(mol) HBD = Lipinski.NumHDonors(mol) rotable = Lipinski.NumRotatableBonds(mol) amide = AllChem.CalcNumAmideBonds(mol) bridge = AllChem.CalcNumBridgeheadAtoms(mol) heteroA = Lipinski.NumHeteroatoms(mol) heavy = Lipinski.HeavyAtomCount(mol) spiro = AllChem.CalcNumSpiroAtoms(mol) FCSP3 = AllChem.CalcFractionCSP3(mol) ring = Lipinski.RingCount(mol) Aliphatic = AllChem.CalcNumAliphaticRings(mol) aromatic = AllChem.CalcNumAromaticRings(mol) saturated = AllChem.CalcNumSaturatedRings(mol) heteroR = AllChem.CalcNumHeterocycles(mol) TPSA = MolSurf.TPSA(mol) valence = desc.NumValenceElectrons(mol) mr = Crippen.MolMR(mol) # charge = AllChem.ComputeGasteigerCharges(mol) prop = [ MW, LOGP, HBA, HBD, rotable, amide, bridge, heteroA, heavy, spiro, FCSP3, ring, Aliphatic, aromatic, saturated, heteroR, TPSA, valence, mr ] except Exception: print(smile) prop = [0] * 19 props.append(prop) props = np.array(props) props = Scaler().fit_transform(props) return props
def __init__(self, *args, **kwargs): if len(args) > 2: super(Compound, self).__init__(*args, **kwargs) return mol_as_RDmol = args[0] if len(args) > 0 else None if not mol_as_RDmol: mol_as_RDmol = kwargs['mol_as_RDmol'] if 'mol_as_RDmol' in kwargs else None if not mol_as_RDmol: raise RuntimeError("No RDMol specified") description = args[1] if len(args) > 1 else None if not description: description = kwargs['description'] if 'description' in kwargs else '' new_kwargs = dict() new_kwargs['unique_id'] = self._generate_id() new_kwargs['smiles'] = Chem.MolToSmiles(mol_as_RDmol, isomericSmiles=True, canonical=True) new_kwargs['inchi'] = Chem.MolToInchi(mol_as_RDmol) new_kwargs['inchi_key'] = Chem.InchiToInchiKey(new_kwargs['inchi']) new_kwargs['mol_weight_exact'] = Descriptors.ExactMolWt(mol_as_RDmol) new_kwargs['heavy_atoms_count'] = Lipinski.HeavyAtomCount(mol_as_RDmol) new_kwargs['ring_count'] = Lipinski.RingCount(mol_as_RDmol) new_kwargs['mol'] = mol_as_RDmol super(Compound, self).__init__(description=description, **new_kwargs)
def extract(x, from_smiles): if from_smiles: mol = Chem.MolFromSmiles(x) else: mol = x if (mol is None) or (len(mol.GetAtoms()) == 0): if include_3D: return [0] * 29 else: return [0] * 24 else: logP = Crippen.MolLogP(mol) refractivity = Crippen.MolMR(mol) weight = Descriptors.MolWt(mol) exact_weight = Descriptors.ExactMolWt(mol) heavy_weight = Descriptors.HeavyAtomMolWt(mol) heavy_count = Lipinski.HeavyAtomCount(mol) nhoh_count = Lipinski.NHOHCount(mol) no_count = Lipinski.NOCount(mol) hacceptor_count = Lipinski.NumHAcceptors(mol) hdonor_count = Lipinski.NumHDonors(mol) hetero_count = Lipinski.NumHeteroatoms(mol) rotatable_bond_count = Lipinski.NumRotatableBonds(mol) valance_electron_count = Descriptors.NumValenceElectrons(mol) amide_bond_count = rdMolDescriptors.CalcNumAmideBonds(mol) aliphatic_ring_count = Lipinski.NumAliphaticRings(mol) aromatic_ring_count = Lipinski.NumAromaticRings(mol) saturated_ring_count = Lipinski.NumSaturatedRings(mol) aliphatic_cycle_count = Lipinski.NumAliphaticCarbocycles(mol) aliphaticHetero_cycle_count = Lipinski.NumAliphaticHeterocycles( mol) aromatic_cycle_count = Lipinski.NumAromaticCarbocycles(mol) aromaticHetero_cycle_count = Lipinski.NumAromaticHeterocycles(mol) saturated_cycle_count = Lipinski.NumSaturatedCarbocycles(mol) saturatedHetero_cycle_count = Lipinski.NumSaturatedHeterocycles( mol) tpsa = rdMolDescriptors.CalcTPSA(mol) if include_3D: mol_3D = Chem.AddHs(mol) AllChem.EmbedMolecule(mol_3D) AllChem.MMFFOptimizeMolecule(mol_3D) eccentricity = rdMolDescriptors.CalcEccentricity(mol_3D) asphericity = rdMolDescriptors.CalcAsphericity(mol_3D) spherocity = rdMolDescriptors.CalcSpherocityIndex(mol_3D) inertial = rdMolDescriptors.CalcInertialShapeFactor(mol_3D) gyration = rdMolDescriptors.CalcRadiusOfGyration(mol_3D) return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa, eccentricity, asphericity, spherocity, inertial, gyration ] else: return [ logP, refractivity, weight, exact_weight, heavy_weight, heavy_count, nhoh_count, no_count, hacceptor_count, hdonor_count, hetero_count, rotatable_bond_count, valance_electron_count, amide_bond_count, aliphatic_ring_count, aromatic_ring_count, saturated_ring_count, aliphatic_cycle_count, aliphaticHetero_cycle_count, aromatic_cycle_count, aromaticHetero_cycle_count, saturated_cycle_count, saturatedHetero_cycle_count, tpsa ]
def calc_rdkit(mol): descriptors = pd.Series( np.array([ Crippen.MolLogP(mol), Crippen.MolMR(mol), Descriptors.FpDensityMorgan1(mol), Descriptors.FpDensityMorgan2(mol), Descriptors.FpDensityMorgan3(mol), Descriptors.FractionCSP3(mol), Descriptors.HeavyAtomMolWt(mol), Descriptors.MaxAbsPartialCharge(mol), Descriptors.MaxPartialCharge(mol), Descriptors.MinAbsPartialCharge(mol), Descriptors.MinPartialCharge(mol), Descriptors.MolWt(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.NumValenceElectrons(mol), EState.EState.MaxAbsEStateIndex(mol), EState.EState.MaxEStateIndex(mol), EState.EState.MinAbsEStateIndex(mol), EState.EState.MinEStateIndex(mol), EState.EState_VSA.EState_VSA1(mol), EState.EState_VSA.EState_VSA10(mol), EState.EState_VSA.EState_VSA11(mol), EState.EState_VSA.EState_VSA2(mol), EState.EState_VSA.EState_VSA3(mol), EState.EState_VSA.EState_VSA4(mol), EState.EState_VSA.EState_VSA5(mol), EState.EState_VSA.EState_VSA6(mol), EState.EState_VSA.EState_VSA7(mol), EState.EState_VSA.EState_VSA8(mol), EState.EState_VSA.EState_VSA9(mol), Fragments.fr_Al_COO(mol), Fragments.fr_Al_OH(mol), Fragments.fr_Al_OH_noTert(mol), Fragments.fr_aldehyde(mol), Fragments.fr_alkyl_carbamate(mol), Fragments.fr_alkyl_halide(mol), Fragments.fr_allylic_oxid(mol), Fragments.fr_amide(mol), Fragments.fr_amidine(mol), Fragments.fr_aniline(mol), Fragments.fr_Ar_COO(mol), Fragments.fr_Ar_N(mol), Fragments.fr_Ar_NH(mol), Fragments.fr_Ar_OH(mol), Fragments.fr_ArN(mol), Fragments.fr_aryl_methyl(mol), Fragments.fr_azide(mol), Fragments.fr_azo(mol), Fragments.fr_barbitur(mol), Fragments.fr_benzene(mol), Fragments.fr_benzodiazepine(mol), Fragments.fr_bicyclic(mol), Fragments.fr_C_O(mol), Fragments.fr_C_O_noCOO(mol), Fragments.fr_C_S(mol), Fragments.fr_COO(mol), Fragments.fr_COO2(mol), Fragments.fr_diazo(mol), Fragments.fr_dihydropyridine(mol), Fragments.fr_epoxide(mol), Fragments.fr_ester(mol), Fragments.fr_ether(mol), Fragments.fr_furan(mol), Fragments.fr_guanido(mol), Fragments.fr_halogen(mol), Fragments.fr_hdrzine(mol), Fragments.fr_hdrzone(mol), Fragments.fr_HOCCN(mol), Fragments.fr_imidazole(mol), Fragments.fr_imide(mol), Fragments.fr_Imine(mol), Fragments.fr_isocyan(mol), Fragments.fr_isothiocyan(mol), Fragments.fr_ketone(mol), Fragments.fr_ketone_Topliss(mol), Fragments.fr_lactam(mol), Fragments.fr_lactone(mol), Fragments.fr_methoxy(mol), Fragments.fr_morpholine(mol), Fragments.fr_N_O(mol), Fragments.fr_Ndealkylation1(mol), Fragments.fr_Ndealkylation2(mol), Fragments.fr_NH0(mol), Fragments.fr_NH1(mol), Fragments.fr_NH2(mol), Fragments.fr_Nhpyrrole(mol), Fragments.fr_nitrile(mol), Fragments.fr_nitro(mol), Fragments.fr_nitro_arom(mol), Fragments.fr_nitro_arom_nonortho(mol), Fragments.fr_nitroso(mol), Fragments.fr_oxazole(mol), Fragments.fr_oxime(mol), Fragments.fr_para_hydroxylation(mol), Fragments.fr_phenol(mol), Fragments.fr_phenol_noOrthoHbond(mol), Fragments.fr_phos_acid(mol), Fragments.fr_phos_ester(mol), Fragments.fr_piperdine(mol), Fragments.fr_piperzine(mol), Fragments.fr_priamide(mol), Fragments.fr_prisulfonamd(mol), Fragments.fr_pyridine(mol), Fragments.fr_quatN(mol), Fragments.fr_SH(mol), Fragments.fr_sulfide(mol), Fragments.fr_sulfonamd(mol), Fragments.fr_sulfone(mol), Fragments.fr_term_acetylene(mol), Fragments.fr_tetrazole(mol), Fragments.fr_thiazole(mol), Fragments.fr_thiocyan(mol), Fragments.fr_thiophene(mol), Fragments.fr_unbrch_alkane(mol), Fragments.fr_urea(mol), GraphDescriptors.BalabanJ(mol), GraphDescriptors.BertzCT(mol), GraphDescriptors.Chi0(mol), GraphDescriptors.Chi0n(mol), GraphDescriptors.Chi0v(mol), GraphDescriptors.Chi1(mol), GraphDescriptors.Chi1n(mol), GraphDescriptors.Chi1v(mol), GraphDescriptors.Chi2n(mol), GraphDescriptors.Chi2v(mol), GraphDescriptors.Chi3n(mol), GraphDescriptors.Chi3v(mol), GraphDescriptors.Chi4n(mol), GraphDescriptors.Chi4v(mol), GraphDescriptors.HallKierAlpha(mol), GraphDescriptors.Ipc(mol), GraphDescriptors.Kappa1(mol), GraphDescriptors.Kappa2(mol), GraphDescriptors.Kappa3(mol), Lipinski.HeavyAtomCount(mol), Lipinski.NHOHCount(mol), Lipinski.NOCount(mol), Lipinski.NumAliphaticCarbocycles(mol), Lipinski.NumAliphaticHeterocycles(mol), Lipinski.NumAliphaticRings(mol), Lipinski.NumAromaticCarbocycles(mol), Lipinski.NumAromaticHeterocycles(mol), Lipinski.NumAromaticRings(mol), Lipinski.NumHAcceptors(mol), Lipinski.NumHDonors(mol), Lipinski.NumHeteroatoms(mol), Lipinski.NumRotatableBonds(mol), Lipinski.NumSaturatedCarbocycles(mol), Lipinski.NumSaturatedHeterocycles(mol), Lipinski.NumSaturatedRings(mol), Lipinski.RingCount(mol), MolSurf.LabuteASA(mol), MolSurf.PEOE_VSA1(mol), MolSurf.PEOE_VSA10(mol), MolSurf.PEOE_VSA11(mol), MolSurf.PEOE_VSA12(mol), MolSurf.PEOE_VSA13(mol), MolSurf.PEOE_VSA14(mol), MolSurf.PEOE_VSA2(mol), MolSurf.PEOE_VSA3(mol), MolSurf.PEOE_VSA4(mol), MolSurf.PEOE_VSA5(mol), MolSurf.PEOE_VSA6(mol), MolSurf.PEOE_VSA7(mol), MolSurf.PEOE_VSA8(mol), MolSurf.PEOE_VSA9(mol), MolSurf.SlogP_VSA1(mol), MolSurf.SlogP_VSA10(mol), MolSurf.SlogP_VSA11(mol), MolSurf.SlogP_VSA12(mol), MolSurf.SlogP_VSA2(mol), MolSurf.SlogP_VSA3(mol), MolSurf.SlogP_VSA4(mol), MolSurf.SlogP_VSA5(mol), MolSurf.SlogP_VSA6(mol), MolSurf.SlogP_VSA7(mol), MolSurf.SlogP_VSA8(mol), MolSurf.SlogP_VSA9(mol), MolSurf.SMR_VSA1(mol), MolSurf.SMR_VSA10(mol), MolSurf.SMR_VSA2(mol), MolSurf.SMR_VSA3(mol), MolSurf.SMR_VSA4(mol), MolSurf.SMR_VSA5(mol), MolSurf.SMR_VSA6(mol), MolSurf.SMR_VSA7(mol), MolSurf.SMR_VSA8(mol), MolSurf.SMR_VSA9(mol), MolSurf.TPSA(mol) ])) return descriptors
def get_num_heavy_atoms_(mol: Mol) -> int: """Returns the number of heavy atoms in the molecule""" return Lipinski.HeavyAtomCount(mol)
def smiles_to_all_labels(df): smilesList = df['SMILES'] feature_df = df.copy() # get all functions of modules all_lipinski = inspect.getmembers(l, inspect.isfunction) all_fragments = inspect.getmembers(f, inspect.isfunction) # bad features have the same value for all our compounds bad_features = [] for (columnName, columnData) in df.iteritems(): if (len(set(columnData.values)) == 1): bad_features.append(columnName) # add fragment features for i in range(len(all_fragments)): new_col = [] # exclude attributes which start with _ and exclude bad features if all_fragments[i][0].startswith( '_') == False and all_fragments[i][0] not in bad_features: for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) mol_method = all_fragments[i][1](molecule) new_col.append(mol_method) # add new col with feature name to our df feature_df[all_fragments[i][0]] = new_col print('fragments over') # add lipinski features for i in range(len(all_lipinski)): new_col = [] if all_lipinski[i][0].startswith( '_') == False and all_fragments[i][0] not in bad_features: for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) mol_method = all_lipinski[i][1](molecule) new_col.append(mol_method) feature_df[all_lipinski[i][0]] = new_col print('lipinski over') new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(f.fr_Al_COO(molecule)) feature_df["fr_Al_COO"] = new_col # new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(l.HeavyAtomCount(molecule)) feature_df["HeavyAtomCount"] = new_col # add getnumatoms as feature new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(molecule.GetNumAtoms()) feature_df["GetNumAtoms"] = new_col # add CalcExactMolWt as feature new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(molDesc.CalcExactMolWt(molecule)) feature_df["CalcExactMolWt"] = new_col # print('other over') return feature_df
def main(): # CLI options parsing parser = argparse.ArgumentParser( description = "Project molecules read from a SMILES file into an 8D \ space whose dimensions are molecular descriptors: \ (MolW, HA, cLogP, MR, TPSA, RotB, HBA, HBD, FC)") parser.add_argument("-i", metavar = "input_smi", dest = "input_smi", help = "input SMILES file") parser.add_argument("-o", metavar = "output_csv", dest = "output_csv", help = "output CSV file") parser.add_argument('--no-header', dest='no_header', action='store_true', default=False, help = "no CSV header in output file") # just warn about aliens by default parser.add_argument('--remove-aliens', dest='rm_aliens', action='store_true', default=False, help = "don't allow aliens in output file") # parse CLI if len(sys.argv) == 1: # show help in case user has no clue of what to do parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() input_smi = args.input_smi output_csv = args.output_csv rm_aliens = args.rm_aliens no_header = args.no_header out_count = 0 alien_count = 0 error_count = 0 with open(output_csv, 'w') as out_file: if not no_header: print("#name,MolW,HA,cLogP,AR,MR,TPSA,RotB,HBA,HBD,FC", file=out_file) for i, mol, name in RobustSmilesMolSupplier(input_smi): if mol is None: error_count += 1 else: MolW = Descriptors.MolWt(mol) HA = Lipinski.HeavyAtomCount(mol) cLogP = Descriptors.MolLogP(mol) AR = Lipinski.NumAromaticRings(mol) MR = Descriptors.MolMR(mol) TPSA = Descriptors.TPSA(mol) RotB = Descriptors.NumRotatableBonds(mol) HBA = Descriptors.NumHAcceptors(mol) HBD = Descriptors.NumHDonors(mol) FC = Chem.rdmolops.GetFormalCharge(mol) alien = is_alien(MolW, cLogP, TPSA, RotB, HBA, HBD, FC) if alien: alien_str = alien_diagnose(i, name, MolW, cLogP, TPSA, RotB, HBA, HBD, FC) print("WARN: %s" % alien_str, file=sys.stderr) alien_count += 1 if (not alien) or (not rm_aliens): csv_line = "%s,%g,%d,%g,%d,%g,%g,%d,%d,%d,%d" % \ (name, MolW, HA, cLogP, AR, MR, TPSA, RotB, HBA, HBD, FC) print(csv_line, file=out_file) out_count += 1 total_count = out_count + error_count if rm_aliens: total_count += alien_count print("encoded: %d aliens: %d errors: %d total: %d" % \ (out_count, alien_count, error_count, total_count), file=sys.stderr)