def get_lipinksi_test(mol, rule_test): mol.UpdatePropertyCache(strict=False) MW = rdMolDescriptors.CalcExactMolWt(mol) # Calculate mol features. NB CalcCrippenDescriptors returns tuple logP & mr_values feature_values = [ rdMolDescriptors.CalcCrippenDescriptors(mol)[0], rdMolDescriptors.CalcNumLipinskiHBD(mol), rdMolDescriptors.CalcNumLipinskiHBA(mol) ] test_rule = all(value <= rule_test for value in feature_values) if MW < 500 and MW > 300 and test_rule == True: return True else: return False
def testMolWt(self): mol = Chem.MolFromSmiles("C") amw = rdMD._CalcMolWt(mol) self.assertTrue(feq(amw, 16.043, .001)) amw = rdMD._CalcMolWt(mol, True) self.assertTrue(feq(amw, 12.011, .001)) mol2 = Chem.AddHs(mol) amw = rdMD._CalcMolWt(mol2) self.assertTrue(feq(amw, 16.043, .001)) amw = rdMD._CalcMolWt(mol2, True) self.assertTrue(feq(amw, 12.011, .001)) mol = Chem.MolFromSmiles("C") amw = rdMD.CalcExactMolWt(mol) self.assertTrue(feq(amw, 16.031, .001))
def predict(self, mol, selected_descriptors): options = [0, 0, 0, 0, 0] return_properties = {} for option in selected_descriptors: if option == 'logP': options[0] = 1 elif option == 'sol': options[0] = 1 options[1] = 1 elif option == 'mp': options[0] = 1 options[1] = 1 options[2] = 1 elif option == 'pka': options[3] = 1 elif option == 'mol_wt': options[4] = 1 fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol) if options[0]: logP = self.logP_model.run(fp) return_properties['logP'] = logP if options[1]: logP_sol = self.logP_solubility_model.run(logP) atom_pair_sol = self.atom_pair_sol_model.run(fp) combined_sol = self.combined_model.run(mol, logP, logP_sol, atom_pair_sol) mg_ml_sol = logs_to_mg_ml(combined_sol, mol) return_properties['sol'] = mg_ml_sol if options[2]: mp = self.melting_point_model.run(combined_sol, logP) return_properties['mp'] = mp if options[3]: avalon = GetAvalonFP(mol) maacs = MACCSkeys.GenMACCSKeys(mol) pka = self.pKa_model.run(avalon + maacs + fp) return_properties['pka'] = pka if options[4]: wt = rdMolDescriptors.CalcExactMolWt(mol) return_properties['mol_wt'] = wt return return_properties
def evaluate_chem_mol(mol): try: Chem.GetSSSR(mol) clogp = Crippen.MolLogP(mol) mw = MolDescriptors.CalcExactMolWt(mol) tpsa = Descriptors.TPSA(mol) ret_val = [ True, 320 < mw < 420, 2 < clogp < 3, 40 < tpsa < 60 ] except: ret_val = [False] * 4 return ret_val
def choose(self, mol): """Return the largest covalent unit. The largest fragment is determined by number of atoms (including hydrogens). Ties are broken by taking the fragment with the higher molecular weight, and then by taking the first alphabetically by SMILES if needed. :param mol: The molecule to choose the largest fragment from. :type mol: :rdkit:`Mol <Chem.rdchem.Mol-class.html>` :return: The largest fragment. :rtype: :rdkit:`Mol <Chem.rdchem.Mol-class.html>` """ log.debug('Running LargestFragmentChooser') # TODO: Alternatively allow a list of fragments to be passed as the mol parameter fragments = Chem.GetMolFrags(mol, asMols=True) largest = None for f in fragments: smiles = Chem.MolToSmiles(f, isomericSmiles=True) log.debug('Fragment: %s', smiles) organic = is_organic(f) if self.prefer_organic: # Skip this fragment if not organic and we already have an organic fragment as the largest so far if largest and largest['organic'] and not organic: continue # Reset largest if it wasn't organic and this fragment is organic if largest and organic and not largest['organic']: largest = None # Count atoms atoms = 0 for a in f.GetAtoms(): atoms += 1 + a.GetTotalNumHs() # Skip this fragment if fewer atoms than the largest if largest and atoms < largest['atoms']: continue # Skip this fragment if equal number of atoms but weight is lower weight = rdMolDescriptors.CalcExactMolWt(f) if largest and atoms == largest['atoms'] and weight < largest['weight']: continue # Skip this fragment if equal atoms and equal weight but smiles comes last alphabetically if largest and atoms == largest['atoms'] and weight == largest['weight'] and smiles > largest['smiles']: continue # Otherwise this is the largest so far log.debug('New largest fragment: %s (%s)', smiles, atoms) largest = {'smiles': smiles, 'fragment': f, 'atoms': atoms, 'weight': weight, 'organic': organic} return largest['fragment']
def parse_epilion(abbr: str) -> dict: fa_decoder = ParserFA() pl_decoder = ParserPL() info_dct = {} converter = Converter(abbr_cfg_path) epilion_id = converter.convert_abbr(abbr) if fa_decoder.is_fa(epilion_id): smi = fa_decoder.get_smi_fa(epilion_id) logger.info(epilion_id + ': ' + smi) elif pl_decoder.is_pl(epilion_id): smi = pl_decoder.get_smi_pl(epilion_id) logger.info(epilion_id + ': ' + smi) else: logger.info(f'Can NOT parse abbreviation: {epilion_id}') try: mol = Chem.MolFromSmiles(smi) AllChem.Compute2DCoords(mol) # m_mass = Descriptors.MolWt(mol) m_exactmass = rdMolDescriptors.CalcExactMolWt(mol) m_formula = rdMolDescriptors.CalcMolFormula(mol) img = Draw.MolToImage(mol, size=(600, 400)) img_io = BytesIO() img.save(img_io, format='png') img_io.seek(0) img.save(img_io, format='png') img_data = base64.b64encode(img_io.getbuffer()) img_data_url = r'data:image/png;base64,' + img_data.decode("utf-8") info_dct['id'] = epilion_id info_dct['formula'] = m_formula info_dct['exactmass'] = '%.4f' % m_exactmass info_dct['img'] = img_data_url except Exception as e: logger.error(f'! FAILED: {epilion_id}') logger.error(f'! FAILED to generate structure from SMILES: {smi}') logger.error(e) return info_dct
def get_filter_values(mol): """ calculate the values, for a given molecule, that are used to filter return as a dictionary """ assert isinstance(mol, Chem.Mol) values = {} values["MW"] = desc.CalcExactMolWt(mol) values["logP"] = crip.MolLogP(mol) values["HBA"] = lip.NumHAcceptors(mol) values["HBD"] = lip.NumHDonors(mol) values["tPSA"] = desc.CalcTPSA(mol) values["rot_bonds"] = lip.NumRotatableBonds(mol) values["rigid_bonds"] = mol.GetNumBonds() - values[ "rot_bonds"] # assume mutual exclusion values["num_rings"] = lip.RingCount(mol) values["num_hetero_atoms"] = lip.NumHeteroatoms(mol) values["charge"] = rdmolops.GetFormalCharge( mol) # trusting this charge calculation method values["num_carbons"], values["num_charges"], values[ "max_ring_size"] = get_atom_props(mol) try: values["hc_ratio"] = float(values["num_hetero_atoms"]) / float( values["num_carbons"]) except ZeroDivisionError: values["hc_ratio"] = 100000000 # if there are zero carbons values["fc"] = len(list(Brics.FindBRICSBonds( mol))) # how many BRICS bonds, related to complexity values["is_good"] = True # default to true, but not yet observed atoms = [atom.GetSymbol() for atom in mol.GetAtoms() ] # get all the atoms, and make the list unique (only types) atoms = set(atoms) atoms = list(atoms) values["atoms"] = atoms values["num_chiral_centers"] = len( Chem.FindMolChiralCenters(mol, includeUnassigned=True)) values["rejections"] = [] # empty list to store the reasons for rejection return values
def __init__(self, mol, atom_count=None, MW=None, Tb=None): if type(mol) == Chem.rdchem.Mol: self.rdkitmol = mol else: self.rdkitmol = Chem.MolFromSmiles(mol) if atom_count is None: self.rdkitmol_Hs = Chem.AddHs(self.rdkitmol) self.atom_count = len(self.rdkitmol_Hs.GetAtoms()) else: self.atom_count = atom_count if MW is None: self.MW = rdMolDescriptors.CalcExactMolWt(self.rdkitmol_Hs) else: self.MW = MW self.counts, self.success, self.status = smarts_fragment(J_BIGGS_JOBACK_SMARTS_id_dict, rdkitmol=self.rdkitmol) if Tb is not None: self.Tb_estimated = self.Tb(self.counts) else: self.Tb_estimated = Tb
def get_monoisotopic_mz_and_z(structure): """ Determines the monoisotopic m/z value and charge of an ion provided as a SMILES string or .sdf file. :param structure: str a valid SMILES string OR a path to an .sdf file containg a single ion structure. :return out_dict: dict w/ entries "charge" (int) and "monoiso_mz" (float in Daltons) and rdkit mol obj. """ # parse input try: mol = Chem.MolFromSmiles(structure) if mol is None: raise TypeError( 'The provided structure was not a valid SMILES, assuming it is a path to an .sdf file...' ) except TypeError: try: lst = [mol for mol in Chem.SDMolSupplier(structure)] mol = lst[0] except OSError: raise TypeError( 'The provide structure was neither a valid SMILES string nor a path to an .sdf file.' ) # ensure mol exists if not mol: raise NotImplementedError( 'For unknown reasons, the provided structure could not be analyzed.' ) # determine properties of mol monoiso_mz = rdMolDescriptors.CalcExactMolWt(mol) charge = rdmolops.GetFormalCharge(mol) # ensure provided structure is of an ion if not charge: raise ValueError( 'Provided structures must be of ions, not neutral molecules.') charge = int(charge) out_dict = {'charge': charge, 'monoiso_mz': monoiso_mz, 'mol': mol} return out_dict
def calculate_properties(self, smiles=None, mol=None, props=[]): """this method calculates basic properties for the mol returns : error (bool)""" if len(props) == 0: return True if mol is None: mol = Chem.MolFromSmiles(smiles) if mol is None: return True if 'py_formula' in props: self.data['py_formula'] = desc.CalcMolFormula(mol) if 'py_em' in props: self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5) if 'py_n_Cl_Br' in props: all_atoms = [] for atom in mol.GetAtoms(): all_atoms.append(atom.GetSymbol()) n_Cl = all_atoms.count('Cl') n_Br = all_atoms.count('Br') self.data['py_n_Cl_Br'] = n_Cl + n_Br if 'py_na' in props: self.data['py_na'] = mol.GetNumAtoms() if 'py_mw' in props: self.data['py_mw'] = desc._CalcMolWt(mol) if 'py_fsp3' in props: self.data['py_fsp3'] = desc.CalcFractionCSP3(mol) if 'py_rb' in props: self.data['py_rb'] = desc.CalcNumRotatableBonds(mol) if 'py_tpsa' in props: self.data['py_tpsa'] = desc.CalcTPSA(mol) if 'py_clogp' in props: self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0] if 'py_nar' in props: self.data['py_nar'] = desc.CalcNumAromaticRings(mol) if 'py_nhba' in props: self.data['py_nhba'] = desc.CalcNumHBA(mol) if 'py_nhbd' in props: self.data['py_nhbd'] = desc.CalcNumHBD(mol) return False
def choose_largest_fragment(mol): """Return the largest covalent unit. The largest fragment is determined by number of atoms (including hydrogens). Ties are broken by taking the fragment with the higher molecular weight, and then by taking the first alphabetically by SMILES if needed. :param mol: The molecule to choose the largest fragment from. :type mol: :rdkit:`Mol <Chem.rdchem.Mol-class.html>` :return: The largest fragment. :rtype: :rdkit:`Mol <Chem.rdchem.Mol-class.html>` """ # TODO: Alternatively allow a list of fragments to be passed as the mol parameter fragments = Chem.GetMolFrags(mol, asMols=True) largest = None for f in fragments: smiles = Chem.MolToSmiles(f, isomericSmiles=True) # Count atoms atoms = 0 for a in f.GetAtoms(): atoms += 1 + a.GetTotalNumHs() # Skip this fragment if fewer atoms than the largest if largest and atoms < largest['atoms']: continue # Skip this fragment if equal number of atoms but weight is lower weight = rdMolDescriptors.CalcExactMolWt(f) if largest and atoms == largest['atoms'] and weight < largest['weight']: continue # Skip this fragment if equal atoms and equal weight but smiles comes last alphabetically if largest and atoms == largest['atoms'] and weight == largest[ 'weight'] and smiles > largest['smiles']: continue # Otherwise this is the largest so far largest = { 'smiles': smiles, 'fragment': f, 'atoms': atoms, 'weight': weight } return largest['fragment']
def computeFeatures(mol): numRings = rdMolDescriptors.CalcNumRings(mol) numRotBonds = rdMolDescriptors.CalcNumRotatableBonds(mol) nitrogenCount = countNitrogens(mol) oxygenCount = countOxygens(mol) carbonCount = countCarbons(mol) boronCount = countBorons(mol) phosCount = countPhos(mol) sulfurCount = countSulfurs(mol) fluorCount = countFluorine(mol) iodCount = countIodine(mol) doubleBonds = countDoubleBonds(mol) surf_area = rdMolDescriptors.CalcLabuteASA(mol) mol_weight = rdMolDescriptors.CalcExactMolWt(mol) s_logp = rdMolDescriptors.SlogP_VSA_(mol) dist_hs = recurseMolHCount(mol) output = [numRings, nitrogenCount, oxygenCount, carbonCount, boronCount, phosCount, sulfurCount, fluorCount, iodCount, doubleBonds, surf_area, mol_weight] for s in s_logp: output.append(s) for d in dist_hs: output.append(dist_hs[d]) return output
def feature_fp(smiles): mol = Chem.MolFromSmiles(smiles) fp = rdMolDescriptors.MQNs_(mol) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcExactMolWt(mol)) fp.append(rdMolDescriptors.CalcNumRotatableBonds(mol)) fp.append(rdMolDescriptors.CalcFractionCSP3(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAliphaticRings((mol))) fp.append(rdMolDescriptors.CalcNumAromaticCarbocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumAromaticRings(mol)) fp.append(rdMolDescriptors.CalcNumBridgeheadAtoms(mol)) fp.append(rdMolDescriptors.CalcNumRings(mol)) fp.append(rdMolDescriptors.CalcNumAmideBonds(mol)) fp.append(rdMolDescriptors.CalcNumHeterocycles(mol)) fp.append(rdMolDescriptors.CalcNumSpiroAtoms(mol)) fp.append(rdMolDescriptors.CalcTPSA(mol)) return np.array(fp)
def calculate_scalar_descriptors(molecule, symbols): features = list() features.append(rdMD.CalcAsphericity(molecule)) features += list(rdMD.CalcCrippenDescriptors(molecule)) features.append(rdMD.CalcExactMolWt(molecule)) features.append(rdMD.CalcEccentricity(molecule)) features.append(rdMD.CalcFractionCSP3(molecule)) features.append(rdMD.CalcLabuteASA(molecule)) features.append(rdMD.CalcNPR1(molecule)) features.append(rdMD.CalcNPR2(molecule)) features.append(rdMD.CalcHallKierAlpha(molecule)) # elemental distribution symbols = np.array(symbols) features.append(np.sum(symbols == 'H')) features.append(np.sum(symbols == 'C')) features.append(np.sum(symbols == 'N')) features.append(np.sum(symbols == 'O')) features.append(np.sum(symbols == 'F')) # ring features features.append(rdMD.CalcNumAliphaticCarbocycles(molecule)) features.append(rdMD.CalcNumAliphaticHeterocycles(molecule)) features.append(rdMD.CalcNumAromaticCarbocycles(molecule)) features.append(rdMD.CalcNumAromaticHeterocycles(molecule)) features.append(rdMD.CalcNumSaturatedCarbocycles(molecule)) features.append(rdMD.CalcNumSaturatedHeterocycles(molecule)) features.append(rdMD.CalcNumSpiroAtoms( molecule)) # atom shared between rings with one bond features.append(rdMD.CalcNumBridgeheadAtoms( molecule)) # atom shared between rings with at least two bonds # other counts features.append(rdMD.CalcNumAmideBonds(molecule)) features.append(rdMD.CalcNumHBA(molecule)) # number of hydrogen acceptors features.append(rdMD.CalcNumHBD(molecule)) # number of hydrogen donors return np.array(features)
def theolpp(usr_params): """ param_dct = {'lipid_class': lipid_class, 'ox_level': ox_level, 'oap_mode': oap_mode, 'ocp_mode': ocp_mode, 'lyso_oap_mode': lyso_oap_mode, 'lyso_ocp_mode': lyso_ocp_mode, 'ox_max': ox_max, 'keto_max': keto_max, 'ooh_max': ooh_max, 'epoxy_max': epoxy_max, 'lipid_lst_path': lipid_lst_path, 'lipid_tab': lipid_tab, 'prostane_mode': prostane_mode, 'ox_prostane_mode': ox_prostane_mode, 'sdf_path': sdf_path, 'msp_mode': msp_mode, 'msp_path': msp_path, 'mod_lst_path': mod_lst_path, 'fa_lst_path': fa_lst_path, 'prostane_mod_path': prostane_mod_path, 'prostane_abbr_path': prostane_abbr_path, 'frag_pattern_path': frag_pattern_path} :param usr_params: :return: """ t_start = time.clock() pl_table = usr_params['lipid_lst_path'] fa_table = usr_params['fa_lst_path'] mod_table = usr_params['mod_lst_path'] isop_cfg = usr_params['prostane_mod_path'] isopabbr_cfg = usr_params['prostane_abbr_path'] # pl_class_use_lst = ['PA', 'PC', 'PE', 'PG', 'PI', 'PIP', 'PS'] pl_class = usr_params['lipid_class'] pl_class_use_lst = [pl_class] ox_level = usr_params['ox_level'] oap_mode = usr_params['oap_mode'] ocp_mode = usr_params['ocp_mode'] lyso_oap_mode = usr_params['lyso_oap_mode'] lyso_ocp_mode = usr_params['lyso_ocp_mode'] ox_max = usr_params['ox_max'] keto_max = usr_params['keto_max'] ooh_max = usr_params['ooh_max'] epoxy_max = usr_params['epoxy_max'] prostane_mode = usr_params['prostane_mode'] prostane_ox_mode = usr_params['ox_prostane_mode'] save_sdf = usr_params['sdf_path'] save_spectra = usr_params['msp_mode'] save_msp = usr_params['msp_path'] score_xlsx = usr_params['frag_pattern_path'] pl_fp_xlsx = usr_params['pl_hg_path'] pl_df = pd.read_excel(pl_table, sheetname=usr_params['lipid_tab']) fa_df = pd.read_csv(fa_table, index_col=0) print(pl_df.head()) # Select export species OAP, OCP, Lyso OAP, Lyso OCP ban_lst = ['LYSOLYSO'] if oap_mode == 0: ban_lst.extend(['UNMODOAP', 'OAPUNMOD', 'OAPOAP']) if ocp_mode == 0: ban_lst.extend(['UNMODOCP', 'OCPUNMOD', 'OCPOCP']) if lyso_oap_mode == 0: ban_lst.extend(['LYSOOAP', 'OAPLYSO']) if lyso_ocp_mode == 0: ban_lst.extend(['LYSOOCP', 'OCPLYSO']) if ox_level == 1: ban_lst.extend( ['OAPOAP', 'OCPOCP', 'OAPOCP', 'OCPOAP', 'OAPUNMOD', 'OCPUNMOD']) ox_param_dct = { 'MAX_MOD': ox_max, 'MAX_KETO': keto_max, 'MAX_OOH': ooh_max, 'MAX_EPOXY': epoxy_max } # sdf_writer = Chem.SDWriter(open(save_sdf, mode='w')) if save_spectra == 1 and len(save_msp) > 0: msp_obj = open(save_msp, mode='w') else: msp_obj = None sdf_dct = {} parser = PLParser() abbr_gen = AbbrGenerator() frag_gen = TheoFrag(pl_class, score_xlsx) fingerprint_gen = FingerprintGen(pl_fp_xlsx) c_lst = [] fa_lpp_df_dct = {} sum_theo_lpp_dct = {} for (_idx, _row) in pl_df.iterrows(): _pl_abbr = str(_row['phospholipids']) _pl_elem_lst, pl_info_dct = parser.get_composition(_pl_abbr) print('PL composition ==>', _pl_elem_lst) _pl_hg_abbr = _pl_elem_lst[0] # get smiles from abbr if _pl_hg_abbr in pl_class_use_lst: c_lst.append(_pl_abbr) # prepare output _pl_lpp_df = pd.DataFrame() print('Start oxidation of ==>', _pl_abbr) _pl_sn1_abbr = _pl_elem_lst[1] _pl_sn2_abbr = _pl_elem_lst[2] if len(pl_info_dct.keys()) > 0: sn1_link = pl_info_dct['sn1_link'] sn1_c_num = int(pl_info_dct['sn1_c_num']) sn1_db_num = int(pl_info_dct['sn1_db_num']) sn1_omega_type = int(pl_info_dct['sn1_omega_type']) if sn1_omega_type == 0: sn1_query_code = 'Link == "%s" and C == % i and DB == %i' % ( sn1_link, sn1_c_num, sn1_db_num) sn1_fa_df = fa_df.query(sn1_query_code) sn1_fa_df = sn1_fa_df.query(sn1_query_code).head(1) else: sn1_query_code = 'Link == "%s" C == % i and DB == %i' % ( sn1_link, sn1_c_num, sn1_db_num) sn1_fa_df = fa_df.query(sn1_query_code) sn1_fa_df = sn1_fa_df.query( 'Link == "%s" and omega == %i' % (sn1_link, sn1_omega_type)).head(1) sn2_link = pl_info_dct['sn2_link'] sn2_c_num = int(pl_info_dct['sn2_c_num']) sn2_db_num = int(pl_info_dct['sn2_db_num']) sn2_omega_type = int(pl_info_dct['sn2_omega_type']) if sn2_omega_type == 0: sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % ( sn2_link, sn2_c_num, sn2_db_num) sn2_fa_df = fa_df.query(sn2_query_code) sn2_fa_df = sn2_fa_df.query(sn2_query_code).head(1) else: sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % ( sn2_link, sn2_c_num, sn2_db_num) sn2_fa_df = fa_df.query(sn2_query_code) sn2_fa_df = sn2_fa_df.query( 'Link == "%s" and omega == %i' % (sn2_link, sn2_omega_type)).head(1) _pl_sn1_smiles = sn1_fa_df.loc[_pl_sn1_abbr, 'SMILES'] _pl_sn2_smiles = sn2_fa_df.loc[_pl_sn2_abbr, 'SMILES'] print('sn1 =>', _pl_sn1_smiles, '|| sn2 =>', _pl_sn2_smiles) else: _pl_sn1_smiles = '' _pl_sn2_smiles = '' # check if FA already oxidized to speed up if _pl_sn1_abbr in fa_lpp_df_dct.keys(): sn1_mod_sum_df = fa_lpp_df_dct[_pl_sn1_abbr] else: sn1_link_dct = fa_link_filter(_pl_sn1_smiles) sn1_mod_sum_df = oxidizer(sn1_link_dct, mod_table, isop_cfg, isopabbr_cfg, ox_level, ox_param_dct, prostane_mode, prostane_ox_mode) fa_lpp_df_dct[_pl_sn1_abbr] = sn1_mod_sum_df.copy() if _pl_sn2_abbr in fa_lpp_df_dct.keys(): sn2_mod_sum_df = fa_lpp_df_dct[_pl_sn2_abbr] else: sn2_link_dct = fa_link_filter(_pl_sn2_smiles) sn2_mod_sum_df = oxidizer(sn2_link_dct, mod_table, isop_cfg, isopabbr_cfg, ox_level, ox_param_dct, prostane_mode, prostane_ox_mode) fa_lpp_df_dct[_pl_sn2_abbr] = sn2_mod_sum_df.copy() for (_sn1_idx, _sn1_row) in sn1_mod_sum_df.iterrows(): _sn1_mod_smiles = _sn1_row['FULL_SMILES'] _sn1_abbr_str = _sn1_row['FA_ABBR'] _sn1_typ_str = _sn1_row['FA_TYPE'] _sn1_formula_str = _sn1_row['FA_FORMULA'] for (_sn2_idx, _sn2_row) in sn2_mod_sum_df.iterrows(): _sn2_mod_smiles = _sn2_row['FULL_SMILES'] _sn2_abbr_str = _sn2_row['FA_ABBR'] _sn2_typ_str = _sn2_row['FA_TYPE'] _sn2_formula_str = _sn2_row['FA_FORMULA'] _oap_ocp_lst = [_sn1_typ_str, _sn2_typ_str] _lpp_typ = ''.join(_oap_ocp_lst) if _lpp_typ not in ban_lst: _lpp_smiles = LPPmerge.pl_lpp(_pl_hg_abbr, sn1=_sn1_mod_smiles, sn2=_sn2_mod_smiles) _lpp_id_str = str(''.join([ _pl_hg_abbr, '(', _sn1_abbr_str, '/', _sn2_abbr_str, ')' ])) _lpp_sub_class_json = '{"SN1": "%s", "SN2": "%s"}' % ( _sn1_typ_str, _sn2_typ_str) _lpp_info_dct = { 'LPP_ORIGIN': _pl_abbr, 'LPP_SMILES': _lpp_smiles, 'LPP_CLASS': _pl_hg_abbr, 'SN1_SMILES': _sn1_mod_smiles, 'SN2_SMILES': _sn2_mod_smiles, 'SN1_ABBR': _sn1_abbr_str, 'SN2_ABBR': _sn2_abbr_str, 'SN1_JSON': _sn1_row['FA_JSON'], 'SN2_JSON': _sn2_row['FA_JSON'], 'SN1_FORMULA': _sn1_formula_str, 'SN2_FORMULA': _sn2_formula_str, 'LM_ID': _lpp_id_str, 'SN_JSON': _lpp_sub_class_json } if save_spectra == 1: _lpp_info_dct['MSP_JSON'] = frag_gen.calc_frags( _lpp_info_dct) _lpp_info_se = pd.Series(data=_lpp_info_dct) _pl_lpp_df[_lpp_id_str] = _lpp_info_se # check if same lpp generated already # Currently use bulk settings if _lpp_id_str in sdf_dct.keys(): _lpp_origin = sdf_dct[_lpp_id_str]['LPP_ORIGIN'] _lpp_origin_lst = _lpp_origin.split(',') if _pl_abbr in _lpp_origin_lst: pass else: _lpp_origin_lst.append(_pl_abbr) sdf_dct[_lpp_id_str]['LPP_ORIGIN'] = ','.join( _lpp_origin_lst) else: sdf_dct[_lpp_id_str] = _lpp_info_dct.copy() # clean memory by deleting these dicts and series del _lpp_info_dct, _lpp_info_se # generate summary table _pl_lpp_df = _pl_lpp_df.transpose() print('==> %i of LPP generated !!' % _pl_lpp_df.shape[0]) print('==> ==> Move to next lipid==> ') # print(_pl_lpp_df.head()) sum_theo_lpp_dct[_pl_abbr] = _pl_lpp_df # create sdf # for (_lpp_i, _lpp_r) in _pl_lpp_df.iterrows(): sum_theo_lpp_pl = pd.Panel(data=sum_theo_lpp_dct) print(sum_theo_lpp_pl.shape) # write to sdf print('==>Start to generate SDF ==> MSP mode = %i' % save_spectra) print('!! %i structures in total !!' % len(sdf_dct.keys())) mzcalc = MZcalc() sdf_writer = Chem.SDWriter(open(save_sdf, mode='w')) if save_spectra == 1: for _k_lpp in sdf_dct.keys(): _lpp_dct = sdf_dct[_k_lpp] if len(json.loads(_lpp_dct['MSP_JSON']).keys()) > 0: _lpp_smiles = str(_lpp_dct['LPP_SMILES']) # print(_lpp_smiles) _lpp_mol = Chem.MolFromSmiles(_lpp_smiles) AllChem.Compute2DCoords(_lpp_mol) _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID'])) _lpp_mass = Descriptors.MolWt(_lpp_mol) _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol) _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol) _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass) _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass) _lpp_mol.SetProp('FORMULA', _lpp_formula) _lpp_sn2_smi = _lpp_dct['SN2_SMILES'] if str(_lpp_dct['LPP_CLASS'] ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O': _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M+HCOO]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz( _lpp_formula, charge='[M+HCOO]-') _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) else: _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M-H]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula, charge='[M-H]-') _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info) _lpp_dct['EXACT_MASS'] = _lpp_exactmass fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct) _lpp_dct['FINGERPRINT'] = fp_mz_lst _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst)) for _k in _lpp_dct.keys(): _lpp_mol.SetProp(_k, str(_lpp_dct[_k])) sdf_writer.write(_lpp_mol) if save_spectra == 1 and len(save_msp) > 0: MSPcreator.to_msp(msp_obj, _lpp_dct) elif save_spectra == 0: for _k_lpp in sdf_dct.keys(): _lpp_dct = sdf_dct[_k_lpp] _lpp_smiles = str(_lpp_dct['LPP_SMILES']) _lpp_mol = Chem.MolFromSmiles(_lpp_smiles) AllChem.Compute2DCoords(_lpp_mol) _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID'])) _lpp_mass = Descriptors.MolWt(_lpp_mol) _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol) _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol) _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass) _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass) _lpp_mol.SetProp('FORMULA', _lpp_formula) _lpp_sn2_smi = _lpp_dct['SN2_SMILES'] if str(_lpp_dct['LPP_CLASS'] ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O': _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M+HCOO]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula, charge='[M+HCOO]-') _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) else: _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M-H]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula, charge='[M-H]-') _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info) _lpp_dct['EXACT_MASS'] = _lpp_exactmass fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct) _lpp_dct['FINGERPRINT'] = fp_mz_lst _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst)) for _k in _lpp_dct.keys(): _lpp_mol.SetProp(_k, str(_lpp_dct[_k])) sdf_writer.write(_lpp_mol) sdf_writer.close() if save_spectra == 1 and len(save_msp) > 0: msp_obj.close() SDFsummary.sdf2xlsx(save_sdf, str(save_sdf)[:-4] + '.xlsx') # if save_spectra == 1: SDFsummary.sdf2sum_fa(save_sdf, str(save_sdf)[:-4] + '_FA_SUM.xlsx') t_spent = time.clock() - t_start info_updater_1 = '=>%i of LPP generated ==> ' % len(sdf_dct.keys()) info_updater_2 = '=>==> %i of phospholipids processed in %.3fs ==> ==> Finished !!!!!!' % ( len(c_lst), t_spent) return info_updater_1, info_updater_2
def mol_weight_from_smiles(smile): x = Chem.MolFromSmiles(smile) return rdMolDescriptors.CalcExactMolWt(x) # return grams per mol
def HasMatch(self, mol): mw = rdMolDescriptors.CalcExactMolWt(mol) res = not self.minMw <= mw <= self.maxMw Chem.MolFromSmiles("---") Chem.LogErrorMsg("dasfsadf") return res
def MW(self) -> float: """Molecular weight.""" return round(rdMolDescriptors.CalcExactMolWt(self.mol))
def build_reactions(perturbations_all_paths, mcs_neighbours): # loop over each perturbation in the list and load the pdb files: perturbation_reactions = [] for perturbation_pair_path in perturbations_all_paths: # generate fingerprints for all present ligands: perturbations_unnested = list( itertools.chain.from_iterable(perturbations_all_paths)) all_members = [] all_members_FPs = [] for member in perturbations_unnested: member_pdb_file = open(member, 'r').read() all_members.append(rdmolfiles.MolFromPDBBlock(member_pdb_file)) all_members_FPs.append([ FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member_pdb_file)) ]) first_pair = perturbations_all_paths[0] member1_pdb_file = open(first_pair[0], 'r').read() member2_pdb_file = open(first_pair[1], 'r').read() # for the perturbation pair, pick the ligand with lowest weight to query for MCS: size_member1 = rdMolDescriptors.CalcExactMolWt( rdmolfiles.MolFromPDBBlock(member1_pdb_file)) size_member2 = rdMolDescriptors.CalcExactMolWt( rdmolfiles.MolFromPDBBlock(member2_pdb_file)) if size_member1 >= size_member2: query_member = FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member1_pdb_file)) else: query_member = FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member2_pdb_file)) similarities = [ AllChem.DataStructs.FingerprintSimilarity( query_member, target_fp[0], metric=DataStructs.DiceSimilarity) for target_fp in all_members_FPs ] similarities_to_query = dict(zip(all_members, similarities)) # of all the ligands and their computed similarities, keep the given top amount(i.e. mcs_neighbours variable): mol_similarities = dict( sorted(similarities_to_query.items(), key=lambda kv: kv[1], reverse=True)) ordered_mol_similarities = {} for key, value in mol_similarities.items(): if value not in ordered_mol_similarities.values(): ordered_mol_similarities[key] = value similar_hits = [] for key, value in ordered_mol_similarities.items(): similar_hits.append(key) # output the picked most similar ligands as rdkit molecule objects in a list: neighbours = similar_hits[:mcs_neighbours] # regenerate the perturbation (A>B): ligA = perturbation_pair_path[0].replace("../fesetup/poses/", "").replace( "/ligand.pdb", "") ligB = perturbation_pair_path[1].replace("../fesetup/poses/", "").replace( "/ligand.pdb", "") perturbation = str(ligA) + ">" + str(ligB) # read in PDB files: perturbation_pair = [] member1_pdb_file = open(perturbation_pair_path[0], 'r').read() member2_pdb_file = open(perturbation_pair_path[1], 'r').read() perturbation_pair.append(rdmolfiles.MolFromPDBBlock(member1_pdb_file)) perturbation_pair.append(rdmolfiles.MolFromPDBBlock(member2_pdb_file)) # generate MCS (taking into account substitutions in ring structures) using the neighbours list: #print("Generating MCS for perturbation " + str(perturbation) + "..") print("##########") print(str(perturbation) + ":") MCS_object = rdFMCS.FindMCS(neighbours, completeRingsOnly=True) MCS_SMARTS = Chem.MolFromSmarts(MCS_object.smartsString) if MCS_SMARTS == None: print("Could not generate MCS pattern") return # print(Chem.MolToSmiles(perturbation_pair[0])) # print(Chem.MolToSmiles(perturbation_pair[1])) # print("################################") # use SMARTS pattern to isolate unique patterns in each pair member # if multiple unique patterns exist in one molecule they are written as: # pattern1.pattern2 ('.' signifies a non-bonded connection) member1 = perturbation_pair[0] member2 = perturbation_pair[1] member1_stripped = AllChem.DeleteSubstructs(member1, MCS_SMARTS) member2_stripped = AllChem.DeleteSubstructs(member2, MCS_SMARTS) member1_stripped_smiles = Chem.MolToSmiles(member1_stripped) member2_stripped_smiles = Chem.MolToSmiles(member2_stripped) # construct SMILES string from the two members: reaction = str(member1_stripped_smiles) + ">>" + str( member2_stripped_smiles) print(reaction) member1 = str(member1_stripped_smiles) member2 = str(member2_stripped_smiles) # combine all results (name of perturbation, reaction SMILES, ligand A Smiles and ligand B SMILES) result = [perturbation, reaction, member1, member2] perturbation_reactions.append(result) return perturbation_reactions
def HasMatch(self, mol): mw = rdMolDescriptors.CalcExactMolWt(mol) return not self.minMw <= mw <= self.maxMw
lipinski_violators = [] counter = 0 print("Scanning molecules for Lipinski violations.") for mol in tqdm(mols): # Assume no violations dono_viol = False acceptor_viol = False mw_viol = False logp_viol = False # Use RDKit functions to get hdonors, acceptors, molecular weight and # logP. hdonors = Lipinski.NHOHCount(mol) hacceptors = Lipinski.NOCount(mol) mw = rdMolDescriptors.CalcExactMolWt(mol) logp = Crippen.MolLogP(mol) # Make the checks if the current mol actually violates a role. if hdonors > 5: dono_viol = True if hacceptors > 10: acceptor_viol = True if mw > 500: mw_viol = True if logp > 5: logp_viol = True # Check if the violation sum is greater than one and assign the molecule # as a violator saving the index to a list. if sum([dono_viol, acceptor_viol, mw_viol, logp_viol]) > 1:
def compute_Wt(self, mol_input): return rdMolDescriptors.CalcExactMolWt(mol_input)
return def check_maxes(formd, maxes): bools = [v < maxes[e] for e, v in formd.items()] return all(bools) def rec_formula(mz, ppm=5): maxes = dict(get_elemaxs(mz)) error = mz * (ppm * 1E-6) mlow, mhigh = mz - error, mz + error formula = {e: 0 for e in eles.keys()} return _rec_form(formula, mlow, mhigh, maxes) def _rec_form(ele_idx, formula, mlow, mhigh, maxes): good_form = check_formula(formula, mlow, mhigh) good_form = True under_maxes = check_maxes(formula, maxes) if good_form and under_maxes: yield formula else: formula[ele] pass sm = get_soome_mols() masses = [rdMolDescriptors.CalcExactMolWt(m) for m in sm] formulas = [rdMolDescriptors.CalcMolFormula(m) for m in sm]
def formatdb(smiles): df = pd.read_csv(smiles, sep='\t', header=None) os.remove(smiles) smi = list(df[0]) m = [Chem.MolFromSmiles(x) for x in smi] inchi = [] ikeys = [] ikey1 = [] ikey2 = [] form = [] exmass = [] for i in range(len(m)): try: inchi.append(Chem.rdinchi.MolToInchi(m[i])[0]) ikey = Chem.rdinchi.InchiToInchiKey(inchi[i]) ikeys.append(ikey) ikey1.append(ikey.split('-')[0]) ikey2.append(ikey.split('-')[1]) form.append(rdMD.CalcMolFormula(m[i])) exmass.append(rdMD.CalcExactMolWt(m[i])) except: ikeys.append('') inchi.append('') ikey1.append('') ikey2.append('') form.append('') exmass.append('') data = { 'inchikey': ikeys, 'MonoisotopicMass': exmass, 'InChI': inchi, 'SMILES': list(df[0]), 'Identifier': list(df[1]), 'InChIKey2': ikey2, 'InChIKey1': ikey1, 'MolecularFormula': form } cn = [ "inchikey", "MonoisotopicMass", "InChI", "SMILES", "Identifier", "InChIKey2", "InChIKey1", "MolecularFormula" ] formdata = pd.DataFrame(data, columns=cn) classy = query_inchikey(ikeys) # If the structure do not show a classification, try query #in_process = get_class(list(df[0]), chunksize=100) #classy = poll(in_process) classy = classy[['inchikey', 'kingdom', 'superclass', 'class', 'subclass']] classy.columns = [ 'inchikey', 'kingdom_name', 'superclass_name', 'class_name', 'subclass_name' ] formfinal = pd.merge(formdata, classy, how='left', on=['inchikey']) formfinal = formfinal.fillna('') formfinal.drop('inchikey', axis=1, inplace=True) id = [x for x in range(len(ikeys)) if ikeys[x] == ''] formfinal.drop(formfinal.index[id], inplace=True) formfinal.to_csv(smiles + '_FORMATED.txt', index=False, sep='\t') return 'Done'
def smiles_to_all_labels(df): smilesList = df['SMILES'] feature_df = df.copy() # get all functions of modules all_lipinski = inspect.getmembers(l, inspect.isfunction) all_fragments = inspect.getmembers(f, inspect.isfunction) # bad features have the same value for all our compounds bad_features = [] for (columnName, columnData) in df.iteritems(): if (len(set(columnData.values)) == 1): bad_features.append(columnName) # add fragment features for i in range(len(all_fragments)): new_col = [] # exclude attributes which start with _ and exclude bad features if all_fragments[i][0].startswith( '_') == False and all_fragments[i][0] not in bad_features: for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) mol_method = all_fragments[i][1](molecule) new_col.append(mol_method) # add new col with feature name to our df feature_df[all_fragments[i][0]] = new_col print('fragments over') # add lipinski features for i in range(len(all_lipinski)): new_col = [] if all_lipinski[i][0].startswith( '_') == False and all_fragments[i][0] not in bad_features: for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) mol_method = all_lipinski[i][1](molecule) new_col.append(mol_method) feature_df[all_lipinski[i][0]] = new_col print('lipinski over') new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(f.fr_Al_COO(molecule)) feature_df["fr_Al_COO"] = new_col # new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(l.HeavyAtomCount(molecule)) feature_df["HeavyAtomCount"] = new_col # add getnumatoms as feature new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(molecule.GetNumAtoms()) feature_df["GetNumAtoms"] = new_col # add CalcExactMolWt as feature new_col = [] for smiles in smilesList: molecule = chem.MolFromSmiles(smiles) new_col.append(molDesc.CalcExactMolWt(molecule)) feature_df["CalcExactMolWt"] = new_col # print('other over') return feature_df
53.49... """ HeavyAtomMolWt = lambda x: MolWt(x, True) HeavyAtomMolWt.__doc__ = """The average molecular weight of the molecule ignoring hydrogens >>> HeavyAtomMolWt(Chem.MolFromSmiles('CC')) 24.02... >>> HeavyAtomMolWt(Chem.MolFromSmiles('[NH4+].[Cl-]')) 49.46 """ HeavyAtomMolWt.version = "1.0.0" ExactMolWt = lambda *x, **y: _rdMolDescriptors.CalcExactMolWt(*x, **y) ExactMolWt.version = _rdMolDescriptors._CalcExactMolWt_version ExactMolWt.__doc__ = """The exact molecular weight of the molecule >>> ExactMolWt(Chem.MolFromSmiles('CC')) 30.04... >>> ExactMolWt(Chem.MolFromSmiles('[13CH3]C')) 31.05... """ def NumValenceElectrons(mol): """ The number of valence electrons the molecule has >>> NumValenceElectrons(Chem.MolFromSmiles('CC'))
def create_features(data, types="train"): if types == "train": y = np.array(data['ACTIVE'].astype(int)) elif types == "test": y = None data = data[["SMILES"]] data["SMILES_str"] = data["SMILES"] data["SMILES"] = data["SMILES"].apply(lambda x: Chem.MolFromSmiles(x)) data["NumAtoms"] = data["SMILES"].apply( lambda x: x.GetNumAtoms()) #l.HeavyAtomCount(m) data["ExactMolWt"] = data["SMILES"].apply(lambda x: d.CalcExactMolWt(x)) data["fr_Al_COO"] = data["SMILES"].apply(lambda x: f.fr_Al_COO(x)) data["HsNumAtoms"] = data["SMILES"].apply( lambda x: Chem.AddHs(x).GetNumAtoms()) #to have the hydrogens explicitly present BondType = [[str(x.GetBondType()) for x in m.GetBonds()] for m in data["SMILES"]] BondType = [" ".join(x) for x in BondType] vec = CountVectorizer().fit(BondType) train_tfidf = vec.transform(BondType).todense() # 转化为更直观的一般矩阵 vocabulary = vec.vocabulary_ train_tfidf = pd.DataFrame(train_tfidf) train_tfidf.columns = vocabulary data = pd.concat([data, train_tfidf], axis=1) #data.columns #['SMILES', 'ACTIVE', 'SMILES_str', 'NumAtoms', 'ExactMolWt', 'fr_Al_COO','HsNumAtoms', 'double', 'single', 'aromatic', 'triple'] traindata = data[[ 'NumAtoms', 'ExactMolWt', 'fr_Al_COO', 'HsNumAtoms', 'double', 'single', 'aromatic', 'triple' ]] finger = [ np.array(AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=512)) for x in data["SMILES"] ] finger = pd.DataFrame(finger) finger.columns = ["morgan_" + str(x) for x in finger.columns] model = word2vec.Word2Vec.load('models/model_300dim.pkl') data['sentence'] = data.apply( lambda x: MolSentence(mol2alt_sentence(x['SMILES'], 1)), axis=1) m2v = [ DfVec(x) for x in sentences2vec(data['sentence'], model, unseen='UNK') ] m2v = np.array([x.vec for x in m2v]) m2v = pd.DataFrame(m2v) m2v.columns = ["m2v_" + str(x) for x in m2v.columns] datadict = { "Morgan": finger, "Despcritor": traindata, "molvec": m2v, 'y': y } return datadict