def check_reaction(reactants, products): """ """ if isinstance(reactants, list): reactants = ".".join(reactants) if isinstance(products, list): products = ".".join(products) reactants = Chem.MolFromSmiles(reactants) products = Chem.MolFromSmiles(products) return rdMolDescriptors.CalcMolFormula( reactants) == rdMolDescriptors.CalcMolFormula(products)
def testMolFormula(self): m = Chem.MolFromSmiles("[2H]C([3H])O") formula = rdMD.CalcMolFormula(m) self.assertEqual(formula,'CH4O') formula = rdMD.CalcMolFormula(m,separateIsotopes=True) self.assertEqual(formula,'CH2DTO') formula = rdMD.CalcMolFormula(m,separateIsotopes=True,abbreviateHIsotopes=False) self.assertEqual(formula,'CH2[2H][3H]O') m = Chem.MolFromSmiles("[2H][13CH2]CO") formula = rdMD.CalcMolFormula(m) self.assertEqual(formula,'C2H6O') formula = rdMD.CalcMolFormula(m,separateIsotopes=True) self.assertEqual(formula,'C[13C]H5DO')
def set2DStructure(self): if self.smiles: try: mol = Chem.MolFromSmiles(self.smiles) self.molwt = rdMolDescriptors.CalcExactMolWt(mol) self.molformula = rdMolDescriptors.CalcMolFormula(mol) rdDepictor.Compute2DCoords(mol) self.structure_image = Draw.MolToImage(mol, size=(400,200), kekulize=True, wedgeBonds=False) pixdata = self.structure_image.load() for y in range(self.structure_image.size[1]): for x in range(self.structure_image.size[0]): if pixdata[x, y] == (255, 255, 255, 255): pixdata[x, y] = (255, 255, 255, 0) self.structure_qt = ImageQt.ImageQt(self.structure_image) except Exception as e: print(e) self.molwt = False self.molformula = False self.structure_image = False self.structure_data = False else: self.molwt = False self.molformula = False self.structure_image = False self.structure_data = False
def epilion2sdf(abbr_lst, save_sdf): if isinstance(abbr_lst, str): try: if os.path.isfile(abbr_lst): logger.info(f'Try to open file: {abbr_lst}') with open(abbr_lst, 'r') as infile_obj: abbr_lst = infile_obj.readlines() else: logger.error(f'Can NOT load input: {abbr_lst}') logger.info('!! END PROCESSING !!') exit() except Exception as e: logger.error(f'Can NOT load input: {abbr_lst}') logger.error(e) fa_decoder = ParserFA() pl_decoder = ParserPL() info_dct = {} for abbr in abbr_lst: logger.info(abbr) if fa_decoder.is_fa(abbr): smi = fa_decoder.get_smi_fa(abbr) logger.info(abbr + ': ' + smi) info_dct[abbr] = smi elif pl_decoder.is_pl(abbr): smi = pl_decoder.get_smi_pl(abbr) logger.info(abbr + ': ' + smi) info_dct[abbr] = smi else: logger.info(f'Can NOT parse abbreviation: {abbr}') sdf_writer = Chem.SDWriter(open(save_sdf, mode='w')) for m in abbr_lst: if m in info_dct: smi = info_dct[m] try: mol = Chem.MolFromSmiles(smi) AllChem.Compute2DCoords(mol) mol.SetProp('_Name', m) m_mass = Descriptors.MolWt(mol) m_exactmass = rdMolDescriptors.CalcExactMolWt(mol) m_formula = rdMolDescriptors.CalcMolFormula(mol) mol.SetProp('EXACT_MASS', '%.6f' % m_exactmass) mol.SetProp('NOMINAL_MASS', '%.3f' % m_mass) mol.SetProp('FORMULA', m_formula) sdf_writer.write(mol) except Exception as e: logger.error(f'! FAILED: {m}') logger.error( f'! FAILED to generate structure from SMILES: {smi}') logger.error(e) else: logger.warning(f'!! Can NOT parse: {m}')
def parse_data(input_file): """ takes all text from NPAtlas database file and returns a list of lists with all CLASS data and an attribute list. input_file: NPAtlas database txt file """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] all_lines = input_file.split('\n') all_lines = all_lines[1:-1] for line in all_lines: line = line.split('\t') #SMILE m = line[1] m = Chem.MolFromSmiles(m) sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Source identifiers identifier_list += [line[0]] #Inchi inchi_list += [line[2]] #InchiKeys inchi_key = line[3].split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def get_molecular_formula(smi): """ Return the molecular formula of the molecule corresponding to the smiles """ try: mol = Chem.AddHs(Chem.MolFromSmiles(smi)) except NameError: logging.error('RDKit is not installed or loaded correctly.') sys.exit() return rdMolDescriptors.CalcMolFormula(mol)
def main(in_file, output): Cmpds = {} InMols = rdkit_open([in_file]) print('\n # Number of input molecule: {0}'.format(len(InMols))) for mol in InMols: m = {} name = mol.GetProp('_Name').split()[0] m['Name'] = name m['Formula'] = rd.CalcMolFormula(mol) m['SMILES'] = Chem.MolToSmiles(mol) m['MW'] = rd._CalcMolWt(mol) # Molecular Weight m['logP'] = rd.CalcCrippenDescriptors(mol)[0] # Partition coefficient m['HDon'] = rd.CalcNumLipinskiHBD(mol) # Lipinski Hbond donor m['HAcc'] = rd.CalcNumLipinskiHBA(mol) # Lipinski Hbond acceptor m['TPSA'] = rd.CalcTPSA(mol) # Topological polar surface area m['Rotat'] = rd.CalcNumRotatableBonds(mol, strict=True) # Rotatable bond m['MolRef'] = rd.CalcCrippenDescriptors(mol)[1] # Molar refractivity m['AliRing'] = rd.CalcNumAliphaticRings(mol) # Aliphatic ring number m['AroRing'] = rd.CalcNumAromaticRings(mol) # Aromatic ring number # m['Stereo'] = rd.CalcNumAtomStereoCenters(mol) # Stereo center number # m['UnspStereo'] = rd.CalcNumUnspecifiedAtomStereoCenters(mol) # unspecified stereo m['SMILES'] = Chem.MolToSmiles(mol, isomericSmiles=True, allHsExplicit=False) Cmpds[name] = m #################################### df = pd.DataFrame.from_dict(Cmpds, orient='index') df.index.name = 'Name' # Columns of data to print out Columns = [ 'Formula', 'MW', 'logP', 'HDon', 'HAcc', 'TPSA', 'Rotat', 'MolRef', 'AliRing', 'AroRing', #'Stereo', 'UnspStereo', 'SMILES', ] reorder = df[Columns] # Output to CSV reorder.to_csv( output+'.csv', sep=',', na_rep='NA', encoding='utf-8', float_format='%.5f', header=True ) # Output to Excel reorder.to_excel( output+'.xlsx', header=True, na_rep='NA' )
def parse_epilion(abbr: str) -> dict: fa_decoder = ParserFA() pl_decoder = ParserPL() info_dct = {} converter = Converter(abbr_cfg_path) epilion_id = converter.convert_abbr(abbr) if fa_decoder.is_fa(epilion_id): smi = fa_decoder.get_smi_fa(epilion_id) logger.info(epilion_id + ': ' + smi) elif pl_decoder.is_pl(epilion_id): smi = pl_decoder.get_smi_pl(epilion_id) logger.info(epilion_id + ': ' + smi) else: logger.info(f'Can NOT parse abbreviation: {epilion_id}') try: mol = Chem.MolFromSmiles(smi) AllChem.Compute2DCoords(mol) # m_mass = Descriptors.MolWt(mol) m_exactmass = rdMolDescriptors.CalcExactMolWt(mol) m_formula = rdMolDescriptors.CalcMolFormula(mol) img = Draw.MolToImage(mol, size=(600, 400)) img_io = BytesIO() img.save(img_io, format='png') img_io.seek(0) img.save(img_io, format='png') img_data = base64.b64encode(img_io.getbuffer()) img_data_url = r'data:image/png;base64,' + img_data.decode("utf-8") info_dct['id'] = epilion_id info_dct['formula'] = m_formula info_dct['exactmass'] = '%.4f' % m_exactmass info_dct['img'] = img_data_url except Exception as e: logger.error(f'! FAILED: {epilion_id}') logger.error(f'! FAILED to generate structure from SMILES: {smi}') logger.error(e) return info_dct
def calcMolprops(self): """Calculate masses for mol using RDKit Masses calculated and rounded to 4 decimal points [M+H]+ and other adducts can be calculated using RDKit and the calculate_exact_mass function by providing an appropriate SMILES string """ self.inchi = Chem.MolToInchi(self.rdmol) self.inchikey = Chem.MolToInchiKey(self.rdmol) self.accurate_mass = round(Descriptors.ExactMolWt(self.rdmol), 4) self.mass = round(Descriptors.MolWt(self.rdmol), 4) self.m_plus_h = round( self.accurate_mass + calculate_exact_mass('[H+]'), 4) self.m_plus_na = round( self.accurate_mass + calculate_exact_mass('[Na+]'), 4) # Set name in molblock self.rdmol.SetProp('_Name', self.name) rdDepictor.Compute2DCoords(self.rdmol) self.molblock = Chem.MolToMolBlock(self.rdmol) self.formula = rdMolDescriptors.CalcMolFormula(self.rdmol)
def calculate_properties(self, smiles=None, mol=None, props=[]): """this method calculates basic properties for the mol returns : error (bool)""" if len(props) == 0: return True if mol is None: mol = Chem.MolFromSmiles(smiles) if mol is None: return True if 'py_formula' in props: self.data['py_formula'] = desc.CalcMolFormula(mol) if 'py_em' in props: self.data['py_em'] = round(desc.CalcExactMolWt(mol), 5) if 'py_n_Cl_Br' in props: all_atoms = [] for atom in mol.GetAtoms(): all_atoms.append(atom.GetSymbol()) n_Cl = all_atoms.count('Cl') n_Br = all_atoms.count('Br') self.data['py_n_Cl_Br'] = n_Cl + n_Br if 'py_na' in props: self.data['py_na'] = mol.GetNumAtoms() if 'py_mw' in props: self.data['py_mw'] = desc._CalcMolWt(mol) if 'py_fsp3' in props: self.data['py_fsp3'] = desc.CalcFractionCSP3(mol) if 'py_rb' in props: self.data['py_rb'] = desc.CalcNumRotatableBonds(mol) if 'py_tpsa' in props: self.data['py_tpsa'] = desc.CalcTPSA(mol) if 'py_clogp' in props: self.data['py_clogp'] = desc.CalcCrippenDescriptors(mol)[0] if 'py_nar' in props: self.data['py_nar'] = desc.CalcNumAromaticRings(mol) if 'py_nhba' in props: self.data['py_nhba'] = desc.CalcNumHBA(mol) if 'py_nhbd' in props: self.data['py_nhbd'] = desc.CalcNumHBD(mol) return False
def generateCompoundPropertiesTask(structure, debug=False): if debug: pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) molecule = structure.molecule if not molecule.compoundProperty: prop = CompoundProperties(molecule=molecule) else: prop = molecule.compoundProperty saltRemover = SaltRemover() mol = Chem.MolFromMolBlock(str(structure.molfile)) base = saltRemover.StripMol(mol) prop.hbd = Descriptors.CalcNumHBD(mol) prop.hba = Descriptors.CalcNumHBA(mol) prop.rtb = Descriptors.CalcNumRotatableBonds(mol) prop.alogp = Crippen.MolLogP(mol) prop.psa = Descriptors.CalcTPSA(mol) prop.full_mwt = NewDescriptors.MolWt(mol) # prop.exact_mass = Descriptors.CalcExactMolWt(mol) if base.GetNumAtoms(): prop.mw_freebase = NewDescriptors.MolWt(base) prop.full_molformula = Descriptors.CalcMolFormula(mol) try: prop.save() except IntegrityError as e: if debug: print e.message else: raise e
def generate_data(input_file): """ takes all text from the input structure data file and returns a list of lists with all generated data needed for the sqlite database. input_file: input structure txt file """ mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] pre_SMILES_list = [] identifier_list = [] all_lines = input_file.split('\n') if all_lines[-1] == '': all_lines = all_lines[:-1] for line in all_lines: line = line.split('\t') #Convert to mol and remove invalid structures smile_string = '' id_string = '' m = line[0] id_name = line[1] mol = Chem.MolFromSmiles(m) if mol != None: smile_string += m id_string += id_name pre_SMILES_list += [smile_string] #Source identifiers identifier_list += [id_string] pre_inchi_list = [] for smile in pre_SMILES_list: #Generate mol m = Chem.MolFromSmiles(smile) #SMILES, canonical sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # InChI inchi = rdinchi.MolToInchi(m) pre_inchi_list += [inchi[0]] # InChIKey1 and InChIKey2 for inchi in pre_inchi_list: if not str(inchi).startswith('InCh'): inchi = 'NA' inchi_list += [inchi] pre_inchi_key_list = [] for inchi2 in inchi_list: if inchi2 == 'NA': inchi_key = "NA-NA" pre_inchi_key_list += [inchi_key] if inchi2 != 'NA': inchi_key = rdinchi.InchiToInchiKey(inchi2) pre_inchi_key_list += [inchi_key] for inchi_key in pre_inchi_key_list: inchi_key = inchi_key.split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return overall_list
def parse_file(input_file, db_name): """ takes all text from nanpdb database file and returns a list of lists with NPs which is easy to use. input_file: nanpdb database txt file db_name: database name """ all_lines = input_file.split('\n') all_lines = all_lines[:-1] all_info_list = [] for line in all_lines: line = line.split('\t') info_per_row_list = [] for value in line: my_string = "" if len(value) == 0: value = "NA" my_string += value info_per_row_list += [my_string] info_per_row_list += [db_name] all_info_list += [info_per_row_list] attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] for line in all_info_list: # generate molecules m = Chem.MolFromSmiles(line[0]) # MonoisotopicMass mol_mass = str(Descriptors.ExactMolWt(m))[:-5] mol_mass_list += [mol_mass] # InChI inchi = rdinchi.MolToInchi(m) inchi_list += [inchi[0]] # SMILES SMILES_list += [line[0]] # Identifier identifier_list += [line[1]] # MolecularFormula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(all_info_list) NA_list += ['NA'] * nr_of_structures # InChIKey inchi_key_list = [] inchi_key_list2 = [] for inchi in inchi_list: inchi_key = rdinchi.InchiToInchiKey(inchi) inchi_key_list2 += [inchi_key] inchi_key_list += inchi_key_list2 # InChiKey1 and InChiKey2 for inchikey in inchi_key_list: inchikey = inchikey.split('-') inchikey1 = inchikey[0] inchikey2 = inchikey[1] inchi_key1_list += [inchikey1] inchi_key2_list += [inchikey2] overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def parse_data(input_file): """ takes all text from norine database file and returns a list of lists with all CLASS data and an attribute list. input_file: norine database txt file """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] pre_SMILES_list = [] identifier_list = [] all_lines = input_file.split('\n') all_lines = all_lines[2:] for line in all_lines: line = line.split('\t') #Convert to mol and remove invalid structures smile_string = '' id_string = '' m = line[2] id_name = line[0] mol = Chem.MolFromSmiles(m) if mol != None: smile_string += m id_string += id_name pre_SMILES_list += [smile_string] #Source identifiers identifier_list += [id_string] pre_inchi_list = [] for smile in pre_SMILES_list: #Generate mol m = Chem.MolFromSmiles(smile) #SMILES sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # InChI inchi = rdinchi.MolToInchi(m) pre_inchi_list += [inchi[0]] # InChIKey1 and InChIKey2 for inchi in pre_inchi_list: if not str(inchi).startswith('InCh'): inchi = 'NA' inchi_list += [inchi] pre_inchi_key_list = [] for inchi2 in inchi_list: if inchi2 == 'NA': inchi_key = "NA-NA" pre_inchi_key_list += [inchi_key] if inchi2 != 'NA': inchi_key = rdinchi.InchiToInchiKey(inchi2) pre_inchi_key_list += [inchi_key] for inchi_key in pre_inchi_key_list: inchi_key = inchi_key.split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def get_molecular_formula(smi): """ Return the molecular formula of the molecule corresponding to the smiles """ mol = Chem.AddHs(Chem.MolFromSmiles(smi)) return rdMolDescriptors.CalcMolFormula(mol)
def theolpp(usr_params): """ param_dct = {'lipid_class': lipid_class, 'ox_level': ox_level, 'oap_mode': oap_mode, 'ocp_mode': ocp_mode, 'lyso_oap_mode': lyso_oap_mode, 'lyso_ocp_mode': lyso_ocp_mode, 'ox_max': ox_max, 'keto_max': keto_max, 'ooh_max': ooh_max, 'epoxy_max': epoxy_max, 'lipid_lst_path': lipid_lst_path, 'lipid_tab': lipid_tab, 'prostane_mode': prostane_mode, 'ox_prostane_mode': ox_prostane_mode, 'sdf_path': sdf_path, 'msp_mode': msp_mode, 'msp_path': msp_path, 'mod_lst_path': mod_lst_path, 'fa_lst_path': fa_lst_path, 'prostane_mod_path': prostane_mod_path, 'prostane_abbr_path': prostane_abbr_path, 'frag_pattern_path': frag_pattern_path} :param usr_params: :return: """ t_start = time.clock() pl_table = usr_params['lipid_lst_path'] fa_table = usr_params['fa_lst_path'] mod_table = usr_params['mod_lst_path'] isop_cfg = usr_params['prostane_mod_path'] isopabbr_cfg = usr_params['prostane_abbr_path'] # pl_class_use_lst = ['PA', 'PC', 'PE', 'PG', 'PI', 'PIP', 'PS'] pl_class = usr_params['lipid_class'] pl_class_use_lst = [pl_class] ox_level = usr_params['ox_level'] oap_mode = usr_params['oap_mode'] ocp_mode = usr_params['ocp_mode'] lyso_oap_mode = usr_params['lyso_oap_mode'] lyso_ocp_mode = usr_params['lyso_ocp_mode'] ox_max = usr_params['ox_max'] keto_max = usr_params['keto_max'] ooh_max = usr_params['ooh_max'] epoxy_max = usr_params['epoxy_max'] prostane_mode = usr_params['prostane_mode'] prostane_ox_mode = usr_params['ox_prostane_mode'] save_sdf = usr_params['sdf_path'] save_spectra = usr_params['msp_mode'] save_msp = usr_params['msp_path'] score_xlsx = usr_params['frag_pattern_path'] pl_fp_xlsx = usr_params['pl_hg_path'] pl_df = pd.read_excel(pl_table, sheetname=usr_params['lipid_tab']) fa_df = pd.read_csv(fa_table, index_col=0) print(pl_df.head()) # Select export species OAP, OCP, Lyso OAP, Lyso OCP ban_lst = ['LYSOLYSO'] if oap_mode == 0: ban_lst.extend(['UNMODOAP', 'OAPUNMOD', 'OAPOAP']) if ocp_mode == 0: ban_lst.extend(['UNMODOCP', 'OCPUNMOD', 'OCPOCP']) if lyso_oap_mode == 0: ban_lst.extend(['LYSOOAP', 'OAPLYSO']) if lyso_ocp_mode == 0: ban_lst.extend(['LYSOOCP', 'OCPLYSO']) if ox_level == 1: ban_lst.extend( ['OAPOAP', 'OCPOCP', 'OAPOCP', 'OCPOAP', 'OAPUNMOD', 'OCPUNMOD']) ox_param_dct = { 'MAX_MOD': ox_max, 'MAX_KETO': keto_max, 'MAX_OOH': ooh_max, 'MAX_EPOXY': epoxy_max } # sdf_writer = Chem.SDWriter(open(save_sdf, mode='w')) if save_spectra == 1 and len(save_msp) > 0: msp_obj = open(save_msp, mode='w') else: msp_obj = None sdf_dct = {} parser = PLParser() abbr_gen = AbbrGenerator() frag_gen = TheoFrag(pl_class, score_xlsx) fingerprint_gen = FingerprintGen(pl_fp_xlsx) c_lst = [] fa_lpp_df_dct = {} sum_theo_lpp_dct = {} for (_idx, _row) in pl_df.iterrows(): _pl_abbr = str(_row['phospholipids']) _pl_elem_lst, pl_info_dct = parser.get_composition(_pl_abbr) print('PL composition ==>', _pl_elem_lst) _pl_hg_abbr = _pl_elem_lst[0] # get smiles from abbr if _pl_hg_abbr in pl_class_use_lst: c_lst.append(_pl_abbr) # prepare output _pl_lpp_df = pd.DataFrame() print('Start oxidation of ==>', _pl_abbr) _pl_sn1_abbr = _pl_elem_lst[1] _pl_sn2_abbr = _pl_elem_lst[2] if len(pl_info_dct.keys()) > 0: sn1_link = pl_info_dct['sn1_link'] sn1_c_num = int(pl_info_dct['sn1_c_num']) sn1_db_num = int(pl_info_dct['sn1_db_num']) sn1_omega_type = int(pl_info_dct['sn1_omega_type']) if sn1_omega_type == 0: sn1_query_code = 'Link == "%s" and C == % i and DB == %i' % ( sn1_link, sn1_c_num, sn1_db_num) sn1_fa_df = fa_df.query(sn1_query_code) sn1_fa_df = sn1_fa_df.query(sn1_query_code).head(1) else: sn1_query_code = 'Link == "%s" C == % i and DB == %i' % ( sn1_link, sn1_c_num, sn1_db_num) sn1_fa_df = fa_df.query(sn1_query_code) sn1_fa_df = sn1_fa_df.query( 'Link == "%s" and omega == %i' % (sn1_link, sn1_omega_type)).head(1) sn2_link = pl_info_dct['sn2_link'] sn2_c_num = int(pl_info_dct['sn2_c_num']) sn2_db_num = int(pl_info_dct['sn2_db_num']) sn2_omega_type = int(pl_info_dct['sn2_omega_type']) if sn2_omega_type == 0: sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % ( sn2_link, sn2_c_num, sn2_db_num) sn2_fa_df = fa_df.query(sn2_query_code) sn2_fa_df = sn2_fa_df.query(sn2_query_code).head(1) else: sn2_query_code = 'Link == "%s" and C == % i and DB == %i' % ( sn2_link, sn2_c_num, sn2_db_num) sn2_fa_df = fa_df.query(sn2_query_code) sn2_fa_df = sn2_fa_df.query( 'Link == "%s" and omega == %i' % (sn2_link, sn2_omega_type)).head(1) _pl_sn1_smiles = sn1_fa_df.loc[_pl_sn1_abbr, 'SMILES'] _pl_sn2_smiles = sn2_fa_df.loc[_pl_sn2_abbr, 'SMILES'] print('sn1 =>', _pl_sn1_smiles, '|| sn2 =>', _pl_sn2_smiles) else: _pl_sn1_smiles = '' _pl_sn2_smiles = '' # check if FA already oxidized to speed up if _pl_sn1_abbr in fa_lpp_df_dct.keys(): sn1_mod_sum_df = fa_lpp_df_dct[_pl_sn1_abbr] else: sn1_link_dct = fa_link_filter(_pl_sn1_smiles) sn1_mod_sum_df = oxidizer(sn1_link_dct, mod_table, isop_cfg, isopabbr_cfg, ox_level, ox_param_dct, prostane_mode, prostane_ox_mode) fa_lpp_df_dct[_pl_sn1_abbr] = sn1_mod_sum_df.copy() if _pl_sn2_abbr in fa_lpp_df_dct.keys(): sn2_mod_sum_df = fa_lpp_df_dct[_pl_sn2_abbr] else: sn2_link_dct = fa_link_filter(_pl_sn2_smiles) sn2_mod_sum_df = oxidizer(sn2_link_dct, mod_table, isop_cfg, isopabbr_cfg, ox_level, ox_param_dct, prostane_mode, prostane_ox_mode) fa_lpp_df_dct[_pl_sn2_abbr] = sn2_mod_sum_df.copy() for (_sn1_idx, _sn1_row) in sn1_mod_sum_df.iterrows(): _sn1_mod_smiles = _sn1_row['FULL_SMILES'] _sn1_abbr_str = _sn1_row['FA_ABBR'] _sn1_typ_str = _sn1_row['FA_TYPE'] _sn1_formula_str = _sn1_row['FA_FORMULA'] for (_sn2_idx, _sn2_row) in sn2_mod_sum_df.iterrows(): _sn2_mod_smiles = _sn2_row['FULL_SMILES'] _sn2_abbr_str = _sn2_row['FA_ABBR'] _sn2_typ_str = _sn2_row['FA_TYPE'] _sn2_formula_str = _sn2_row['FA_FORMULA'] _oap_ocp_lst = [_sn1_typ_str, _sn2_typ_str] _lpp_typ = ''.join(_oap_ocp_lst) if _lpp_typ not in ban_lst: _lpp_smiles = LPPmerge.pl_lpp(_pl_hg_abbr, sn1=_sn1_mod_smiles, sn2=_sn2_mod_smiles) _lpp_id_str = str(''.join([ _pl_hg_abbr, '(', _sn1_abbr_str, '/', _sn2_abbr_str, ')' ])) _lpp_sub_class_json = '{"SN1": "%s", "SN2": "%s"}' % ( _sn1_typ_str, _sn2_typ_str) _lpp_info_dct = { 'LPP_ORIGIN': _pl_abbr, 'LPP_SMILES': _lpp_smiles, 'LPP_CLASS': _pl_hg_abbr, 'SN1_SMILES': _sn1_mod_smiles, 'SN2_SMILES': _sn2_mod_smiles, 'SN1_ABBR': _sn1_abbr_str, 'SN2_ABBR': _sn2_abbr_str, 'SN1_JSON': _sn1_row['FA_JSON'], 'SN2_JSON': _sn2_row['FA_JSON'], 'SN1_FORMULA': _sn1_formula_str, 'SN2_FORMULA': _sn2_formula_str, 'LM_ID': _lpp_id_str, 'SN_JSON': _lpp_sub_class_json } if save_spectra == 1: _lpp_info_dct['MSP_JSON'] = frag_gen.calc_frags( _lpp_info_dct) _lpp_info_se = pd.Series(data=_lpp_info_dct) _pl_lpp_df[_lpp_id_str] = _lpp_info_se # check if same lpp generated already # Currently use bulk settings if _lpp_id_str in sdf_dct.keys(): _lpp_origin = sdf_dct[_lpp_id_str]['LPP_ORIGIN'] _lpp_origin_lst = _lpp_origin.split(',') if _pl_abbr in _lpp_origin_lst: pass else: _lpp_origin_lst.append(_pl_abbr) sdf_dct[_lpp_id_str]['LPP_ORIGIN'] = ','.join( _lpp_origin_lst) else: sdf_dct[_lpp_id_str] = _lpp_info_dct.copy() # clean memory by deleting these dicts and series del _lpp_info_dct, _lpp_info_se # generate summary table _pl_lpp_df = _pl_lpp_df.transpose() print('==> %i of LPP generated !!' % _pl_lpp_df.shape[0]) print('==> ==> Move to next lipid==> ') # print(_pl_lpp_df.head()) sum_theo_lpp_dct[_pl_abbr] = _pl_lpp_df # create sdf # for (_lpp_i, _lpp_r) in _pl_lpp_df.iterrows(): sum_theo_lpp_pl = pd.Panel(data=sum_theo_lpp_dct) print(sum_theo_lpp_pl.shape) # write to sdf print('==>Start to generate SDF ==> MSP mode = %i' % save_spectra) print('!! %i structures in total !!' % len(sdf_dct.keys())) mzcalc = MZcalc() sdf_writer = Chem.SDWriter(open(save_sdf, mode='w')) if save_spectra == 1: for _k_lpp in sdf_dct.keys(): _lpp_dct = sdf_dct[_k_lpp] if len(json.loads(_lpp_dct['MSP_JSON']).keys()) > 0: _lpp_smiles = str(_lpp_dct['LPP_SMILES']) # print(_lpp_smiles) _lpp_mol = Chem.MolFromSmiles(_lpp_smiles) AllChem.Compute2DCoords(_lpp_mol) _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID'])) _lpp_mass = Descriptors.MolWt(_lpp_mol) _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol) _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol) _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass) _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass) _lpp_mol.SetProp('FORMULA', _lpp_formula) _lpp_sn2_smi = _lpp_dct['SN2_SMILES'] if str(_lpp_dct['LPP_CLASS'] ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O': _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M+HCOO]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz( _lpp_formula, charge='[M+HCOO]-') _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) else: _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M-H]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula, charge='[M-H]-') _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info) _lpp_dct['EXACT_MASS'] = _lpp_exactmass fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct) _lpp_dct['FINGERPRINT'] = fp_mz_lst _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst)) for _k in _lpp_dct.keys(): _lpp_mol.SetProp(_k, str(_lpp_dct[_k])) sdf_writer.write(_lpp_mol) if save_spectra == 1 and len(save_msp) > 0: MSPcreator.to_msp(msp_obj, _lpp_dct) elif save_spectra == 0: for _k_lpp in sdf_dct.keys(): _lpp_dct = sdf_dct[_k_lpp] _lpp_smiles = str(_lpp_dct['LPP_SMILES']) _lpp_mol = Chem.MolFromSmiles(_lpp_smiles) AllChem.Compute2DCoords(_lpp_mol) _lpp_mol.SetProp('_Name', str(_lpp_dct['LM_ID'])) _lpp_mass = Descriptors.MolWt(_lpp_mol) _lpp_exactmass = rdMolDescriptors.CalcExactMolWt(_lpp_mol) _lpp_formula = rdMolDescriptors.CalcMolFormula(_lpp_mol) _lpp_mol.SetProp('EXACT_MASS', '%.6f' % _lpp_exactmass) _lpp_mol.SetProp('NOMINAL_MASS', '%.3f' % _lpp_mass) _lpp_mol.SetProp('FORMULA', _lpp_formula) _lpp_sn2_smi = _lpp_dct['SN2_SMILES'] if str(_lpp_dct['LPP_CLASS'] ) == 'PC' and _lpp_sn2_smi[-9:] != r'C(O)=O)=O': _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M+HCOO]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula, charge='[M+HCOO]-') _lpp_neg_precursor_info = '{"[M+HCOO]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) else: _lpp_neg_precursor_elem = mzcalc.get_elements(_lpp_formula) _lpp_neg_precursor_formula = mzcalc.get_formula( _lpp_neg_precursor_elem, charge='[M-H]-') _lpp_neg_precursor_mz = mzcalc.get_mono_mz(_lpp_formula, charge='[M-H]-') _lpp_neg_precursor_info = '{"[M-H]-": ["%s", %f]}' % ( _lpp_neg_precursor_formula[0], _lpp_neg_precursor_mz) _lpp_dct['PRECURSOR_JSON'] = _lpp_neg_precursor_info _lpp_mol.SetProp('PRECURSOR_JSON', _lpp_neg_precursor_info) _lpp_dct['EXACT_MASS'] = _lpp_exactmass fp_mz_lst = fingerprint_gen.get_fingerprint(_lpp_dct) _lpp_dct['FINGERPRINT'] = fp_mz_lst _lpp_mol.SetProp('FINGERPRINT', json.dumps(fp_mz_lst)) for _k in _lpp_dct.keys(): _lpp_mol.SetProp(_k, str(_lpp_dct[_k])) sdf_writer.write(_lpp_mol) sdf_writer.close() if save_spectra == 1 and len(save_msp) > 0: msp_obj.close() SDFsummary.sdf2xlsx(save_sdf, str(save_sdf)[:-4] + '.xlsx') # if save_spectra == 1: SDFsummary.sdf2sum_fa(save_sdf, str(save_sdf)[:-4] + '_FA_SUM.xlsx') t_spent = time.clock() - t_start info_updater_1 = '=>%i of LPP generated ==> ' % len(sdf_dct.keys()) info_updater_2 = '=>==> %i of phospholipids processed in %.3fs ==> ==> Finished !!!!!!' % ( len(c_lst), t_spent) return info_updater_1, info_updater_2
return def check_maxes(formd, maxes): bools = [v < maxes[e] for e, v in formd.items()] return all(bools) def rec_formula(mz, ppm=5): maxes = dict(get_elemaxs(mz)) error = mz * (ppm * 1E-6) mlow, mhigh = mz - error, mz + error formula = {e: 0 for e in eles.keys()} return _rec_form(formula, mlow, mhigh, maxes) def _rec_form(ele_idx, formula, mlow, mhigh, maxes): good_form = check_formula(formula, mlow, mhigh) good_form = True under_maxes = check_maxes(formula, maxes) if good_form and under_maxes: yield formula else: formula[ele] pass sm = get_soome_mols() masses = [rdMolDescriptors.CalcExactMolWt(m) for m in sm] formulas = [rdMolDescriptors.CalcMolFormula(m) for m in sm]
def calculate_formula_in_dataframe(x): formula = '' if x: formula = rdMolDescriptors.CalcMolFormula(x) return formula
def formatdb(smiles): df = pd.read_csv(smiles, sep='\t', header=None) os.remove(smiles) smi = list(df[0]) m = [Chem.MolFromSmiles(x) for x in smi] inchi = [] ikeys = [] ikey1 = [] ikey2 = [] form = [] exmass = [] for i in range(len(m)): try: inchi.append(Chem.rdinchi.MolToInchi(m[i])[0]) ikey = Chem.rdinchi.InchiToInchiKey(inchi[i]) ikeys.append(ikey) ikey1.append(ikey.split('-')[0]) ikey2.append(ikey.split('-')[1]) form.append(rdMD.CalcMolFormula(m[i])) exmass.append(rdMD.CalcExactMolWt(m[i])) except: ikeys.append('') inchi.append('') ikey1.append('') ikey2.append('') form.append('') exmass.append('') data = { 'inchikey': ikeys, 'MonoisotopicMass': exmass, 'InChI': inchi, 'SMILES': list(df[0]), 'Identifier': list(df[1]), 'InChIKey2': ikey2, 'InChIKey1': ikey1, 'MolecularFormula': form } cn = [ "inchikey", "MonoisotopicMass", "InChI", "SMILES", "Identifier", "InChIKey2", "InChIKey1", "MolecularFormula" ] formdata = pd.DataFrame(data, columns=cn) classy = query_inchikey(ikeys) # If the structure do not show a classification, try query #in_process = get_class(list(df[0]), chunksize=100) #classy = poll(in_process) classy = classy[['inchikey', 'kingdom', 'superclass', 'class', 'subclass']] classy.columns = [ 'inchikey', 'kingdom_name', 'superclass_name', 'class_name', 'subclass_name' ] formfinal = pd.merge(formdata, classy, how='left', on=['inchikey']) formfinal = formfinal.fillna('') formfinal.drop('inchikey', axis=1, inplace=True) id = [x for x in range(len(ikeys)) if ikeys[x] == ''] formfinal.drop(formfinal.index[id], inplace=True) formfinal.to_csv(smiles + '_FORMATED.txt', index=False, sep='\t') return 'Done'
def create_CLASS_data(data_dict): """ Generates CLASS data for the strepto data present in the strep_dict. input_file: streptodb dictionary """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] # Identifier identifier_list = data_dict['compound_id'] # SMILES SMILES_list = data_dict['canonical_smiles'] for SMILE in SMILES_list: # generate molecules m = Chem.MolFromSmiles(SMILE) # MonoisotopicMass mol_mass = str(Descriptors.ExactMolWt(m))[:-3] mol_mass_list += [mol_mass] # InChI inchi = rdinchi.MolToInchi(m) inchi_list += [inchi[0]] # MolecularFormula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(data_dict['canonical_smiles']) NA_list += ['NA'] * nr_of_structures # InChIKey inchi_key_list = [] inchi_key_list2 = [] for inchi in inchi_list: inchi_key = rdinchi.InchiToInchiKey(inchi) inchi_key_list2 += [inchi_key] inchi_key_list += inchi_key_list2 # InChiKey1 and InChiKey2 for inchikey in inchi_key_list: inchikey = inchikey.split('-') inchikey1 = inchikey[0] inchikey2 = inchikey[1] inchi_key1_list += [inchikey1] inchi_key2_list += [inchikey2] overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def extract_molecules(xml_3d_filename, outfile): """ Extract molecules and then stick into database """ tree3d = ET.parse(xml_3d_filename) root = tree3d.getroot() molecules = [] for molecule in root.findall('{http://www.xml-cml.org/schema}molecule'): molecules.append(molecule) MAX_DEBUG_ITER = 100000000 molecules_df = [] for m, _ in tqdm(zip(molecules, range(MAX_DEBUG_ITER)), total=len(molecules)): mol_id = m.attrib['id'] mol = Chem.RWMol() mol.SetProp("id", mol_id) name = "" if 'title' in m.attrib: name = m.attrib['title'] mol.SetProp("name", name) atomArray = m.find("{http://www.xml-cml.org/schema}atomArray") bondArray = m.find("{http://www.xml-cml.org/schema}bondArray") atom_pos_map = {} atoms_3dloc = [] for ai, a in enumerate(atomArray): #print(a.attrib) atom = Chem.Atom(a.attrib['elementType']) x3 = float(a.attrib['x3']) y3 = float(a.attrib['y3']) z3 = float(a.attrib['z3']) #atom.SetIsotope(int(a.attrib['isotopeNumber'])) atom.SetFormalCharge(int(a.attrib['formalCharge'])) #atom.SetNumExplicitHs(int(a.attrib['hydrogenCount'])) atom.SetProp('id', a.attrib['id']) idx = mol.AddAtom(atom) atom_pos_map[a.attrib['id']] = idx assert idx == ai atoms_3dloc.append((x3, y3, z3)) for b in bondArray: atom_refs = b.attrib['atomRefs2'] bond_order = b.attrib['order'] a1, a2 = atom_refs.split(" ") if bond_order == 'S': bond = Chem.rdchem.BondType.SINGLE elif bond_order == 'D': bond = Chem.rdchem.BondType.DOUBLE elif bond_order == 'T': bond = Chem.rdchem.BondType.TRIPLE else: raise NotImplementedError() mol.AddBond(atom_pos_map[a1], atom_pos_map[a2], order=bond) C_count = np.sum([a.GetSymbol() == 'C' for a in mol.GetAtoms()]) H_count = np.sum([a.GetSymbol() == 'H' for a in mol.GetAtoms()]) try: Chem.SanitizeMol(mol) formula = rdMD.CalcMolFormula(mol) error_msg = "" valid = True except ValueError as e: print("error sanitizing", name, e) error_msg = str(e) valid = False mol = mol.GetMol() c = datautil.array_to_conf(np.array(atoms_3dloc)) mol.AddConformer(c) molecules_df.append({ 'mol_id': mol_id, 'name': name, 'C_count': C_count, 'H_count': H_count, 'formula': formula, 'error_msg': error_msg, 'mol': mol, 'valid': valid }) molecules_df = pd.DataFrame(molecules_df).set_index('mol_id') out = [] for row_i, row in tqdm(molecules_df.iterrows(), total=len(molecules_df)): nmrshift_mol = row.mol id_to_pos = { nmrshift_mol.GetAtomWithIdx(i).GetProp('id'): i for i in range(nmrshift_mol.GetNumAtoms()) } for id_str, pos in id_to_pos.items(): out.append({'atom': id_str, 'atom_idx': pos, 'molecule': row_i}) mol_atomid_to_idx = pd.DataFrame(out).set_index(['molecule', 'atom']) pickle.dump( { 'molecules_df': molecules_df, 'mol_atomid_to_idx': mol_atomid_to_idx }, open(outfile, 'wb'))
''' Created on 5 Jul 2017 @author: dghosh ''' from rdkit import Chem from rdkit.Chem import rdMolDescriptors filepath = 'insert filepath here' with open(filepath) as f: listMol = f.read().splitlines() outfile = open(filepath[-4:] + 'molFolmula.txt', 'w') for molSmile in listMol: mol = Chem.MolFromSmiles(molSmile) outfile.write(rdMolDescriptors.CalcMolFormula(mol))
def calc_mz(self, elem_info, mod=None, score=0, charge='[M-H]-', lpp_info_dct=None): if charge in self.charge_mz_dct.keys( ) and charge in self.charge_elem_dct.keys(): pass else: charge = '[M-H]-' if isinstance(elem_info, str): # test if elem_info is smiles code try: _mol = Chem.MolFromSmiles(elem_info) AllChem.Compute2DCoords(_mol) # _exactmass = rdMolDescriptors.CalcExactMolWt(_mol) _formula = rdMolDescriptors.CalcMolFormula(_mol) _elem_dct = self.parse_formula(_formula) except: _elem_dct = self.parse_formula(elem_info) elif isinstance(elem_info, dict): _elem_dct = elem_info.copy() else: _elem_dct = {} if mod is not None or mod != '': if _elem_dct is None or _elem_dct == {}: _elem_dct = self.get_mod_elem(elem_dct=None, mod=mod, lpp_info_dct=lpp_info_dct) else: _elem_dct = self.get_mod_elem(elem_dct=_elem_dct, mod=mod, lpp_info_dct=lpp_info_dct) ion_mz, _ion_elem_dct = self.formula_to_mz(_elem_dct, charge=charge) elem_order_lst = ['C', 'H', 'N', 'O', 'P', 'S', 'Na', 'K'] _ion_elem = '' for _e in elem_order_lst: if _e in _ion_elem_dct.keys(): _ion_elem += _e if _ion_elem_dct[_e] > 1: _ion_elem += str(_ion_elem_dct[_e]) else: pass if charge in ['[M+H]+', '[M+Na]+', '[M+K]+', '[M+NH4]+']: _ion_elem += '+' elif charge in ['[M-H]-', '[M+FA-H]-', '[M+HCOO]-']: _ion_elem += '-' # charged_info = '|'.join([frag_type, _ion_elem]) # ion_info = (round(ion_mz, 4), score, _ion_elem) ion_info_dct = { 'mz': round(ion_mz, 4), 'i': score, 'formula': _ion_elem } return ion_info_dct
def diff_mol_pdb(mol, pdbfile, logfile=devnull): with stdout_redirected(to=logfile, stdout=sys_stderr): with stdout_redirected(to=logfile, stdout=sys_stdout): remove_isotopes(mol, sanitize=True) nhmol = Chem.RemoveHs(mol, implicitOnly=False, updateExplicitCount=True, sanitize=True) try: Chem.Kekulize(nhmol) except: pass checkconnect = True pdbmol = None try: pdbmol = Chem.MolFromPDBFile(pdbfile, removeHs=False, sanitize=True) except: pass if pdbmol is None: pdbmol = Chem.MolFromPDBFile(pdbfile, removeHs=False, sanitize=False) if pdbmol is None: raise ParsingError("Cannot open PDB molecule.") pdbmol = disconnect(pdbmol) Chem.SanitizeMol(pdbmol, catchErrors=True) nhpdbmol = Chem.RemoveHs(pdbmol, implicitOnly=False, updateExplicitCount=True, sanitize=False) Chem.SanitizeMol(nhpdbmol, catchErrors=True) try: print( 'Applying bond orders and formal charges from molecule file to PDB molecule ... ' ) nhpdbmol = AssignBondOrdersFromTemplate(nhmol, nhpdbmol) newpdbmol = Chem.AddHs(nhpdbmol, addCoords=True, explicitOnly=True) newpdbmol.UpdatePropertyCache() newpdbmol = correct_hydrogen_num_from_pdbmol(pdbmol, newpdbmol) newpdbmol = set_hydrogen_coor_from_pdbmol(pdbmol, newpdbmol, refconfId=-1, confId=-1) except Exception: print( "WARNING: Cannot assign bond orders from molecule file template. Checking only non-hydrogen connectivity." ) checkconnect = False newpdbmol = nhpdbmol pass #Stoichiometric formula check impnum = count_implicit_hydrogens(newpdbmol) failnum = 0 result = 'OK' unformula = remove_charge_formula( rdMolDescriptors.CalcMolFormula(mol)) pdbunformula = remove_charge_formula( rdMolDescriptors.CalcMolFormula(newpdbmol)) #print(pdbunformula) pdbunformula = fix_formula(pdbunformula, impnum) if unformula != pdbunformula: failnum += 1 result = 'FAIL: Molecules have different Stoichiometric formulas ' + unformula + ' ' + pdbunformula + '.' print('Stoichiometric formula check (without charge): ' + result) print('Generating Fixed H InChI for molecule file ... ') inchi, code, msg, log, aux = rdinchi.MolToInchi( mol, options='-FixedH -DoNotAddH') if code == 0: #print(inchi) pass if code == 1: # print(inchi) print(msg) else: print(msg) print('Generating Standard InChI for molecule file ... ') sinchi, code, msg, log, aux = rdinchi.MolToInchi( mol, options=' -DoNotAddH') if code == 0: #print(sinchi) pass if code == 1: #print(sinchi) print(msg) else: print(msg) maininchi = truncate_inchi(inchi, ['connect']) print('Generating Fixed H InChI for PDB molecule ... ') pdbinchi, code, msg, log, aux = rdinchi.MolToInchi( newpdbmol, options='-FixedH -DoNotAddH') if code == 0: pass if code == 1: print(msg) else: print(msg) print('Generating Standard InChI for PDB molecule ... ') pdbsinchi, code, msg, log, aux = rdinchi.MolToInchi( newpdbmol, options=' -DoNotAddH') if code == 0: pass if code == 1: print(msg) else: print(msg) pdbmaininchi = truncate_inchi(pdbinchi, ['connect']) result = 'OK' if maininchi != pdbmaininchi: result = 'FAIL: Molecules have diferent scaffolds\n' + maininchi + ' ' + pdbmaininchi + '.' failnum += 1 print('Main chain InChI check: ' + result) else: print('Main chain InChI check: ' + result) result = 'OK' if checkconnect: if sinchi != pdbsinchi: result = 'FAIL: Molecules are not the same compound or have different net charge.\n' + sinchi + '\n' + pdbsinchi + '.' failnum += 1 print('Standard InChI check: ' + result) else: print('Standard InChI check: ' + result) result = 'OK' if inchi != pdbinchi: result = 'FAIL: Molecules have different protonation/tautomery\n' + inchi + '\n' + pdbinchi + '.' failnum += 1 print('Fixed H InChI check: ' + result) print('OK') return failnum, newpdbmol, nhpdbmol
def get_mod_elem(self, elem_dct=None, mod=None, lpp_info_dct=None): mod_dct = { '-H2O': { 'H': -2, 'O': -1 }, '+H2O': { 'H': 2, 'O': 1 }, '-CO2': { 'C': -1, 'O': -2 }, '+HCOO': { 'H': 1, 'C': 1, 'O': 2 }, '-C3H9N': { 'C': -3, 'H': -9, 'N': -1 }, '-C3H5NO2': { 'C': -3, 'O': -2, 'H': -5, 'N': -1 }, '-CH3COOH': { 'C': -2, 'O': -2, 'H': -4 }, '-CH2': { 'C': -1, 'H': -2 }, '-H': { 'H': -1 }, '-CH3': { 'C': -1, 'H': -3 }, '+CH3': { 'C': +1, 'H': +3 } } _sn1_elem = {} _sn2_elem = {} # get the formula as dict if elem_dct is None: if lpp_info_dct is None: _elem_dct = {} else: _lpp_type = lpp_info_dct['LPP_CLASS'] _lpp_full_smi = lpp_info_dct['LPP_SMILES'] _sn1_smi = lpp_info_dct['SN1_SMILES'] _sn2_smi = lpp_info_dct['SN2_SMILES'] _sn1_formula = lpp_info_dct['SN1_FORMULA'] _sn2_formula = lpp_info_dct['SN2_FORMULA'] _sn1_elem = self.parse_formula(_sn1_formula) _sn2_elem = self.parse_formula(_sn2_formula) _lyso_smi = 'O' if re.match(r'\[M-[sS][nN][1].*', mod): _frag_smi = pl_lpp(_lpp_type, sn1=_lyso_smi, sn2=_sn2_smi) elif re.match(r'\[M-[sS][nN][2].*', mod): _frag_smi = pl_lpp(_lpp_type, sn1=_sn1_smi, sn2=_lyso_smi) # PC M == M-CH3 elif re.match(r'\[M-CH3-[sS][nN][1].*', mod): _frag_smi = pl_lpp(_lpp_type, sn1=_lyso_smi, sn2=_sn2_smi) elif re.match(r'\[M-CH3-[sS][nN][2].*', mod): _frag_smi = pl_lpp(_lpp_type, sn1=_sn1_smi, sn2=_lyso_smi) elif re.match(r'\[[sS][nN][1].*', mod): _frag_smi = _sn1_smi elif re.match(r'\[[sS][nN][2].*', mod): _frag_smi = _sn2_smi elif re.match(r'\[M[+-][HCFA].*', mod): _frag_smi = _lpp_full_smi else: _frag_smi = '' _mol = Chem.MolFromSmiles(_frag_smi) AllChem.Compute2DCoords(_mol) _formula = rdMolDescriptors.CalcMolFormula(_mol) _elem_dct = self.parse_formula(_formula) else: _elem_dct = elem_dct.copy() _mod_sum_elem_dct = {'C': 0, 'H': 0, 'N': 0, 'O': 0, 'P': 0} if mod is None or mod == '': _mod_sum_elem_dct = {} else: chk0 = re.compile(r'.*-[sS][nN][12].*') chk1 = re.compile(r'.*[-]H2O.*') chk2 = re.compile(r'.*[+]H2O.*') chk3 = re.compile(r'.*[-]CO2.*') chk4 = re.compile(r'.*[-]C3H9N.*') chk5 = re.compile(r'.*[-]CH3COOH.*') chk6 = re.compile(r'.*[-]CH3.*') chk7 = re.compile(r'.*[-]C3H5NO2.*') chk9 = re.compile(r'.*[+]HCOO.*') chk10 = re.compile(r'.*[+]CH3.*') chk11 = re.compile(r'(P[ACEGSI]4?P?_)(.*)([+-])') # if chk0.match(mod): # if re.match(r'.*-[sS][nN]1.*', mod): # _mod_sum_elem_dct['H'] += 2 # _mod_sum_elem_dct['O'] += 1 # for _key in _sn1_elem.keys(): # _mod_sum_elem_dct[_key] -= _sn1_elem[_key] # # if re.match(r'.*-[sS][nN]2.*', mod): # _mod_sum_elem_dct['H'] += 2 # _mod_sum_elem_dct['O'] += 1 # for _key in _sn2_elem.keys(): # _mod_sum_elem_dct[_key] -= _sn2_elem[_key] if chk1.match(mod): _mod_elem_dct = mod_dct['-H2O'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] # for [M-sn+H2O-H]-, the H2O was add already # elif chk2.match(mod): # _mod_elem_dct = mod_dct['+H2O'] if chk2.match(mod): _mod_elem_dct = mod_dct['+H2O'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk3.match(mod): _mod_elem_dct = mod_dct['-CO2'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk4.match(mod): _mod_elem_dct = mod_dct['-C3H9N'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk5.match(mod): _mod_elem_dct = mod_dct['-CH3COOH'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk6.match(mod): _mod_elem_dct = mod_dct['-CH3'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] _mod_sum_elem_dct['H'] += 1 # For PC only if chk7.match(mod): _mod_elem_dct = mod_dct['-C3H5NO2'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk9.match(mod): _mod_elem_dct = mod_dct['+HCOO'] for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk10.match(mod): _mod_elem_dct = mod_dct['+CH3'] # For PC only for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] if chk11.match(mod): chk11_match = chk11.match(mod) chk11_lst = chk11_match.groups() mod_elem_str = chk11_lst[1] _mod_elem_dct = self.parse_formula(formula=mod_elem_str) _mod_elem_dct['H'] += 1 for _key in _mod_elem_dct.keys(): _mod_sum_elem_dct[_key] += _mod_elem_dct[_key] else: pass # get sum keys form both dict _frag_elem_dct = {} _charged_keys_lst = set( sum([_elem_dct.keys(), _mod_sum_elem_dct.keys()], [])) for _key in _charged_keys_lst: _frag_elem_dct[_key] = _elem_dct.get( _key, 0) + _mod_sum_elem_dct.get(_key, 0) return _frag_elem_dct