def smiles_to_inchi_inchikey(smiles,verbose=0): error_counter=0 warning_counter=0 inchis=[] inchis2=[] inchiKeys=[] for elem in smiles: mol=Chem.MolFromSmiles(elem) if(mol!=None): inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol) inchiKey=rdinchi.InchiToInchiKey(inchi) inchis.append(inchi) inchiKeys.append(inchiKey) try: mol, retcode, message, logs=rdinchi.InchiToMol(inchi) if(mol!=None): inchi2, retcode, message, logs, aux =rdinchi.MolToInchi(mol) inchis2.append(inchi2) else: inchis2.append("XXX") if(verbose!=0): print(elem+ ": InChI cannot converted to mol object, added XXX instead!") except: if(verbose!=0): print(retcode) print(message) print("Smiles:"+elem) inchis2.append("XXX") else: inchis.append("XXX") inchiKeys.append("XXX") inchis2.append("XXX") if(verbose!=0): print(elem+ ": can not converted added XXX instead! ") error_counter=error_counter+1 for i1,i2 in zip(inchis,inchis2): if(i1!=i2): if(verbose!=0): print("Warning:"+i1+" - "+i2) warning_counter=warning_counter+1 print("\nGeneration of InChI and InChIKey from SMILES is completed.") print("Total errors:"+str(error_counter)) print("Total warnings:"+str(warning_counter)+"\n") return inchis,inchiKeys
def MolToInchiAndAuxInfo(mol, options="", logLevel=None, treatWarningAsError=False): """Returns the standard InChI string and InChI auxInfo for a molecule Keyword arguments: logLevel -- the log level used for logging logs and messages from InChI API. set to None to diable the logging completely treatWarningAsError -- set to True to raise an exception in case of a molecule that generates warning in calling InChI API. The resultant InChI string and AuxInfo string as well as the error message are encoded in the exception. Returns: a tuple of the standard InChI string and the auxInfo string returned by InChI API, in that order, for the input molecule """ inchi, retcode, message, logs, aux = rdinchi.MolToInchi(mol, options) if logLevel is not None: if logLevel not in logLevelToLogFunctionLookup: raise ValueError("Unsupported log level: %d" % logLevel) log = logLevelToLogFunctionLookup[logLevel] if retcode == 0: log(message) if retcode != 0: if retcode == 1: logger.warning(message) else: logger.error(message) if treatWarningAsError and retcode != 0: raise InchiReadWriteError(inchi, aux, message) return inchi, aux
def sln_to_smiles(sln_list,verbose=0): error_counter=0 warning_counter=0 smiles_list= [] inchis_from_sln=[] inchis_from_smiles=[] for elem in sln_list: mol=rdSLNParse.MolFromQuerySLN(elem) if(mol!=None): smiles=Chem.MolToSmiles(mol) smiles_list.append(smiles) inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol) inchis_from_sln.append(inchi) mol=Chem.MolFromSmiles(smiles) if(mol!=None): inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol) inchis_from_smiles.append(inchi) else: inchis_from_smiles.append("XXX") if(verbose!=0): print(elem+ ": SMILES cannot converted to mol object, added XXX instead!") else: smiles_list.append("XXX") print(elem+ ": SLN can not converted to mol object, added XXX instead!") error_counter=error_counter+1 for i1,i2 in zip(inchis_from_sln,inchis_from_smiles): if(i1!=i2): if(verbose!=0): print("Warning:"+i1+" - "+i2) warning_counter=warning_counter+1 print("\nConversion from SLN to SMILES is completed.") print("Total errors:"+str(error_counter)) print("Total warnings:"+str(warning_counter)+"\n") return smiles_list
def parse_file(input_file, db_name): """ takes all text from nanpdb database file and returns a list of lists with NPs which is easy to use. input_file: nanpdb database txt file db_name: database name """ all_lines = input_file.split('\n') all_lines = all_lines[:-1] all_info_list = [] for line in all_lines: line = line.split('\t') info_per_row_list = [] for value in line: my_string = "" if len(value) == 0: value = "NA" my_string += value info_per_row_list += [my_string] info_per_row_list += [db_name] all_info_list += [info_per_row_list] attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] for line in all_info_list: # generate molecules m = Chem.MolFromSmiles(line[0]) # MonoisotopicMass mol_mass = str(Descriptors.ExactMolWt(m))[:-5] mol_mass_list += [mol_mass] # InChI inchi = rdinchi.MolToInchi(m) inchi_list += [inchi[0]] # SMILES SMILES_list += [line[0]] # Identifier identifier_list += [line[1]] # MolecularFormula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(all_info_list) NA_list += ['NA'] * nr_of_structures # InChIKey inchi_key_list = [] inchi_key_list2 = [] for inchi in inchi_list: inchi_key = rdinchi.InchiToInchiKey(inchi) inchi_key_list2 += [inchi_key] inchi_key_list += inchi_key_list2 # InChiKey1 and InChiKey2 for inchikey in inchi_key_list: inchikey = inchikey.split('-') inchikey1 = inchikey[0] inchikey2 = inchikey[1] inchi_key1_list += [inchikey1] inchi_key2_list += [inchikey2] overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def parse_data(input_file): """ takes all text from norine database file and returns a list of lists with all CLASS data and an attribute list. input_file: norine database txt file """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] pre_SMILES_list = [] identifier_list = [] all_lines = input_file.split('\n') all_lines = all_lines[2:] for line in all_lines: line = line.split('\t') #Convert to mol and remove invalid structures smile_string = '' id_string = '' m = line[2] id_name = line[0] mol = Chem.MolFromSmiles(m) if mol != None: smile_string += m id_string += id_name pre_SMILES_list += [smile_string] #Source identifiers identifier_list += [id_string] pre_inchi_list = [] for smile in pre_SMILES_list: #Generate mol m = Chem.MolFromSmiles(smile) #SMILES sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # InChI inchi = rdinchi.MolToInchi(m) pre_inchi_list += [inchi[0]] # InChIKey1 and InChIKey2 for inchi in pre_inchi_list: if not str(inchi).startswith('InCh'): inchi = 'NA' inchi_list += [inchi] pre_inchi_key_list = [] for inchi2 in inchi_list: if inchi2 == 'NA': inchi_key = "NA-NA" pre_inchi_key_list += [inchi_key] if inchi2 != 'NA': inchi_key = rdinchi.InchiToInchiKey(inchi2) pre_inchi_key_list += [inchi_key] for inchi_key in pre_inchi_key_list: inchi_key = inchi_key.split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def create_CLASS_data(data_dict): """ Generates CLASS data for the strepto data present in the strep_dict. input_file: streptodb dictionary """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] # Identifier identifier_list = data_dict['compound_id'] # SMILES SMILES_list = data_dict['canonical_smiles'] for SMILE in SMILES_list: # generate molecules m = Chem.MolFromSmiles(SMILE) # MonoisotopicMass mol_mass = str(Descriptors.ExactMolWt(m))[:-3] mol_mass_list += [mol_mass] # InChI inchi = rdinchi.MolToInchi(m) inchi_list += [inchi[0]] # MolecularFormula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(data_dict['canonical_smiles']) NA_list += ['NA'] * nr_of_structures # InChIKey inchi_key_list = [] inchi_key_list2 = [] for inchi in inchi_list: inchi_key = rdinchi.InchiToInchiKey(inchi) inchi_key_list2 += [inchi_key] inchi_key_list += inchi_key_list2 # InChiKey1 and InChiKey2 for inchikey in inchi_key_list: inchikey = inchikey.split('-') inchikey1 = inchikey[0] inchikey2 = inchikey[1] inchi_key1_list += [inchikey1] inchi_key2_list += [inchikey2] overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def diff_mol_pdb(mol, pdbfile, logfile=devnull): with stdout_redirected(to=logfile, stdout=sys_stderr): with stdout_redirected(to=logfile, stdout=sys_stdout): remove_isotopes(mol, sanitize=True) nhmol = Chem.RemoveHs(mol, implicitOnly=False, updateExplicitCount=True, sanitize=True) try: Chem.Kekulize(nhmol) except: pass checkconnect = True pdbmol = None try: pdbmol = Chem.MolFromPDBFile(pdbfile, removeHs=False, sanitize=True) except: pass if pdbmol is None: pdbmol = Chem.MolFromPDBFile(pdbfile, removeHs=False, sanitize=False) if pdbmol is None: raise ParsingError("Cannot open PDB molecule.") pdbmol = disconnect(pdbmol) Chem.SanitizeMol(pdbmol, catchErrors=True) nhpdbmol = Chem.RemoveHs(pdbmol, implicitOnly=False, updateExplicitCount=True, sanitize=False) Chem.SanitizeMol(nhpdbmol, catchErrors=True) try: print( 'Applying bond orders and formal charges from molecule file to PDB molecule ... ' ) nhpdbmol = AssignBondOrdersFromTemplate(nhmol, nhpdbmol) newpdbmol = Chem.AddHs(nhpdbmol, addCoords=True, explicitOnly=True) newpdbmol.UpdatePropertyCache() newpdbmol = correct_hydrogen_num_from_pdbmol(pdbmol, newpdbmol) newpdbmol = set_hydrogen_coor_from_pdbmol(pdbmol, newpdbmol, refconfId=-1, confId=-1) except Exception: print( "WARNING: Cannot assign bond orders from molecule file template. Checking only non-hydrogen connectivity." ) checkconnect = False newpdbmol = nhpdbmol pass #Stoichiometric formula check impnum = count_implicit_hydrogens(newpdbmol) failnum = 0 result = 'OK' unformula = remove_charge_formula( rdMolDescriptors.CalcMolFormula(mol)) pdbunformula = remove_charge_formula( rdMolDescriptors.CalcMolFormula(newpdbmol)) #print(pdbunformula) pdbunformula = fix_formula(pdbunformula, impnum) if unformula != pdbunformula: failnum += 1 result = 'FAIL: Molecules have different Stoichiometric formulas ' + unformula + ' ' + pdbunformula + '.' print('Stoichiometric formula check (without charge): ' + result) print('Generating Fixed H InChI for molecule file ... ') inchi, code, msg, log, aux = rdinchi.MolToInchi( mol, options='-FixedH -DoNotAddH') if code == 0: #print(inchi) pass if code == 1: # print(inchi) print(msg) else: print(msg) print('Generating Standard InChI for molecule file ... ') sinchi, code, msg, log, aux = rdinchi.MolToInchi( mol, options=' -DoNotAddH') if code == 0: #print(sinchi) pass if code == 1: #print(sinchi) print(msg) else: print(msg) maininchi = truncate_inchi(inchi, ['connect']) print('Generating Fixed H InChI for PDB molecule ... ') pdbinchi, code, msg, log, aux = rdinchi.MolToInchi( newpdbmol, options='-FixedH -DoNotAddH') if code == 0: pass if code == 1: print(msg) else: print(msg) print('Generating Standard InChI for PDB molecule ... ') pdbsinchi, code, msg, log, aux = rdinchi.MolToInchi( newpdbmol, options=' -DoNotAddH') if code == 0: pass if code == 1: print(msg) else: print(msg) pdbmaininchi = truncate_inchi(pdbinchi, ['connect']) result = 'OK' if maininchi != pdbmaininchi: result = 'FAIL: Molecules have diferent scaffolds\n' + maininchi + ' ' + pdbmaininchi + '.' failnum += 1 print('Main chain InChI check: ' + result) else: print('Main chain InChI check: ' + result) result = 'OK' if checkconnect: if sinchi != pdbsinchi: result = 'FAIL: Molecules are not the same compound or have different net charge.\n' + sinchi + '\n' + pdbsinchi + '.' failnum += 1 print('Standard InChI check: ' + result) else: print('Standard InChI check: ' + result) result = 'OK' if inchi != pdbinchi: result = 'FAIL: Molecules have different protonation/tautomery\n' + inchi + '\n' + pdbinchi + '.' failnum += 1 print('Fixed H InChI check: ' + result) print('OK') return failnum, newpdbmol, nhpdbmol
def generate_data(input_file): """ takes all text from the input structure data file and returns a list of lists with all generated data needed for the sqlite database. input_file: input structure txt file """ mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] pre_SMILES_list = [] identifier_list = [] all_lines = input_file.split('\n') if all_lines[-1] == '': all_lines = all_lines[:-1] for line in all_lines: line = line.split('\t') #Convert to mol and remove invalid structures smile_string = '' id_string = '' m = line[0] id_name = line[1] mol = Chem.MolFromSmiles(m) if mol != None: smile_string += m id_string += id_name pre_SMILES_list += [smile_string] #Source identifiers identifier_list += [id_string] pre_inchi_list = [] for smile in pre_SMILES_list: #Generate mol m = Chem.MolFromSmiles(smile) #SMILES, canonical sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # InChI inchi = rdinchi.MolToInchi(m) pre_inchi_list += [inchi[0]] # InChIKey1 and InChIKey2 for inchi in pre_inchi_list: if not str(inchi).startswith('InCh'): inchi = 'NA' inchi_list += [inchi] pre_inchi_key_list = [] for inchi2 in inchi_list: if inchi2 == 'NA': inchi_key = "NA-NA" pre_inchi_key_list += [inchi_key] if inchi2 != 'NA': inchi_key = rdinchi.InchiToInchiKey(inchi2) pre_inchi_key_list += [inchi_key] for inchi_key in pre_inchi_key_list: inchi_key = inchi_key.split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return overall_list