def InchiToInchiKey(inchi): """Return the InChI key for the given InChI string. Return None on error""" ret = rdinchi.InchiToInchiKey(inchi) if ret: return ret else: return None
def smiles_to_inchi_inchikey(smiles,verbose=0): error_counter=0 warning_counter=0 inchis=[] inchis2=[] inchiKeys=[] for elem in smiles: mol=Chem.MolFromSmiles(elem) if(mol!=None): inchi, retcode, message, logs, aux =rdinchi.MolToInchi(mol) inchiKey=rdinchi.InchiToInchiKey(inchi) inchis.append(inchi) inchiKeys.append(inchiKey) try: mol, retcode, message, logs=rdinchi.InchiToMol(inchi) if(mol!=None): inchi2, retcode, message, logs, aux =rdinchi.MolToInchi(mol) inchis2.append(inchi2) else: inchis2.append("XXX") if(verbose!=0): print(elem+ ": InChI cannot converted to mol object, added XXX instead!") except: if(verbose!=0): print(retcode) print(message) print("Smiles:"+elem) inchis2.append("XXX") else: inchis.append("XXX") inchiKeys.append("XXX") inchis2.append("XXX") if(verbose!=0): print(elem+ ": can not converted added XXX instead! ") error_counter=error_counter+1 for i1,i2 in zip(inchis,inchis2): if(i1!=i2): if(verbose!=0): print("Warning:"+i1+" - "+i2) warning_counter=warning_counter+1 print("\nGeneration of InChI and InChIKey from SMILES is completed.") print("Total errors:"+str(error_counter)) print("Total warnings:"+str(warning_counter)+"\n") return inchis,inchiKeys
def parse_file(input_file, db_name): """ takes all text from nanpdb database file and returns a list of lists with NPs which is easy to use. input_file: nanpdb database txt file db_name: database name """ all_lines = input_file.split('\n') all_lines = all_lines[:-1] all_info_list = [] for line in all_lines: line = line.split('\t') info_per_row_list = [] for value in line: my_string = "" if len(value) == 0: value = "NA" my_string += value info_per_row_list += [my_string] info_per_row_list += [db_name] all_info_list += [info_per_row_list] attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] for line in all_info_list: # generate molecules m = Chem.MolFromSmiles(line[0]) # MonoisotopicMass mol_mass = str(Descriptors.ExactMolWt(m))[:-5] mol_mass_list += [mol_mass] # InChI inchi = rdinchi.MolToInchi(m) inchi_list += [inchi[0]] # SMILES SMILES_list += [line[0]] # Identifier identifier_list += [line[1]] # MolecularFormula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(all_info_list) NA_list += ['NA'] * nr_of_structures # InChIKey inchi_key_list = [] inchi_key_list2 = [] for inchi in inchi_list: inchi_key = rdinchi.InchiToInchiKey(inchi) inchi_key_list2 += [inchi_key] inchi_key_list += inchi_key_list2 # InChiKey1 and InChiKey2 for inchikey in inchi_key_list: inchikey = inchikey.split('-') inchikey1 = inchikey[0] inchikey2 = inchikey[1] inchi_key1_list += [inchikey1] inchi_key2_list += [inchikey2] overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def parse_data(input_file): """ takes all text from norine database file and returns a list of lists with all CLASS data and an attribute list. input_file: norine database txt file """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] pre_SMILES_list = [] identifier_list = [] all_lines = input_file.split('\n') all_lines = all_lines[2:] for line in all_lines: line = line.split('\t') #Convert to mol and remove invalid structures smile_string = '' id_string = '' m = line[2] id_name = line[0] mol = Chem.MolFromSmiles(m) if mol != None: smile_string += m id_string += id_name pre_SMILES_list += [smile_string] #Source identifiers identifier_list += [id_string] pre_inchi_list = [] for smile in pre_SMILES_list: #Generate mol m = Chem.MolFromSmiles(smile) #SMILES sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # InChI inchi = rdinchi.MolToInchi(m) pre_inchi_list += [inchi[0]] # InChIKey1 and InChIKey2 for inchi in pre_inchi_list: if not str(inchi).startswith('InCh'): inchi = 'NA' inchi_list += [inchi] pre_inchi_key_list = [] for inchi2 in inchi_list: if inchi2 == 'NA': inchi_key = "NA-NA" pre_inchi_key_list += [inchi_key] if inchi2 != 'NA': inchi_key = rdinchi.InchiToInchiKey(inchi2) pre_inchi_key_list += [inchi_key] for inchi_key in pre_inchi_key_list: inchi_key = inchi_key.split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def create_CLASS_data(data_dict): """ Generates CLASS data for the strepto data present in the strep_dict. input_file: streptodb dictionary """ attribute_names = ['MonoisotopicMass', 'InChI', 'SMILES', 'Identifier',\ 'InChIKey2', 'InChIKey1', 'MolecularFormula', 'kingdom_name',\ 'superclass_name', 'class_name', 'subclass_name'] mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] # Identifier identifier_list = data_dict['compound_id'] # SMILES SMILES_list = data_dict['canonical_smiles'] for SMILE in SMILES_list: # generate molecules m = Chem.MolFromSmiles(SMILE) # MonoisotopicMass mol_mass = str(Descriptors.ExactMolWt(m))[:-3] mol_mass_list += [mol_mass] # InChI inchi = rdinchi.MolToInchi(m) inchi_list += [inchi[0]] # MolecularFormula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # NA list nr_of_structures = len(data_dict['canonical_smiles']) NA_list += ['NA'] * nr_of_structures # InChIKey inchi_key_list = [] inchi_key_list2 = [] for inchi in inchi_list: inchi_key = rdinchi.InchiToInchiKey(inchi) inchi_key_list2 += [inchi_key] inchi_key_list += inchi_key_list2 # InChiKey1 and InChiKey2 for inchikey in inchi_key_list: inchikey = inchikey.split('-') inchikey1 = inchikey[0] inchikey2 = inchikey[1] inchi_key1_list += [inchikey1] inchi_key2_list += [inchikey2] overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return attribute_names, overall_list
def generate_data(input_file): """ takes all text from the input structure data file and returns a list of lists with all generated data needed for the sqlite database. input_file: input structure txt file """ mol_mass_list = [] inchi_list = [] SMILES_list = [] identifier_list = [] inchi_key1_list = [] inchi_key2_list = [] mol_formula_list = [] NA_list = [] pre_SMILES_list = [] identifier_list = [] all_lines = input_file.split('\n') if all_lines[-1] == '': all_lines = all_lines[:-1] for line in all_lines: line = line.split('\t') #Convert to mol and remove invalid structures smile_string = '' id_string = '' m = line[0] id_name = line[1] mol = Chem.MolFromSmiles(m) if mol != None: smile_string += m id_string += id_name pre_SMILES_list += [smile_string] #Source identifiers identifier_list += [id_string] pre_inchi_list = [] for smile in pre_SMILES_list: #Generate mol m = Chem.MolFromSmiles(smile) #SMILES, canonical sm = Chem.MolToSmiles(m) SMILES_list += [sm] #Monoisotopic mass mol_weigth = Descriptors.ExactMolWt(m) mol_mass_list += [mol_weigth] #Mol Forumula mol_formula = rdMolDescriptors.CalcMolFormula(m) mol_formula_list += [mol_formula] # InChI inchi = rdinchi.MolToInchi(m) pre_inchi_list += [inchi[0]] # InChIKey1 and InChIKey2 for inchi in pre_inchi_list: if not str(inchi).startswith('InCh'): inchi = 'NA' inchi_list += [inchi] pre_inchi_key_list = [] for inchi2 in inchi_list: if inchi2 == 'NA': inchi_key = "NA-NA" pre_inchi_key_list += [inchi_key] if inchi2 != 'NA': inchi_key = rdinchi.InchiToInchiKey(inchi2) pre_inchi_key_list += [inchi_key] for inchi_key in pre_inchi_key_list: inchi_key = inchi_key.split('-') inchi_key2 = inchi_key[1] inchi_key2_list += [inchi_key2] inchi_key1 = inchi_key[0] inchi_key1_list += [inchi_key1] # NA list nr_of_structures = len(SMILES_list) NA_list += ['NA'] * nr_of_structures overall_list = [mol_mass_list]+[inchi_list]+[SMILES_list]+\ [identifier_list]+[inchi_key2_list]+[inchi_key1_list]+[mol_formula_list]+\ [NA_list]+[NA_list]+[NA_list]+[NA_list] return overall_list