def compute_one_inchi(mol): molfrom = Chem.MolFromSmiles(mol) if molfrom == None: return ((None, None, None)) ini = inchi.MolToInchi(molfrom) inikey = inchi.InchiToInchiKey(ini) hsh = hashlib.md5(ini.encode('utf-8')).hexdigest() return ((hsh, inikey, ini))
def id_cleanup(df, version): df = df.dropna(subset=["InChIKey"]).reset_index(drop=True) df.InChIKey = df.InChIKey.apply(lambda x: inchi.InchiToInchiKey(x) if ( x.startswith("InChI")) else x).apply(lambda x: str(x)[:14]) df.broad_id = df.broad_id.apply(lambda x: str(x)[:13]) df.deprecated_broad_id = df.deprecated_broad_id.apply( lambda x: str(x)[:13]) df = (df.drop_duplicates( ["InChIKey", "pert_iname", "broad_id", "deprecated_broad_id"]).rename( columns={ "pert_iname": f"pert_iname_{version}", "broad_id": f"broad_id_{version}", "deprecated_broad_id": f"deprecated_broad_id_{version}", "InChIKey": "InChIKey14", }).reset_index(drop=True)) return df
def parse_f(f): names = [''] cid = -1 CAS = f.split('/')[1] if '/' in f else f CAS = CAS.split('.')[0] if CAS in ignored_CASs: return None failed_mol = False try: if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: raise Exception( 'Pubchem specified, not trying to use the mol file') elif 'formula' in d: raise Exception( 'Formula specified, not trying to use the mol file') try: mol = Chem.MolFromMolFile(f) assert mol is not None except: print('Cannot read %s' % f) 1 / 0 try: inchi_val = inchi.MolToInchi(mol) except: print('BAILING ON %s' % f) 1 / 0 mol = inchi.MolFromInchi(inchi_val) # Works better for ions if mol is None: print('BAILING ON reconversion to mol %s' % f) 1 / 0 except: failed_mol = True if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: if str(d['pubchem']) in mycache: cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[ str(d['pubchem'])] else: pc = Compound.from_cid(d['pubchem']) cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mw = pc.molecular_weight smi = pc.canonical_smiles inchi_val = pc.inchi inchikey = pc.inchikey formula = pc.molecular_formula mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula) else: cid = -1 names = d['synonyms'] if 'synonyms' in d else [''] mw = float(d['MW']) smi = d['smiles'] if 'smiles' in d else '' formula = d['formula'] if 'formula' in d else '' inchi_val = d['inchi'] if 'inchi' in d else '' inchikey = d['inchikey'] if 'inchikey' in d else '' iupac_name = '' else: print('FAILED on %s and no custom data was available either' % CAS) return None if not failed_mol: smi = Chem.MolToSmiles(mol, True) inchi_val = inchi.MolToInchi(mol) inchikey = inchi.InchiToInchiKey(inchi_val) mw = Descriptors.MolWt(mol) # for i in mol.GetAtoms(): # if i.GetIsotope(): # mw = Descriptors.ExactMolWt(mol) # break formula = CalcMolFormula(mol, True, True) iupac_name = '' try: if not failed_mol: if str(inchikey) in mycache: cid, iupac_name, names = mycache[str(inchikey)] else: try: pc = get_compounds(inchikey, 'inchikey')[0] cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mycache[str(inchikey)] = (cid, iupac_name, names) except: mycache[str(inchikey)] = (-1, '', ['']) except: cid = -1 iupac_name = '' names = [''] other_CAS = [] if CAS in pdf_data: d = pdf_data[CAS] name = d['Name'] if 'Other Names' in d: syns = d['Other Names'] else: syns = [] if not iupac_name: iupac_name = name else: syns.insert(0, name) if 'Deleted CAS' in d: other_CAS.extend(d['Deleted CAS']) if 'Alternate CAS' in d: other_CAS.extend(d['Alternate CAS']) syns = [i for i in syns if i not in dup_names] names = syns + [i for i in names if i not in all_names] + other_CAS actual_names = [] for name in names: if name in all_user_names: # If the name is in the user db, only add it if it corresponds to this CAS number if CAS in syn_data and 'synonyms' in syn_data[ CAS] and name in syn_data[CAS]['synonyms']: actual_names.append(name) else: # Discard it otherwise pass else: # If the name is not in the user db we're all good actual_names.append(name) if CAS in syn_data and 'synonyms' in syn_data[CAS]: # If the user has any syns for this cas number, add those names if the name hasn't already been aded for n in syn_data[CAS]['synonyms']: if n not in actual_names: actual_names.append(n) actual_names = [i for i in actual_names if i] if inchi_val is not None: inchi_val = inchi_val.replace('InChI=1S/', '') formula = serialize_formula(formula) s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi, inchi_val, inchikey, iupac_name) s += '\t'.join(actual_names) print(s) return None
help = [] help.append(counter) chembl_counter = 0 exister = 0 if (len(bdb_help) != 0) & (len(chembl_help) != 0): for j in chembl_help: if bdb_help[0][0] == j[0]: exister = 1 break chembl_counter = chembl_counter + 1 #if in chembl if (exister == 1) & (len(bdb_help) != 0): help.append(bdb_help[0][0]) test = bdb_help[0][0] inchi_key = inchi.InchiToInchiKey(bdb_help[0][0]) help.append(inchi_key) help.append(bdb_help[0][1]) help.append(chembl_help[chembl_counter][1]) mol = inchi.MolFromInchi(bdb_help[0][0], sanitize=False) smiles = chem.MolToSmiles(mol) help.append(smiles) chembl_help.remove(chembl_help[chembl_counter]) bdb_help.remove(bdb_help[0]) #not in chembl, but in bdb elif (exister == 0) & (len(bdb_help) != 0): help.append(bdb_help[0][0]) test = bdb_help[0][0] inchi_key = inchi.InchiToInchiKey(bdb_help[0][0]) help.append(inchi_key)
def _dbize(ec, org, rxn, cof, all_smiles): """Place data into MongoDB.""" #Connect to mongodb. client = MongoClient() db = client.BrendaDB ec_collection = db.ec_pages rxn_collection = db.rxn_pages cpd_collection = db.cpd_pages #Build dictionary of reactions and organisms r_o_dict = {} for k, v in rxn.iteritems(): p_ = [] r_ = [] #The substrates/products are in name format from the dump, so adding smiles data here. if len(v[1]) > 0 and len(v[2]) > 0: for comp in v[1]: if comp in all_smiles: smiles = all_smiles[str(comp)] id = hashlib.sha1(smiles).hexdigest() inchi = pybel.readstring( 'smi', smiles).write('inchi').strip('\t\n') inchikey = rdki.InchiToInchiKey(inchi) r_.append(id) cpd_collection.update({"_id": id}, { "$set": { "smiles": smiles, "inchi": inchi, "inchikey": inchikey, "name": comp } }, upsert=True) else: r_.append('') for comp in v[2]: if comp in all_smiles: smiles = all_smiles[str(comp)] id = hashlib.sha1(smiles).hexdigest() inchi = pybel.readstring( 'smi', smiles).write('inchi').strip('\t\n') inchikey = rdki.InchiToInchiKey(inchi) p_.append(id) cpd_collection.update({"_id": id}, { "$set": { "smiles": smiles, "inchi": inchi, "inchikey": inchikey, "name": comp } }, upsert=True) else: p_.append('') #A reaction doc is generated containing the names/smiles of both products and reactants as well as a #stoichiometry vector. The id field is a hash of the final dictionary, and gets added into the rxn/org dict #for inclusion in the ec pages. Upsert option adds to anything that matches the query and creates a new #entry if there is no match. r_entry = { "r_name": v[1], "p_name": v[2], "r_smiles": r_, "p_smiles": p_, "s": v[3] } rxn_collection.update( {"_id": hashlib.sha1(str(r_entry)).hexdigest()}, {"$set": { "rxn": r_entry }}, upsert=True) r_o_dict[k] = (v[0], hashlib.sha1(str(r_entry)).hexdigest()) else: continue #Iterate through a dictionary of organisms to create the ec pages. Each doc is for a particular organism and lists #all of the ecs present in it, followed by a list of reactions in each ec listing, with cofactors. for k, v in org.iteritems(): rxns_in = [x[1] for x in r_o_dict.values() if k in x[0]] cofs_in = [{"name": x[1], "link": ''} for x in cof if k in x[0]] for d in cofs_in: if d["name"] in all_smiles: d["link"] = hashlib.sha1(all_smiles[str( d["name"])]).hexdigest() else: d["link"] = '' ec_collection.update({"org": v}, { "$set": { "ec." + ec.replace('.', '_'): { "rxns": rxns_in, "cofactors": cofs_in } } }, upsert=True)
# ## Merge the Samples and Drugs data # In[7]: combined_df = drug_df.merge(sample_df, on="pert_iname", how="inner").reset_index( drop=True ) # Move broad_id to first column col_order = combined_df.columns.tolist() col_order.insert(0, col_order.pop(col_order.index("broad_id"))) combined_df = combined_df.loc[:, col_order].assign( InChIKey14=combined_df.InChIKey.apply( lambda x: inchi.InchiToInchiKey(x) if (x.startswith("InChI")) else x ).apply(lambda x: str(x)[:14]) ) # Output to file output_file = "repurposing_info" combined_df.to_csv(f"{output_file}.tsv", sep="\t", index=False) print(combined_df.shape) combined_df.head() # ## Create a "Long" version where we split MOA and Target delimiters # # Certain compounds have multiple MOA classes and targets that are delimited by pipes (`|`). # Each MOA class and target can be considered to have equal support (see https://github.com/broadinstitute/lincs-cell-painting/issues/5).