예제 #1
0
 def compute_one_inchi(mol):
     molfrom = Chem.MolFromSmiles(mol)
     if molfrom == None:
         return ((None, None, None))
     ini = inchi.MolToInchi(molfrom)
     inikey = inchi.InchiToInchiKey(ini)
     hsh = hashlib.md5(ini.encode('utf-8')).hexdigest()
     return ((hsh, inikey, ini))
예제 #2
0
def id_cleanup(df, version):
    df = df.dropna(subset=["InChIKey"]).reset_index(drop=True)

    df.InChIKey = df.InChIKey.apply(lambda x: inchi.InchiToInchiKey(x) if (
        x.startswith("InChI")) else x).apply(lambda x: str(x)[:14])

    df.broad_id = df.broad_id.apply(lambda x: str(x)[:13])
    df.deprecated_broad_id = df.deprecated_broad_id.apply(
        lambda x: str(x)[:13])

    df = (df.drop_duplicates(
        ["InChIKey", "pert_iname", "broad_id", "deprecated_broad_id"]).rename(
            columns={
                "pert_iname": f"pert_iname_{version}",
                "broad_id": f"broad_id_{version}",
                "deprecated_broad_id": f"deprecated_broad_id_{version}",
                "InChIKey": "InChIKey14",
            }).reset_index(drop=True))

    return df
예제 #3
0
def parse_f(f):
    names = ['']
    cid = -1
    CAS = f.split('/')[1] if '/' in f else f
    CAS = CAS.split('.')[0]
    if CAS in ignored_CASs:
        return None
    failed_mol = False
    try:
        if CAS in syn_data:
            d = syn_data[CAS]
            if 'pubchem' in d:
                raise Exception(
                    'Pubchem specified, not trying to use the mol file')
            elif 'formula' in d:
                raise Exception(
                    'Formula specified, not trying to use the mol file')
        try:
            mol = Chem.MolFromMolFile(f)
            assert mol is not None
        except:
            print('Cannot read %s' % f)
            1 / 0
        try:
            inchi_val = inchi.MolToInchi(mol)
        except:
            print('BAILING ON %s' % f)
            1 / 0
        mol = inchi.MolFromInchi(inchi_val)  # Works better for ions
        if mol is None:
            print('BAILING ON reconversion to mol %s' % f)
            1 / 0
    except:
        failed_mol = True
        if CAS in syn_data:
            d = syn_data[CAS]
            if 'pubchem' in d:
                if str(d['pubchem']) in mycache:
                    cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[
                        str(d['pubchem'])]
                else:
                    pc = Compound.from_cid(d['pubchem'])
                    cid = pc.cid
                    iupac_name = pc.iupac_name
                    names = pc.synonyms
                    mw = pc.molecular_weight
                    smi = pc.canonical_smiles
                    inchi_val = pc.inchi
                    inchikey = pc.inchikey
                    formula = pc.molecular_formula

                    mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw,
                                                  smi, inchi_val, inchikey,
                                                  formula)
            else:
                cid = -1
                names = d['synonyms'] if 'synonyms' in d else ['']
                mw = float(d['MW'])
                smi = d['smiles'] if 'smiles' in d else ''
                formula = d['formula'] if 'formula' in d else ''
                inchi_val = d['inchi'] if 'inchi' in d else ''
                inchikey = d['inchikey'] if 'inchikey' in d else ''
                iupac_name = ''
        else:
            print('FAILED on %s and no custom data was available either' % CAS)
            return None

    if not failed_mol:
        smi = Chem.MolToSmiles(mol, True)
        inchi_val = inchi.MolToInchi(mol)
        inchikey = inchi.InchiToInchiKey(inchi_val)
        mw = Descriptors.MolWt(mol)
        #        for i in mol.GetAtoms():
        #            if i.GetIsotope():
        #                mw = Descriptors.ExactMolWt(mol)
        #                break

        formula = CalcMolFormula(mol, True, True)
        iupac_name = ''
    try:
        if not failed_mol:
            if str(inchikey) in mycache:
                cid, iupac_name, names = mycache[str(inchikey)]
            else:
                try:
                    pc = get_compounds(inchikey, 'inchikey')[0]
                    cid = pc.cid
                    iupac_name = pc.iupac_name
                    names = pc.synonyms
                    mycache[str(inchikey)] = (cid, iupac_name, names)
                except:
                    mycache[str(inchikey)] = (-1, '', [''])
    except:
        cid = -1
        iupac_name = ''
        names = ['']

    other_CAS = []
    if CAS in pdf_data:
        d = pdf_data[CAS]
        name = d['Name']
        if 'Other Names' in d:
            syns = d['Other Names']
        else:
            syns = []
        if not iupac_name:
            iupac_name = name
        else:
            syns.insert(0, name)
        if 'Deleted CAS' in d:
            other_CAS.extend(d['Deleted CAS'])
        if 'Alternate CAS' in d:
            other_CAS.extend(d['Alternate CAS'])

        syns = [i for i in syns if i not in dup_names]
        names = syns + [i for i in names if i not in all_names] + other_CAS
    actual_names = []
    for name in names:
        if name in all_user_names:
            # If the name is in the user db, only add it if it corresponds to this CAS number
            if CAS in syn_data and 'synonyms' in syn_data[
                    CAS] and name in syn_data[CAS]['synonyms']:
                actual_names.append(name)
            else:
                # Discard it otherwise
                pass
        else:
            # If the name is not in the user db we're all good
            actual_names.append(name)
    if CAS in syn_data and 'synonyms' in syn_data[CAS]:
        # If the user has any syns for this cas number, add those names if the name hasn't already been aded
        for n in syn_data[CAS]['synonyms']:
            if n not in actual_names:
                actual_names.append(n)

    actual_names = [i for i in actual_names if i]

    if inchi_val is not None:
        inchi_val = inchi_val.replace('InChI=1S/', '')

    formula = serialize_formula(formula)
    s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi,
                                              inchi_val, inchikey, iupac_name)

    s += '\t'.join(actual_names)
    print(s)
    return None
예제 #4
0
    help = []
    help.append(counter)
    chembl_counter = 0
    exister = 0
    if (len(bdb_help) != 0) & (len(chembl_help) != 0):
        for j in chembl_help:
            if bdb_help[0][0] == j[0]:
                exister = 1
                break
            chembl_counter = chembl_counter + 1

    #if in chembl
    if (exister == 1) & (len(bdb_help) != 0):
        help.append(bdb_help[0][0])
        test = bdb_help[0][0]
        inchi_key = inchi.InchiToInchiKey(bdb_help[0][0])
        help.append(inchi_key)
        help.append(bdb_help[0][1])
        help.append(chembl_help[chembl_counter][1])
        mol = inchi.MolFromInchi(bdb_help[0][0], sanitize=False)
        smiles = chem.MolToSmiles(mol)
        help.append(smiles)
        chembl_help.remove(chembl_help[chembl_counter])
        bdb_help.remove(bdb_help[0])

    #not in chembl, but in bdb
    elif (exister == 0) & (len(bdb_help) != 0):
        help.append(bdb_help[0][0])
        test = bdb_help[0][0]
        inchi_key = inchi.InchiToInchiKey(bdb_help[0][0])
        help.append(inchi_key)
예제 #5
0
파일: loader.py 프로젝트: tyo-nu/SimAL
def _dbize(ec, org, rxn, cof, all_smiles):
    """Place data into MongoDB."""

    #Connect to mongodb.
    client = MongoClient()
    db = client.BrendaDB
    ec_collection = db.ec_pages
    rxn_collection = db.rxn_pages
    cpd_collection = db.cpd_pages

    #Build dictionary of reactions and organisms

    r_o_dict = {}

    for k, v in rxn.iteritems():
        p_ = []
        r_ = []
        #The substrates/products are in name format from the dump, so adding smiles data here.
        if len(v[1]) > 0 and len(v[2]) > 0:
            for comp in v[1]:
                if comp in all_smiles:
                    smiles = all_smiles[str(comp)]
                    id = hashlib.sha1(smiles).hexdigest()
                    inchi = pybel.readstring(
                        'smi', smiles).write('inchi').strip('\t\n')
                    inchikey = rdki.InchiToInchiKey(inchi)
                    r_.append(id)
                    cpd_collection.update({"_id": id}, {
                        "$set": {
                            "smiles": smiles,
                            "inchi": inchi,
                            "inchikey": inchikey,
                            "name": comp
                        }
                    },
                                          upsert=True)
                else:
                    r_.append('')
            for comp in v[2]:
                if comp in all_smiles:
                    smiles = all_smiles[str(comp)]
                    id = hashlib.sha1(smiles).hexdigest()
                    inchi = pybel.readstring(
                        'smi', smiles).write('inchi').strip('\t\n')
                    inchikey = rdki.InchiToInchiKey(inchi)
                    p_.append(id)
                    cpd_collection.update({"_id": id}, {
                        "$set": {
                            "smiles": smiles,
                            "inchi": inchi,
                            "inchikey": inchikey,
                            "name": comp
                        }
                    },
                                          upsert=True)
                else:
                    p_.append('')
            #A reaction doc is generated containing the names/smiles of both products and reactants as well as a
            #stoichiometry vector. The id field is a hash of the final dictionary, and gets added into the rxn/org dict
            #for inclusion in the ec pages.  Upsert option adds to anything that matches the query and creates a new
            #entry if there is no match.
            r_entry = {
                "r_name": v[1],
                "p_name": v[2],
                "r_smiles": r_,
                "p_smiles": p_,
                "s": v[3]
            }
            rxn_collection.update(
                {"_id": hashlib.sha1(str(r_entry)).hexdigest()},
                {"$set": {
                    "rxn": r_entry
                }},
                upsert=True)
            r_o_dict[k] = (v[0], hashlib.sha1(str(r_entry)).hexdigest())
        else:
            continue

    #Iterate through a dictionary of organisms to create the ec pages. Each doc is for a particular organism and lists
    #all of the ecs present in it, followed by a list of reactions in each ec listing, with cofactors.
    for k, v in org.iteritems():
        rxns_in = [x[1] for x in r_o_dict.values() if k in x[0]]
        cofs_in = [{"name": x[1], "link": ''} for x in cof if k in x[0]]
        for d in cofs_in:
            if d["name"] in all_smiles:
                d["link"] = hashlib.sha1(all_smiles[str(
                    d["name"])]).hexdigest()
            else:
                d["link"] = ''
        ec_collection.update({"org": v}, {
            "$set": {
                "ec." + ec.replace('.', '_'): {
                    "rxns": rxns_in,
                    "cofactors": cofs_in
                }
            }
        },
                             upsert=True)
# ## Merge the Samples and Drugs data

# In[7]:


combined_df = drug_df.merge(sample_df, on="pert_iname", how="inner").reset_index(
    drop=True
)

# Move broad_id to first column
col_order = combined_df.columns.tolist()
col_order.insert(0, col_order.pop(col_order.index("broad_id")))
combined_df = combined_df.loc[:, col_order].assign(
    InChIKey14=combined_df.InChIKey.apply(
        lambda x: inchi.InchiToInchiKey(x) if (x.startswith("InChI")) else x
    ).apply(lambda x: str(x)[:14])
)

# Output to file
output_file = "repurposing_info"
combined_df.to_csv(f"{output_file}.tsv", sep="\t", index=False)

print(combined_df.shape)
combined_df.head()


# ## Create a "Long" version where we split MOA and Target delimiters
# 
# Certain compounds have multiple MOA classes and targets that are delimited by pipes (`|`).
# Each MOA class and target can be considered to have equal support (see https://github.com/broadinstitute/lincs-cell-painting/issues/5).