Пример #1
0
def get_pkis_pki(dataframe=True):
    pkis = '../../kinome_assay/other_published/pkis_pki.tsv'
    gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap(
    )
    chembl2ikey = load_chembl2ikey()
    data = []
    with open(pkis, 'r') as f:
        for line in f:
            line = line.strip().split('\t')
            chembl = line[0]
            gene = line[1]
            pki = float(line[2])
            try:
                ikey = chembl2ikey[chembl]
            except:
                #there may be outdated ChEMBL molecules removed from ChEMBL DB
                continue
            try:
                uni = gene2uniprot[gene]
            except:
                #target genes not properly mapped to any protein
                #with open('extra_genes_pkis.txt','a') as out:
                #  out.write("{}\n".format(gene))
                continue
            tup = (ikey, uni, 'pKi', '=', pki)
            data.append(tup)
    if dataframe:
        data = pandas_df_continuous(data)
    return data
Пример #2
0
def get_plos_pki(dataframe=True):
    pkis = '../../kinome_assay/other_published/plos_pki.tsv'
    gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap(
    )
    data = []
    with open(pkis, 'r') as f:
        for line in f:
            line = line.strip().split('\t')
            ikey = line[0]
            gene = line[1]
            modobj = re.match(r'(.*)-(.*ted)$', gene, re.M | re.I)
            if modobj:
                gene = modobj.group(1)
                mod = modobj.group(2)
            else:
                mod = None
            pki = float(line[2])
            try:
                uni = gene2uniprot[gene]
            except:
                #target genes not properly mapped to any protein
                #with open('extra_genes_plos.txt','a') as out:
                #  out.write("{}\n".format(gene))
                continue
            if mod is not None:
                uni = uni + '-' + mod
            tup = (ikey, uni, 'pKi', '=', pki)
            data.append(tup)
    if dataframe:
        data = pandas_df_continuous(data)
    return data
Пример #3
0
def get_jcim_pki(dataframe=True):
    jcim = '../../kinome_assay/other_published/JCIM_activity_pKi.tsv'
    gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap(
    )
    chemname2ikey = load_chemname2ikey()
    data = []
    with open(jcim, 'r') as f:
        next(f)
        for line in f:
            line_ori = line
            line = line.strip().split('\t')
            org = line[0]
            gene = str(line[1]).replace(
                ' ', '')  #genes in JCIM may contain white spaces
            comp = str(line[2])  #compound name
            pki = float(line[3])  #pKi in Ki (M)
            try:
                uni = gene2uniprot[gene]
            except:
                #  with open('./extra_genes_jcim.txt','a') as out:
                #    out.write("{}\n".format(gene))
                continue
            try:
                ikey = chemname2ikey[comp]
            except:
                with open('./extra_chemicals_jcim.txt', 'a') as out:
                    out.write("{}\n".format(comp))
                continue
            tup = (ikey, uni, 'pKi', '=', pki)
            data.append(tup)
    if dataframe:
        data = pandas_df_continuous(data)
    return data
Пример #4
0
def get_bindingdb_by_assay_type(assay_type='pKd',dataframe=True):
  bindingdb_path='../../BindingDB/'
  pic50=bindingdb_path+'BindingDB_pIC50.tsv'
  pkd=bindingdb_path+'BindingDB_pKd.tsv'
  pki=bindingdb_path+'BindingDB_pKi.tsv'
  if (assay_type=='pIC50') or (assay_type=='pic50'):
    infile=pic50
    atype='pIC50'
  elif (assay_type=='pKd') or (assay_type=='pkd'):
    infile=pkd
    atype='pKd'
  elif (assay_type=='pKi') or (assay_type=='pki'):
    infile=pki
    atype='pKi'
  else:
    print("Error in parsing BindingDB data. Choose a proper assay type (pIC50, pKd, or pKi)")
    sys.exit()
  data=[]
  with open(infile,'r') as f:
    for line in f:
      line=line.strip().split('\t')
      ikey=str(line[0])
      uni=str(line[1])
      rel=line[2]
      val=float(line[3])
      tup=(ikey,uni,atype,rel,val)
      data.append(tup)
  if dataframe:
    data=pandas_df_continuous(data)
  return data
Пример #5
0
def get_chembl_cyp450_by_assay_type(assay_type='pKd',dataframe=True):
  fpath='../../CYP450/ChEMBL23/'
  pic50=fpath+'CYP450_pIC50.tsv'
  pkd=fpath+'CYP450_pKd.tsv'
  pki=fpath+'CYP450_pKi.tsv'
  if (assay_type=='pIC50') or (assay_type=='pic50'):
    infile=pic50
    atype='pIC50'
  elif (assay_type=='pKd') or (assay_type=='pkd'):
    infile=pkd
    atype='pKd'
  elif (assay_type=='pKi') or (assay_type=='pki'):
    infile=pki
    atype='pKi'
  else:
    print("Error in parsing ChEMBL CYP450 data. Choose a proper assay type (pIC50, pKd, or pKi)")
    sys.exit()
  data=[]
  with open(infile,'r') as f:
    next(f)
    for line in f:
      line=line.strip().split('\t')
      ikey=str(line[0])
      uni=str(line[1])
      rel=line[2]
      val=float(line[3])
      tup=(ikey,uni,atype,rel,val)
      data.append(tup)
  if dataframe:
    data=pandas_df_continuous(data)
  return data
Пример #6
0
def get_kinomescan(assay_type='pKd', dataframe=True):
    #if dataframe=True, a pandas dataframe is returned
    #dataframe has integer indice from 0,
    # column names are ['InChIKey','UniProt','Activity_type','Relation','Activity_value']
    gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap(
    )
    chemname2ikey = load_chemname2ikey()
    kinomepath = '../../kinome_assay/LINCS/'
    if assay_type == 'pKd' or assay_type == 'pkd':
        assay_type = 'pKd'
        activity = kinomepath + 'LINCS_kinomescan_kd_nM.tsv'  #assay with numeric activity value in Kd
        null_activity = kinomepath + 'LINCS_kinomescan_kd_inactive_null.tsv'  # assay inactive, without numeric value reported
    elif assay_type == 'pi' or assay_type == 'pI':
        assay_type = 'pPI'  #Percent Inhibition Standardized = compound_concentration_nM*(100-%activity)/%activity
        activity = kinomepath + 'LINCS_kinomescan_pi_nM.tsv'  #assay with numeric activity value in PI
        null_activity = kinomepath + 'LINCS_kinomescan_pi_inactive_null.tsv'  # assay inactive, without numeric value reported
    else:
        print("Choose activity. pKd or pI.")
        sys.exit()

    data = []
    with open(activity, 'r') as f:
        for l in f:
            l = l.strip().split('\t')
            drug = l[0]
            gene = l[1]
            modobj = re.match(r'(.*)-(.*ted)$', gene, re.M | re.I)
            if modobj:
                gene = modobj.group(1)
                mod = modobj.group(2)
            else:
                mod = None
            val = float(l[2])
            if val <= 0:  #nonsense data, Kd must be positive
                continue
            val = -np.log10(val) + 9.0  #all activities are in nM
            if np.isinf(val) or np.isnan(val):
                continue  #skip inf or NaN
            ikey = chemname2ikey[drug]
            try:
                uni = gene2uniprot[gene]
            except:
                # with open('./extra_genes_in_kinomescan.txt','a') as out: #collect unmapped genes
                #   out.write("{}\n".format(gene))
                continue
            if mod is not None:
                uni = uni + '-' + mod
            tup = (ikey, uni, assay_type, '=', val)
            data.append(tup)
    if dataframe:
        data = pandas_df_continuous(data)
    return data
Пример #7
0
def get_gpcrdb(assay_type='pKd', dataframe=True):
    #if dataframe=True, a pandas dataframe is returned
    #dataframe has integer indice from 0,
    # column names are ['InChIKey','UniProt','Activity_type','Relation','Activity_value']
    fpath = '../../GPCRdb/'
    gpcrfile = fpath + 'GPCR_assays.csv.gz'
    if Path(gpcrfile).is_file():  #decompress if compressed
        gpcrfile = decompress_gzip(gpcrfile)
    else:
        gpcrfile = fpath + 'GPCR_assays.csv'  #already decompressed

    chembl2uniprot = load_chembl2uniprot()
    chembl2ikey = load_chembl2ikey()

    data = []
    with open(gpcrfile, 'r') as f:
        #activity types : ['AC50', 'Potency', 'IC50', 'EC50', 'Kd', 'Ki']
        #activity units : ['nM']
        next(f)
        for l in csv.reader(f,
                            quotechar='"',
                            delimiter=',',
                            quoting=csv.QUOTE_ALL,
                            skipinitialspace=True):
            smi = l[8]
            c_chembl = l[14]  #chembl ID for chemical molecule
            rel = l[25]
            atype = l[26]
            unit = l[27]
            val = l[28]
            atype = 'p' + atype  #e.g. IC50 -> pIC50
            if atype.lower() != assay_type.lower():
                continue
            p_chembl = l[29]
            try:
                ikey = chembl2ikey[c_chembl]
            except:
                continue
            try:
                uni = chembl2uniprot[p_chembl]
            except:
                continue
            try:
                val = np.float(val)
            except:
                continue
            val = -np.log10(val) + 9.0  #all activities are in nM
            if np.isinf(val) or np.isnan(val):
                continue  #skip inf or NaN
            if rel != '=':  #need to flip sign for -log conversion unless '='
                lt = re.search(r'<', rel)
                gt = re.search(r'>', rel)
                if lt:
                    if gt:  #relation shouldn't contain both > and <
                        continue
                    rel = rel.replace('<', '>')
                elif gt:
                    rel = rel.replace('>', '<')
                else:
                    continue
            tup = (ikey, uni, atype, rel, val)
            data.append(tup)

    if dataframe:
        data = pandas_df_continuous(data)
    return data