def get_pkis_pki(dataframe=True): pkis = '../../kinome_assay/other_published/pkis_pki.tsv' gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap( ) chembl2ikey = load_chembl2ikey() data = [] with open(pkis, 'r') as f: for line in f: line = line.strip().split('\t') chembl = line[0] gene = line[1] pki = float(line[2]) try: ikey = chembl2ikey[chembl] except: #there may be outdated ChEMBL molecules removed from ChEMBL DB continue try: uni = gene2uniprot[gene] except: #target genes not properly mapped to any protein #with open('extra_genes_pkis.txt','a') as out: # out.write("{}\n".format(gene)) continue tup = (ikey, uni, 'pKi', '=', pki) data.append(tup) if dataframe: data = pandas_df_continuous(data) return data
def get_plos_pki(dataframe=True): pkis = '../../kinome_assay/other_published/plos_pki.tsv' gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap( ) data = [] with open(pkis, 'r') as f: for line in f: line = line.strip().split('\t') ikey = line[0] gene = line[1] modobj = re.match(r'(.*)-(.*ted)$', gene, re.M | re.I) if modobj: gene = modobj.group(1) mod = modobj.group(2) else: mod = None pki = float(line[2]) try: uni = gene2uniprot[gene] except: #target genes not properly mapped to any protein #with open('extra_genes_plos.txt','a') as out: # out.write("{}\n".format(gene)) continue if mod is not None: uni = uni + '-' + mod tup = (ikey, uni, 'pKi', '=', pki) data.append(tup) if dataframe: data = pandas_df_continuous(data) return data
def get_jcim_pki(dataframe=True): jcim = '../../kinome_assay/other_published/JCIM_activity_pKi.tsv' gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap( ) chemname2ikey = load_chemname2ikey() data = [] with open(jcim, 'r') as f: next(f) for line in f: line_ori = line line = line.strip().split('\t') org = line[0] gene = str(line[1]).replace( ' ', '') #genes in JCIM may contain white spaces comp = str(line[2]) #compound name pki = float(line[3]) #pKi in Ki (M) try: uni = gene2uniprot[gene] except: # with open('./extra_genes_jcim.txt','a') as out: # out.write("{}\n".format(gene)) continue try: ikey = chemname2ikey[comp] except: with open('./extra_chemicals_jcim.txt', 'a') as out: out.write("{}\n".format(comp)) continue tup = (ikey, uni, 'pKi', '=', pki) data.append(tup) if dataframe: data = pandas_df_continuous(data) return data
def get_bindingdb_by_assay_type(assay_type='pKd',dataframe=True): bindingdb_path='../../BindingDB/' pic50=bindingdb_path+'BindingDB_pIC50.tsv' pkd=bindingdb_path+'BindingDB_pKd.tsv' pki=bindingdb_path+'BindingDB_pKi.tsv' if (assay_type=='pIC50') or (assay_type=='pic50'): infile=pic50 atype='pIC50' elif (assay_type=='pKd') or (assay_type=='pkd'): infile=pkd atype='pKd' elif (assay_type=='pKi') or (assay_type=='pki'): infile=pki atype='pKi' else: print("Error in parsing BindingDB data. Choose a proper assay type (pIC50, pKd, or pKi)") sys.exit() data=[] with open(infile,'r') as f: for line in f: line=line.strip().split('\t') ikey=str(line[0]) uni=str(line[1]) rel=line[2] val=float(line[3]) tup=(ikey,uni,atype,rel,val) data.append(tup) if dataframe: data=pandas_df_continuous(data) return data
def get_chembl_cyp450_by_assay_type(assay_type='pKd',dataframe=True): fpath='../../CYP450/ChEMBL23/' pic50=fpath+'CYP450_pIC50.tsv' pkd=fpath+'CYP450_pKd.tsv' pki=fpath+'CYP450_pKi.tsv' if (assay_type=='pIC50') or (assay_type=='pic50'): infile=pic50 atype='pIC50' elif (assay_type=='pKd') or (assay_type=='pkd'): infile=pkd atype='pKd' elif (assay_type=='pKi') or (assay_type=='pki'): infile=pki atype='pKi' else: print("Error in parsing ChEMBL CYP450 data. Choose a proper assay type (pIC50, pKd, or pKi)") sys.exit() data=[] with open(infile,'r') as f: next(f) for line in f: line=line.strip().split('\t') ikey=str(line[0]) uni=str(line[1]) rel=line[2] val=float(line[3]) tup=(ikey,uni,atype,rel,val) data.append(tup) if dataframe: data=pandas_df_continuous(data) return data
def get_kinomescan(assay_type='pKd', dataframe=True): #if dataframe=True, a pandas dataframe is returned #dataframe has integer indice from 0, # column names are ['InChIKey','UniProt','Activity_type','Relation','Activity_value'] gene2uniprot, uniprot2genes, mutgene2mutuniprot, mutgene2filename = get_protein_idmap( ) chemname2ikey = load_chemname2ikey() kinomepath = '../../kinome_assay/LINCS/' if assay_type == 'pKd' or assay_type == 'pkd': assay_type = 'pKd' activity = kinomepath + 'LINCS_kinomescan_kd_nM.tsv' #assay with numeric activity value in Kd null_activity = kinomepath + 'LINCS_kinomescan_kd_inactive_null.tsv' # assay inactive, without numeric value reported elif assay_type == 'pi' or assay_type == 'pI': assay_type = 'pPI' #Percent Inhibition Standardized = compound_concentration_nM*(100-%activity)/%activity activity = kinomepath + 'LINCS_kinomescan_pi_nM.tsv' #assay with numeric activity value in PI null_activity = kinomepath + 'LINCS_kinomescan_pi_inactive_null.tsv' # assay inactive, without numeric value reported else: print("Choose activity. pKd or pI.") sys.exit() data = [] with open(activity, 'r') as f: for l in f: l = l.strip().split('\t') drug = l[0] gene = l[1] modobj = re.match(r'(.*)-(.*ted)$', gene, re.M | re.I) if modobj: gene = modobj.group(1) mod = modobj.group(2) else: mod = None val = float(l[2]) if val <= 0: #nonsense data, Kd must be positive continue val = -np.log10(val) + 9.0 #all activities are in nM if np.isinf(val) or np.isnan(val): continue #skip inf or NaN ikey = chemname2ikey[drug] try: uni = gene2uniprot[gene] except: # with open('./extra_genes_in_kinomescan.txt','a') as out: #collect unmapped genes # out.write("{}\n".format(gene)) continue if mod is not None: uni = uni + '-' + mod tup = (ikey, uni, assay_type, '=', val) data.append(tup) if dataframe: data = pandas_df_continuous(data) return data
def get_gpcrdb(assay_type='pKd', dataframe=True): #if dataframe=True, a pandas dataframe is returned #dataframe has integer indice from 0, # column names are ['InChIKey','UniProt','Activity_type','Relation','Activity_value'] fpath = '../../GPCRdb/' gpcrfile = fpath + 'GPCR_assays.csv.gz' if Path(gpcrfile).is_file(): #decompress if compressed gpcrfile = decompress_gzip(gpcrfile) else: gpcrfile = fpath + 'GPCR_assays.csv' #already decompressed chembl2uniprot = load_chembl2uniprot() chembl2ikey = load_chembl2ikey() data = [] with open(gpcrfile, 'r') as f: #activity types : ['AC50', 'Potency', 'IC50', 'EC50', 'Kd', 'Ki'] #activity units : ['nM'] next(f) for l in csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True): smi = l[8] c_chembl = l[14] #chembl ID for chemical molecule rel = l[25] atype = l[26] unit = l[27] val = l[28] atype = 'p' + atype #e.g. IC50 -> pIC50 if atype.lower() != assay_type.lower(): continue p_chembl = l[29] try: ikey = chembl2ikey[c_chembl] except: continue try: uni = chembl2uniprot[p_chembl] except: continue try: val = np.float(val) except: continue val = -np.log10(val) + 9.0 #all activities are in nM if np.isinf(val) or np.isnan(val): continue #skip inf or NaN if rel != '=': #need to flip sign for -log conversion unless '=' lt = re.search(r'<', rel) gt = re.search(r'>', rel) if lt: if gt: #relation shouldn't contain both > and < continue rel = rel.replace('<', '>') elif gt: rel = rel.replace('>', '<') else: continue tup = (ikey, uni, atype, rel, val) data.append(tup) if dataframe: data = pandas_df_continuous(data) return data