def pfamDomains(release, user, pword, host, port): import getUniprotTargets import parse import getAllTargets import getPfamDomains import export ## Get all ChEMBL targets with a Uniprot accession. chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Read all human protein coding gene names. humProtCod = parse.parse2col("data/proteinCoding.tab", True, 1, 0) humanTargets = [] for tstr in humProtCod.keys(): humanTargets.append(tstr.split(";")[0]) print "We are dealing with %s human proteins" % len(humanTargets) ## Generate a list of all targets that are to be fed into the getPfamDomain procedure. allTargets = getAllTargets.getAllTargets(humanTargets, chemblTargets) allTargets = allTargets.keys() ## Get the domains by parsing Pfam. This step takes long and therefore pickles out the domainDict. pfamDict = getPfamDomains.getDomains(allTargets, release) ## Export the PfamDict as a mysql table. export.exportPfamDict(chemblTargets, pfamDict, release, user, pword, host, port)
def master(version): """ Function: master Run through all steps to identify mandatory muli-domain architectures. """ ## Load the pfamDict. infile = open('data/protCodPfamDict_%s.pkl' %RELEASE, 'r') pfam_d = pickle.load(infile) infile.close() # Load the list of validated domains. valid_dom_d = readfile('data/valid_pfam_v_%(version)s.tab' % locals(), 'pfam_a', 'pfam_a') del valid_dom_d['Pkinase_Tyr'] valid_doms = valid_dom_d.keys() ## Load Uniprot targets. chembl_targets = getUniprotTargets.getUniprotTargets(RELEASE, USER, PWORD, HOST, PORT) ## Load eligible multi-domain targets. el_targets = get_el_targets(RELEASE, USER, PWORD, HOST, PORT) ## Add targets with given architecture. (arch_lkp, dom_lkp, act_lkp) = get_multi_doms(el_targets, pfam_d) ## Write multi-domain architechtures to markdown tables. export_archs(arch_lkp, valid_doms, 'data/multi_dom_archs_%s'% RELEASE) ## Write domains from multi-domain architechtures to markdown tables. export_doms(dom_lkp, valid_doms, 'data/multi_dom_doms_%s'% RELEASE) ## export network file. export_network(arch_lkp, valid_doms, 'data/multi_dom_network_%s'% RELEASE) ## export network attribute file. export_attribs(arch_lkp, valid_doms, 'data/multi_dom_attributes_%s'% RELEASE)
def master(): """ Function: master -------------------- [email protected] """ ## Load the pdb_d. infile = open('data/pdbDict_%s.pkl' % release, 'r') pdb_d = pickle.load(infile) infile.close() ## Load the pfam_d. infile = open('data/protCodPfamDict_%s.pkl' % release, 'r') pfam_d = pickle.load(infile) infile.close() ## Load Uniprot targets. chembl_targets = getUniprotTargets.getUniprotTargets( release, user, pword, host, port) ## Convert pfam_d to long format. long_pfam_d = get_long_pfams(pfam_d) ## Identify architectures binding sm through multiple domains. arch_d = get_archs(pdb_d, long_pfam_d, min_res, min_ratio) ## Add targets with given architecture. arch_d = add_targets(chembl_targets, pfam_d, arch_d) ## Write architechtures to markdown tables. export_archs(arch_d, 'data/interface_%s' % release)
def query(release, user, pword, host, port): import queryUniprot import getUniprotTargets ## Get all protein targets from ChEBML. chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Get Uniprot binding site annotation for each target. uniDict = queryUniprot.getBindingSites(chemblTargets, release) print "number of targets with binding site information", len(uniDict.keys())
def query(release, user, pword, host, port): import queryUniprot import getUniprotTargets ## Get all protein targets from ChEBML. chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Get Uniprot binding site annotation for each target. uniDict = queryUniprot.getBindingSites(chemblTargets, release) print 'number of targets with binding site information', len(uniDict.keys())
def mapPDs(th, release, user, pword, host, port): ## Set the threshold. import numpy as np threshold = -np.log10(th*10**(-6)) ## Get a list of all ChEMBL targets. import getUniprotTargets chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Load the pfamDict. import pickle infile = open('data/protCodPfamDict_%s.pkl' %release, 'r') pfamDict = pickle.load(inFile) infile.close() ## Get ligands for targets with single domains. import singleDomain single = singleDomain.singleDomains(pfamDict, chemblTargets, threshold, release, user, pword, host, port) ## Construct the propDict for targets with one domain. Manually remove targets (as decribed in Methods section Manual curation) listed in blacklist.tab and add domains that never occur alone listed in whitelist (Pkinase_Tyr). import feedPropDict import parse blacklist = parse.col2list('data/blacklist.tab',1, False) propDict = {} propDict = feedPropDict.dictionary(single, propDict, blacklist, 'single') propDict = feedPropDict.addLigs(propDict,'manual', 'data/whitelist.tab') ## Extract a list of validated domains. valid = propDict.keys() ## Identify targets with one binding site containing domain and at least one ## other domain. import multiDomain multi = multiDomain.multiDomain(pfamDict, chemblTargets, valid, threshold, release, user, pword, host, port) ## Insert data for multi domain proteins. import feedPropDict propDict = feedPropDict.dictionary(multi, propDict, blacklist, 'multi') ## Export the mapping to a mySQL table. import export import pickle outfile = open('data/propDict_%s.pkl' %release, 'w') pickle.dump(propDict, outfile) export.exportMapsMySQL(propDict, release, user, pword, host, port) export.exportConflsMySQL(conflicts, release ,user, pword, host, port)
def master(): """ Function: master -------------------- [email protected] """ ## Load the pdb_d. infile = open('data/pdbDict_%s.pkl' %release, 'r') pdb_d = pickle.load(infile) infile.close() ## Load the pfam_d. infile = open('data/protCodPfamDict_%s.pkl' %release, 'r') pfam_d = pickle.load(infile) infile.close() ## Load Uniprot targets. chembl_targets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Convert pfam_d to long format. long_pfam_d = get_long_pfams(pfam_d) ## Identify architectures binding sm through multiple domains. arch_d = get_archs(pdb_d, long_pfam_d, min_res, min_ratio) ## Add targets with given architecture. arch_d = add_targets(chembl_targets, pfam_d, arch_d) ## Write architechtures to markdown tables. export_archs(arch_d, 'data/interface_%s'%release)
def analysis(th, release, user, pword, host, port): #### #### Load data. #### ## Set threshold for all calculations. import numpy as np threshold = -np.log10(th*10**(-6)) ## Get all ChEMBL targets with a Uniprot accession. import getUniprotTargets chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Read all human protein coding genes import parse humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0) #humanTargets = humanProtCodUniq.keys() print "We are dealing with %s human proteins" %len(humProtCod.keys()) ## Get a list of all human (!) ChEMBL targets humChembl = {} for target in chemblTargets: if target in humProtCod.keys(): humChembl[target] = 0 ## Load the pfamDict. import pickle inFile = open('data/protCodPfamDict_%s.pkl' %release, 'r') pfamDict = pickle.load(inFile) inFile.close() ## Load the pdbDict. import pickle infile = open('data/pdbDict_chembl%s.pkl' %release, 'r') pdbDict = pickle.load(infile) infile.close() ## Load the uniprotDict. import pickle infile = open('data/bsDictUniprot_chembl%s.pkl'%release, 'r') uniprotDict = pickle.load(infile) infile.close() print 'number of targets with binding site information', len(uniprotDict.keys()) ## Load the uniDict. import parseUniChem uniDict = parseUniChem.parse('data/unichemMappings.txt') ## Load the propDict. import pickle infile = open('data/propDict_%s.pkl'% release, 'r') propDict = pickle.load(infile) infile.close() #### #### Generate Plots. #### ## For each target in PfamDict, calculate the ratio of domain over non-domain regions. import getRatioUnstruct import writeTable import os pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port) writeTable.writePfam(pfamDict, humProtCod,humChembl, chemblTargets, release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' %release) ## Assess small molecule binding within Pfam domains for PDBe entries. import matchData import evaluatePred pdbDict = matchData.pdbe(pdbDict,pfamDict, release) evaluatePred.pdbe(pdbDict, 'within', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "PDB" , release)) ## Assess small molecule binding within Pfam domains for Uniprot entries. import matchData import evaluatePred uniprotDict = matchData.uniprot(uniprotDict,pfamDict, release) evaluatePred.uniprot(uniprotDict, 'within', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "Uni" , release)) ## Print a summary of the number of targets and domains covered by the mapping. import groupSize import os allDomains = groupSize.uniqueDomains(pfamDict) singleDomains = groupSize.singles(chemblTargets, pfamDict) groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles) print "all possible groups (single, none, multi, conflict):",groupsAll (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user , pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len(multi), len(conflict) (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user , pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len(multi),len(conflict) ## Plot the evaluation of the mappings. import queryDevice import matchData import evaluatePred import os intacts = queryDevice.queryDevice("SELECT mpf.protein_accession,mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain", release, user, pword, host, port) # ...against PDBe pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict) evaluatePred.pdbePredicted(pdbDict, 'prediction', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', 'PDB' , release)) # ...against uniprot uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts) evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', "Uni" , release)) ## Map the overlap #import overlap #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001] #overlap.overlap(propDict, tholds, release) ## Power Law Distribution of domain occurences ## Prepare the data for the power law plot. ## 1. Count the targets and compounds per domain using the propDict ## 2. Count a human genes per domain using the Pfam dictionary ## 3. Plot the power law distributions for all domains and overlay 25 most ## frequent domains import countFreqs import plplot import plplotRaw import parse countFreqs.countLigs(humProtCod.keys(), chemblTargets, release ,user, pword, host, port) countFreqs.countDoms(humProtCod.keys(), pfamDict) filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab'] for filename in filenames: os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' %filename) al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename) freqs = parse.col2intlist('data/%s'%filename, 1, True) print len(freqs), minx, al, filename, type(freqs), type(freqs[1]) plplot.plplot(freqs, minx, al, filename) plplotRaw.plplotRaw(freqs, filename) ## Plot the ligand properties. import export import os selected = ['Pkinase','Pkinase_Tyr','p450','SNF','Trypsin', 'RVP'] export.exportProps(selected,propDict, threshold, release, user, pword, host, port) filename = 'data/cmpdProps_pKi%s_chembl%s.tab'%(int(threshold), release) os.system("/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"%filename)
def analysis(release): #### #### Load parameters. #### import yaml # Read config file. paramFile = open('mpf.yaml') params = yaml.safe_load(paramFile) user = params['user'] pword = params['pword'] host = params['host'] port = params['port'] th = params['threshold'] #### #### Load data. #### ## Set threshold for all calculations. import numpy as np threshold = -np.log10(th * 10**(-6)) ## Get all ChEMBL targets with a Uniprot accession. import getUniprotTargets chemblTargets = getUniprotTargets.getUniprotTargets( release, user, pword, host, port) ## Get a list of all human (!) ChEMBL targets humChembl = {} for target in chemblTargets.keys(): if chemblTargets[target] == 'H**o sapiens': humChembl[target] = 0 ## Read all human protein coding genes import parse humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0) #humanTargets = humanProtCodUniq.keys() print "We are dealing with %s human proteins" % len(humProtCod.keys()) ## Load the pfamDict. import pickle inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r') pfamDict = pickle.load(inFile) inFile.close() ## Load the pdbDict. import pickle infile = open('data/pdbDict_%s.pkl' % release, 'r') pdbDict = pickle.load(infile) infile.close() ## Load the uniprotDict. import pickle infile = open('data/bsDictUniprot_%s.pkl' % release, 'r') uniprotDict = pickle.load(infile) infile.close() print 'number of targets with binding site information', len( uniprotDict.keys()) ## Load the uniDict. import parseUniChem uniDict = parseUniChem.parse('data/unichemMappings.txt') ## Load the propDict. import pickle infile = open('data/propDict_%s.pkl' % release, 'r') propDict = pickle.load(infile) infile.close() #### #### Generate Plots. #### ## For each target in PfamDict, calculate the ratio of domain over non-domain regions. import getRatioUnstruct import writeTable import os pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port) writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets, release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' % release) ## Assess small molecule binding within Pfam domains for PDBe entries. import matchData import evaluatePred pdbDict = matchData.pdbe(pdbDict, pfamDict, release) evaluatePred.pdbe(pdbDict, 'within', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "PDB", release)) ## Assess small molecule binding within Pfam domains for Uniprot entries. import matchData import evaluatePred uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release) evaluatePred.uniprot(uniprotDict, 'within', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "Uni", release)) ## Print a summary of the number of targets and domains covered by the mapping. import groupSize import os allDomains = groupSize.uniqueDomains(pfamDict) singleDomains = groupSize.singles(chemblTargets, pfamDict) groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles) print "all possible groups (single, none, multi, conflict):", groupsAll (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user, pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len( multi), len(conflict) (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user, pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len( multi), len(conflict) ## Plot the evaluation of the mappings. import queryDevice import matchData import evaluatePred import os intacts = queryDevice.queryDevice( """SELECT mpf.protein_accession, mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain""", release, user, pword, host, port) # ...against PDBe pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict) evaluatePred.pdbePredicted(pdbDict, 'prediction', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', 'PDB', release)) # ...against uniprot uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts) evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', "Uni", release)) ## Map the overlap #import overlap #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001] #overlap.overlap(propDict, tholds, release) ## Power Law Distribution of domain occurences ## Prepare the data for the power law plot. ## 1. Count the targets and compounds per domain using the propDict ## 2. Count a human genes per domain using the Pfam dictionary ## 3. Plot the power law distributions for all domains and overlay 25 most ## frequent domains import countFreqs import plplot import plplotRaw import parse countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user, pword, host, port) countFreqs.countDoms(humProtCod.keys(), pfamDict) filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab'] for filename in filenames: os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' % filename) al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename) freqs = parse.col2intlist('data/%s' % filename, 1, True) print len(freqs), minx, al, filename, type(freqs), type(freqs[1]) plplot.plplot(freqs, minx, al, filename) plplotRaw.plplotRaw(freqs, filename) ## Plot the ligand properties. import export import os selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP'] export.exportProps(selected, propDict, threshold, release, user, pword, host, port) filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release) os.system( "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R" % filename)