def power_law(x, variable, subject, radius = 0.5, number_of_sets = 100): print "- Fitting power law to empirical data: %s" % variable if sum(x-numpy.floor(x)): print " CONTINUOUS" alpha_range = None else: alpha, xmin = plfit0.plfit0(x) print " DISCRETE" print " Approximate estimator for the scaling parameter of the discrete power law:" print " * Scaling parameter: alpha %g" % alpha print " * Lower bound: xmin %g" % xmin alpha_range = numpy.arange(round(alpha)-radius, round(alpha)+radius, 0.001) alpha_range = alpha_range[alpha_range > 1] # distributions with alpha <=1 are not normalizable alpha, xmin, L = plfit.plfit(x, vec = alpha_range, nosmall = False, finite = True) print " Numerical maximization of the logarithm of the likelihood function L:" print " * Scaling parameter: alpha %g" % alpha try: if alpha == min(alpha_range) or alpha == max(alpha_range): print " WARNING alpha_range" except TypeError: pass print " * Lower bound: xmin %g" % xmin print " * Logarithm of the likelihood function: L %g" % L p, gof = plpva.plpva(x, xmin, vec=alpha_range, reps=number_of_sets, quiet=True) print " Generation of %d power-law distributed synthetic data sets:" % number_of_sets print " * Fraction of data sets with worse KS statistic than the empirical data: p-value %g" % p print " * KS statistic of the empirical data: D %g" % gof png = "plplot_"+subject plplot.plplot(x, xmin, alpha, variable, p, png)
def analysis(th, release, user, pword, host, port): #### #### Load data. #### ## Set threshold for all calculations. import numpy as np threshold = -np.log10(th*10**(-6)) ## Get all ChEMBL targets with a Uniprot accession. import getUniprotTargets chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port) ## Read all human protein coding genes import parse humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0) #humanTargets = humanProtCodUniq.keys() print "We are dealing with %s human proteins" %len(humProtCod.keys()) ## Get a list of all human (!) ChEMBL targets humChembl = {} for target in chemblTargets: if target in humProtCod.keys(): humChembl[target] = 0 ## Load the pfamDict. import pickle inFile = open('data/protCodPfamDict_%s.pkl' %release, 'r') pfamDict = pickle.load(inFile) inFile.close() ## Load the pdbDict. import pickle infile = open('data/pdbDict_chembl%s.pkl' %release, 'r') pdbDict = pickle.load(infile) infile.close() ## Load the uniprotDict. import pickle infile = open('data/bsDictUniprot_chembl%s.pkl'%release, 'r') uniprotDict = pickle.load(infile) infile.close() print 'number of targets with binding site information', len(uniprotDict.keys()) ## Load the uniDict. import parseUniChem uniDict = parseUniChem.parse('data/unichemMappings.txt') ## Load the propDict. import pickle infile = open('data/propDict_%s.pkl'% release, 'r') propDict = pickle.load(infile) infile.close() #### #### Generate Plots. #### ## For each target in PfamDict, calculate the ratio of domain over non-domain regions. import getRatioUnstruct import writeTable import os pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port) writeTable.writePfam(pfamDict, humProtCod,humChembl, chemblTargets, release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' %release) ## Assess small molecule binding within Pfam domains for PDBe entries. import matchData import evaluatePred pdbDict = matchData.pdbe(pdbDict,pfamDict, release) evaluatePred.pdbe(pdbDict, 'within', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "PDB" , release)) ## Assess small molecule binding within Pfam domains for Uniprot entries. import matchData import evaluatePred uniprotDict = matchData.uniprot(uniprotDict,pfamDict, release) evaluatePred.uniprot(uniprotDict, 'within', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "Uni" , release)) ## Print a summary of the number of targets and domains covered by the mapping. import groupSize import os allDomains = groupSize.uniqueDomains(pfamDict) singleDomains = groupSize.singles(chemblTargets, pfamDict) groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles) print "all possible groups (single, none, multi, conflict):",groupsAll (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user , pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len(multi), len(conflict) (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user , pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len(multi),len(conflict) ## Plot the evaluation of the mappings. import queryDevice import matchData import evaluatePred import os intacts = queryDevice.queryDevice("SELECT mpf.protein_accession,mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain", release, user, pword, host, port) # ...against PDBe pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict) evaluatePred.pdbePredicted(pdbDict, 'prediction', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', 'PDB' , release)) # ...against uniprot uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts) evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release) os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', "Uni" , release)) ## Map the overlap #import overlap #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001] #overlap.overlap(propDict, tholds, release) ## Power Law Distribution of domain occurences ## Prepare the data for the power law plot. ## 1. Count the targets and compounds per domain using the propDict ## 2. Count a human genes per domain using the Pfam dictionary ## 3. Plot the power law distributions for all domains and overlay 25 most ## frequent domains import countFreqs import plplot import plplotRaw import parse countFreqs.countLigs(humProtCod.keys(), chemblTargets, release ,user, pword, host, port) countFreqs.countDoms(humProtCod.keys(), pfamDict) filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab'] for filename in filenames: os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' %filename) al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename) freqs = parse.col2intlist('data/%s'%filename, 1, True) print len(freqs), minx, al, filename, type(freqs), type(freqs[1]) plplot.plplot(freqs, minx, al, filename) plplotRaw.plplotRaw(freqs, filename) ## Plot the ligand properties. import export import os selected = ['Pkinase','Pkinase_Tyr','p450','SNF','Trypsin', 'RVP'] export.exportProps(selected,propDict, threshold, release, user, pword, host, port) filename = 'data/cmpdProps_pKi%s_chembl%s.tab'%(int(threshold), release) os.system("/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"%filename)
#plothist = [] #plothist = np.append(plothist,50. *np.ones(36)) #plothist = np.append(plothist,71. *np.ones(18) ) #plothist = np.append(plothist,100. *np.ones(12) ) #plothist = np.append(plothist,141. *np.ones(6) ) #plothist = np.append(plothist,200.*np.ones(4) ) #plothist = np.append(plothist,282.*np.ones(1) ) #plothist = np.append(plothist,400.*np.ones(2) ) exit() plt.figure(3) [alpha, xmin, L] = plfit.plfit(plothist,'xmin',30)#50.) print alpha,xmin #a = plpva.plpva(plothist,30,'xmin',30) h = plplot.plplot(plothist,xmin,alpha) plt.loglog(h[0], h[1], 'k--',linewidth=2) plt.hist( plothist,\ log=True,\ bins=zebins,\ # cumulative=-1,\ normed=True,\ ) plt.xscale('log') plt.xlabel('Pressure (micro Pa)') plt.ylabel('Population (normalized)') myp.makeplotres("data",res=200,disp=False) #plt.figure(4) #[alpha, xmin, L] = plfit.plfit(plothist,'xmin',50.) #,'xmin',30.) #print alpha,xmin
def analysis(release): #### #### Load parameters. #### import yaml # Read config file. paramFile = open('mpf.yaml') params = yaml.safe_load(paramFile) user = params['user'] pword = params['pword'] host = params['host'] port = params['port'] th = params['threshold'] #### #### Load data. #### ## Set threshold for all calculations. import numpy as np threshold = -np.log10(th * 10**(-6)) ## Get all ChEMBL targets with a Uniprot accession. import getUniprotTargets chemblTargets = getUniprotTargets.getUniprotTargets( release, user, pword, host, port) ## Get a list of all human (!) ChEMBL targets humChembl = {} for target in chemblTargets.keys(): if chemblTargets[target] == 'H**o sapiens': humChembl[target] = 0 ## Read all human protein coding genes import parse humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0) #humanTargets = humanProtCodUniq.keys() print "We are dealing with %s human proteins" % len(humProtCod.keys()) ## Load the pfamDict. import pickle inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r') pfamDict = pickle.load(inFile) inFile.close() ## Load the pdbDict. import pickle infile = open('data/pdbDict_%s.pkl' % release, 'r') pdbDict = pickle.load(infile) infile.close() ## Load the uniprotDict. import pickle infile = open('data/bsDictUniprot_%s.pkl' % release, 'r') uniprotDict = pickle.load(infile) infile.close() print 'number of targets with binding site information', len( uniprotDict.keys()) ## Load the uniDict. import parseUniChem uniDict = parseUniChem.parse('data/unichemMappings.txt') ## Load the propDict. import pickle infile = open('data/propDict_%s.pkl' % release, 'r') propDict = pickle.load(infile) infile.close() #### #### Generate Plots. #### ## For each target in PfamDict, calculate the ratio of domain over non-domain regions. import getRatioUnstruct import writeTable import os pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port) writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets, release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' % release) ## Assess small molecule binding within Pfam domains for PDBe entries. import matchData import evaluatePred pdbDict = matchData.pdbe(pdbDict, pfamDict, release) evaluatePred.pdbe(pdbDict, 'within', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "PDB", release)) ## Assess small molecule binding within Pfam domains for Uniprot entries. import matchData import evaluatePred uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release) evaluatePred.uniprot(uniprotDict, 'within', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "Uni", release)) ## Print a summary of the number of targets and domains covered by the mapping. import groupSize import os allDomains = groupSize.uniqueDomains(pfamDict) singleDomains = groupSize.singles(chemblTargets, pfamDict) groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles) print "all possible groups (single, none, multi, conflict):", groupsAll (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user, pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len( multi), len(conflict) (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user, pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len( multi), len(conflict) ## Plot the evaluation of the mappings. import queryDevice import matchData import evaluatePred import os intacts = queryDevice.queryDevice( """SELECT mpf.protein_accession, mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain""", release, user, pword, host, port) # ...against PDBe pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict) evaluatePred.pdbePredicted(pdbDict, 'prediction', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', 'PDB', release)) # ...against uniprot uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts) evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', "Uni", release)) ## Map the overlap #import overlap #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001] #overlap.overlap(propDict, tholds, release) ## Power Law Distribution of domain occurences ## Prepare the data for the power law plot. ## 1. Count the targets and compounds per domain using the propDict ## 2. Count a human genes per domain using the Pfam dictionary ## 3. Plot the power law distributions for all domains and overlay 25 most ## frequent domains import countFreqs import plplot import plplotRaw import parse countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user, pword, host, port) countFreqs.countDoms(humProtCod.keys(), pfamDict) filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab'] for filename in filenames: os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' % filename) al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename) freqs = parse.col2intlist('data/%s' % filename, 1, True) print len(freqs), minx, al, filename, type(freqs), type(freqs[1]) plplot.plplot(freqs, minx, al, filename) plplotRaw.plplotRaw(freqs, filename) ## Plot the ligand properties. import export import os selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP'] export.exportProps(selected, propDict, threshold, release, user, pword, host, port) filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release) os.system( "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R" % filename)