Exemplo n.º 1
0
def prepRank():
  import parse
  from operator import itemgetter

  keyIndex = 0
  valIndex = 1
  header  = True
  genFreq  = parse.parse2col('data/genFreq.tab', header, keyIndex, valIndex) 
  ligFreq  = parse.parse2col('data/domLigs.tab', header, keyIndex, valIndex) 
  ligRankTups  = sorted(ligFreq.items(), key = itemgetter(1), reverse = True)
  genRankTups  = sorted(genFreq.items(), key = itemgetter(1), reverse = True)
  i = 1
  tb = 0
  ligRanks = {}
  for tup in ligRankTups:
    dom = tup[0]
    if tb > tup[1]:
      i += 1
    ligRanks[dom] = i
    tb = tup[1]

  i = 1
  tb = 0
  genRanks = {}
  for tup in genRankTups:
    dom = tup[0]
    if tb > tup[1]:
      i += 1
    genRanks[dom] = i
    tb = tup[1]
    
  genRankL = []
  ligRankL = []
  for dom in ligRanks.keys():
    genRankL.append(genRanks[dom]+1)
    ligRankL.append(ligRanks[dom]+1)



  ## Pepare the rectangular limits
  yr2 = []
  for threshold in [8,40,200, 1000]:

    for tup in ligRankTups:
      if tup[1] < threshold:
        #print tup[0], ' has less than %s ligands' %threshold
        yr2.append(ligRanks[tup[0]])
        break
        
        
        
 
  out = open('data/temp.tab','w')
  out.write('domain\tgenFreq\tligFreq\n')
  for dom in ligFreq.keys():
    out.write('%s\t%s\t%s\n'%(dom, genFreq[dom], ligFreq[dom]))
  out.close()

  return genRankL, ligRankL, yr2
Exemplo n.º 2
0
def prepRank():
    import parse
    from operator import itemgetter

    keyIndex = 0
    valIndex = 1
    header = True
    genFreq = parse.parse2col('data/genFreq.tab', header, keyIndex, valIndex)
    ligFreq = parse.parse2col('data/domLigs.tab', header, keyIndex, valIndex)
    ligRankTups = sorted(ligFreq.items(), key=itemgetter(1), reverse=True)
    genRankTups = sorted(genFreq.items(), key=itemgetter(1), reverse=True)
    i = 1
    tb = 0
    ligRanks = {}
    for tup in ligRankTups:
        dom = tup[0]
        if tb > tup[1]:
            i += 1
        ligRanks[dom] = i
        tb = tup[1]

    i = 1
    tb = 0
    genRanks = {}
    for tup in genRankTups:
        dom = tup[0]
        if tb > tup[1]:
            i += 1
        genRanks[dom] = i
        tb = tup[1]

    genRankL = []
    ligRankL = []
    for dom in ligRanks.keys():
        genRankL.append(genRanks[dom] + 1)
        ligRankL.append(ligRanks[dom] + 1)

    ## Pepare the rectangular limits
    yr2 = []
    for threshold in [8, 40, 200, 1000]:

        for tup in ligRankTups:
            if tup[1] < threshold:
                #print tup[0], ' has less than %s ligands' %threshold
                yr2.append(ligRanks[tup[0]])
                break

    out = open('data/temp.tab', 'w')
    out.write('domain\tgenFreq\tligFreq\n')
    for dom in ligFreq.keys():
        out.write('%s\t%s\t%s\n' % (dom, genFreq[dom], ligFreq[dom]))
    out.close()

    return genRankL, ligRankL, yr2
Exemplo n.º 3
0
def pfamDomains(release, user, pword, host, port):

    import getUniprotTargets
    import parse
    import getAllTargets
    import getPfamDomains
    import export

    ## Get all ChEMBL targets with a Uniprot accession.
    chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)

    ## Read all human protein coding gene names.
    humProtCod = parse.parse2col("data/proteinCoding.tab", True, 1, 0)
    humanTargets = []
    for tstr in humProtCod.keys():
        humanTargets.append(tstr.split(";")[0])
    print "We are dealing with %s human proteins" % len(humanTargets)

    ## Generate a list of all targets that are to be fed into the getPfamDomain procedure.
    allTargets = getAllTargets.getAllTargets(humanTargets, chemblTargets)
    allTargets = allTargets.keys()

    ## Get the domains by parsing Pfam. This step takes long and therefore pickles out the domainDict.
    pfamDict = getPfamDomains.getDomains(allTargets, release)

    ## Export the PfamDict as a mysql table.
    export.exportPfamDict(chemblTargets, pfamDict, release, user, pword, host, port)
Exemplo n.º 4
0
def analysis(th, release, user, pword, host, port):

  ####
  #### Load data.
  ####
  
  ## Set threshold for all calculations.
  import numpy as np
  threshold = -np.log10(th*10**(-6))


  ## Get all ChEMBL targets with a Uniprot accession.
  import getUniprotTargets
  chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)
  
  ## Read all human protein coding genes
  import parse
  humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
  #humanTargets = humanProtCodUniq.keys()
  print "We are dealing with %s human proteins" %len(humProtCod.keys())

  ## Get a list of all human (!) ChEMBL targets
  humChembl = {}
  for target in chemblTargets:
    if target in humProtCod.keys():
      humChembl[target] = 0

  ## Load the pfamDict.
  import pickle
  inFile = open('data/protCodPfamDict_%s.pkl' %release, 'r')
  pfamDict = pickle.load(inFile)
  inFile.close() 

  ## Load the pdbDict.
  import pickle
  infile = open('data/pdbDict_chembl%s.pkl' %release, 'r')
  pdbDict = pickle.load(infile)
  infile.close()

  ## Load the uniprotDict.
  import pickle
  infile  = open('data/bsDictUniprot_chembl%s.pkl'%release, 'r')
  uniprotDict = pickle.load(infile)
  infile.close()
  print 'number of targets with binding site information', len(uniprotDict.keys())


  ## Load the uniDict.
  import parseUniChem
  uniDict = parseUniChem.parse('data/unichemMappings.txt')

  ## Load the propDict.
  import pickle
  infile = open('data/propDict_%s.pkl'% release, 'r')
  propDict = pickle.load(infile)
  infile.close()

  ####
  #### Generate Plots.
  ####

  ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
  import getRatioUnstruct
  import writeTable
  import os
  pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port)
  writeTable.writePfam(pfamDict, humProtCod,humChembl, chemblTargets, release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' %release) 


  ## Assess small molecule binding within Pfam domains for PDBe entries.
  import matchData
  import evaluatePred 
  pdbDict = matchData.pdbe(pdbDict,pfamDict, release)
  evaluatePred.pdbe(pdbDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('within', "PDB" , release))

  
  ## Assess small molecule binding within Pfam domains for Uniprot entries.  
  import matchData
  import evaluatePred  
  uniprotDict = matchData.uniprot(uniprotDict,pfamDict,  release)
  evaluatePred.uniprot(uniprotDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('within', "Uni" , release))

  ## Print a summary of the number of targets and domains covered by the mapping. 
  import groupSize
  import os
  allDomains = groupSize.uniqueDomains(pfamDict)
  singleDomains = groupSize.singles(chemblTargets, pfamDict)
  groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
  print "all possible groups (single, none, multi, conflict):",groupsAll
  (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi), len(conflict)
  (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi),len(conflict)


  ## Plot the evaluation of the mappings.
  import queryDevice
  import matchData
  import evaluatePred
  import os

  intacts = queryDevice.queryDevice("SELECT mpf.protein_accession,mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain", release, user, pword, host, port)

  # ...against PDBe  
  pdbDict = matchData.pdbePredicted(pdbDict,  intacts, uniDict)
  evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('prediction', 'PDB' , release))
  # ...against uniprot
  uniprotDict = matchData.uniprotPredicted(uniprotDict,  intacts)
  evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('prediction', "Uni" , release))


  ## Map the overlap
  #import overlap
  #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
  #overlap.overlap(propDict, tholds, release)  


  ## Power Law Distribution of domain occurences
  ##  Prepare the data for the power law plot.
  ##  1. Count the targets and compounds per domain using the propDict
  ##  2. Count a human genes per domain using the Pfam dictionary
  ##  3. Plot the power law distributions for all domains and overlay 25 most 
  ##     frequent domains
  import countFreqs
  import plplot
  import plplotRaw
  import parse 
  countFreqs.countLigs(humProtCod.keys(), chemblTargets, release ,user, pword, host, port)
  countFreqs.countDoms(humProtCod.keys(), pfamDict)
  filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

  for filename in filenames:
    os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' %filename)
    al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
    freqs = parse.col2intlist('data/%s'%filename, 1, True)
    print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
    plplot.plplot(freqs, minx, al, filename)
    plplotRaw.plplotRaw(freqs, filename) 


  ## Plot the ligand properties.
  import export
  import os
  selected = ['Pkinase','Pkinase_Tyr','p450','SNF','Trypsin', 'RVP']
  export.exportProps(selected,propDict, threshold, release, user, pword, host, port) 

  filename = 'data/cmpdProps_pKi%s_chembl%s.tab'%(int(threshold), release)
  os.system("/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"%filename)
Exemplo n.º 5
0
def analysis(release):

    ####
    #### Load parameters.
    ####

    import yaml
    # Read config file.
    paramFile = open('mpf.yaml')
    params = yaml.safe_load(paramFile)
    user = params['user']
    pword = params['pword']
    host = params['host']
    port = params['port']
    th = params['threshold']

    ####
    #### Load data.
    ####

    ## Set threshold for all calculations.
    import numpy as np
    threshold = -np.log10(th * 10**(-6))

    ## Get all ChEMBL targets with a Uniprot accession.
    import getUniprotTargets
    chemblTargets = getUniprotTargets.getUniprotTargets(
        release, user, pword, host, port)

    ## Get a list of all human (!) ChEMBL targets
    humChembl = {}
    for target in chemblTargets.keys():
        if chemblTargets[target] == 'H**o sapiens':
            humChembl[target] = 0

    ## Read all human protein coding genes
    import parse
    humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
    #humanTargets = humanProtCodUniq.keys()
    print "We are dealing with %s human proteins" % len(humProtCod.keys())

    ## Load the pfamDict.
    import pickle
    inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r')
    pfamDict = pickle.load(inFile)
    inFile.close()

    ## Load the pdbDict.
    import pickle
    infile = open('data/pdbDict_%s.pkl' % release, 'r')
    pdbDict = pickle.load(infile)
    infile.close()

    ## Load the uniprotDict.
    import pickle
    infile = open('data/bsDictUniprot_%s.pkl' % release, 'r')
    uniprotDict = pickle.load(infile)
    infile.close()
    print 'number of targets with binding site information', len(
        uniprotDict.keys())

    ## Load the uniDict.
    import parseUniChem
    uniDict = parseUniChem.parse('data/unichemMappings.txt')

    ## Load the propDict.
    import pickle
    infile = open('data/propDict_%s.pkl' % release, 'r')
    propDict = pickle.load(infile)
    infile.close()

    ####
    #### Generate Plots.
    ####

    ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
    import getRatioUnstruct
    import writeTable
    import os
    pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user,
                                         pword, host, port)
    writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets,
                         release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R'
        % release)

    ## Assess small molecule binding within Pfam domains for PDBe entries.
    import matchData
    import evaluatePred
    pdbDict = matchData.pdbe(pdbDict, pfamDict, release)
    evaluatePred.pdbe(pdbDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('within', "PDB", release))

    ## Assess small molecule binding within Pfam domains for Uniprot entries.
    import matchData
    import evaluatePred
    uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release)
    evaluatePred.uniprot(uniprotDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('within', "Uni", release))

    ## Print a summary of the number of targets and domains covered by the mapping.
    import groupSize
    import os
    allDomains = groupSize.uniqueDomains(pfamDict)
    singleDomains = groupSize.singles(chemblTargets, pfamDict)
    groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
    print "all possible groups (single, none, multi, conflict):", groupsAll
    (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release,
                                                       user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)
    (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release,
                                                     user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)

    ## Plot the evaluation of the mappings.
    import queryDevice
    import matchData
    import evaluatePred
    import os

    intacts = queryDevice.queryDevice(
        """SELECT mpf.protein_accession,
		mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype,
	 	md.chembl_id FROM map_pfam mpf 
	JOIN pfam_domains pfd 
	  ON pfd.protein_accession = mpf.protein_accession 
	JOIN molecule_dictionary md 
	  ON md.molregno = mpf.molregno 
	WHERE mpf.domain = pfd.domain""", release, user, pword, host, port)

    # ...against PDBe
    pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict)
    evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('prediction', 'PDB', release))
    # ...against uniprot
    uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts)
    evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('prediction', "Uni", release))

    ## Map the overlap
    #import overlap
    #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
    #overlap.overlap(propDict, tholds, release)

    ## Power Law Distribution of domain occurences
    ##  Prepare the data for the power law plot.
    ##  1. Count the targets and compounds per domain using the propDict
    ##  2. Count a human genes per domain using the Pfam dictionary
    ##  3. Plot the power law distributions for all domains and overlay 25 most
    ##     frequent domains
    import countFreqs
    import plplot
    import plplotRaw
    import parse
    countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user,
                         pword, host, port)
    countFreqs.countDoms(humProtCod.keys(), pfamDict)
    filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

    for filename in filenames:
        os.system(
            '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R'
            % filename)
        al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
        freqs = parse.col2intlist('data/%s' % filename, 1, True)
        print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
        plplot.plplot(freqs, minx, al, filename)
        plplotRaw.plplotRaw(freqs, filename)

    ## Plot the ligand properties.
    import export
    import os
    selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP']
    export.exportProps(selected, propDict, threshold, release, user, pword,
                       host, port)

    filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release)
    os.system(
        "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"
        % filename)