예제 #1
0
def pfamDomains(release, user, pword, host, port):

    import getUniprotTargets
    import parse
    import getAllTargets
    import getPfamDomains
    import export

    ## Get all ChEMBL targets with a Uniprot accession.
    chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)

    ## Read all human protein coding gene names.
    humProtCod = parse.parse2col("data/proteinCoding.tab", True, 1, 0)
    humanTargets = []
    for tstr in humProtCod.keys():
        humanTargets.append(tstr.split(";")[0])
    print "We are dealing with %s human proteins" % len(humanTargets)

    ## Generate a list of all targets that are to be fed into the getPfamDomain procedure.
    allTargets = getAllTargets.getAllTargets(humanTargets, chemblTargets)
    allTargets = allTargets.keys()

    ## Get the domains by parsing Pfam. This step takes long and therefore pickles out the domainDict.
    pfamDict = getPfamDomains.getDomains(allTargets, release)

    ## Export the PfamDict as a mysql table.
    export.exportPfamDict(chemblTargets, pfamDict, release, user, pword, host, port)
예제 #2
0
def master(version):
    """
    Function:  master
    Run through all steps to identify mandatory muli-domain architectures.
    """
    ## Load the pfamDict.
    infile = open('data/protCodPfamDict_%s.pkl' %RELEASE, 'r')
    pfam_d = pickle.load(infile)
    infile.close()
    # Load the list of validated domains.
    valid_dom_d = readfile('data/valid_pfam_v_%(version)s.tab' % locals(), 'pfam_a', 'pfam_a')
    del valid_dom_d['Pkinase_Tyr']
    valid_doms = valid_dom_d.keys()
    ## Load Uniprot targets.
    chembl_targets = getUniprotTargets.getUniprotTargets(RELEASE, USER, PWORD, HOST, PORT)
    ## Load eligible multi-domain targets.
    el_targets = get_el_targets(RELEASE, USER, PWORD, HOST, PORT)
    ## Add targets with given architecture.
    (arch_lkp, dom_lkp, act_lkp) = get_multi_doms(el_targets, pfam_d)
    ##  Write multi-domain architechtures to markdown tables.
    export_archs(arch_lkp, valid_doms, 'data/multi_dom_archs_%s'% RELEASE)  
    ## Write domains from multi-domain architechtures to markdown tables.
    export_doms(dom_lkp, valid_doms, 'data/multi_dom_doms_%s'% RELEASE)
    ## export network file.
    export_network(arch_lkp, valid_doms, 'data/multi_dom_network_%s'% RELEASE)
    ## export network attribute file.
    export_attribs(arch_lkp, valid_doms, 'data/multi_dom_attributes_%s'% RELEASE)
예제 #3
0
def master():
    """
    Function:  master
    --------------------    
    [email protected]
    """
    ## Load the pdb_d.
    infile = open('data/pdbDict_%s.pkl' % release, 'r')
    pdb_d = pickle.load(infile)
    infile.close()
    ## Load the pfam_d.
    infile = open('data/protCodPfamDict_%s.pkl' % release, 'r')
    pfam_d = pickle.load(infile)
    infile.close()
    ## Load Uniprot targets.
    chembl_targets = getUniprotTargets.getUniprotTargets(
        release, user, pword, host, port)
    ## Convert pfam_d to long format.
    long_pfam_d = get_long_pfams(pfam_d)
    ## Identify architectures binding sm through multiple domains.
    arch_d = get_archs(pdb_d, long_pfam_d, min_res, min_ratio)
    ## Add targets with given architecture.
    arch_d = add_targets(chembl_targets, pfam_d, arch_d)
    ##  Write architechtures to markdown tables.
    export_archs(arch_d, 'data/interface_%s' % release)
예제 #4
0
def query(release, user, pword, host, port):

    import queryUniprot
    import getUniprotTargets

    ## Get all protein targets from ChEBML.
    chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)

    ## Get Uniprot binding site annotation for each target.
    uniDict = queryUniprot.getBindingSites(chemblTargets, release)
    print "number of targets with binding site information", len(uniDict.keys())
예제 #5
0
def query(release, user, pword, host, port):
  
  import queryUniprot
  import getUniprotTargets


  ## Get all protein targets from ChEBML.                                       
  chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)

  ## Get Uniprot binding site annotation for each target.
  uniDict = queryUniprot.getBindingSites(chemblTargets, release)
  print 'number of targets with binding site information', len(uniDict.keys())
예제 #6
0
def mapPDs(th, release, user, pword, host, port): 

  ## Set the threshold.
  import numpy as np
  threshold = -np.log10(th*10**(-6))
  
  ## Get a list of all ChEMBL targets.
  import getUniprotTargets
  chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)

  ## Load the pfamDict.
  import pickle
  infile = open('data/protCodPfamDict_%s.pkl' %release, 'r')
  pfamDict = pickle.load(inFile)
  infile.close()    

  ## Get ligands for targets with single domains.
  import singleDomain 
  single = singleDomain.singleDomains(pfamDict, chemblTargets, threshold, release, user, pword, host, port)

  ## Construct the propDict for targets with one domain. Manually remove targets (as decribed in Methods section Manual curation) listed in blacklist.tab and add domains that never occur alone listed in whitelist (Pkinase_Tyr). 
  import feedPropDict
  import parse
  blacklist = parse.col2list('data/blacklist.tab',1, False)  
  propDict = {}
  propDict = feedPropDict.dictionary(single, propDict, blacklist, 'single')
  propDict = feedPropDict.addLigs(propDict,'manual', 'data/whitelist.tab') 
 
  ## Extract a list of validated domains.
  valid = propDict.keys() 
 
  ## Identify targets with one binding site containing domain and at least one
  ## other domain.
  import multiDomain
  multi = multiDomain.multiDomain(pfamDict, chemblTargets, valid, threshold, release, user, pword, host, port)

  ## Insert data for multi domain proteins.
  import feedPropDict
  propDict = feedPropDict.dictionary(multi, propDict, blacklist, 'multi')

  ## Export the mapping to a mySQL table.
  import export
  import pickle
  outfile = open('data/propDict_%s.pkl' %release, 'w')
  pickle.dump(propDict, outfile)
  export.exportMapsMySQL(propDict, release, user, pword, host, port)
  export.exportConflsMySQL(conflicts, release ,user, pword, host, port)
예제 #7
0
파일: arch.py 프로젝트: fak/mapChEMBLPfam
def master():
    """
    Function:  master
    --------------------    
    [email protected]
    """
    ## Load the pdb_d.
    infile = open('data/pdbDict_%s.pkl' %release, 'r')
    pdb_d = pickle.load(infile)
    infile.close()  
    ## Load the pfam_d.
    infile = open('data/protCodPfamDict_%s.pkl' %release, 'r')
    pfam_d = pickle.load(infile)
    infile.close()
    ## Load Uniprot targets.
    chembl_targets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)
    ## Convert pfam_d to long format.
    long_pfam_d = get_long_pfams(pfam_d)
    ## Identify architectures binding sm through multiple domains.
    arch_d = get_archs(pdb_d, long_pfam_d, min_res, min_ratio)
    ## Add targets with given architecture.
    arch_d = add_targets(chembl_targets, pfam_d, arch_d)
    ##  Write architechtures to markdown tables.
    export_archs(arch_d, 'data/interface_%s'%release)
예제 #8
0
def analysis(th, release, user, pword, host, port):

  ####
  #### Load data.
  ####
  
  ## Set threshold for all calculations.
  import numpy as np
  threshold = -np.log10(th*10**(-6))


  ## Get all ChEMBL targets with a Uniprot accession.
  import getUniprotTargets
  chemblTargets = getUniprotTargets.getUniprotTargets(release, user, pword, host, port)
  
  ## Read all human protein coding genes
  import parse
  humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
  #humanTargets = humanProtCodUniq.keys()
  print "We are dealing with %s human proteins" %len(humProtCod.keys())

  ## Get a list of all human (!) ChEMBL targets
  humChembl = {}
  for target in chemblTargets:
    if target in humProtCod.keys():
      humChembl[target] = 0

  ## Load the pfamDict.
  import pickle
  inFile = open('data/protCodPfamDict_%s.pkl' %release, 'r')
  pfamDict = pickle.load(inFile)
  inFile.close() 

  ## Load the pdbDict.
  import pickle
  infile = open('data/pdbDict_chembl%s.pkl' %release, 'r')
  pdbDict = pickle.load(infile)
  infile.close()

  ## Load the uniprotDict.
  import pickle
  infile  = open('data/bsDictUniprot_chembl%s.pkl'%release, 'r')
  uniprotDict = pickle.load(infile)
  infile.close()
  print 'number of targets with binding site information', len(uniprotDict.keys())


  ## Load the uniDict.
  import parseUniChem
  uniDict = parseUniChem.parse('data/unichemMappings.txt')

  ## Load the propDict.
  import pickle
  infile = open('data/propDict_%s.pkl'% release, 'r')
  propDict = pickle.load(infile)
  infile.close()

  ####
  #### Generate Plots.
  ####

  ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
  import getRatioUnstruct
  import writeTable
  import os
  pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port)
  writeTable.writePfam(pfamDict, humProtCod,humChembl, chemblTargets, release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' %release) 


  ## Assess small molecule binding within Pfam domains for PDBe entries.
  import matchData
  import evaluatePred 
  pdbDict = matchData.pdbe(pdbDict,pfamDict, release)
  evaluatePred.pdbe(pdbDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('within', "PDB" , release))

  
  ## Assess small molecule binding within Pfam domains for Uniprot entries.  
  import matchData
  import evaluatePred  
  uniprotDict = matchData.uniprot(uniprotDict,pfamDict,  release)
  evaluatePred.uniprot(uniprotDict, 'within', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('within', "Uni" , release))

  ## Print a summary of the number of targets and domains covered by the mapping. 
  import groupSize
  import os
  allDomains = groupSize.uniqueDomains(pfamDict)
  singleDomains = groupSize.singles(chemblTargets, pfamDict)
  groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
  print "all possible groups (single, none, multi, conflict):",groupsAll
  (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi), len(conflict)
  (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user , pword, host, port)
  print "all covered targets (single, multi, conflict): ", len(single), len(multi),len(conflict)


  ## Plot the evaluation of the mappings.
  import queryDevice
  import matchData
  import evaluatePred
  import os

  intacts = queryDevice.queryDevice("SELECT mpf.protein_accession,mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain", release, user, pword, host, port)

  # ...against PDBe  
  pdbDict = matchData.pdbePredicted(pdbDict,  intacts, uniDict)
  evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R' % ('prediction', 'PDB' , release))
  # ...against uniprot
  uniprotDict = matchData.uniprotPredicted(uniprotDict,  intacts)
  evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
  os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R' % ('prediction', "Uni" , release))


  ## Map the overlap
  #import overlap
  #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
  #overlap.overlap(propDict, tholds, release)  


  ## Power Law Distribution of domain occurences
  ##  Prepare the data for the power law plot.
  ##  1. Count the targets and compounds per domain using the propDict
  ##  2. Count a human genes per domain using the Pfam dictionary
  ##  3. Plot the power law distributions for all domains and overlay 25 most 
  ##     frequent domains
  import countFreqs
  import plplot
  import plplotRaw
  import parse 
  countFreqs.countLigs(humProtCod.keys(), chemblTargets, release ,user, pword, host, port)
  countFreqs.countDoms(humProtCod.keys(), pfamDict)
  filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

  for filename in filenames:
    os.system('/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' %filename)
    al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
    freqs = parse.col2intlist('data/%s'%filename, 1, True)
    print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
    plplot.plplot(freqs, minx, al, filename)
    plplotRaw.plplotRaw(freqs, filename) 


  ## Plot the ligand properties.
  import export
  import os
  selected = ['Pkinase','Pkinase_Tyr','p450','SNF','Trypsin', 'RVP']
  export.exportProps(selected,propDict, threshold, release, user, pword, host, port) 

  filename = 'data/cmpdProps_pKi%s_chembl%s.tab'%(int(threshold), release)
  os.system("/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"%filename)
예제 #9
0
def analysis(release):

    ####
    #### Load parameters.
    ####

    import yaml
    # Read config file.
    paramFile = open('mpf.yaml')
    params = yaml.safe_load(paramFile)
    user = params['user']
    pword = params['pword']
    host = params['host']
    port = params['port']
    th = params['threshold']

    ####
    #### Load data.
    ####

    ## Set threshold for all calculations.
    import numpy as np
    threshold = -np.log10(th * 10**(-6))

    ## Get all ChEMBL targets with a Uniprot accession.
    import getUniprotTargets
    chemblTargets = getUniprotTargets.getUniprotTargets(
        release, user, pword, host, port)

    ## Get a list of all human (!) ChEMBL targets
    humChembl = {}
    for target in chemblTargets.keys():
        if chemblTargets[target] == 'H**o sapiens':
            humChembl[target] = 0

    ## Read all human protein coding genes
    import parse
    humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0)
    #humanTargets = humanProtCodUniq.keys()
    print "We are dealing with %s human proteins" % len(humProtCod.keys())

    ## Load the pfamDict.
    import pickle
    inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r')
    pfamDict = pickle.load(inFile)
    inFile.close()

    ## Load the pdbDict.
    import pickle
    infile = open('data/pdbDict_%s.pkl' % release, 'r')
    pdbDict = pickle.load(infile)
    infile.close()

    ## Load the uniprotDict.
    import pickle
    infile = open('data/bsDictUniprot_%s.pkl' % release, 'r')
    uniprotDict = pickle.load(infile)
    infile.close()
    print 'number of targets with binding site information', len(
        uniprotDict.keys())

    ## Load the uniDict.
    import parseUniChem
    uniDict = parseUniChem.parse('data/unichemMappings.txt')

    ## Load the propDict.
    import pickle
    infile = open('data/propDict_%s.pkl' % release, 'r')
    propDict = pickle.load(infile)
    infile.close()

    ####
    #### Generate Plots.
    ####

    ## For each target in PfamDict, calculate the ratio of domain over non-domain regions.
    import getRatioUnstruct
    import writeTable
    import os
    pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user,
                                         pword, host, port)
    writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets,
                         release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R'
        % release)

    ## Assess small molecule binding within Pfam domains for PDBe entries.
    import matchData
    import evaluatePred
    pdbDict = matchData.pdbe(pdbDict, pfamDict, release)
    evaluatePred.pdbe(pdbDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('within', "PDB", release))

    ## Assess small molecule binding within Pfam domains for Uniprot entries.
    import matchData
    import evaluatePred
    uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release)
    evaluatePred.uniprot(uniprotDict, 'within', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('within', "Uni", release))

    ## Print a summary of the number of targets and domains covered by the mapping.
    import groupSize
    import os
    allDomains = groupSize.uniqueDomains(pfamDict)
    singleDomains = groupSize.singles(chemblTargets, pfamDict)
    groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles)
    print "all possible groups (single, none, multi, conflict):", groupsAll
    (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release,
                                                       user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)
    (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release,
                                                     user, pword, host, port)
    print "all covered targets (single, multi, conflict): ", len(single), len(
        multi), len(conflict)

    ## Plot the evaluation of the mappings.
    import queryDevice
    import matchData
    import evaluatePred
    import os

    intacts = queryDevice.queryDevice(
        """SELECT mpf.protein_accession,
		mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype,
	 	md.chembl_id FROM map_pfam mpf 
	JOIN pfam_domains pfd 
	  ON pfd.protein_accession = mpf.protein_accession 
	JOIN molecule_dictionary md 
	  ON md.molregno = mpf.molregno 
	WHERE mpf.domain = pfd.domain""", release, user, pword, host, port)

    # ...against PDBe
    pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict)
    evaluatePred.pdbePredicted(pdbDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s  ecdf.R'
        % ('prediction', 'PDB', release))
    # ...against uniprot
    uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts)
    evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release)
    os.system(
        '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s  -%s -%s  ecdf.R'
        % ('prediction', "Uni", release))

    ## Map the overlap
    #import overlap
    #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001]
    #overlap.overlap(propDict, tholds, release)

    ## Power Law Distribution of domain occurences
    ##  Prepare the data for the power law plot.
    ##  1. Count the targets and compounds per domain using the propDict
    ##  2. Count a human genes per domain using the Pfam dictionary
    ##  3. Plot the power law distributions for all domains and overlay 25 most
    ##     frequent domains
    import countFreqs
    import plplot
    import plplotRaw
    import parse
    countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user,
                         pword, host, port)
    countFreqs.countDoms(humProtCod.keys(), pfamDict)
    filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab']

    for filename in filenames:
        os.system(
            '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R'
            % filename)
        al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename)
        freqs = parse.col2intlist('data/%s' % filename, 1, True)
        print len(freqs), minx, al, filename, type(freqs), type(freqs[1])
        plplot.plplot(freqs, minx, al, filename)
        plplotRaw.plplotRaw(freqs, filename)

    ## Plot the ligand properties.
    import export
    import os
    selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP']
    export.exportProps(selected, propDict, threshold, release, user, pword,
                       host, port)

    filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release)
    os.system(
        "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R"
        % filename)