예제 #1
0
파일: ppi_parser.py 프로젝트: Ravasz/ed
def name_collector():
  """look up PTPN22 interactors, download their fasta sequences to extract their names. 
  Print the names to STDout"""
  from root.ed.tools import prot_entrez_fetch
  idList = intact_parser() 
  fastaL = prot_entrez_fetch(idList, retM="text", retT="fasta")
  for fastaItem in fastaL:
    nameS = fastaItem.split("\n")[0].split("|")[-1]
    print nameS
예제 #2
0
파일: peptide_parser.py 프로젝트: Ravasz/ed
def main():
  from root.ed.tools import html_creator, prot_id_converter, prot_entrez_fetch
  from copy import deepcopy
  print "this is peptide parser"
  
  
  queryS = "Orc6" # this is the search term that will be worked on. It should be a protein name like "Ptpn22"
  
  
  print "working on: " + queryS
  with open("../bob/peptides.txt","r") as inpF: 
    pepL, uniId = peptide_finder(targetFile=inpF, targetS = queryS) # find peptides from peptides.txt for the protein name
  print pepL
  print "peptides found"
  targetL = [queryS]
  idList = prot_id_converter(targetL, "10090", inpDB = "genesymbol",outDB="refseqproteingi")
  seqL = prot_entrez_fetch(idList, retM="gb", retT="fasta")
  for seqItem in seqL:
    seqS = seqItem.split("\n")[1]
    print seqS
  print "protein sequence found"
  annotS = deepcopy(seqS)
  pStartL = []
  pEndL = []
  for pepItem in pepL: # locate peptides in full protein sequence and store positions for starts and ends. merge overlapping peptides.
    pepStart = seqS.index(pepItem)
    pepEnd = pepStart + len(pepItem)
    
    startCount = 0 # handle starts
    for startItem in pStartL:
      if startItem <= pepStart:
        startCount += 1
    endCount = 0
    for endItem in pEndL:
      if endItem <= pepStart:
        endCount += 1
    if startCount == endCount and pepStart not in pEndL: # start new peptide
      pStartL.append(pepStart)
    elif startCount == endCount and pepStart in pEndL: # start a new peptide at the end of another peptide
      pEndL.remove(pepStart)
    
    overlapCount = 0
    for startItem in pStartL[:]: # handle ends
      if pepStart<startItem<=pepEnd:
        pStartL.remove(startItem) # remove extra starts
        overlapCount += 1
        
    for endItem in pEndL[:]:
      if pepStart<=endItem<=pepEnd: # remove extra ends
        pEndL.remove(endItem)
        overlapCount -= 1
    
    if pepEnd not in pEndL and overlapCount <= 0: # add end
      curStart = 500000
      for pSI in pStartL:
        if curStart > pSI > pepEnd:
          curStart = pSI
      curEnd = 500000
      for pEI in pEndL:
        if curEnd > pEI > pepEnd:
          curEnd = pEI
      if curStart <= curEnd:  # check if next tag is start or end. if start, add end. if end, do nothing
        pEndL.append(pepEnd)
     
  print uniId
  phL = []
  with open("../datafiles/Phosphorylation_site_dataset") as phInp: # now for the phosphosite data
    for phLine in phInp:
      phList = phLine.split("\t")
      try:
        if uniId == phList[1]: 
          phL.append(int(phList[4][1:-2]) - 1)
      except IndexError:
        continue
  print phL
  
  fullL = pStartL + pEndL
  for phItem in phL:
    if phItem not in fullL: fullL.append(phItem)
  fullL.sort()
  
  offsetN = 0
  for posI in fullL: # from the resulting intervals, create emphasis in html file, mark phosphosites in red
    if posI in pStartL:
      annotS = annotS[:posI+offsetN] + "<mark>" + annotS[posI+offsetN:]
      offsetN += 6
    elif posI in pEndL:
      annotS = annotS[:posI+offsetN] + "</mark>" + annotS[posI+offsetN:]
      offsetN += 7
    if posI in phL:
      annotS = annotS[:posI+offsetN] + r"""<strong style="color: red;">""" + annotS[posI+offsetN] + r"""</strong>""" + annotS[posI+offsetN + 1 :]
      offsetN += 37
        
  print annotS
  html_creator(queryS + " peptides", annotS, queryS + ".html")
  print "found peptides marked in the file: ",
  print queryS + ".html"  
예제 #3
0
파일: ppi_parser.py 프로젝트: Ravasz/ed
def sh3_counter():
  """look up a list of uniprot IDs, download their full genbank entries from the Entrez database 
  and count the number of SH3 domains the interactors have. Print the results to STDout"""
  from root.ed.tools import prot_entrez_fetch, prot_id_converter
  from root.ed.bobscripts.bobdata_parser import protein_name_collector
  # idList = intact_parser() # to use for Ptpn22 interactome
  fullPreyL = protein_name_collector()
  # fullPreyL = ["P20152", "Q8BFZ3", "P17182", "P17742", "P11499"]
  print fullPreyL
  if len(fullPreyL) > 200: # chop up very large lists of uniprot IDs to batches of 100
    maxBatch = (len(fullPreyL)/200) + 1
    lenCount = 0
    idList = []
    seqL = []
    batchCount = 0
    preyL = []
    for listItem in fullPreyL:
      preyL.append(listItem)
      lenCount += 1
      if lenCount == 200:
        batchCount += 1
        print "processing batch number %d of Uniprot IDs..." % (batchCount, )
        idList = prot_id_converter(preyL, "10090", outDB="refseqproteingi")
        seqL = seqL + prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py
        lenCount = 0
        idList = []
        preyL = []
        
        print "this was batch number %d of %d" %(batchCount, maxBatch) 
        print ""
    if lenCount != 0:
      batchCount += 1
      print "processing batch number %d of Uniprot IDs..." % (batchCount, )
      idList = prot_id_converter(preyL, "10090", outDB="refseqproteingi")
      seqL = seqL + prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py
      lenCount = 0
      idList = []
      preyL = []
      
      print "this was batch number %d of %d" %(batchCount, maxBatch) 
      print ""
      
        
  else: 
    idList = prot_id_converter(fullPreyL, "10090", outDB="refseqproteingi")
    seqL = prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py
  
  """
  dumpStr = ""
  for seqLine in seqL:
    seqStr = "***".join(seqLine) 
    dumpStr = dumpStr + "xxxxx" + seqStr # this might get huge
  
  outputDump = open("entrezdump.txt", "w")
  outputDump.write(dumpStr)
  """
  
  regionFlag = False
  sHFlag = True
  regionCount = 0
  sHCount = 0
  protCount = 0
  shProtCount = 0
  for flatLine in seqL:
    if flatLine[:10]  == "LOCUS     ": # this is usually the title of an entrez flatfile and contains the protien name 
      protCount+= 1
      newProtFlag = True
      # curProt = flatLine
    if flatLine[:11] == "     Region": # regions like this mark domains in the flatfile. Look here for SH3 domains
      regionCount += 1
      regionFlag = True
      continue
    if regionFlag:
      if flatLine[:11] == "           ":
        if "SH3" in flatLine and sHFlag:
          sHCount += 1
          sHFlag = False
          if newProtFlag:
            shProtCount += 1
            newProtFlag = False
          # print flatLine
          # print curProt
      else: 
        regionFlag = False
        sHFlag = True
  print "%d SH3 domains found in %d domains of %d proteins" % (sHCount, regionCount, protCount)
  print "%d proteins out of %d contain SH3 domains" % (shProtCount, protCount)