예제 #1
0
파일: bobdata_parser.py 프로젝트: Ravasz/ed
def kegg_converter():
  """process list of uniprot accessions for KEGG pathway analysis"""
  from root.ed.tools import prot_id_converter
  
  protList = []
  headerFlag = True
  with open("../bob/processed/24h_bobprots_up_full.csv","r") as inpF:
    for inpLine in inpF:
      if headerFlag:
        headerFlag = False
        continue
      inpList = inpLine.split(",")
      protList.append(inpList[1])
  print prot_id_converter(protList, outDB="kegggeneid")
예제 #2
0
파일: bobdata_parser.py 프로젝트: Ravasz/ed
def interactor_finder():
  """take a list of protein names and check if they are in Bob's dataset"""
  from root.ed.tools import prot_id_converter

  proteinList = []
  with open("../datafiles/known_interactors.txt","r") as inpProt: # create list of gene names from hand-made text file with known ptp22 interactors
    for protLine in inpProt:
      if protLine != "\n":
        curName = protLine.strip().split("\t")[0]
        curName = curName[0] + curName[1:].lower()
        proteinList.append(curName)
  inpIdL = prot_id_converter(proteinList, "10090", "genesymbol", "uniprotaccession") # convert to uniprot accessions
  print inpIdL
  
  with open("../bob/processed/bobprots_all.csv","r") as targetF: # create list of all uniprot accessions in Bob's dataset (unique razor proteins only)
    targetD = {}
    for targetLine in targetF:
      targetD[targetLine.split(",")[0]] = targetLine.split(",")[1].strip()
  for inpIdItem in inpIdL:
    for queryI in inpIdItem:
      if queryI in targetD:
        print targetD[queryI]
        break
예제 #3
0
파일: ppi_parser.py 프로젝트: Ravasz/ed
def intact_parser():
  """open ptpn22.txt and extract prey protein uniprot accessions. 
  Convert those to refseq protein accessions.
  Return them as a list."""
  from root.ed.tools import prot_id_converter, file_importer
  
  relPath = "ptpn22_ppi_data/ptpn22.txt"
  inpF = file_importer(relPath, "r")
  headerFlag = True
  preyL = []
  for inpLine in inpF:
    if headerFlag:
      headerFlag = False
      continue
    inpList = inpLine.split("\t")
    inpItem = inpList[1].split(":")[-1]
    if inpItem not in preyL:
      preyL.append(inpItem)
    inpItem = inpList[0].split(":")[-1]
    if inpItem not in preyL:
      preyL.append(inpItem)    
  inpF.close()
  idList = prot_id_converter(preyL, "", outDB="refseqproteingi") # convert uniprot ID to refseq accessions
  return idList
예제 #4
0
파일: peptide_parser.py 프로젝트: Ravasz/ed
def main():
  from root.ed.tools import html_creator, prot_id_converter, prot_entrez_fetch
  from copy import deepcopy
  print "this is peptide parser"
  
  
  queryS = "Orc6" # this is the search term that will be worked on. It should be a protein name like "Ptpn22"
  
  
  print "working on: " + queryS
  with open("../bob/peptides.txt","r") as inpF: 
    pepL, uniId = peptide_finder(targetFile=inpF, targetS = queryS) # find peptides from peptides.txt for the protein name
  print pepL
  print "peptides found"
  targetL = [queryS]
  idList = prot_id_converter(targetL, "10090", inpDB = "genesymbol",outDB="refseqproteingi")
  seqL = prot_entrez_fetch(idList, retM="gb", retT="fasta")
  for seqItem in seqL:
    seqS = seqItem.split("\n")[1]
    print seqS
  print "protein sequence found"
  annotS = deepcopy(seqS)
  pStartL = []
  pEndL = []
  for pepItem in pepL: # locate peptides in full protein sequence and store positions for starts and ends. merge overlapping peptides.
    pepStart = seqS.index(pepItem)
    pepEnd = pepStart + len(pepItem)
    
    startCount = 0 # handle starts
    for startItem in pStartL:
      if startItem <= pepStart:
        startCount += 1
    endCount = 0
    for endItem in pEndL:
      if endItem <= pepStart:
        endCount += 1
    if startCount == endCount and pepStart not in pEndL: # start new peptide
      pStartL.append(pepStart)
    elif startCount == endCount and pepStart in pEndL: # start a new peptide at the end of another peptide
      pEndL.remove(pepStart)
    
    overlapCount = 0
    for startItem in pStartL[:]: # handle ends
      if pepStart<startItem<=pepEnd:
        pStartL.remove(startItem) # remove extra starts
        overlapCount += 1
        
    for endItem in pEndL[:]:
      if pepStart<=endItem<=pepEnd: # remove extra ends
        pEndL.remove(endItem)
        overlapCount -= 1
    
    if pepEnd not in pEndL and overlapCount <= 0: # add end
      curStart = 500000
      for pSI in pStartL:
        if curStart > pSI > pepEnd:
          curStart = pSI
      curEnd = 500000
      for pEI in pEndL:
        if curEnd > pEI > pepEnd:
          curEnd = pEI
      if curStart <= curEnd:  # check if next tag is start or end. if start, add end. if end, do nothing
        pEndL.append(pepEnd)
     
  print uniId
  phL = []
  with open("../datafiles/Phosphorylation_site_dataset") as phInp: # now for the phosphosite data
    for phLine in phInp:
      phList = phLine.split("\t")
      try:
        if uniId == phList[1]: 
          phL.append(int(phList[4][1:-2]) - 1)
      except IndexError:
        continue
  print phL
  
  fullL = pStartL + pEndL
  for phItem in phL:
    if phItem not in fullL: fullL.append(phItem)
  fullL.sort()
  
  offsetN = 0
  for posI in fullL: # from the resulting intervals, create emphasis in html file, mark phosphosites in red
    if posI in pStartL:
      annotS = annotS[:posI+offsetN] + "<mark>" + annotS[posI+offsetN:]
      offsetN += 6
    elif posI in pEndL:
      annotS = annotS[:posI+offsetN] + "</mark>" + annotS[posI+offsetN:]
      offsetN += 7
    if posI in phL:
      annotS = annotS[:posI+offsetN] + r"""<strong style="color: red;">""" + annotS[posI+offsetN] + r"""</strong>""" + annotS[posI+offsetN + 1 :]
      offsetN += 37
        
  print annotS
  html_creator(queryS + " peptides", annotS, queryS + ".html")
  print "found peptides marked in the file: ",
  print queryS + ".html"  
예제 #5
0
파일: ppi_parser.py 프로젝트: Ravasz/ed
def sh3_counter():
  """look up a list of uniprot IDs, download their full genbank entries from the Entrez database 
  and count the number of SH3 domains the interactors have. Print the results to STDout"""
  from root.ed.tools import prot_entrez_fetch, prot_id_converter
  from root.ed.bobscripts.bobdata_parser import protein_name_collector
  # idList = intact_parser() # to use for Ptpn22 interactome
  fullPreyL = protein_name_collector()
  # fullPreyL = ["P20152", "Q8BFZ3", "P17182", "P17742", "P11499"]
  print fullPreyL
  if len(fullPreyL) > 200: # chop up very large lists of uniprot IDs to batches of 100
    maxBatch = (len(fullPreyL)/200) + 1
    lenCount = 0
    idList = []
    seqL = []
    batchCount = 0
    preyL = []
    for listItem in fullPreyL:
      preyL.append(listItem)
      lenCount += 1
      if lenCount == 200:
        batchCount += 1
        print "processing batch number %d of Uniprot IDs..." % (batchCount, )
        idList = prot_id_converter(preyL, "10090", outDB="refseqproteingi")
        seqL = seqL + prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py
        lenCount = 0
        idList = []
        preyL = []
        
        print "this was batch number %d of %d" %(batchCount, maxBatch) 
        print ""
    if lenCount != 0:
      batchCount += 1
      print "processing batch number %d of Uniprot IDs..." % (batchCount, )
      idList = prot_id_converter(preyL, "10090", outDB="refseqproteingi")
      seqL = seqL + prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py
      lenCount = 0
      idList = []
      preyL = []
      
      print "this was batch number %d of %d" %(batchCount, maxBatch) 
      print ""
      
        
  else: 
    idList = prot_id_converter(fullPreyL, "10090", outDB="refseqproteingi")
    seqL = prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py
  
  """
  dumpStr = ""
  for seqLine in seqL:
    seqStr = "***".join(seqLine) 
    dumpStr = dumpStr + "xxxxx" + seqStr # this might get huge
  
  outputDump = open("entrezdump.txt", "w")
  outputDump.write(dumpStr)
  """
  
  regionFlag = False
  sHFlag = True
  regionCount = 0
  sHCount = 0
  protCount = 0
  shProtCount = 0
  for flatLine in seqL:
    if flatLine[:10]  == "LOCUS     ": # this is usually the title of an entrez flatfile and contains the protien name 
      protCount+= 1
      newProtFlag = True
      # curProt = flatLine
    if flatLine[:11] == "     Region": # regions like this mark domains in the flatfile. Look here for SH3 domains
      regionCount += 1
      regionFlag = True
      continue
    if regionFlag:
      if flatLine[:11] == "           ":
        if "SH3" in flatLine and sHFlag:
          sHCount += 1
          sHFlag = False
          if newProtFlag:
            shProtCount += 1
            newProtFlag = False
          # print flatLine
          # print curProt
      else: 
        regionFlag = False
        sHFlag = True
  print "%d SH3 domains found in %d domains of %d proteins" % (sHCount, regionCount, protCount)
  print "%d proteins out of %d contain SH3 domains" % (shProtCount, protCount)