示例#1
0
def stat_parser():
  """take protein names with a significant p value and out them to a result file"""
  from root.ed.tools import file_importer, file_outporter
  from math import log
  
  print "this is stat parser"
  
  relPath = "bob/processed/24h_bobdata_ed2.csv"
  outPathUp = "bob/processed/24h_bobprots_up_full.csv"
  outPathDown = "bob/processed/24h_bobprots_down_full.csv"
  inpF = file_importer(relPath)
  outFUp = file_outporter(outPathUp)
  outFDown = file_outporter(outPathDown)
  
  
  skipFlag = True
  
  for inpLine in inpF:
    if skipFlag:
      skipFlag = False
      outFDown.write("ID,Uniprot ID,Gene name,unique peptides (unique+razor),KO1,KO2,KO3,WT1,WT2,WT3,enrichment,P value\n")
      outFUp.write("ID,Uniprot ID,Gene name,unique peptides (unique+razor),KO1,KO2,KO3,WT1,WT2,WT3,enrichment,P value\n")
      continue
    inpLine = inpLine.split("\" \"")
    curLine = []
    for inpI in inpLine:
      curLine.append(inpI.strip("\"\n"))
    try: 
      curLine[-1] = float(curLine[-1])
    except ValueError:
      curLine[-1] = 1   
    if curLine[-1] < 0.05 and int(curLine[3]) > 1: # check if protein has at least 2 unique peptides and has a significant p value
      curLine[4:10] = [int(x) for x in curLine[4:10]]
      enrScore = log((sum(curLine[4:7]) / 3.0)/(sum(curLine[7:10]) / 3.0),2) # calculate log2 enrichment score
      # print int(sum(curLine[4:7]) / 3.0), int(sum(curLine[7:10]) / 3.0)
      if sum(curLine[4:7]) / 3.0 > sum(curLine[7:10]) / 3.0: # if the mean of the KO intensities is higher than the wt  
        for outI in curLine:
          outFDown.write(str(outI).strip(" "))
          if outI is not curLine[-1]:
            outFDown.write(",")
            if outI is curLine[-2]:
              outFDown.write(str(enrScore)+ ",")
          else:
            outFDown.write("\n")
        # outFDown.write(curLine[1] + "," + curLine[2] + "\n")
      else:
        # outFUp.write(curLine[1] + "," + curLine[2] + "\n")
        for outI in curLine:
          outFUp.write(str(outI).strip(" "))
          if outI is not curLine[-1]:
            outFUp.write(",")
            if outI is curLine[-2]:
              outFUp.write(str(enrScore)+ ",")
          else:
            outFUp.write("\n")
  
  inpF.close()
  outFUp.close()
  outFDown.close()
  print "stat_parser completed"
示例#2
0
def file_parser():
  """from bob"s proteinGroups.txt take: Majority protein IDs Peptide counts (razor+unique) ['LFQ intensity KO1', 'LFQ intensity KO2', 'LFQ intensity KO3', 'LFQ intensity WT1', 'LFQ intensity WT2', 'LFQ intensity WT3']
  and write them to a new file. do not select contaminants or reverse peptides"""

  from root.ed.tools import file_importer, file_outporter
  print "this is file parser"
  inpF = file_importer("bob/24h_proteingroups.csv")
  outF = file_outporter("bob/processed/24h_bobdata.csv")
  for inpLine in inpF:
    inpP = inpLine.split("\r")
    cN = 0
    print len(inpP)
    for inpI in inpP:
      inpItems = inpI.split("\t") 
      if inpItems[100] == "+" or inpItems[101] == "+": continue # get rid of contaminants and reverse proteins
      outF.write(str(cN) + "," + inpItems[1] + "," + inpItems[6] + "," + inpItems[3] + "," + inpItems[86] + "," + inpItems[87] + "," + inpItems[88] + "," + inpItems[89] + "," + inpItems[90] + "," + inpItems[91] + "\n")
      # print inpItems [1],"+++", inpItems [3],"+++", inpItems [6],"+++", inpItems[86:92]
      cN += 1
      # if cN == 40: break

    break

  inpF.close()
  outF.close()
  print cN
示例#3
0
def entry_parser():
  """remove duplicate protein name and total peptide count cell entries from bob's dataset"""
  from root.ed.tools import file_importer, file_outporter
  
  print "this is entry parser"
  
  relPath = "bob/processed/24h_bobdata.csv"
  outPath = "bob/processed/24h_bobdata_ed.csv"
  inpF = file_importer(relPath)
  outF = file_outporter(outPath)
  cN = 0
  hitN = 0
  for inpLine in inpF:
    cN += 1
    inpLine = inpLine.strip()
    inpItem = inpLine.split(",")
    geneL = inpItem[1].split(";")
    lenS = len(geneL[0])
    curGene = geneL[0]
    for geneI in geneL:
      if len(geneI) < lenS:
        lenS = len(geneI)
        curGene = geneI
    if "__" in curGene: continue
    protL = inpItem[3].split(";")
    curProt = protL[geneL.index(curGene)]
    if curGene[-2] == "-":
      curGene = curGene[:-2]
    if curGene[-3] == "-":
      curGene = curGene[:-3]
    outF.write(inpItem[0] + "," + curGene + "," + inpItem[2] + "," + curProt + "," + inpItem[4] + "," + inpItem[5] + "," + inpItem[6] + "," + inpItem[7]  + "," + inpItem[8] + "," + inpItem[9] + "\n")
    hitN += 1
  print cN, hitN
  inpF.close()
  outF.close()
示例#4
0
def lfq_parser():
  """remove 0 values from lfq measurements and replace them with a random number between 1 and 100
  This is needed for ttseting later in R, as each measurement there has to have some sort of noise in it"""
  from root.ed.tools import file_importer, file_outporter
  from random import randint
  
  print "this is lfq parser"
  
  relPath = "bob/processed/24h_bobdata_ed.csv"
  outPath = "bob/processed/24h_bobdata_no0_ed.csv"
  inpF = file_importer(relPath)
  outF = file_outporter(outPath)  
  headerFlag = True
  for inpLine in inpF:
    if headerFlag: 
      headerFlag = False
      outF.write(inpLine)
      continue
    inpLine = inpLine.strip()
    inpItems = inpLine.split(",")
    try:
      int(inpItems[4]) # get rid of wonky erroneous lines introduced by excel
    except ValueError:
      print "bad line found here, ignored: ", inpItems
      continue
    for inpI in inpItems[0:4]: # copy over gene name and such to new file
      outF.write(inpI)
      outF.write(",")
    
    commaCount = 0
    for inpJ in inpItems[4:]: # copy over lfq values while replacing 0-s with random values
      commaCount += 1
      if int(inpJ) == 0:
        randNum = randint(1,100)
        outF.write(str(randNum))
      else:
        outF.write(inpJ)
      if commaCount < 6:
          outF.write(",")
    outF.write("\n")
  inpF.close()
  outF.close()