def kegg_converter(): """process list of uniprot accessions for KEGG pathway analysis""" from root.ed.tools import prot_id_converter protList = [] headerFlag = True with open("../bob/processed/24h_bobprots_up_full.csv","r") as inpF: for inpLine in inpF: if headerFlag: headerFlag = False continue inpList = inpLine.split(",") protList.append(inpList[1]) print prot_id_converter(protList, outDB="kegggeneid")
def interactor_finder(): """take a list of protein names and check if they are in Bob's dataset""" from root.ed.tools import prot_id_converter proteinList = [] with open("../datafiles/known_interactors.txt","r") as inpProt: # create list of gene names from hand-made text file with known ptp22 interactors for protLine in inpProt: if protLine != "\n": curName = protLine.strip().split("\t")[0] curName = curName[0] + curName[1:].lower() proteinList.append(curName) inpIdL = prot_id_converter(proteinList, "10090", "genesymbol", "uniprotaccession") # convert to uniprot accessions print inpIdL with open("../bob/processed/bobprots_all.csv","r") as targetF: # create list of all uniprot accessions in Bob's dataset (unique razor proteins only) targetD = {} for targetLine in targetF: targetD[targetLine.split(",")[0]] = targetLine.split(",")[1].strip() for inpIdItem in inpIdL: for queryI in inpIdItem: if queryI in targetD: print targetD[queryI] break
def intact_parser(): """open ptpn22.txt and extract prey protein uniprot accessions. Convert those to refseq protein accessions. Return them as a list.""" from root.ed.tools import prot_id_converter, file_importer relPath = "ptpn22_ppi_data/ptpn22.txt" inpF = file_importer(relPath, "r") headerFlag = True preyL = [] for inpLine in inpF: if headerFlag: headerFlag = False continue inpList = inpLine.split("\t") inpItem = inpList[1].split(":")[-1] if inpItem not in preyL: preyL.append(inpItem) inpItem = inpList[0].split(":")[-1] if inpItem not in preyL: preyL.append(inpItem) inpF.close() idList = prot_id_converter(preyL, "", outDB="refseqproteingi") # convert uniprot ID to refseq accessions return idList
def main(): from root.ed.tools import html_creator, prot_id_converter, prot_entrez_fetch from copy import deepcopy print "this is peptide parser" queryS = "Orc6" # this is the search term that will be worked on. It should be a protein name like "Ptpn22" print "working on: " + queryS with open("../bob/peptides.txt","r") as inpF: pepL, uniId = peptide_finder(targetFile=inpF, targetS = queryS) # find peptides from peptides.txt for the protein name print pepL print "peptides found" targetL = [queryS] idList = prot_id_converter(targetL, "10090", inpDB = "genesymbol",outDB="refseqproteingi") seqL = prot_entrez_fetch(idList, retM="gb", retT="fasta") for seqItem in seqL: seqS = seqItem.split("\n")[1] print seqS print "protein sequence found" annotS = deepcopy(seqS) pStartL = [] pEndL = [] for pepItem in pepL: # locate peptides in full protein sequence and store positions for starts and ends. merge overlapping peptides. pepStart = seqS.index(pepItem) pepEnd = pepStart + len(pepItem) startCount = 0 # handle starts for startItem in pStartL: if startItem <= pepStart: startCount += 1 endCount = 0 for endItem in pEndL: if endItem <= pepStart: endCount += 1 if startCount == endCount and pepStart not in pEndL: # start new peptide pStartL.append(pepStart) elif startCount == endCount and pepStart in pEndL: # start a new peptide at the end of another peptide pEndL.remove(pepStart) overlapCount = 0 for startItem in pStartL[:]: # handle ends if pepStart<startItem<=pepEnd: pStartL.remove(startItem) # remove extra starts overlapCount += 1 for endItem in pEndL[:]: if pepStart<=endItem<=pepEnd: # remove extra ends pEndL.remove(endItem) overlapCount -= 1 if pepEnd not in pEndL and overlapCount <= 0: # add end curStart = 500000 for pSI in pStartL: if curStart > pSI > pepEnd: curStart = pSI curEnd = 500000 for pEI in pEndL: if curEnd > pEI > pepEnd: curEnd = pEI if curStart <= curEnd: # check if next tag is start or end. if start, add end. if end, do nothing pEndL.append(pepEnd) print uniId phL = [] with open("../datafiles/Phosphorylation_site_dataset") as phInp: # now for the phosphosite data for phLine in phInp: phList = phLine.split("\t") try: if uniId == phList[1]: phL.append(int(phList[4][1:-2]) - 1) except IndexError: continue print phL fullL = pStartL + pEndL for phItem in phL: if phItem not in fullL: fullL.append(phItem) fullL.sort() offsetN = 0 for posI in fullL: # from the resulting intervals, create emphasis in html file, mark phosphosites in red if posI in pStartL: annotS = annotS[:posI+offsetN] + "<mark>" + annotS[posI+offsetN:] offsetN += 6 elif posI in pEndL: annotS = annotS[:posI+offsetN] + "</mark>" + annotS[posI+offsetN:] offsetN += 7 if posI in phL: annotS = annotS[:posI+offsetN] + r"""<strong style="color: red;">""" + annotS[posI+offsetN] + r"""</strong>""" + annotS[posI+offsetN + 1 :] offsetN += 37 print annotS html_creator(queryS + " peptides", annotS, queryS + ".html") print "found peptides marked in the file: ", print queryS + ".html"
def sh3_counter(): """look up a list of uniprot IDs, download their full genbank entries from the Entrez database and count the number of SH3 domains the interactors have. Print the results to STDout""" from root.ed.tools import prot_entrez_fetch, prot_id_converter from root.ed.bobscripts.bobdata_parser import protein_name_collector # idList = intact_parser() # to use for Ptpn22 interactome fullPreyL = protein_name_collector() # fullPreyL = ["P20152", "Q8BFZ3", "P17182", "P17742", "P11499"] print fullPreyL if len(fullPreyL) > 200: # chop up very large lists of uniprot IDs to batches of 100 maxBatch = (len(fullPreyL)/200) + 1 lenCount = 0 idList = [] seqL = [] batchCount = 0 preyL = [] for listItem in fullPreyL: preyL.append(listItem) lenCount += 1 if lenCount == 200: batchCount += 1 print "processing batch number %d of Uniprot IDs..." % (batchCount, ) idList = prot_id_converter(preyL, "10090", outDB="refseqproteingi") seqL = seqL + prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py lenCount = 0 idList = [] preyL = [] print "this was batch number %d of %d" %(batchCount, maxBatch) print "" if lenCount != 0: batchCount += 1 print "processing batch number %d of Uniprot IDs..." % (batchCount, ) idList = prot_id_converter(preyL, "10090", outDB="refseqproteingi") seqL = seqL + prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py lenCount = 0 idList = [] preyL = [] print "this was batch number %d of %d" %(batchCount, maxBatch) print "" else: idList = prot_id_converter(fullPreyL, "10090", outDB="refseqproteingi") seqL = prot_entrez_fetch(idList, retM="gb", retT="text").split("\n") # fetch the complete genbank entries from entrez using this function from tools.py """ dumpStr = "" for seqLine in seqL: seqStr = "***".join(seqLine) dumpStr = dumpStr + "xxxxx" + seqStr # this might get huge outputDump = open("entrezdump.txt", "w") outputDump.write(dumpStr) """ regionFlag = False sHFlag = True regionCount = 0 sHCount = 0 protCount = 0 shProtCount = 0 for flatLine in seqL: if flatLine[:10] == "LOCUS ": # this is usually the title of an entrez flatfile and contains the protien name protCount+= 1 newProtFlag = True # curProt = flatLine if flatLine[:11] == " Region": # regions like this mark domains in the flatfile. Look here for SH3 domains regionCount += 1 regionFlag = True continue if regionFlag: if flatLine[:11] == " ": if "SH3" in flatLine and sHFlag: sHCount += 1 sHFlag = False if newProtFlag: shProtCount += 1 newProtFlag = False # print flatLine # print curProt else: regionFlag = False sHFlag = True print "%d SH3 domains found in %d domains of %d proteins" % (sHCount, regionCount, protCount) print "%d proteins out of %d contain SH3 domains" % (shProtCount, protCount)