def pfamHitLength_File(fileInfo):
	inputFolder=fileInfo[1]
	inputfile=fileInfo[2]
	
	lengthArray=[]
	#proteinLenName=inputfile.replace(conf.pfamExt,conf.protLenExt)
	# with open(os.path.join(conf.protLenFolder,proteinLenName)) as f:
	# 	proteinLengths=load(f)

	with open(os.path.join(inputFolder,inputfile),"r") as f:
		for lineIndex, line in enumerate(f):
			lineInfo=(lineIndex,line)
			#format: first line is header
			#0      1               2               3       4
			#PDB_ID	PdbResNumStart	PdbResNumEnd	eValue	PFAM_ACC
			if lineIndex!=0:
				arr=line.split("\t")
				start=int(arr[1].strip())
				end=int(arr[2].strip())
				difference=end-start
				
				lengthArray.append(difference)
			

	#write the array down and reset the array
	util.generateDirectories(conf.pfamGenFolder)
	outfile=os.path.join(conf.pfamGenFolder,inputfile.replace(conf.pfamExt,".cPickle"))
	with open(outfile,"wb") as f:
		dump(lengthArray, f)
	
	return lengthArray
Пример #2
0
def reDownloadSeq():
    util.generateDirectories(conf.outputFolder)
    #input folder
    PFAMFolder = conf.PFAMFolder

    for infile in os.listdir(PFAMFolder):

        #the directory of the output infile is:
        outputDir = os.path.join(conf.outputFolder,
                                 infile.replace(conf.PFamExt, conf.outputExt))
        errorDir = os.path.join(conf.outputFolder,
                                infile.replace(conf.PFamExt, "_error.txt"))
        inputDir = os.path.join(conf.PFAMFolder, infile)
        progDir = os.path.join(conf.outputFolder,
                               infile.replace(conf.PFamExt, "_progress.txt"))
        progress = 0
        if os.path.isfile(progDir):
            with open(progDir, "r") as f:
                progress = int(f.read().strip)
        #create the output file
        #open(outputDir,"w")
        open(errorDir, "w")
        records = []
        for record in SeqIO.parse(open(inputDir, "rU"), "fasta"):
            records.append(record)

        for i in range(progress, len(records)):
            record = records[i]
            seq = record.seq
            sid = record.id
            desc = record.description
            protID = desc.split(":")[0]
            seqmerge = str(seq).replace("\n", "").strip()
            if seqmerge == len(seqmerge) * "X":  #if sequence is all X
                #print seq
                print "downloading", i, "/", len(records), int(
                    i * 100 / float(len(records))), "%"
                retmax = 10
                strOut = "Retmax"
                while (strOut == "Retmax" and retmax < 1000):
                    strOut = DownloadNewSeq(protID, retmax)
                    time.sleep(.3)
                    retmax = retmax * 2

                if strOut == "Error" or retmax >= 1000:
                    print "Cannot find Seq for:", protID, "in", retmax, "downloads"
                    with open(errorDir, "a") as f:
                        f.write(protID + "," + str(retmax) + "\n")

                else:
                    with open(outputDir, "a") as f:
                        f.write(strOut)
            else:
                with open(outputDir, "a") as f:
                    f.write(">" + str(desc) + "\n" + str(seq) + "\n\n")
Пример #3
0
def downloadInOneGo():
    #create the output folder
    util.generateDirectories(conf.outputFolder)

    #input folder
    PFAMFolder = conf.PFAMFolder

    for infile in os.listdir(PFAMFolder):

        #the directory of the output infile is:
        outputDir = os.path.join(conf.outputFolder,
                                 infile.replace(conf.PFamExt, conf.outputExt))

        #create the output file
        open(outputDir, "w")

        #identfy the proteins we need to download
        with open(os.path.join(PFAMFolder, infile), "r") as f:
            proteins = identifyProteinSequences(f.read())
        proteinDir = os.path.join(
            conf.outputFolder, infile.replace(conf.PFamExt,
                                              "_proteins.cPickle"))
        with open(proteinDir, "wb") as f:
            dump(proteins, f)
        print "Number of proteins", len(proteins)
        # #convert the proteins into a query
        # proteinQueries=[]
        # maxLen=250
        # for i, protID in enumerate(proteins):
        # 	addToProtQueries(proteinQueries, protID, maxLen)
        # 	# if i>0 and i<len(proteins)-1:
        # 	# 	proteinQuery+=" OR "

        for i, proteinQuery in enumerate(proteins):

            print "(", i + 1, "/", len(proteins), ")", int(
                i * 100 / float(len(proteins))), "%"

            results = fetchFASTASeqFromPDB(proteinQuery)
            with open(outputDir, "ab") as f:
                f.write(results)
                f.write("\n\n")
            #print "waiting..."
            time.sleep(.5)
def generateHistograms(fileInfo):
	util.generateDirectories(conf.histogramFolder)
	pfamArr=pfamHitLength_File(fileInfo)
	

	blastFilename=fileInfo[2].replace(conf.pfamExt, conf.blastExt)
	fileInfoMod=(fileInfo[0],conf.blastFolder, blastFilename)
	blastArr=BLASTHitLength_File(fileInfoMod)

	
	numbins=100
	maxnum=max(numpy.amax(blastArr),numpy.amax(pfamArr))
	bins = numpy.linspace(0, maxnum, numbins)

	plt.hist(pfamArr, bins, normed=1,facecolor="red", alpha=.75, label="pfam")
	plt.hist(blastArr, bins, normed=1,facecolor="blue", alpha=.25, label="blast")
	
	histoutname=fileInfo[2].replace(conf.pfamExt, ".png")
	outdir=os.path.join(conf.histogramFolder,histoutname)

	plt.legend()
	plt.savefig(outdir)
	plt.close()
def build_graph(blastInfoFilename, blastdir, hspIntGraphdir, cutoffRatio,
                evalueCutoff):

    #Generate the output folder
    util.generateDirectories(hspIntGraphdir)

    g = nx.Graph()

    #read the file
    f = open(os.path.join(blastdir, blastInfoFilename), "r")
    content = f.read()
    f.close()

    #a dictionary that stores node names by the protein names
    nodeNames = {}

    #add the HSP edges
    for i, line in enumerate(content.split("\n")):
        if (i % (len(content.split("\n")) / 10) == 0):
            #sys.stdout.write(str(int(float(10*i)/float(len(content.split("\n"))))))
            sys.stdout.write("*")
            sys.stdout.flush()
        if len(line) > 0:
            hsp = read_HSP(line)
            goodeval = hsp["EValue"] < evalueCutoff
            notsameprotein = (hsp["query_id"] != hsp["target_id"])
            if goodeval and notsameprotein:

                #Add the nodes (p_1,s_1,e_1) and (p_2,s_2,e_2) and create an edge between them
                g.add_node(nodeName(hsp, "query"))
                g.add_node(nodeName(hsp, "target"))
                g.add_edge(nodeName(hsp, "query"),
                           nodeName(hsp, "target"),
                           eValue=hsp["EValue"])

                #add the two node names to the nodeNames dictionary and take away the duplicates
                addToDict(nodeNames,
                          nodeName(hsp, "query")[0], nodeName(hsp, "query"))
                addToDict(nodeNames,
                          nodeName(hsp, "target")[0], nodeName(hsp, "target"))

    sys.stdout.write("\n")
    sys.stdout.flush()

    #add the Interval edges
    proteins = nodeNames.keys()
    for protein in proteins:
        # if(i%(len(proteins)/10)==0):
        #     sys.stdout.write("*")
        #     sys.stdout.flush()
        subNodeNames = nodeNames[protein]
        for i in xrange(len(subNodeNames) - 1):
            for j in xrange(i + 1, len(subNodeNames)):
                name1 = subNodeNames[i]
                name2 = subNodeNames[j]

                overlapPairs = findOverlapIntervals(name1, name2, cutoffRatio)

                for overlapPair in overlapPairs:
                    g.add_edge(overlapPair[0], overlapPair[1])
    # sys.stdout.write("\n")
    # sys.stdout.flush()

    #save the HSPIntGraph
    splitFilename = blastInfoFilename.split(".")
    fileExt = "." + splitFilename[len(splitFilename) - 1]
    outputFile = blastInfoFilename.replace(fileExt,
                                           "") + '_HSPIntGraph.gpickle'
    outputPath = os.path.join(hspIntGraphdir, outputFile)
    with open(outputPath, 'wb') as fout:
        dump(g, fout, HIGHEST_PROTOCOL)

    return outputFile
def defineBordersFromGraph(graphFile, hspIntGraphdir, borderInfodir,
                           borderResultdir):
    #generate the directories
    util.generateDirectories(borderInfodir)
    util.generateDirectories(borderResultdir)

    with open(os.path.join(hspIntGraphdir, graphFile)) as fin:
        g = load(fin)

    #find the connected components of the graph:
    CCgraphs = list(nx.connected_component_subgraphs(g))
    CCgraphs.sort(key=lambda tup: len(tup.nodes()))

    #create a dictionary where key=protein name, val:border information
    modulefamilyinfo = {}
    for moduleID, CCgraph in enumerate(CCgraphs):
        for node in CCgraph.nodes():

            proteinName = node[0]
            start = int(node[1])
            end = int(node[2])

            if (proteinName not in modulefamilyinfo):
                modulefamilyinfo[proteinName] = [[moduleID, start, end]]
            else:
                #if we already have information on this protein
                modulefamilyinfo[proteinName].append([moduleID, start, end])
                collapsOverlappingBorders(modulefamilyinfo[proteinName])

    #cleanup borders that ended up developing to overlap borders
    for protein in modulefamilyinfo.keys():
        collapsOverlappingBorders(modulefamilyinfo[protein])

    #save the border information dataset
    with open(
            os.path.join(
                borderInfodir,
                graphFile.replace('_HSPIntGraph.gpickle', "") +
                '_BorderInformation.gpickle'), 'wb') as fout:
        moduleNumInfo = ("Number of modules detected", len(CCgraphs))
        familyInfoWrap = (
            "ModuleFamilyInfo, Format: Dict\{proteinName, list[borders[moduleId,start,end]] \}",
            modulefamilyinfo)
        dump((moduleNumInfo, familyInfoWrap), fout, HIGHEST_PROTOCOL)

    #write the information down in a text file.
    resultFile = open(
        os.path.join(
            borderResultdir,
            graphFile.replace('_HSPIntGraph.gpickle', "") + '_ModuleInfo.txt'),
        "w")
    resultFile.write("Number of modules detected: " + str(len(CCgraphs)) +
                     "\n\n")
    resultFile.write("proteinName\t borders\n")
    proteins = modulefamilyinfo.keys()
    proteins.sort()

    for protein in proteins:
        resultFile.write(protein)
        borders = modulefamilyinfo[protein]
        for border in borders:
            moduleID = border[0]
            start = border[1]
            end = border[2]
            resultFile.write("\tM_" + str(moduleID) + "(" + str(start) + "," +
                             str(end) + ")")
        resultFile.write("\n")
    resultFile.close()

    return 0