def formSeqGraph(folderName , mummerLink): print "formSeqGraph" startList, graphNodes = [], [] print "formSeqGraph: Reading in best successors and predecessors" rightConnect = readConnectList(folderName, "rightConnect.txt") leftConnect = readConnectList(folderName, "leftConnect.txt") numberOfNodes = len(rightConnect) print "formSeqGraph: Initializing seqGraph" G = graphLib.seqGraph(numberOfNodes) print "formSeqGraph: Adding edges to seqGraph" for eachitem, i in zip(rightConnect, range(len(rightConnect))): index = i connector, weight = eachitem G.insertEdge(index, connector, weight) for eachitem, i in zip(leftConnect, range(len(leftConnect))): index = i connector, weight = eachitem G.insertEdge(connector, index, weight) G.cleanEdge() G.condense() print "formSeqGraph: Outputting seqGraph to condensedGraph.txt" G.saveToFile(folderName, "condensedGraph.txt") G.checkSelfLoops() G.checkCompleteness() G2 = graphLib.seqGraph(0) G2.loadFromFile(folderName, "condensedGraph.txt") houseKeeper.compareGraphUnitTest(G, G2) G.reportDummyUsefulNode() G.reportEdge() graphFileName = "condensedGraph.txt" contigFile = "noEmbed_Double.fasta" outContigFile = "improved.fasta" outOpenList = "openZone.txt" print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
def formSeqGraph(folderName, mummerLink): print "formSeqGraph" startList, graphNodes = [], [] print "formSeqGraph: Reading in best successors and predecessors" rightConnect = readConnectList(folderName, "rightConnect.txt") leftConnect = readConnectList(folderName, "leftConnect.txt") numberOfNodes = len(rightConnect) print "formSeqGraph: Initializing seqGraph" G = graphLib.seqGraph(numberOfNodes) print "formSeqGraph: Adding edges to seqGraph" for eachitem, i in zip(rightConnect, range(len(rightConnect))): index = i connector, weight = eachitem G.insertEdge(index, connector, weight) for eachitem, i in zip(leftConnect, range(len(leftConnect))): index = i connector, weight = eachitem G.insertEdge(connector, index, weight) G.cleanEdge() G.condense() print "formSeqGraph: Outputting seqGraph to condensedGraph.txt" G.saveToFile(folderName, "condensedGraph.txt") G.checkSelfLoops() G.checkCompleteness() G2 = graphLib.seqGraph(0) G2.loadFromFile(folderName, "condensedGraph.txt") houseKeeper.compareGraphUnitTest(G, G2) G.reportDummyUsefulNode() G.reportEdge() graphFileName = "condensedGraph.txt" contigFile = "noEmbed_Double.fasta" outContigFile = "improved.fasta" outOpenList = "openZone.txt" print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"
def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 print "xPhased: Building seqGraph" G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt" G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"