def findNoGoByNoHeads(noGoList, side, folderName): noGoListNew = [] sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ formSortedDataList(folderName) lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta" ) lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") for x in noGoList: rList = findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead) cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig,lenDicRead) if bestMatchContigOnly == False: bestContigIDList = findBreakContigAdv(cList) else: bestContigIDList = findBreakContig(cList) if len(rList) > 0 and len(cList) > 0: print "x, side, len(rList), len(cList), len(bestContigIDList)",\ abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList) print "cList", bestContigIDList noGoListNew = noGoListNew + bestContigIDList return noGoListNew
def findNoGoByNoHeads(noGoList, side, folderName): noGoListNew = [] sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ formSortedDataList(folderName) lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta") lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") for x in noGoList: rList = findAttachedReads(x, side, folderName, sortedContigList, sortedContigDic, lenDicContig, lenDicRead) cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig, lenDicRead) if bestMatchContigOnly == False: bestContigIDList = findBreakContigAdv(cList) else: bestContigIDList = findBreakContig(cList) if len(rList) > 0 and len(cList) > 0: print "x, side, len(rList), len(cList), len(bestContigIDList)",\ abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList) print "cList", bestContigIDList noGoListNew = noGoListNew + bestContigIDList return noGoListNew
def findCoverageFromRawData(folderName): contigLenDic = IORobot.obtainLength(folderName, "contigs.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") G = 0 NL = 0 for eachitem in contigLenDic: G = G + contigLenDic[eachitem] for eachitem in readLenDic: NL = NL + readLenDic[eachitem] c = (NL * 1.0) / G print c return c
def findCoverageFromRawData(folderName): contigLenDic = IORobot.obtainLength(folderName, "contigs.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") G = 0 NL = 0 for eachitem in contigLenDic: G = G+ contigLenDic[eachitem] for eachitem in readLenDic: NL = NL+ readLenDic[eachitem] c = (NL*1.0)/G print c return c
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): ''' Algorithm: 1)Load ContigReadGraph and form xResolvedGraph 2)Transitive reduction and remove double pointers 3)Bipartite resolution 4)xResolve 5)Form gapLookUp 6)Read contigs out from graph 7)CheckAns and get it done today again... ''' if abunHouseKeeper.abunGlobalRunEM == True: emalgo.generateAssociatedReadDic(folderName) lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDic) Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename) Gnew.logEdges(folderName, "graphsurgery") #Gnew.reportEdge() #assert(False) Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic, mummerLink) Gnew.logEdges(folderName, "BResolution") XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1, mummerLink) Gnew.logEdges(folderName, "XResolution") readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph)
def generateGapContentLookup( folderName, mummerLink, oldResolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic={} ): gapContentLookUpList = [] contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta") N1 = len(contigLenDic) * 2 resolvedList = [] print "mapDummyToRealDic", mapDummyToRealDic for eachmatchpair in oldResolvedList: tmpList = [] if eachmatchpair[0] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] - N1)][1] else: tmpList.append(eachmatchpair[0]) if eachmatchpair[-1] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] - N1)][1] else: tmpList.append(eachmatchpair[-1]) for ii in range(len(tmpList) - 1): resolvedList.append([tmpList[ii], tmpList[ii + 1]]) gapContentLookUpList = parallelGapLookUp( resolvedList, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename ) return gapContentLookUpList
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList) , len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList), len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def formExtraEdges( folderName="/home/kakitfive/kkdata2/MetaFinisherSC/dataFolderBackup/", optTypeFileHeader="phaseString", contigFilename="improved3", G=[], N1=0): dataList = alignerRobot.extractMumData(folderName, optTypeFileHeader + "CR" + "Out") dataList.sort(key=itemgetter(-2)) lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") count = 0 tmpItem = [] embedContig2ReadDic, read2EmbedContigDic = {}, {} for key, items in groupby(dataList, itemgetter(-2)): isEmbedded = False for eachitem in items: #print eachitem if eachitem[4] > lenDic[key] - 300: isEmbedded = True tmpItem = eachitem if isEmbedded: count = count + 1 readName = tmpItem[-1] embedContig2ReadDic[key] = readName read2EmbedContigDic[readName] = key print "len(embedContig2ReadDic)", len(embedContig2ReadDic) #assert(False) for contigName in embedContig2ReadDic: readName = embedContig2ReadDic[contigName] readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID( readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C') for eachprev in G.graphNodesList[readIndex].listOfPrevNodes: idNode, wt = eachprev[0], eachprev[1] if idNode < N1: G.insertEdge(idNode, contigIndex, wt) for eachnext in G.graphNodesList[readIndex].listOfNextNodes: idNode, wt = eachnext[0], eachnext[1] if idNode < N1: G.insertEdge(contigIndex, idNode, wt) return G
def colorNodes(folderName, mummerPath,sourceFilename, contigFilename, readsetFilename): print "colorNodes" lenDic = IORobot.obtainLength(folderName, sourceFilename+".fasta") print lenDic thresForShort = 15000 shortList = [] longList = [] for eachitem in lenDic: if lenDic[eachitem] > thresForShort: longList.append(eachitem) else: shortList.append(eachitem) IORobot.putListToFileO(folderName, sourceFilename+".fasta", contigFilename, longList) IORobot.putListToFileO(folderName, sourceFilename+".fasta", readsetFilename, shortList)
def colorNodes(folderName, mummerPath, sourceFilename, contigFilename, readsetFilename): print "colorNodes" lenDic = IORobot.obtainLength(folderName, sourceFilename + ".fasta") print lenDic thresForShort = 15000 shortList = [] longList = [] for eachitem in lenDic: if lenDic[eachitem] > thresForShort: longList.append(eachitem) else: shortList.append(eachitem) IORobot.putListToFileO(folderName, sourceFilename + ".fasta", contigFilename, longList) IORobot.putListToFileO(folderName, sourceFilename + ".fasta", readsetFilename, shortList)
def decideCut(folderName, mummerPath): ''' Input : directPath.fasta, indirectPath.fasta Output : toDelete ''' thres = 50 if True: alignerRobot.useMummerAlign(mummerPath, folderName, \ "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True) dataList = alignerRobot.extractMumData(folderName , "indirectvsdirectOut") lenDic = IORobot.obtainLength(folderName, "directPath.fasta") ctr =0 ctrindirect = 0 dataList.sort(key = itemgetter(-1)) toDelete = True for key, items in groupby(dataList, itemgetter(-1)): print "key", key ctr = ctr + 1 isFound = False for eachitem in items: if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres: isFound = True if isFound: ctrindirect = ctrindirect + 1 epsilon = 1.1 print "ctrindirect, ctr", ctrindirect, ctr if ctrindirect*1.0/ctr < (1- epsilon): toDelete = False else: toDelete = True return toDelete
def decideCut(folderName, mummerPath): ''' Input : directPath.fasta, indirectPath.fasta Output : toDelete ''' thres = 50 if True: alignerRobot.useMummerAlign(mummerPath, folderName, \ "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True) dataList = alignerRobot.extractMumData(folderName , "indirectvsdirectOut") lenDic = IORobot.obtainLength(folderName, "directPath.fasta") ctr =0 ctrindirect = 0 dataList.sort(key = itemgetter(-1)) toDelete = True for key, items in groupby(dataList, itemgetter(-1)): print "key", key ctr = ctr + 1 isFound = False for eachitem in items: if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres: isFound = True if isFound: ctrindirect = ctrindirect + 1 epsilon = 1.1 print "ctrindirect, ctr", ctrindirect, ctr if ctrindirect*1.0/ctr < (1- epsilon): toDelete = False else: toDelete = True return toDelete
def runningTestSet(self ,myFolderName, ctexpected, commandList, matchingContigFile): print "Integration test on RepeatPhaserMain: " + myFolderName self.sourceFolder = myFolderName os.system("rm -rf "+ self.testingFolder) os.system("mkdir " + self.testingFolder) for eachitem in self.listOfFiles: os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder) for eachcommand in commandList: os.system(eachcommand) lenDic = IORobot.obtainLength(self.testingFolder, matchingContigFile) assert(len(lenDic) == ctexpected) os.system("rm -rf "+ self.testingFolder)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
def runningTestSet(self, myFolderName, ctexpected, commandList, matchingContigFile): print "Integration test on RepeatPhaserMain: " + myFolderName self.sourceFolder = myFolderName os.system("rm -rf " + self.testingFolder) os.system("mkdir " + self.testingFolder) for eachitem in self.listOfFiles: os.system("cp " + self.sourceFolder + eachitem + " " + self.testingFolder) for eachcommand in commandList: os.system(eachcommand) lenDic = IORobot.obtainLength(self.testingFolder, matchingContigFile) assert (len(lenDic) == ctexpected) os.system("rm -rf " + self.testingFolder)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): """ Algorithm: 1)Load ContigReadGraph and form xResolvedGraph 2)Transitive reduction and remove double pointers 3)Bipartite resolution 4)xResolve 5)Form gapLookUp 6)Read contigs out from graph 7)CheckAns and get it done today again... """ lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDic) Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename) Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic) XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1) readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph)
def checkPathLength(path, G, N1, folderName): lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") sumLength = 0 overlapLength = 0 for index, i in zip(path, range(len(path))): header = "Read" + str((index - N1) / 2) + "_" if (index - N1) % 2 == 0: header = header + "p" else: header = header + "d" print "lenDicRR[header], ", lenDicRR[header], header print (index - N1) * 2 + 1, (index - N1) * 2 + 2 sumLength = sumLength + lenDicRR[header] if i != len(path) - 1: for eachnext in G.graphNodesList[index].listOfNextNodes: if eachnext[0] == path[i + 1]: overlapLength = overlapLength + eachnext[1] break print sumLength, overlapLength, sumLength - overlapLength
def checkPathLength(path, G, N1, folderName): lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") sumLength = 0 overlapLength = 0 for index, i in zip(path, range(len(path))): header = "Read" + str((index - N1) / 2) + "_" if (index - N1) % 2 == 0: header = header + "p" else: header = header + "d" print "lenDicRR[header], ", lenDicRR[header], header print(index - N1) * 2 + 1, (index - N1) * 2 + 2 sumLength = sumLength + lenDicRR[header] if i != len(path) - 1: for eachnext in G.graphNodesList[index].listOfNextNodes: if eachnext[0] == path[i + 1]: overlapLength = overlapLength + eachnext[1] break print sumLength, overlapLength, sumLength - overlapLength
def generateGapContentLookup(folderName, mummerLink, oldResolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic={}): gapContentLookUpList = [] contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta") N1 = len(contigLenDic) * 2 resolvedList = [] print "mapDummyToRealDic", mapDummyToRealDic for eachmatchpair in oldResolvedList: tmpList = [] if eachmatchpair[0] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] - N1)][1] else: tmpList.append(eachmatchpair[0]) if eachmatchpair[-1] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] - N1)][1] else: tmpList.append(eachmatchpair[-1]) for ii in range(len(tmpList) - 1): resolvedList.append([tmpList[ii], tmpList[ii + 1]]) gapContentLookUpList = abunGraphLib.parallelGapLookUp( resolvedList, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename) return gapContentLookUpList
def test1(): lenDic = {} coverageDic = {} lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta") f = open("/Users/kakitlam/Documents/abundata", 'r') tmp = f.readline() while len(tmp) > 0: if len(tmp) > 10: myitem = tmp[0:-1].split() coverageDic[myitem[0]] = float(myitem[1]) tmp = f.readline() f.close() myList = [] baseCt = {} for eachitem in lenDic: myList.append(lenDic[eachitem]*coverageDic[eachitem]) baseCt[eachitem] = lenDic[eachitem]*coverageDic[eachitem] for eachitem in lenDic : print eachitem, baseCt[eachitem] for eachitem in lenDic : print eachitem, lenDic[eachitem] for eachitem in lenDic : print eachitem, coverageDic[eachitem]
def viewLenDic(): folderName = "Apr10Test/" json_data = open(folderName + "myCountDic.json", 'r') myCountDic = json.load(json_data) contigLenDic = IORobot.obtainLength(folderName, "LC_n.fasta") toPlotListX = [] toPlotListY = [] for eachitem in contigLenDic: toPlotListX.append(myCountDic[eachitem]) toPlotListY.append(contigLenDic[eachitem]) print toPlotListX, toPlotListY with open(folderName + "toPlotListX.json", 'w') as f: json.dump(toPlotListX, f) with open(folderName + "toPlotListY.json", 'w') as f: json.dump(toPlotListY, f)
def continuousIntegration(): if False: G = graphLib.seqGraph(10) for i in range(5): G.insertEdge(i,i+1,1997) G.insertEdge(i,i+2, 1997) resultList = abunGraphLib.BFS_revisit(1,3,G,1) print "resultList", resultList if False : folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \ "Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta" abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile) if False: lenDic = IORobot.obtainLength(folderName , contigFile) N1 = len(lenDic) print "N1", N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) Gnew.initAdv() Gnew.doubleEdgeReduction() contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3) contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5) print "contigPaths", contigPaths print "contigReadPaths", contigReadPaths Gnew.transitiveReduction() if False: toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/") print toDelete if False: G = graphLib.seqGraph(0) G.loadFromFile("Apr10TestA/", "xResolvedGraph") if False: for i in range(len(G.graphNodesList)): v = G.graphNodesList[i] if len(v.nodeIndexList) > 0: print i , v.listOfPrevNodes , v.listOfNextNodes G.reportEdge() lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta") mylist = [401, 207, 405, 407, 344] json_data = open("Apr10TestA/" + "myCountDic.json", 'r') myCountDic = json.load(json_data) for x in mylist: print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)] if False: folderName = "Apr10TestA/" G = graphLib.seqGraph(0) G.loadFromFile(folderName , "xResolvedGraph") json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") print len(G.graphNodesList) print len(mapDummyToRealDic) print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic) if False: abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/") if False: nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/") if False: folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1" G = graphLib.seqGraph(0) kthres, edgeThres = 3, 1 G.loadFromFile(folderName, contigReadGraph) lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta") N1 = len(lenDic) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres: adj[i].append(j) #print i, adj[i] ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i+1) adaptorPair.append([i, i+1]) elif i % 2 ==1: if i-1 in adj[i] : adj[i].remove(i-1) adaptorPair.append([i, i-1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) #Gnew.reportEdge() count2 = 0 for i in range(len(Gnew.graphNodesList)): if len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and len(Gnew.graphNodesList[i].listOfNextNodes) == 2: count2 = count2 + 1 print str(i)+"{color:red}" print "count2, ", count2 ### End filter adaptor skipped case if True: nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" : print "debug" , eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" : print "debug" , eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = 20 if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = abunGraphLib.seqGraphWt(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraph(G) json_data = open(folderName + repeatFilename, 'r') repeatList = json.load(json_data) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)", len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1): print rIn, rOut if (len(rIn) == 1 and len(rOut) == 1): rIn = [rIn[0], rIn[0]] rOut = [rOut[0], rOut[0]] # 1. Records reachable indices kkIn , kkOut = [], [] for eachkk in rIn: kkIn.append(str(eachkk)+"_"+"in") for eachkk in rOut: kkOut.append(str(eachkk)+"_"+"out") abunGraphLib.markReachableIndices(G, Grev, kkIn, kkOut, N1) # 2. Marks inside nodes singleMissList, allPassList = abunGraphLib.markInsideNodes(G, kkIn, kkOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = abunGraphLib.markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = abunGraphLib.markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway #checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = abunGraphLib.markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = abunGraphLib.markAssociatedReads(G, singleMissList, allPassList) # ## Experimental repeatList = allPassList # ## End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + repeatSpec, 'w') as outfile: json.dump(bigDumpList, outfile)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0 : repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName, needAlignment=True): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" #if needAlignment: # alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) if needAlignment: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[header, referenceFile, queryFile, ""]], houseKeeper.globalParallel) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' addDataToList(dataListCC, G, 0, 0, 'C', 'C') addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1) Gnew.saveToFile(folderName, graphName) print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
import matplotlib.pyplot as plt from finisherSCCoreLib import IORobot lenDic = {} coverageDic = {} lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta") f = open("/Users/kakitlam/Documents/abundata", 'r') tmp = f.readline() while len(tmp) > 0: if len(tmp) > 10: myitem = tmp[0:-1].split() coverageDic[myitem[0]] = float(myitem[1]) tmp = f.readline() f.close() myList = [] baseCt = {} for eachitem in lenDic: myList.append(lenDic[eachitem]*coverageDic[eachitem]) baseCt[eachitem] = lenDic[eachitem]*coverageDic[eachitem] for eachitem in lenDic : print eachitem, baseCt[eachitem]
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" """ 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V """ numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) """ "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum """ outputName, referenceName, queryName, specialName = ( "outAbun" + indexOfMum, "improved3.fasta", "raw_reads.part-" + indexOfMum + ".fasta", "outAbun" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False) """ command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) """ dataList = [] for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out") """ 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass """ lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList = [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut") else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False) with open(folderName + "myCountDic.json", "w") as f: json.dump(myCountDic, f) return myCountDic
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def performPhasing(folderName, mummerLink): print "performPhasing" ''' 1. Interface from alignmentBridge.py : shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList) cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True) 2. Format of input data data : bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) 3. IO : a) Input : repeatSpecification.txt, phasingSeedName_Double.fasta, graph G b) Output : improved4.fasta 3. Algorithm: a) reformatNoisyReads b) reformatToProcessList c) formShortToLongMapping ''' json_data = open(folderName + 'repeatSpecification.txt', 'r') loadData = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") lenDicCC = IORobot.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) loadData = filterReverseComp(loadData, N1) toPhaseList = [] if True: for eachitem in loadData: # print eachitem flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1) toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1) shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal, dicToOriginal, lenDicCR, N1) indelRobot = createIndelRobot(folderName) cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True) if extendResult != -1: print "extendResult: ", extendResult toPhaseList.append(eachitem + [extendResult]) with open(folderName + 'toPhaseList.txt', 'w') as outfile: json.dump(toPhaseList, outfile) json_data = open(folderName + 'toPhaseList.txt', 'r') toPhaseList = json.load(json_data) outputResults(folderName, mummerLink, toPhaseList, N1, G)
def resolvingTandem( folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec ): print "resolvingTandem" """ Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs """ # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, "r") loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i = 0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i + 1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk] - N1 nextitem = tandemPath[kk + 1] - N1 readName = "Read" + str(eachitem / 2) + "_" nextReadName = "Read" + str(nextitem / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" if nextitem % 2 == 0: nextReadName = nextReadName + "p" elif nextitem % 2 == 1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, "w") fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge = repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList = [] for eachitem in repeatReadList: readName = "Read" + str((eachitem - N1) / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key=itemgetter(-1)) for key, values in groupby(dataList, itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] # print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch * 1.0 / (c * lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1] - N1 contigName = "Contig" + str(startContig / 2) if startContig % 2 == 0: contigName = contigName + "_p" elif startContig % 2 == 1: contigName = contigName + "_d" readName = "Read" + str(firstRead / 2) if firstRead % 2 == 0: readName = readName + "_p" elif firstRead % 2 == 1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName + ";" + readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", "w") f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign( mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta" ) dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out") dataList.sort(key=itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0: repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct * lrepeat) print "repeatStart", repeatStart happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", "w") counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C") if checkingList[id / 2] == False: fout.write(">Segkk" + str(counter) + "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk / 2] = True fout.close()
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0: leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([ abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList) ]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert (loadData == repeatList)
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3") command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = houseKeeper.globalParallelFileNum if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" ''' 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V ''' numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) ''' "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum ''' outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta", "outAbun" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) ''' dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out") ''' 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass ''' lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList= [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" ) else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False) with open(folderName + 'myCountDic.json', 'w') as f: json.dump(myCountDic, f) return myCountDic
import matplotlib.pyplot as plt from finisherSCCoreLib import IORobot lenDic = {} coverageDic = {} lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta") f = open("/Users/kakitlam/Documents/abundata", 'r') tmp = f.readline() while len(tmp) > 0: if len(tmp) > 10: myitem = tmp[0:-1].split() coverageDic[myitem[0]] = float(myitem[1]) tmp = f.readline() f.close() myList = [] baseCt = {} for eachitem in lenDic: myList.append(lenDic[eachitem] * coverageDic[eachitem]) baseCt[eachitem] = lenDic[eachitem] * coverageDic[eachitem] for eachitem in lenDic: print eachitem, baseCt[eachitem] for eachitem in lenDic: print eachitem, lenDic[eachitem]
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p": print "debug", eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p": print "debug", eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)