def formConfirmReadResolve(folderName, inList, outList, G, Grev, N1): #print "formConfirmReadResolve" resolvedList = [] confirmingReadList = [] brLFlankList = [] brRFlankList = [] ### Find possible candidate reads print "inList , outList formConfirmReadResolve()", inList, outList for eachin in inList: for eachout in outList: pathList = abunGraphLib.findAllPathK(eachin, eachout, G, 3) for path in pathList: if len(path) == 3 and path[1] >= N1: R = path[1] confirmingReadList.append(R) brLFlankList.append([eachin, R]) brRFlankList.append([eachout, R]) ### Filter simple false cases toUseReadDic = {} confirmingReadList.sort() for key, items in groupby(confirmingReadList): toUseReadDic[str(key)] = True newbrLFlankList = abunHouseKeeper.getDistinct(brLFlankList) newbrLFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrLFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False newbrRFlankList = abunHouseKeeper.getDistinct(brRFlankList) newbrRFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrRFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False finalSearchReadList = [] for eachitem in toUseReadDic: if toUseReadDic[eachitem] == True: finalSearchReadList.append(int(eachitem)) ### Check paths to confirm all false cases for eachR in finalSearchReadList: l1 = abunGraphLib.findAllReachable(eachR, N1, G) l2 = abunGraphLib.findAllReachable(eachR, N1, Grev) l1Distinct = abunHouseKeeper.getDistinct(l1) l2Distinct = abunHouseKeeper.getDistinct(l2) if len(l1Distinct) == 1 and len(l2Distinct) == 1: c1, c2 = l1Distinct[0], l2Distinct[0] resolvedList.append([c2, c1]) return resolvedList
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename): ### Transitive reduction and remove double pointers N1 = len(myCountDic) * 2 print "N1", N1 kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i + 1) adaptorPair.append([i, i + 1]) elif i % 2 == 1: if i - 1 in adj[i]: adj[i].remove(i - 1) adaptorPair.append([i, i - 1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) ### Trying out the new component import toCondenseFixer Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName) Gnew.symGraph() ### End filter adaptor skipped case if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery: Gnew.initAdv() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove: Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr: Gnew.doubleEdgeReduction() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive: Gnew.transitiveReduction( folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G ) Gnew.condense() Gnew.findAdjList() else: Gnew.initAdv() Gnew.condense() Gnew.findAdjList() return Gnew
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename): print eachmatchpair leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, "" succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) succReadsList = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5) # shuffle(allPaths) print "allPaths", allPaths possibleList = [] for p in allPaths: noContig = True for pp in p[1:-1]: if pp < N1: noContig = False if noContig == True: possibleList.append(p) print "possibleList", possibleList minListLen = 1000 for p in possibleList: if len(p) < minListLen: succReadsList = p minListLen = len(p) if len(succReadsList) > 0: succReadsList.pop(0) succReadsList.pop(-1) else: print "interesting item for future study" print "succReadsList", succReadsList if len(succReadsList) == 0: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap contig : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" else: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) print contigName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1) print readName rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap start read : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" for i in range(len(succReadsList) - 1): readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1) rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap middle read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap end read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]
def formConfirmReadResolve(folderName, inList, outList, G, Grev, N1): # print "formConfirmReadResolve" resolvedList = [] confirmingReadList = [] brLFlankList = [] brRFlankList = [] ### Find possible candidate reads print "inList , outList formConfirmReadResolve()", inList, outList for eachin in inList: for eachout in outList: pathList = abunGraphLib.findAllPathK(eachin, eachout, G, 3) for path in pathList: if len(path) == 3 and path[1] >= N1: R = path[1] confirmingReadList.append(R) brLFlankList.append([eachin, R]) brRFlankList.append([eachout, R]) ### Filter simple false cases toUseReadDic = {} confirmingReadList.sort() for key, items in groupby(confirmingReadList): toUseReadDic[str(key)] = True newbrLFlankList = abunHouseKeeper.getDistinct(brLFlankList) newbrLFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrLFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False newbrRFlankList = abunHouseKeeper.getDistinct(brRFlankList) newbrRFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrRFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False finalSearchReadList = [] for eachitem in toUseReadDic: if toUseReadDic[eachitem] == True: finalSearchReadList.append(int(eachitem)) ### Check paths to confirm all false cases for eachR in finalSearchReadList: l1 = abunGraphLib.findAllReachable(eachR, N1, G) l2 = abunGraphLib.findAllReachable(eachR, N1, Grev) l1Distinct = abunHouseKeeper.getDistinct(l1) l2Distinct = abunHouseKeeper.getDistinct(l2) if len(l1Distinct) == 1 and len(l2Distinct) == 1: c1, c2 = l1Distinct[0], l2Distinct[0] resolvedList.append([c2, c1]) return resolvedList
def continuousIntegration(): if False: G = graphLib.seqGraph(10) for i in range(5): G.insertEdge(i,i+1,1997) G.insertEdge(i,i+2, 1997) resultList = abunGraphLib.BFS_revisit(1,3,G,1) print "resultList", resultList if False : folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \ "Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta" abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile) if False: lenDic = IORobot.obtainLength(folderName , contigFile) N1 = len(lenDic) print "N1", N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) Gnew.initAdv() Gnew.doubleEdgeReduction() contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3) contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5) print "contigPaths", contigPaths print "contigReadPaths", contigReadPaths Gnew.transitiveReduction() if False: toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/") print toDelete if False: G = graphLib.seqGraph(0) G.loadFromFile("Apr10TestA/", "xResolvedGraph") if False: for i in range(len(G.graphNodesList)): v = G.graphNodesList[i] if len(v.nodeIndexList) > 0: print i , v.listOfPrevNodes , v.listOfNextNodes G.reportEdge() lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta") mylist = [401, 207, 405, 407, 344] json_data = open("Apr10TestA/" + "myCountDic.json", 'r') myCountDic = json.load(json_data) for x in mylist: print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)] if False: folderName = "Apr10TestA/" G = graphLib.seqGraph(0) G.loadFromFile(folderName , "xResolvedGraph") json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") print len(G.graphNodesList) print len(mapDummyToRealDic) print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic) if False: abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/") if False: nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/") if False: folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1" G = graphLib.seqGraph(0) kthres, edgeThres = 3, 1 G.loadFromFile(folderName, contigReadGraph) lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta") N1 = len(lenDic) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres: adj[i].append(j) #print i, adj[i] ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i+1) adaptorPair.append([i, i+1]) elif i % 2 ==1: if i-1 in adj[i] : adj[i].remove(i-1) adaptorPair.append([i, i-1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) #Gnew.reportEdge() count2 = 0 for i in range(len(Gnew.graphNodesList)): if len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and len(Gnew.graphNodesList[i].listOfNextNodes) == 2: count2 = count2 + 1 print str(i)+"{color:red}" print "count2, ", count2 ### End filter adaptor skipped case if True: nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename): ### Transitive reduction and remove double pointers N1 = len(myCountDic) * 2 print "N1", N1 kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i + 1) adaptorPair.append([i, i + 1]) elif i % 2 == 1: if i - 1 in adj[i]: adj[i].remove(i - 1) adaptorPair.append([i, i - 1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) Gnew.reportEdge() ### Trying out the new component import toCondenseFixer Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName) Gnew.symGraph() #Gnew.reportEdge() ### End filter adaptor skipped case if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery: Gnew.initAdv() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove: Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr: Gnew.doubleEdgeReduction() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive: Gnew.transitiveReduction(folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G) Gnew.condense() Gnew.findAdjList() else: Gnew.initAdv() Gnew.condense() Gnew.findAdjList() return Gnew