def abunSplit(folderName, mummerLink, myCountDic): ''' Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs ''' json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic)*2 G = graphLib.seqGraph(N1) for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic) addEdges(G, resolvedList) G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def abunSplit(folderName, mummerLink, myCountDic): """ Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs """ json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 G = graphLib.seqGraph(N1) for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic) addEdges(G, resolvedList) G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): ''' Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs ''' json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 print "N1", N1 G = graphLib.seqGraph(N1) gapContentLookUpList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) print "resolvedList", resolvedList gapContentLookUpList += generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename) addEdges(G, resolvedList) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) # some how change ASplitter here by appending necessary information G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic)
def XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1, mummerLink): if abunHouseKeeper.abunGlobalSplitParameterRobot.runXResolve: G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraphFast(G) if not abunHouseKeeper.abunGlobalRunEM: xResolvedList, brResolvedListforX = [[] for i in range(N1) ], [[] for i in range(N1)] if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunX: if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX xResolvedList = xNodeAdvResolving(Gnew, G, folderName, myCountDic, lenDic) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRX: if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX brResolvedListforX = xNodeBrResolving(Gnew, G, Grev, folderName, N1) combinedList = resolveConflictX(xResolvedList, brResolvedListforX) else: combinedList = xNodeEMResolving(Gnew, G, Grev, folderName, myCountDic, lenDic, N1, mummerLink) print "combinedList", combinedList Gnew.xResolve(combinedList) Gnew.condense() Gnew.saveToFile(folderName, "xResolvedGraph") with open(folderName + "mapDummyToRealDic.json", 'w') as f: json.dump(Gnew.mapDummyToRealDic, f) with open(folderName + "xResolvedSimplifiedList.json", 'w') as f: json.dump(Gnew.xResolvedSimplifiedList, f) else: Gnew.saveToFile(folderName, "xResolvedGraph") with open(folderName + "mapDummyToRealDic.json", 'w') as f: json.dump(Gnew.mapDummyToRealDic, f) with open(folderName + "xResolvedSimplifiedList.json", 'w') as f: json.dump(Gnew.xResolvedSimplifiedList, f)
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): """ Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs """ json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 print "N1", N1 G = graphLib.seqGraph(N1) gapContentLookUpList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) print "resolvedList", resolvedList gapContentLookUpList += generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename ) addEdges(G, resolvedList) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) # some how change ASplitter here by appending necessary information G.condense() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic )
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph): json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") gapContentLookUpDic = {} furtherGapList = [] for i in range(N1): if len(G.graphNodesList[i].nodeIndexList) > 1: for j in range(len(G.graphNodesList[i].nodeIndexList) - 1): bk, fwd = G.graphNodesList[i].nodeIndexList[ j], G.graphNodesList[i].nodeIndexList[j + 1] key = str(bk) + "_" + str(fwd) if not key in gapContentLookUpDic: furtherGapList.append([bk, fwd]) with open(folderName + "furtherGapList.json", 'w') as f: json.dump(furtherGapList, f) furtherGapContentLookUpList = generateGapContentLookup( folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic) for eachitem in furtherGapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) #segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") IORobot.extractGraphToContigs(G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic) if True: nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1): if abunHouseKeeper.abunGlobalSplitParameterRobot.runXResolve: G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraphFast(G) xResolvedList, brResolvedListforX = [[] for i in range(N1)], [[] for i in range(N1)] if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunX: if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = ( abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerX ) if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = ( abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperX ) xResolvedList = xNodeAdvResolving(Gnew, G, folderName, myCountDic, lenDic) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRX: if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = ( abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresX ) brResolvedListforX = xNodeBrResolving(Gnew, G, Grev, folderName, N1) combinedList = resolveConflictX(xResolvedList, brResolvedListforX) Gnew.xResolve(combinedList) Gnew.condense() Gnew.saveToFile(folderName, "xResolvedGraph") with open(folderName + "mapDummyToRealDic.json", "w") as f: json.dump(Gnew.mapDummyToRealDic, f) with open(folderName + "xResolvedSimplifiedList.json", "w") as f: json.dump(Gnew.xResolvedSimplifiedList, f) else: Gnew.saveToFile(folderName, "xResolvedGraph") with open(folderName + "mapDummyToRealDic.json", "w") as f: json.dump(Gnew.mapDummyToRealDic, f) with open(folderName + "xResolvedSimplifiedList.json", "w") as f: json.dump(Gnew.xResolvedSimplifiedList, f)
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph): json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") gapContentLookUpDic = {} furtherGapList = [] for i in range(N1): if len(G.graphNodesList[i].nodeIndexList) > 1: for j in range(len(G.graphNodesList[i].nodeIndexList) - 1): bk, fwd = G.graphNodesList[i].nodeIndexList[j], G.graphNodesList[i].nodeIndexList[j + 1] key = str(bk) + "_" + str(fwd) if not key in gapContentLookUpDic: furtherGapList.append([bk, fwd]) with open(folderName + "furtherGapList.json", "w") as f: json.dump(furtherGapList, f) furtherGapContentLookUpList = generateGapContentLookup( folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) for eachitem in furtherGapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) # segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") IORobot.extractGraphToContigs( G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic ) if True: nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
def performPhasing(folderName, mummerLink): print "performPhasing" ''' 1. Interface from alignmentBridge.py : shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList) cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True) 2. Format of input data data : bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) 3. IO : a) Input : repeatSpecification.txt, phasingSeedName_Double.fasta, graph G b) Output : improved4.fasta 3. Algorithm: a) reformatNoisyReads b) reformatToProcessList c) formShortToLongMapping ''' json_data = open(folderName + 'repeatSpecification.txt', 'r') loadData = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") lenDicCC = IORobot.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) loadData = filterReverseComp(loadData, N1) toPhaseList = [] if True: for eachitem in loadData: # print eachitem flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1) toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1) shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal, dicToOriginal, lenDicCR, N1) indelRobot = createIndelRobot(folderName) cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True) if extendResult != -1: print "extendResult: ", extendResult toPhaseList.append(eachitem + [extendResult]) with open(folderName + 'toPhaseList.txt', 'w') as outfile: json.dump(toPhaseList, outfile) json_data = open(folderName + 'toPhaseList.txt', 'r') toPhaseList = json.load(json_data) outputResults(folderName, mummerLink, toPhaseList, N1, G)
def outputResults(folderName, mummerLink, toPhaseList, N1, G): ''' Algorithm : a) Write as contigs b) Add back reverse complement c) Create G2 as the readOut part d) Output the contigs by a function call ''' # a) combinedName = "contigAndRead_Double.fasta" os.system("cp " + folderName + "improved3_Double.fasta " + folderName + combinedName) fout = open(folderName + combinedName, 'a') fin = open(folderName + "phasingSeedName_Double.fasta", 'r') tmp = fin.readline().rstrip() while len(tmp) > 0: if tmp[0] != ">": fout.write(tmp + "\n") else: infoArr = tmp[5:].split("_") fout.write(">Contig" + str(int(infoArr[0]) + N1 / 2)) fout.write("_" + infoArr[1] + "\n") tmp = fin.readline().rstrip() fin.close() fout.close() # b) ''' [28], [[2, 690, 28], [6, 126, 28], [28, 212, 0], [28, 216, 4]], 1 [2 , 690, 28, 212, 0] ''' completePhaseList = [] for eachitem in toPhaseList: repeat = eachitem[-3] flanking = eachitem[-2] result = eachitem[-1] revrepeat = [] for eachsub in eachitem[-3][-1::-1]: revrepeat.append(eachsub + pow(-1, eachsub)) revflanking = [[] for i in range(4)] for j in range(2): for eachsub in eachitem[-2][j + 2][-1::-1]: revflanking[j].append(eachsub + pow(-1, eachsub)) for eachsub in eachitem[-2][j][-1::-1]: revflanking[j + 2].append(eachsub + pow(-1, eachsub)) revresult = eachitem[-1] completePhaseList.append([repeat, flanking, result]) completePhaseList.append([revrepeat, revflanking, revresult]) print "completePhaseList", completePhaseList # c) G2 = graphLib.seqGraph(N1) nameDic = {} for i in range(N1): nameDic[i] = i for eachitem in completePhaseList: repeat, flanking, result = eachitem[0] , eachitem[1] , eachitem[2] path = [[], []] if result == 0: path[0] = flanking[0][0:-1] + repeat + flanking[2][1:] path[1] = flanking[1][0:-1] + repeat + flanking[3][1:] else: path[0] = flanking[0][0:-1] + repeat + flanking[3][1:] path[1] = flanking[1][0:-1] + repeat + flanking[2][1:] print path[0] , path[1] for i in range(2): eachpath = path[i] currentNode = G2.graphNodesList[eachpath[0]] for nextNodeIndex, ctr in zip(eachpath[1:], range(len(eachpath[1:]))): if ctr != len(eachpath[1:]) - 1: myindex = len(G2.graphNodesList) nameDic[myindex] = nextNodeIndex newNode = graphLib.seqGraphNode(myindex) G2.graphNodesList.append(newNode) else: newNode = G2.graphNodesList[nextNodeIndex] wt = 0 for eachck in G.graphNodesList[nameDic[currentNode.nodeIndex]].listOfNextNodes: if eachck[0] == nextNodeIndex: wt = eachck[1] break newNode.listOfPrevNodes.append([currentNode.nodeIndex, wt]) currentNode.listOfNextNodes.append([newNode.nodeIndex, wt]) currentNode = newNode graphFileName = "phaseGraphFinal" G2.condense() G2.saveToFile(folderName, graphFileName) IORobot.readContigOut(folderName, mummerLink, graphFileName, combinedName, "improved4.fasta", "outOpenListphaing", nameDic)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0 : repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()
def BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic): if abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve: print "abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve", abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve maxRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.RThres repeatFinder.adjListToRepeatList(Gnew.adj, folderName, "phaseRepeatTR.txt") json_data = open(folderName + "phaseRepeatTR.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraphFast(G) abunAnalysisList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList, brResolvedList = [], [] if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunB: if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = ( abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB ) if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = ( abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB ) if not abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAggB: resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) else: resolvedList = determindMatchAggregate( inList, outList, myCountDic, folderName, contigReadGraph, N1, Gnew, lenDic ) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRB: if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = ( abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB ) brResolvedList = formBRReolve(folderName, inList, outList, G, Grev, True, N1) combinedList = abunHouseKeeper.getDistinct(resolvedList + brResolvedList) print "resolvedList, brResolvedList, inList, outList", resolvedList, brResolvedList, inList, outList print "resolveConflict(combinedList)", resolveConflict(combinedList) abunAnalysisList.append([inList, outList, resolvedList, brResolvedList, resolveConflict(combinedList)]) if len(inList) <= maxRThres and len(outList) <= maxRThres and len(inList) > 0 and len(outList) > 0: # biResolvedCombineList += resolveConflict(combinedList) resolvedCombine = resolveConflict(combinedList) ### kkdebug Gnew.bipartiteLocalResolve(resolvedCombine, inList, outList, folderName) # json_data = open(folderName + "hackBRResolveList.json", 'r') # dataItem = json.load(json_data) # Gnew.bipartiteResolve(dataItem) ### end kkdebug Gnew.condense() with open(folderName + "biResolvedCombineList.json", "w") as f: json.dump(biResolvedCombineList, f) with open(folderName + "abunAnalysisList.json", "w") as f: json.dump(abunAnalysisList, f) # assert(1==2) return Gnew else: return Gnew
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename): ### Transitive reduction and remove double pointers N1 = len(myCountDic) * 2 print "N1", N1 kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i + 1) adaptorPair.append([i, i + 1]) elif i % 2 == 1: if i - 1 in adj[i]: adj[i].remove(i - 1) adaptorPair.append([i, i - 1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) ### Trying out the new component import toCondenseFixer Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName) Gnew.symGraph() ### End filter adaptor skipped case if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery: Gnew.initAdv() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove: Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr: Gnew.doubleEdgeReduction() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive: Gnew.transitiveReduction( folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G ) Gnew.condense() Gnew.findAdjList() else: Gnew.initAdv() Gnew.condense() Gnew.findAdjList() return Gnew
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving( folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", 'w') as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", 'w') as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", 'r') resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", 'w') as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", 'r') resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", 'r') gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile( folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", 'a') for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic)
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" : print "debug" , eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" : print "debug" , eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0: leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([ abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList) ]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert (loadData == repeatList)
def resolvingTandem( folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec ): print "resolvingTandem" """ Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs """ # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, "r") loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i = 0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i + 1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk] - N1 nextitem = tandemPath[kk + 1] - N1 readName = "Read" + str(eachitem / 2) + "_" nextReadName = "Read" + str(nextitem / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" if nextitem % 2 == 0: nextReadName = nextReadName + "p" elif nextitem % 2 == 1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, "w") fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge = repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList = [] for eachitem in repeatReadList: readName = "Read" + str((eachitem - N1) / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key=itemgetter(-1)) for key, values in groupby(dataList, itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] # print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch * 1.0 / (c * lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1] - N1 contigName = "Contig" + str(startContig / 2) if startContig % 2 == 0: contigName = contigName + "_p" elif startContig % 2 == 1: contigName = contigName + "_d" readName = "Read" + str(firstRead / 2) if firstRead % 2 == 0: readName = readName + "_p" elif firstRead % 2 == 1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName + ";" + readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", "w") f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign( mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta" ) dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out") dataList.sort(key=itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0: repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct * lrepeat) print "repeatStart", repeatStart happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", "w") counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C") if checkingList[id / 2] == False: fout.write(">Segkk" + str(counter) + "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk / 2] = True fout.close()
def continuousIntegration(): if False: G = graphLib.seqGraph(10) for i in range(5): G.insertEdge(i,i+1,1997) G.insertEdge(i,i+2, 1997) resultList = abunGraphLib.BFS_revisit(1,3,G,1) print "resultList", resultList if False : folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \ "Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta" abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile) if False: lenDic = IORobot.obtainLength(folderName , contigFile) N1 = len(lenDic) print "N1", N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) Gnew.initAdv() Gnew.doubleEdgeReduction() contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3) contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5) print "contigPaths", contigPaths print "contigReadPaths", contigReadPaths Gnew.transitiveReduction() if False: toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/") print toDelete if False: G = graphLib.seqGraph(0) G.loadFromFile("Apr10TestA/", "xResolvedGraph") if False: for i in range(len(G.graphNodesList)): v = G.graphNodesList[i] if len(v.nodeIndexList) > 0: print i , v.listOfPrevNodes , v.listOfNextNodes G.reportEdge() lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta") mylist = [401, 207, 405, 407, 344] json_data = open("Apr10TestA/" + "myCountDic.json", 'r') myCountDic = json.load(json_data) for x in mylist: print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)] if False: folderName = "Apr10TestA/" G = graphLib.seqGraph(0) G.loadFromFile(folderName , "xResolvedGraph") json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") print len(G.graphNodesList) print len(mapDummyToRealDic) print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic) if False: abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/") if False: nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/") if False: folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1" G = graphLib.seqGraph(0) kthres, edgeThres = 3, 1 G.loadFromFile(folderName, contigReadGraph) lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta") N1 = len(lenDic) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres: adj[i].append(j) #print i, adj[i] ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i+1) adaptorPair.append([i, i+1]) elif i % 2 ==1: if i-1 in adj[i] : adj[i].remove(i-1) adaptorPair.append([i, i-1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) #Gnew.reportEdge() count2 = 0 for i in range(len(Gnew.graphNodesList)): if len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and len(Gnew.graphNodesList[i].listOfNextNodes) == 2: count2 = count2 + 1 print str(i)+"{color:red}" print "count2, ", count2 ### End filter adaptor skipped case if True: nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p": print "debug", eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p": print "debug", eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1): G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) return findPathBtwEndsFast(folderName, leftCtgIndex, rightCtgIndex, G, N1)
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename): print eachmatchpair leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, "" succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) succReadsList = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5) # shuffle(allPaths) print "allPaths", allPaths possibleList = [] for p in allPaths: noContig = True for pp in p[1:-1]: if pp < N1: noContig = False if noContig == True: possibleList.append(p) print "possibleList", possibleList minListLen = 1000 for p in possibleList: if len(p) < minListLen: succReadsList = p minListLen = len(p) if len(succReadsList) > 0: succReadsList.pop(0) succReadsList.pop(-1) else: print "interesting item for future study" print "succReadsList", succReadsList if len(succReadsList) == 0: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap contig : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" else: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) print contigName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1) print readName rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap start read : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" for i in range(len(succReadsList) - 1): readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1) rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap middle read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap end read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]
def xNodeResolving(folderName, contigReadGraph): ''' Input : contigGraph , abunInfo , folderName Output: myresolvedList.json, gapContentLookUp.json, dummyNodeMapping.json Algorithm : 1) Tranverse the graph a) If the node can well be fixed with sd requirement met i) Link it across and add the pair into the myresolvedList, gapContentLookUp ii) Add dummynodes and fill in the dummyNodeMapping 2) Format return and output as temp file ''' ### Init G, myCountDic, N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) with open(folderName + 'myCountDic.json') as f: myCountDic = json.load(f) N1 = len(myCountDic) * 2 ### Add resolved edge adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) extraCounter = 0 mapDummyToRealDic = {} resolvedList = [] for v in Gnew.graphNodesList: inList = [] for eachitem in v.listOfPrevNodes: inList.append(eachitem[0]) outList = [] for eachitem in v.listOfNextNodes: outList.append(eachitem[0]) inListCt = getCtTwoToOne(inList, myCountDic) outListCt = getCtTwoToOne(outList, myCountDic) sizeList = [] for eachitem in myCountDic: sizeList.append(myCountDic[eachitem]) sd = np.std(sizeList) for eachIn in inListCt: matchedOut = satisfyMatch(eachIn, outListCt, sd) if matchedOut != -1: leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex inSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut outSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) if inSuccReadsList != None and outSuccReadsList != None: resolvedList.append([eachIn[0]] + inSuccReadsList + [N1 + extraCounter]) print "in: ", resolvedList[-1] resolvedList.append([N1 + extraCounter] + outSuccReadsList + [matchedOut]) print "out: ", resolvedList[-1] mapDummyToRealDic[extraCounter] = v.nodeIndex extraCounter = extraCounter + 1 return resolvedList, mapDummyToRealDic
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName, needAlignment=True): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" #if needAlignment: # alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) if needAlignment: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[header, referenceFile, queryFile, ""]], houseKeeper.globalParallel) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' addDataToList(dataListCC, G, 0, 0, 'C', 'C') addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1) Gnew.saveToFile(folderName, graphName) print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList ), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", "w") as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", "w") as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", "w") as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", "r") gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", "a") for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic )
def xNodeResolving(folderName, contigReadGraph): ### Init G, myCountDic, N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) with open(folderName + "myCountDic.json") as f: myCountDic = json.load(f) N1 = len(myCountDic) * 2 ### Add resolved edge adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) extraCounter = 0 mapDummyToRealDic = {} resolvedList = [] for v in Gnew.graphNodesList: inList = [] for eachitem in v.listOfPrevNodes: inList.append(eachitem[0]) outList = [] for eachitem in v.listOfNextNodes: outList.append(eachitem[0]) inListCt = getCtTwoToOne(inList, myCountDic) outListCt = getCtTwoToOne(outList, myCountDic) sizeList = [] for eachitem in myCountDic: sizeList.append(myCountDic[eachitem]) sd = np.std(sizeList) for eachIn in inListCt: matchedOut = satisfyMatch(eachIn, outListCt, sd) if matchedOut != -1: leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex inSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1 ) leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut outSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1 ) if inSuccReadsList != None and outSuccReadsList != None: resolvedList.append([eachIn[0]] + inSuccReadsList + [N1 + extraCounter]) print "in: ", resolvedList[-1] resolvedList.append([N1 + extraCounter] + outSuccReadsList + [matchedOut]) print "out: ", resolvedList[-1] mapDummyToRealDic[extraCounter] = v.nodeIndex extraCounter = extraCounter + 1 return resolvedList, mapDummyToRealDic
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename): ### Transitive reduction and remove double pointers N1 = len(myCountDic) * 2 print "N1", N1 kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i + 1) adaptorPair.append([i, i + 1]) elif i % 2 == 1: if i - 1 in adj[i]: adj[i].remove(i - 1) adaptorPair.append([i, i - 1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) Gnew.reportEdge() ### Trying out the new component import toCondenseFixer Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName) Gnew.symGraph() #Gnew.reportEdge() ### End filter adaptor skipped case if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery: Gnew.initAdv() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove: Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr: Gnew.doubleEdgeReduction() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive: Gnew.transitiveReduction(folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G) Gnew.condense() Gnew.findAdjList() else: Gnew.initAdv() Gnew.condense() Gnew.findAdjList() return Gnew