def formReverseGraph(G): nNode = len(G.graphNodesList) Grev = commonLib.seqGraph(nNode) for i in range(nNode): for j in range(nNode): haveInserted = commonLib.nameInEdgeList(j, G.graphNodesList[i].listOfNextNodes) if haveInserted: Grev.insertEdge(j, i, 100) return Grev
def performPhasing(folderName, mummerLink): print "performPhasing" ''' 1. Interface from alignmentBridge.py : shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList) cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True) 2. Format of input data data : bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) 3. IO : a) Input : repeatSpecification.txt, phasingSeedName_Double.fasta, graph G b) Output : improved4.fasta 3. Algorithm: a) reformatNoisyReads b) reformatToProcessList c) formShortToLongMapping ''' json_data = open(folderName + 'repeatSpecification.txt', 'r') loadData = json.load(json_data) G = commonLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") lenDicRR = commonLib.obtainLength(folderName, "phasingSeedName_Double.fasta") lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) for eachitem in loadData: print eachitem flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1) toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1) shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal,dicToOriginal, lenDicCR, N1 ) indelRobot = createIndelRobot(folderName) cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True) if extendResult != -1: print "extendResult: ", extendResult assert(1==2)
def connectContigs(toPhase, toRemove, toBR, folderName, mummerLink): print "\nConnect Contigs" tmpList = [] delThres =20000 lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta") for eachitem in toRemove: tmpList.append(eachitem/2) tmpList.sort() removeContigIndexList = [] for key, items in groupby(tmpList): name = "Contig"+ str(key)+"_p" if lenDic[name] < delThres: removeContigIndexList.append(2*key) removeContigIndexList.append(2*key+1) print "removeContigIndexList", removeContigIndexList ### toRemove ===> remove both strand when detected G = commonLib.seqGraph(len(lenDic)) ### hack ! make the nodeIndexList to be empty for empty nodes for eachnode in G.graphNodesList: if eachnode.nodeIndex in removeContigIndexList: eachnode.nodeIndexList = [] # form a graph, .condense, then use readContigOut ### add edge for eachedge in toBR: i = eachedge[0]/2 j = eachedge[1]/2 wt = eachedge[3]+1 print "i, j, wt", i, j, wt G.insertEdge(i, j, wt) tmpFileName = "xphasebonus" G.condense() G.saveToFile(folderName,tmpFileName ) commonLib.readContigOut(folderName, mummerLink, tmpFileName, "improved3_Double.fasta", "improved4.fasta", tmpFileName+"Open")
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) Grev = formReverseGraph(G) json_data = open(folderName + repeatFilename, 'r') repeatList = json.load(json_data) lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)", len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1): print rIn, rOut if (len(rIn) == 1 and len(rOut) == 1): rIn = [rIn[0], rIn[0]] rOut = [rOut[0], rOut[0]] # 1. Records reachable indices kkIn , kkOut = [], [] for eachkk in rIn: kkIn.append(str(eachkk)+"_"+"in") for eachkk in rOut: kkOut.append(str(eachkk)+"_"+"out") markReachableIndices(G, Grev, kkIn, kkOut, N1) # 2. Marks inside nodes singleMissList, allPassList = markInsideNodes(G, kkIn, kkOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway #checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList) # ## Experimental repeatList = allPassList # ## End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + repeatSpec, 'w') as outfile: json.dump(bigDumpList, outfile)
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = filterEdge(adjacencyList, folderName, contigFilename) G2 = commonLib.seqGraph(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([getDistinct(leftList), getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] commonLib.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") commonLib.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = commonLib.extractMumData(folderName, header + "Out") dataListCC = filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta") dataListRR = commonLib.extractMumData(folderName, header + "Out") dataListRR = filterData(dataListRR, lenDicRR) header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = commonLib.extractMumData(folderName, header + "Out") dataListCR = filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = commonLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def outputResults(folderName, mummerLink, toPhaseList, N1, G): ''' Algorithm : a) Write as contigs b) Add back reverse complement c) Create G2 as the readOut part d) Output the contigs by a function call ''' # a) combinedName = "contigAndRead_Double.fasta" os.system("cp " + folderName + "improved3_Double.fasta " + folderName + combinedName) fout = open(folderName + combinedName, 'a') fin = open(folderName + "phasingSeedName_Double.fasta", 'r') tmp = fin.readline().rstrip() while len(tmp) > 0: if tmp[0] != ">": fout.write(tmp + "\n") else: infoArr = tmp[5:].split("_") fout.write(">Contig" + str(int(infoArr[0]) + N1 / 2)) fout.write("_" + infoArr[1] + "\n") tmp = fin.readline().rstrip() fin.close() fout.close() # b) ''' [28], [[2, 690, 28], [6, 126, 28], [28, 212, 0], [28, 216, 4]], 1 [2 , 690, 28, 212, 0] ''' completePhaseList = [] for eachitem in toPhaseList: repeat = eachitem[-3] flanking = eachitem[-2] result = eachitem[-1] revrepeat = [] for eachsub in eachitem[-3][-1::-1]: revrepeat.append(eachsub + pow(-1, eachsub)) revflanking = [[] for i in range(4)] for j in range(2): for eachsub in eachitem[-2][j + 2][-1::-1]: revflanking[j].append(eachsub + pow(-1, eachsub)) for eachsub in eachitem[-2][j][-1::-1]: revflanking[j + 2].append(eachsub + pow(-1, eachsub)) revresult = eachitem[-1] completePhaseList.append([repeat, flanking, result]) completePhaseList.append([revrepeat, revflanking, revresult]) print "completePhaseList", completePhaseList # c) G2 = commonLib.seqGraph(N1) nameDic = {} for i in range(N1): nameDic[i] = i for eachitem in completePhaseList: repeat, flanking, result = eachitem[0] , eachitem[1] , eachitem[2] path = [[], []] if result == 0: path[0] = flanking[0][0:-1] + repeat + flanking[2][1:] path[1] = flanking[1][0:-1] + repeat + flanking[3][1:] else: path[0] = flanking[0][0:-1] + repeat + flanking[3][1:] path[1] = flanking[1][0:-1] + repeat + flanking[2][1:] print path[0] , path[1] for i in range(2): eachpath = path[i] currentNode = G2.graphNodesList[eachpath[0]] for nextNodeIndex, ctr in zip(eachpath[1:], range(len(eachpath[1:]))): if ctr != len(eachpath[1:]) - 1: myindex = len(G2.graphNodesList) nameDic[myindex] = nextNodeIndex newNode = commonLib.seqGraphNode(myindex) G2.graphNodesList.append(newNode) else: newNode = G2.graphNodesList[nextNodeIndex] wt = 0 for eachck in G.graphNodesList[nameDic[currentNode.nodeIndex]].listOfNextNodes: if eachck[0] == nextNodeIndex: wt = eachck[1] break newNode.listOfPrevNodes.append([currentNode.nodeIndex, wt]) currentNode.listOfNextNodes.append([newNode.nodeIndex, wt]) currentNode = newNode graphFileName = "phaseGraphFinal" G2.condense() G2.saveToFile(folderName, graphFileName) commonLib.readContigOut(folderName, mummerLink, graphFileName, combinedName, "improved4.fasta", "outOpenListphaing", nameDic)
def defineRepeatAndFlanking(folderName, mummerLink): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = commonLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") Grev = formReverseGraph(G) json_data = open(folderName + 'phaseRepeat.txt', 'r') repeatList = json.load(json_data) lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)",len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if len(rIn) == 2 and len(rOut) == 2: print rIn, rOut # 1. Records reachable indices kkIn , kkOut = [],[] ''' for eachnext in G.graphNodesList[4].listOfNextNodes: print 4, eachnext kkIn.append(eachnext[0]) for eachprev in G.graphNodesList[6].listOfPrevNodes: print 6, eachprev kkOut.append(eachprev[0]) print set(kkIn).intersection(set(kkOut)) print len( G.graphNodesList[0].listOfNextNodes), len( G.graphNodesList[2].listOfNextNodes) print len( G.graphNodesList[1].listOfPrevNodes), len( G.graphNodesList[3].listOfPrevNodes) print len( Grev.graphNodesList[0].listOfPrevNodes), len( Grev.graphNodesList[2].listOfPrevNodes) print len( Grev.graphNodesList[1].listOfNextNodes), len( Grev.graphNodesList[3].listOfNextNodes) ''' markReachableIndices(G, Grev, rIn, rOut, N1) # 2. Marks inside nodes singleMissList, allPassList = markInsideNodes(G, rIn, rOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList) ### Experimental repeatList = allPassList ### End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + 'repeatSpecification.txt', 'w') as outfile: json.dump(bigDumpList, outfile)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = commonLib.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = commonLib.extractMumData(folderName, header + "Out") dataListRR = newPhasing.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[1] > eachitem[3]: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = commonLib.extractMumData(folderName, header + "Out") dataListCR = newPhasing.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[1] > eachitem[3]: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = commonLib.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = commonLib.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) commonLib.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: commonLib.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = commonLib.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = 50 # Important parameters : FIX needed in production #lengthDic = commonLib.obtainLength(folderName, readsetFilename+"_Double.fasta") print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: commonLib.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = commonLib.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm repeatStart = maxItm[0] contigEnd = maxItm[2] # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = newPhasing.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()