def coloringNodes(): folderName = "Apr10Test/" if False: alignerRobot.useMummerAlign("/usr/bin/", folderName, "debug", "reference.fasta", "LC_n.fasta") dataList = alignerRobot.extractMumData(folderName, "debugOut") dataList.sort(key = itemgetter(-1)) mappedDic = {} for key, items in groupby(dataList, itemgetter(-1)): print "key", key matchLen = -1 for eachitem in items: if eachitem[-4] > matchLen: mappedDic[key] = eachitem[-2] matchLen = eachitem[-4] for eachitem in mappedDic: if mappedDic[eachitem] == 'c3': print str(int(eachitem[5:])*2)+"{color:blue}" print str(int(eachitem[5:])*2+1)+"{color:blue}" if mappedDic[eachitem] == 'c1': print str(int(eachitem[5:])*2)+"{color:green}" print str(int(eachitem[5:])*2+1)+"{color:green}"
def formExtraEdges( folderName="/home/kakitfive/kkdata2/MetaFinisherSC/dataFolderBackup/", optTypeFileHeader="phaseString", contigFilename="improved3", G=[], N1=0): dataList = alignerRobot.extractMumData(folderName, optTypeFileHeader + "CR" + "Out") dataList.sort(key=itemgetter(-2)) lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") count = 0 tmpItem = [] embedContig2ReadDic, read2EmbedContigDic = {}, {} for key, items in groupby(dataList, itemgetter(-2)): isEmbedded = False for eachitem in items: #print eachitem if eachitem[4] > lenDic[key] - 300: isEmbedded = True tmpItem = eachitem if isEmbedded: count = count + 1 readName = tmpItem[-1] embedContig2ReadDic[key] = readName read2EmbedContigDic[readName] = key print "len(embedContig2ReadDic)", len(embedContig2ReadDic) #assert(False) for contigName in embedContig2ReadDic: readName = embedContig2ReadDic[contigName] readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID( readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C') for eachprev in G.graphNodesList[readIndex].listOfPrevNodes: idNode, wt = eachprev[0], eachprev[1] if idNode < N1: G.insertEdge(idNode, contigIndex, wt) for eachnext in G.graphNodesList[readIndex].listOfNextNodes: idNode, wt = eachnext[0], eachnext[1] if idNode < N1: G.insertEdge(contigIndex, idNode, wt) return G
def decideCut(folderName, mummerPath): ''' Input : directPath.fasta, indirectPath.fasta Output : toDelete ''' thres = 50 if True: alignerRobot.useMummerAlign(mummerPath, folderName, \ "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True) dataList = alignerRobot.extractMumData(folderName , "indirectvsdirectOut") lenDic = IORobot.obtainLength(folderName, "directPath.fasta") ctr =0 ctrindirect = 0 dataList.sort(key = itemgetter(-1)) toDelete = True for key, items in groupby(dataList, itemgetter(-1)): print "key", key ctr = ctr + 1 isFound = False for eachitem in items: if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres: isFound = True if isFound: ctrindirect = ctrindirect + 1 epsilon = 1.1 print "ctrindirect, ctr", ctrindirect, ctr if ctrindirect*1.0/ctr < (1- epsilon): toDelete = False else: toDelete = True return toDelete
def mapStrangePairs(): folderName = "Apr10Test/" json_data = open(folderName + "furtherGapList.json", 'r') furtherGapList = json.load(json_data) segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") f = open(folderName + "wrongCondense.fasta", 'w') ctr = 0 for eachitem in furtherGapList: beforeI, afterI = eachitem[0], eachitem[1] f.write(">Segkk"+str(ctr)+"\n") f.write(segLookUp[beforeI]+"\n") ctr = ctr + 1 f.write(">Segkk"+str(ctr)+"\n") f.write(segLookUp[afterI]+"\n") ctr = ctr + 1 f.close() if False: alignerRobot.useMummerAlign("/usr/bin/", folderName, "wrongCondenseDebug", "reference.fasta", "wrongCondense.fasta") dataList = alignerRobot.extractMumData(folderName, "wrongCondenseDebugOut") dataList.sort(key = itemgetter(-1)) mappedDic = {} for key, items in groupby(dataList, itemgetter(-1)): print "key", key matchLen = -1 for eachitem in items: if eachitem[-4] > matchLen: mappedDic[key] = eachitem matchLen = eachitem[-4] for eachitem in mappedDic: print "results : ", eachitem, mappedDic[eachitem]
def formSortedDataList(folderName): sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ [], [] , {}, {} dataList = alignerRobot.extractMumData(folderName, "phaseStringCROut") sortedContigList = sorted(dataList, key=itemgetter(-2)) sortedContigDic[sortedContigList[0][-2]] = 0 for i in range(1, len(sortedContigList)): if sortedContigList[i][-2] != sortedContigList[i - 1][-2]: sortedContigDic[sortedContigList[i][-2]] = i sortedReadList = sorted(dataList, key=itemgetter(-1)) sortedReadDic[sortedReadList[0][-1]] = 0 for i in range(1, len(sortedReadList)): if sortedReadList[i][-1] != sortedReadList[i - 1][-1]: sortedReadDic[sortedReadList[i][-1]] = i return sortedContigList, sortedReadList, sortedContigDic, sortedReadDic
def formSortedDataList(folderName): sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ [], [] , {}, {} dataList = alignerRobot.extractMumData(folderName, "phaseStringCROut") sortedContigList = sorted(dataList, key = itemgetter(-2)) sortedContigDic[sortedContigList[0][-2]] = 0 for i in range(1, len(sortedContigList)): if sortedContigList[i][-2] != sortedContigList[i-1][-2]: sortedContigDic[sortedContigList[i][-2]] = i sortedReadList = sorted(dataList, key = itemgetter(-1)) sortedReadDic[sortedReadList[0][-1]] = 0 for i in range(1, len(sortedReadList)): if sortedReadList[i][-1] != sortedReadList[i-1][-1]: sortedReadDic[sortedReadList[i][-1]] = i return sortedContigList, sortedReadList, sortedContigDic, sortedReadDic
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" """ 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V """ numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) """ "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum """ outputName, referenceName, queryName, specialName = ( "outAbun" + indexOfMum, "improved3.fasta", "raw_reads.part-" + indexOfMum + ".fasta", "outAbun" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False) """ command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) """ dataList = [] for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out") """ 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass """ lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList = [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut") else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False) with open(folderName + "myCountDic.json", "w") as f: json.dump(myCountDic, f) return myCountDic
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3") command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = houseKeeper.globalParallelFileNum if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName, needAlignment=True): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" #if needAlignment: # alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) if needAlignment: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[header, referenceFile, queryFile, ""]], houseKeeper.globalParallel) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' addDataToList(dataListCC, G, 0, 0, 'C', 'C') addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1) Gnew.saveToFile(folderName, graphName) print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" : print "debug" , eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" : print "debug" , eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def resolvingTandem( folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec ): print "resolvingTandem" """ Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs """ # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, "r") loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i = 0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i + 1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk] - N1 nextitem = tandemPath[kk + 1] - N1 readName = "Read" + str(eachitem / 2) + "_" nextReadName = "Read" + str(nextitem / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" if nextitem % 2 == 0: nextReadName = nextReadName + "p" elif nextitem % 2 == 1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, "w") fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge = repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList = [] for eachitem in repeatReadList: readName = "Read" + str((eachitem - N1) / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key=itemgetter(-1)) for key, values in groupby(dataList, itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] # print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch * 1.0 / (c * lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1] - N1 contigName = "Contig" + str(startContig / 2) if startContig % 2 == 0: contigName = contigName + "_p" elif startContig % 2 == 1: contigName = contigName + "_d" readName = "Read" + str(firstRead / 2) if firstRead % 2 == 0: readName = readName + "_p" elif firstRead % 2 == 1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName + ";" + readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", "w") f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign( mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta" ) dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out") dataList.sort(key=itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0: repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct * lrepeat) print "repeatStart", repeatStart happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", "w") counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C") if checkingList[id / 2] == False: fout.write(">Segkk" + str(counter) + "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk / 2] = True fout.close()
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = 20 if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0 : repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p": print "debug", eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p": print "debug", eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" ''' 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V ''' numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) ''' "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum ''' outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta", "outAbun" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) ''' dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out") ''' 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass ''' lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList= [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" ) else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False) with open(folderName + 'myCountDic.json', 'w') as f: json.dump(myCountDic, f) return myCountDic