def alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header): #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) numberOfFiles = houseKeeper.globalParallelFileNum bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str( numberOfFiles) + " " + folderName + queryFile os.system(command) os.system("cp *.fasta " + folderName) os.system("rm *.fasta ") workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = header + indexOfMum, referenceFile, queryFile[ 0:-6] + ".part-" + indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=False, refinedVersion=False) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, header, header + "Out", numberOfFiles)
def alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ): #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) numberOfFiles = 20 bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + queryFile os.system(command) os.system("cp *.fasta " + folderName ) os.system("rm *.fasta ") workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile,queryFile[0:-6]+".part-"+ indexOfMum + ".fasta" , header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = False, refinedVersion = False) alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, header,header +"Out", numberOfFiles)
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" """ 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V """ numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) """ "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum """ outputName, referenceName, queryName, specialName = ( "outAbun" + indexOfMum, "improved3.fasta", "raw_reads.part-" + indexOfMum + ".fasta", "outAbun" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False) """ command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) """ dataList = [] for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out") """ 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass """ lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList = [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut") else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False) with open(folderName + "myCountDic.json", "w") as f: json.dump(myCountDic, f) return myCountDic
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter): myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key=itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6] / 100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase) / (4.7 * pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0: toAddReadList.append(eachitem) """ This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine """ if continueFilter: numberOfFiles = 20 IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = ( bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta" ) os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = ( "outAbunRefine" + indexOfMum, "improved3.fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch( mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True ) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for i in range(len(myCountDic)): eachitem = "Segkk" + str(i) print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem]) return myCountDic
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3") command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = houseKeeper.globalParallelFileNum if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName, needAlignment=True): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" #if needAlignment: # alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) if needAlignment: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[header, referenceFile, queryFile, ""]], houseKeeper.globalParallel) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' addDataToList(dataListCC, G, 0, 0, 'C', 'C') addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1) Gnew.saveToFile(folderName, graphName) print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = 20 if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" ''' 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V ''' numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) ''' "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum ''' outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta", "outAbun" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) ''' dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out") ''' 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass ''' lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList= [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" ) else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False) with open(folderName + 'myCountDic.json', 'w') as f: json.dump(myCountDic, f) return myCountDic
def evaluateCoverage(dataList, lenDic, readLenDic, folderName,mummerLink, continueFilter): myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key = itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6]/100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase)/(4.7*pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk)/(1.0*len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0 : toAddReadList.append(eachitem) ''' This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine ''' if continueFilter: numberOfFiles= 20 IORobot.putListToFileO(folderName, "raw_reads.fasta" , "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta" os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outAbunRefine"+indexOfMum, "improved3.fasta", "selected_raw.part-"+ indexOfMum + ".fasta", "abunMissOut" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = True, refinedVersion = True) alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for i in range(len(myCountDic)): eachitem = "Segkk"+str(i) print eachitem , myCountDic[eachitem]/(1.0*lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem]/(1.0*lenDic[eachitem]) return myCountDic
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter, contigFilename): ''' not sure if that is the right documentation... Input : string_graph_3, improved3.fasta, raw_reads.fasta Output : string_graph_4 with weights [need a data structure to store the weight on node] Algorithm : 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass 3. Find your favorite graphical tool to display a. Use a javascript library [halfviz should just work ! put weight on edge ] ''' myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key=itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6] / 100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase) / (4.7 * pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0: toAddReadList.append(eachitem) ''' This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine ''' if continueFilter: numberOfFiles = houseKeeper.globalParallelFileNum IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str( numberOfFiles) + " " + folderName + "selected_raw.fasta" os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = "outAbunRefine" + indexOfMum, contigFilename + ".fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum workerList.append( [outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for eachitem in lenDic: #eachitem = "Segkk"+str(i) print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem]) return myCountDic