def alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header): #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) numberOfFiles = houseKeeper.globalParallelFileNum bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str( numberOfFiles) + " " + folderName + queryFile os.system(command) os.system("cp *.fasta " + folderName) os.system("rm *.fasta ") workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = header + indexOfMum, referenceFile, queryFile[ 0:-6] + ".part-" + indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=False, refinedVersion=False) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, header, header + "Out", numberOfFiles)
def alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ): #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) numberOfFiles = 20 bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + queryFile os.system(command) os.system("cp *.fasta " + folderName ) os.system("rm *.fasta ") workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile,queryFile[0:-6]+".part-"+ indexOfMum + ".fasta" , header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = False, refinedVersion = False) alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, header,header +"Out", numberOfFiles)
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter): myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key=itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6] / 100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase) / (4.7 * pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0: toAddReadList.append(eachitem) """ This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine """ if continueFilter: numberOfFiles = 20 IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = ( bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta" ) os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = ( "outAbunRefine" + indexOfMum, "improved3.fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch( mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True ) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for i in range(len(myCountDic)): eachitem = "Segkk" + str(i) print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem]) return myCountDic
def evaluateCoverage(dataList, lenDic, readLenDic, folderName,mummerLink, continueFilter): myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key = itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6]/100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase)/(4.7*pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk)/(1.0*len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0 : toAddReadList.append(eachitem) ''' This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine ''' if continueFilter: numberOfFiles= 20 IORobot.putListToFileO(folderName, "raw_reads.fasta" , "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta" os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outAbunRefine"+indexOfMum, "improved3.fasta", "selected_raw.part-"+ indexOfMum + ".fasta", "abunMissOut" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = True, refinedVersion = True) alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for i in range(len(myCountDic)): eachitem = "Segkk"+str(i) print eachitem , myCountDic[eachitem]/(1.0*lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem]/(1.0*lenDic[eachitem]) return myCountDic
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter, contigFilename): ''' not sure if that is the right documentation... Input : string_graph_3, improved3.fasta, raw_reads.fasta Output : string_graph_4 with weights [need a data structure to store the weight on node] Algorithm : 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass 3. Find your favorite graphical tool to display a. Use a javascript library [halfviz should just work ! put weight on edge ] ''' myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key=itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6] / 100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase) / (4.7 * pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0: toAddReadList.append(eachitem) ''' This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine ''' if continueFilter: numberOfFiles = houseKeeper.globalParallelFileNum IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str( numberOfFiles) + " " + folderName + "selected_raw.fasta" os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = "outAbunRefine" + indexOfMum, contigFilename + ".fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum workerList.append( [outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for eachitem in lenDic: #eachitem = "Segkk"+str(i) print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem]) return myCountDic