def agouti_path_main(agoutiPaths, dSenses, vertex2Name, dGFFs, dCtgPair2GenePair, oriScafPathFile, outDir, prefix): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_path") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) agPathProgress = agLOG.PROGRESS_METER(moduleName) agPathProgress.logger.info("Analyzing scaffolding paths") outDebugFile = os.path.join(moduleOutDir, prefix) + ".agouti_path.debug" agPathDebug = agLOG.DEBUG("SHREDDER", outDebugFile) agPathProgress.logger.info("[BEGIN] Reading file with shred info") dOriPaths, dOriGaps = read_original_path(oriScafPathFile, agPathProgress) agPathProgress.logger.info("[DONE]") # shut it off for now; working to improve it #agPathProgress.logger.info("[BEGIN] Checking consistency") #compare(dOriPaths, agoutiPaths, vertex2Name, outDir, prefix) #agPathProgress.logger.info("[DONE]") report_consistency(agoutiPaths, dOriPaths, vertex2Name, outDir, prefix) agPathProgress.logger.info("[BEGIN] Recovring original scaffolding") agoutiPaths, dCtgPair2GenePair, dSenses = recover_untouched_sequences( dOriPaths, agoutiPaths, vertex2Name, dGFFs, dCtgPair2GenePair, dSenses, agPathProgress, agPathDebug) agPathProgress.logger.info("[DONE]") return agoutiPaths, dCtgPair2GenePair, dSenses
def update_local(args): ''' update to latest version ''' version = agLOG.PROGRESS_METER("UPDATE") repoDir = os.path.dirname(os.path.realpath(__file__)) # first check git availability checkGitVersion = "git --version" p = sp.Popen(shlex.split(checkGitVersion), stdout=sp.PIPE, stderr=sp.PIPE) pout, perr = p.communicate() if p.returncode: version.logger.info("Please check your PATH for git") version.logger.info("Update unsuccessful") sys.exit(1) # Then compare local with remote version.logger.info("Checking available updates of AGOUTI") checkLocal = "git log -n 1 --pretty=\"%%H\"" localVersion = sp.check_output(shlex.split(checkLocal), cwd=repoDir).strip() # if remoteVersion != localVersion: gitCmd = "git ls-remote origin" heads = sp.check_output(shlex.split(gitCmd), cwd=repoDir).split("\n") tags = [] dVersions = {} for line in heads: if line: tmpLine = line.strip().split("\t") if re.search("refs/tag", tmpLine[1]): if re.search("\^\{\}$", tmpLine[1]): dVersions[tmpLine[1].strip("^{}")] = tmpLine[0] continue else: dVersions[tmpLine[1]] = "" tags.append(tmpLine[1]) latesTag = sorted(tags)[-1] latestHash = dVersions[latesTag] if latestHash != localVersion: gitCmd = "git fetch --all" p = sp.Popen(shlex.split(gitCmd), stdout=sp.PIPE, stderr=sp.PIPE, cwd=repoDir) pout, perr = p.communicate() if p.returncode: version.logger.error("git fetch error: %s" % (perr)) sys.exit(1) gitCmd = "git checkout -q %s -b %s" % (latesTag, latesTag.split("/")[-1]) p = sp.Popen(shlex.split(gitCmd), stdout=sp.PIPE, stderr=sp.PIPE, cwd=repoDir) pout, perr = p.communicate() if p.returncode: version.logger.error("git checkout error: %s" % (perr)) sys.exit(1) version.logger.info("Update successful") sys.exit(0) version.logger.info("Current version is the LATEST. No need to update")
def agouti_shred_main(assemblyFile, gffFile, prefix, minGaps, minCtgLen): breakerProgress = agLOG.PROGRESS_METER("SHREDDER") breakerProgress.logger.info("[BEGIN] Shredding assembly") outdir = os.path.dirname(os.path.realpath(prefix)) if not os.path.exists(outdir): os.makedirs(outdir) dHeader2Intervals = shred_assembly(assemblyFile, breakerProgress, prefix, minGaps, minCtgLen) if gffFile: shred_annotation(dHeader2Intervals, gffFile, prefix, breakerProgress)
def run_scaffolder(args): bamFile = args.bamFile gffFile = os.path.realpath(args.gff) prefix = args.prefix outDir = os.path.realpath(args.outDir) if not os.path.exists(outDir): os.makedirs(outDir) paraLogFile = os.path.join(outDir, "%s.parameters.txt" % (args.prefix)) para = agLOG.PROGRESS_METER(parse_args.__name__) para.add_file_handler(paraLogFile) para.logger.info("Assembly: %s" % (os.path.realpath(args.assemblyFile))) para.logger.info("Gene Model: %s" % (gffFile)) if args.oriScafPath: para.logger.info("Original scaffold path: %s" % (args.oriScafPath)) para.logger.info("Output directory: %s" % (outDir)) para.logger.info("Output prefix: %s" % (prefix)) para.logger.info("Minimum number of supports: %d" % (args.minSupport)) para.logger.info("Length of gaps to fill between contigs: %d" % (args.nFills)) vertex2Name, dSeqs = agSeq.agouti_seq_main(args.assemblyFile, outDir, prefix, args.debug) dGFFs = agGFF.get_gene_models(gffFile, outDir, prefix, args.debug) dContigPairs = agBAM.agouti_sam_main(bamFile, outDir, prefix, args.overwrite, args.minMQ, args.minFracOvl, args.maxFracMM, args.debug) dCtgPair2GenePair, dCtgPairDenoise = agDENOISE.denoise_joining_pairs( dContigPairs, dGFFs, vertex2Name, outDir, prefix, args.minSupport, args.debug) agoutiPaths, dSenses = agSCAFF.run_scaffolding(vertex2Name, dCtgPairDenoise, dCtgPair2GenePair, outDir, prefix, args.minSupport, args.debug) if args.oriScafPath: agoutiPaths, dCtgPair2GenePair, dSenses = agPATH.agouti_path_main( agoutiPaths, dSenses, vertex2Name, dGFFs, dCtgPair2GenePair, args.oriScafPath, outDir, prefix) agUPDATE.agouti_update(agoutiPaths, dSeqs, vertex2Name, dSenses, dGFFs, dCtgPair2GenePair, outDir, prefix, args.nFills, args.debug, args.no_update_gff) para.logger.info("Peak memory use: %.5f GB" % (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / (1024 * 1024)))
def agouti_seq_main(assemblyFile, outDir, prefix, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_seq") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join(moduleOutDir, "%s.agouti_seq.progressMeter" % (prefix)) agSeqProgress = agLOG.PROGRESS_METER(moduleName) agSeqProgress.add_file_handler(progressLogFile) #contigs, dSeqs = get_contigs(assemblyFile, agSeqProgress) agSeqProgress.logger.info("[BEGIN] Reading the initial assembly") dSeqs = {} dHeaders = {} contigs = [ "NONE" ] # add dummny contig to make sure contig index staring with 1, not 0 seqLens = [] seqIndex = 1 for header, seq in read_fasta(assemblyFile): # split header on any non-alphabetic character # use only the first of the return list #header = re.split("\W+", header)[0] if header not in dHeaders: contigs.append(header) dSeqs[seqIndex] = seq seqIndex += 1 dHeaders[header] = 1 seqLens.append(len(seq)) else: agSeqProgress.logger.error("AGOUTI found DUPLICATED header: %s" % (header)) agSeqProgress.logger.error("QUIT") sys.exit(1) n50 = get_assembly_NXX(seqLens) agSeqProgress.logger.info("%d sequences parsed" % (len(dSeqs))) agSeqProgress.logger.info("The given assembly N50: %d" % (n50)) agSeqProgress.logger.info("[DONE]") return contigs, dSeqs
def agouti_update(agoutiPaths, dSeqs, vertex2Name, dSenses, dGFFs, dCtgPair2GenePair, outDir, prefix, nFills=1000, debug=0, no_update_gff=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_update") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join(moduleOutDir, "%s.agouti_update.progressMeter" % (prefix)) global agUPDATEProgress agUPDATEProgress = agLOG.PROGRESS_METER(moduleName) agUPDATEProgress.add_file_handler(progressLogFile) if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_update.debug" % (prefix)) global agUPDATEDebug agUPDATEDebug = agLOG.DEBUG(moduleName, debugLogFile) if not no_update_gff: agUPDATEProgress.logger.info("[BEGIN] Updating gene models") outFasta = os.path.join(outDir, "%s.agouti.fasta" % (prefix)) fFASTA = open(outFasta, 'w') dUpdateGFFs = collections.defaultdict(list) dMergedGene2Ctgs = collections.defaultdict(list) dMergedGene2Genes = collections.defaultdict(list) scafPaths = [] numMergedGene = 0 nCtgScaffolded = 0 scaffoldedCtgs = {} seqLens = [] dScafGaps = {} dScafStats = {} scafID = 0 mergedGenes = [] for i in range(len(agoutiPaths)): path = agoutiPaths[i] scafID += 1 scafName = prefix + "_scaf_%d" % (scafID) dScafStats[scafName] = 0 dScafGaps[scafName] = [] curVertex = path[0] sequence = dSeqs[curVertex] curSense = "+" curCtg = vertex2Name[curVertex] preCtg = "" scafPath = [curVertex] preGeneID, curGeneID = "", "" mergedGene = agGFF.AGOUTI_GFF() preMergedGene = agGFF.AGOUTI_GFF() gapStart, gapStop = 0, 0 offset = 0 orientation = "" updatedGeneIDs = [] mergedGenesPerPath = [] excludeGeneIDs = [] for nextVertex in path[1:]: nextCtg = vertex2Name[nextVertex] if preCtg == "": if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t>scaf_%d - path - %s" % (scafID, str([vertex2Name[vertex] for vertex in path]))) if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tcurVertex - %d - %s - nextVertex - %d - %s" % (curVertex, curCtg, nextVertex, nextCtg)) if not no_update_gff: #curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair, curCtg, nextCtg) curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair, curVertex, nextVertex) #!!! I should not break here, should continue# if curGene is None and nextGene is None: agUPDATEProgress.logger.error( "%s - %s found no gene models joining them" % (curCtg, nextCtg)) agUPDATEProgress.logger.error( "This is NOT EXPECTED, REPORT!") sys.exit(1) curGeneID = curGene.geneID excludeGeneIDs = [preGeneID] + [curGeneID] if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tpreGene - %s - curGene - %s - nextGene - %s" % (preGeneID, curGene.geneID, nextGene.geneID)) if debug: agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafName - %s" % (scafName)) FR, FF, RR, RF = get_orientation_counts(curVertex, nextVertex, dSenses) if debug: agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" % (curSense)) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" % (FR, FF, RF, RR)) if curSense == "-": temp1 = FR temp2 = FF FR = RR FF = RF RR = temp1 RF = temp2 orientation = decide_orientation(FR, FF, RR, RF) gapStart = gapStop + len(dSeqs[curVertex]) gapStop = gapStart + nFills - 1 dScafGaps[scafName].append((gapStart + 1, gapStop + 1)) if debug: agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" % (curSense)) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" % (FR, FF, RF, RR)) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\toffset - %d - curCtgLen - %d" % (offset, len(dSeqs[curVertex]))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tgapstart - %d - gapstop - %d" % (gapStart, gapStop + 1)) valid = 0 if orientation == "FR": if not no_update_gff: if curGeneID != preGeneID: numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] #if curGene.geneStop != 0: # dMergedGene2Genes[mergedGene.geneID] += [curGeneID] #if nextGene.geneStop != 0: # dMergedGene2Genes[mergedGene.geneID] += [nextGene.geneID] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] if nextGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence += 'N' * nFills + dSeqs[nextVertex] scafPath += [nextVertex] curSense = "+" elif orientation == "FF": if not no_update_gff: #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug) dGFFs[nextCtg] = reverse_gene_models( dGFFs[nextCtg], len(dSeqs[nextVertex]), debug) if curGeneID != preGeneID: numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence += 'N' * nFills + agSeq.rc_seq(dSeqs[nextVertex]) scafPath += [-1 * nextVertex] curSense = "-" elif orientation == "RR": if not no_update_gff: if curGene.geneID != preGeneID: dGFFs[curCtg] = reverse_gene_models( dGFFs[curCtg], len(dSeqs[curVertex]), debug) numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug) dUpdateGFFs[scafName] = reverse_gene_models( dUpdateGFFs[scafName], gapStart - 1, debug) mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence = agSeq.rc_seq(sequence) + \ 'N'*nFills + dSeqs[nextVertex] scafPath[-1] = -1 * scafPath[-1] scafPath += [nextVertex] curSense = "+" elif orientation == "RF": if not no_update_gff: dGFFs[nextCtg] = reverse_gene_models( dGFFs[nextCtg], len(dSeqs[nextVertex]), debug) if curGene.geneID != preGeneID: dGFFs[curCtg] = reverse_gene_models( dGFFs[curCtg], len(dSeqs[curVertex]), debug) #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug) numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug) dUpdateGFFs[scafName] = reverse_gene_models( dUpdateGFFs[scafName], gapStop + len(dSeqs[curVertex]), debug) mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence = agSeq.rc_seq(sequence) + \ 'N'*nFills + \ agSeq.rc_seq(dSeqs[nextVertex]) scafPath[-1] = -1 * scafPath[-1] scafPath += [-1 * nextVertex] curSense = "-" if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tscafPath in vertices updates- %s" % (str(scafPath))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tdMergedGene2Gene - %s" % (str(dMergedGene2Genes[mergedGene.geneID]))) if not no_update_gff: mergedGenesPerPath.append(mergedGene.geneID) preGeneID = nextGene.geneID offset = gapStop preCtg = curCtg curVertex = nextVertex curCtg = vertex2Name[curVertex] for i in range(len(scafPath)): v = scafPath[i] if v < 0: scafPath[i] = "-" + vertex2Name[-1 * v] else: scafPath[i] = vertex2Name[v] scafPaths += [scafPath] if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tscafPath in human-readable updates- %s" % (str(scafPath))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tappend last curCtg - %s" % (curCtg)) agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafPath - %s" % (str(scafPath))) if not no_update_gff: excludeGeneIDs = [preGeneID] mergedGenes.append(mergedGenesPerPath) dUpdateGFFs[scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug) fFASTA.write(">%s |%dbp |%s\n%s\n" % (scafName, len(sequence), ",".join(scafPath), sequence)) dScafStats[scafName] = len(sequence) seqLens.append(len(sequence)) #agPaths.append(scafPath) nCtgScaffolded += len(scafPath) scaffoldedCtgs.update(dict((contig, 1) for contig in scafPath)) if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tmergedGenesPerPath - %s" % (str(mergedGenesPerPath))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t-------------------------------------") agPATH.report_scaffold_path(scafPaths, vertex2Name, outDir, prefix) # other contigs need to be output agUPDATEProgress.logger.info("Finalizing sequences") for vertex in dSeqs: if vertex2Name[vertex] in scaffoldedCtgs or "-" + vertex2Name[ vertex] in scaffoldedCtgs: continue fFASTA.write(">%s\n%s\n" % (vertex2Name[vertex], dSeqs[vertex])) dScafStats[vertex2Name[vertex]] = len(dSeqs[vertex]) seqLens.append(len(dSeqs[vertex])) fFASTA.close() n50 = agSeq.get_assembly_NXX(seqLens) agUPDATEProgress.logger.info("Outputting updated Gene Moddels") for vertex in dSeqs: if vertex2Name[vertex] in scaffoldedCtgs: if vertex2Name[vertex] in dGFFs: del dGFFs[vertex2Name[vertex]] if not no_update_gff: dFinalGFFs = dict(dGFFs, **dUpdateGFFs) numGenes = output_gff(dFinalGFFs, dMergedGene2Ctgs, dMergedGene2Genes, dScafStats, dScafGaps, outDir, prefix) agUPDATEProgress.logger.info("Summarizing AGOUTI gene paths") summarize_gene_path(dMergedGene2Genes, dMergedGene2Ctgs, outDir, prefix) agUPDATEProgress.logger.info("-----------Summary-----------") agUPDATEProgress.logger.info("number of contigs scaffoled: %d" % (nCtgScaffolded)) agUPDATEProgress.logger.info("number of scaffolds: %d" % (scafID)) agUPDATEProgress.logger.info( "number of contigs in the final assembly: %d" % (len(seqLens))) agUPDATEProgress.logger.info("Final assembly N50: %d" % (n50)) if not no_update_gff: agUPDATEProgress.logger.info("Final number of genes: %d" % (numGenes)) agUPDATEProgress.logger.info("Succeeded")
def denoise_joining_pairs(dContigPairs, dGFFs, vertex2Name, outDir, prefix, minSupport, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_denoise") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join( moduleOutDir, "%s.agouti_denoise.progressMeter" % (prefix)) agDENOISEProgress = agLOG.PROGRESS_METER(moduleName) agDENOISEProgress.add_file_handler(progressLogFile) debugLogFile = "" if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_denoise.debug" % (prefix)) global agDENOISEDebug agDENOISEDebug = agLOG.DEBUG(moduleName, debugLogFile) agDENOISEProgress.logger.info("[BEGIN] Denoising joining pairs") startTime = time.clock() dCtgPair2GenePair = collections.defaultdict() dCtgPairDenoise = collections.defaultdict() dMappedPos = collections.defaultdict() daddedModels = collections.defaultdict(list) nFail4Combination = 0 nFailGeneModel = 0 nFailK = 0 outDenoiseJPFile = os.path.join( moduleOutDir, "%s.agouti.join_pairs.noise_free.txt" % (prefix)) fOUT = open(outDenoiseJPFile, 'w') for ctgPair, pairInfo in dContigPairs.items(): if len(pairInfo) < minSupport: nFailK += 1 del dContigPairs[ctgPair] continue ctgA = ctgPair[0] ctgB = ctgPair[1] if debug: agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t>contigA - %s - contigB - %s" % (ctgA, ctgB)) pairToRemove = [] mapIntervalsA = [] mapIntervalsB = [] pairs = [] senses = [] keep = 0 for i in xrange(len(pairInfo)): startA, startB, stopA, stopB, senseA, senseB, readID = pairInfo[i] mapIntervalsA += [(startA, stopA)] mapIntervalsB += [(startB, stopB)] pairs += [(startA, stopA, startB, stopB)] senses += [(senseA, senseB)] genePair = get_genePair_for_contigPair(dGFFs, ctgA, ctgB, mapIntervalsA, mapIntervalsB, senses, debug) geneModelsA = dGFFs[ctgA] geneModelsB = dGFFs[ctgB] if genePair is None: nFailGeneModel += 1 if debug: agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tFail to find a pair of gene models") agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t----------------------------------") else: geneIndexA, geneIndexB, endA, endB, intervalsA, intervalsB, senses = genePair sensesCounter = collections.Counter(senses) if debug: agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tsensesCounter: %s" % (str(sensesCounter))) if geneIndexB != 0: # create gene model according to endB using intervalsB if geneIndexB == -1 and (endB == 5 or endB == 0): dGFFs[ctgB] = create_fake_genes(geneModelsB, 0, ctgB, intervalsB, debug) geneIndexB = 0 endB = 5 elif geneIndexB == 1 and (endB == 3 or endB == 0): dGFFs[ctgB] = create_fake_genes(geneModelsB, len(geneModelsB), ctgB, intervalsB, debug) geneIndexB = len(dGFFs[ctgB]) - 1 endB = 3 else: if endB == 0: endB = 5 elif endB == 3: geneIndexB = len(dGFFs[ctgB]) - 1 if geneIndexA != 0: # create gene model according to endA using intervalsA if geneIndexA == -1 and (endA == 5 or endA == 0): dGFFs[ctgA] = create_fake_genes(geneModelsA, 0, ctgA, intervalsA, debug) geneIndexA = 0 endA = 5 elif geneIndexA == 1 and (endA == 3 or endA == 0): dGFFs[ctgA] = create_fake_genes(geneModelsA, len(geneModelsA), ctgA, intervalsA, debug) geneIndexA = len(dGFFs[ctgA]) - 1 endA = 3 else: if endA == 0: endA = 3 elif endA == 3: geneIndexA = len(dGFFs[ctgA]) - 1 if debug: agDENOISEDebug.debugger.debug("DENOISE_MAIN\tgenePair: %s" % (str(genePair))) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t# models on ctgA - %d - # models on ctgB - %d" % (len(dGFFs[ctgA]), len(dGFFs[ctgB]))) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tgeneIndexA - %d - endA - %d - geneIndexB - %d - endB - %d" % (geneIndexA, endA, geneIndexB, endB)) sense = sorted(sensesCounter.items(), key=operator.itemgetter(1), reverse=True)[0][0] if debug: agDENOISEDebug.debugger.debug("DENOISE_MAIN\tsensePair - %s" % (str(sense))) if (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \ (geneIndexB == 0 and endB == 5) and sense == ('+', '-'): # FR + 3'-5' keep = 1 elif (geneIndexA == 0 and endA == 5) and \ (geneIndexB == 0 and endB == 5) and sense == ('-', '-'): # RR + 5'-5' keep = 1 elif (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \ (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \ sense == ('+', '+'): # FF + 3'-3' keep = 1 elif (geneIndexA == 0 and endA == 5) and \ (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \ sense == ('-', '+'): # RF + 5'-3' keep = 1 elif (geneIndexA == 0 and (endA == 0 or endA == 3)) and \ (geneIndexB == 0 and (endB == 0 or endB == 5)) and \ sense == ('+', '-'): # only one gene on the contig # it doesn't matter which end keep = 1 if keep: geneA = dGFFs[ctgA][geneIndexA] geneB = dGFFs[ctgB][geneIndexB] dCtgPair2GenePair[vertex2Name.index(ctgA), vertex2Name.index(ctgB)] = [geneA, geneB] if debug: agDENOISEDebug.debugger.debug("DENOISE_MAIN\tNOISE-FREE") agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tgeneA ID - %s - startA - %d - stopA = %d" % (geneA.geneID, geneA.geneStart, geneA.geneStop)) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tgeneB ID - %s - startB - %d - stopB = %d" % (geneB.geneID, geneB.geneStart, geneB.geneStop)) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t----------------------------------") senseA = sense[0] senseB = sense[1] weight = 0 for i in xrange(len(pairInfo)): startA, startB, stopA, stopB, _, _, readID = pairInfo[i] intervalA = (startA, stopA) intervalB = (startB, stopB) #print "intervalA", intervalA, "intervalB", intervalB if len(intervalsA) == 0: if len(intervalsB) == 0: #print "use all" fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 else: #print "use all A, not all B" overlap = find_overlap( intervalB, (geneB.geneStart, geneB.geneStop)) if overlap == 0: fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 else: if len(intervalsB) == 0: #print "use all B, not all A" overlap = find_overlap( intervalA, (geneA.geneStart, geneA.geneStop)) if overlap == 0: fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 else: #print "not all Both" overlapA = find_overlap( intervalA, (geneA.geneStart, geneA.geneStop)) overlapB = find_overlap( intervalB, (geneB.geneStart, geneB.geneStop)) if overlapA == 0 and overlapB == 0: fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 dCtgPairDenoise[vertex2Name.index(ctgA), vertex2Name.index(ctgB)] = [ weight, (senseA, senseB) ] else: nFail4Combination += 1 # if len(sensesCounter) == 1: # sense = sensesCounter.keys()[0] # else: # print "multiple sense pairs" # senses = sorted(sensesCounter.items(), key=operator.itemgetter(1), reverse=True)[0:2] # print "senses", senses # ratio = float(senses[0][1])/(senses[0][1]+senses[1][1]) # print "ratio", ratio fOUT.close() agDENOISEProgress.logger.info("Succeeded") agDENOISEProgress.logger.info("Denoise took in %.2f min CPU time" % ((time.clock() - startTime) / 60)) agDENOISEProgress.logger.info( "%d contig pairs filtered for spanning across >1 gene models" % (nFailGeneModel)) agDENOISEProgress.logger.info( "%d contig pairs filtered for not being one of the four combinations" % (nFail4Combination)) agDENOISEProgress.logger.info("%d contig pairs filtered for less support" % (nFailK)) agDENOISEProgress.logger.info("%d contig pairs for scaffolding" % (len(dCtgPairDenoise))) return dCtgPair2GenePair, dCtgPairDenoise
def get_joining_pairs(bamStream, outDir, prefix, overwrite, minMapQ=5, minFracOvl=0.0, maxFracMismatch=1.0, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_join_pairs") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join( moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix)) agBAMOutAllJoinPairs = os.path.join( moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix)) agBAMProgress = agLOG.PROGRESS_METER(moduleName) if not os.path.exists(progressLogFile): agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") else: if not overwrite: agBAMProgress.add_file_handler(progressLogFile, 'a') dContigPairs = retrieve_joininng_pairs(agBAMProgress, agBAMOutAllJoinPairs) if dContigPairs is not None: return dContigPairs else: agBAMProgress.logger.info( "Fail to pick up results from the previous run") agBAMProgress.logger.info("Re-processing the BAM file") else: agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") agBAMProgress.logger.info( "Overwrite results from the previous run") agBAMDebug = None if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_join_pairs.debug" % (prefix)) agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile) with open(agBAMOutAllJoinPairs, 'w') as fOUT: agBAMProgress.logger.info( "# processed\t| Current Reads ID\t| Elapsed Time") if debug: agBAMDebug.debugger.debug( "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB" ) startTime = time.time() dContigPairs = collections.defaultdict(list) nJoinPairs = 0 nReadsPairs = 0 while True: pairA = bamStream.readline().strip().split("\t") pairB = bamStream.readline().strip().split("\t") # reach the end of the file if len(pairA) == 1 or len(pairB) == 1: break readsID = pairA[0] contigA = pairA[2] contigB = pairB[2] nReadsPairs += 1 if pairA[0] == pairB[0] and contigA != contigB: alnLenA = getCIGAR(pairA[5]) alnLenB = getCIGAR(pairB[5]) leftMostPosA = int(pairA[3]) leftMostPosB = int(pairB[3]) readLenA = len(pairA[9]) readLenB = len(pairB[9]) nMismatchesA = getMismatches(pairA[11:]) nMismatchesB = getMismatches(pairB[11:]) mapQA = int(pairA[4]) mapQB = int(pairB[4]) flagsA = explainSAMFlag(int(pairA[1])) flagsB = explainSAMFlag(int(pairB[1])) senseA = flagsA[4] senseB = flagsB[4] if debug: agBAMDebug.debugger.debug( "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d" % (readsID, contigA + ":" + str(leftMostPosA), contigB + ":" + str(leftMostPosB), mapQA, mapQB, senseA, senseB, readLenA, readLenB)) if (min(alnLenA / readLenA, alnLenB / readLenB) >= minFracOvl and # minimum fraction of overlaps max(nMismatchesA / alnLenA, nMismatchesB / alnLenB) <= maxFracMismatch and # maximum fraction of mismatches min(mapQA, mapQB) >= minMapQ): # minimum mapping quality startA = leftMostPosA + 1 stopA = startA + 1 + int(alnLenA) startB = leftMostPosB + 1 stopB = startB + 1 + int(alnLenB) nJoinPairs += 1 if contigA <= contigB: if (contigA, contigB) not in dContigPairs: dContigPairs[contigA, contigB] = [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] else: dContigPairs[contigA, contigB] += [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigA, startA, stopA, senseA, contigB, startB, stopB, senseB)) else: if (contigB, contigA) not in dContigPairs: dContigPairs[contigB, contigA] = [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] else: dContigPairs[contigB, contigA] += [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigB, startB, stopB, senseB, contigA, startA, stopA, senseA)) if nReadsPairs % 5000000 == 0: elapsedTime = float((time.time() - startTime) / 60) agBAMProgress.logger.info("%d parsed\t| %s\t| %.2f m" % (nReadsPairs, readsID, elapsedTime)) agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs)) agBAMProgress.logger.info("%d contig pairs given by these joining pairs" % (len(dContigPairs))) if nJoinPairs == 0: agBAMProgress.logger.error("No joining pairs extracted") agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs") sys.exit(1) else: agBAMProgress.logger.info("Succeeded") return dContigPairs
def agouti_sam_main(bamFile, outDir, prefix, overwrite, minMapQ, minFracOvl, maxFracMismatch, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_join_pairs") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join( moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix)) agBAMOutAllJoinPairs = os.path.join( moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix)) agBAMProgress = agLOG.PROGRESS_METER(moduleName) if not os.path.exists(progressLogFile): agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") else: if not overwrite: agBAMProgress.add_file_handler(progressLogFile, 'a') dContigPairs = retrieve_joininng_pairs(agBAMProgress, agBAMOutAllJoinPairs) if dContigPairs is not None: return dContigPairs else: agBAMProgress.logger.info( "Fail to pick up results from the previous run") agBAMProgress.logger.info("Re-processing the BAM file") else: agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") agBAMProgress.logger.info( "Overwrite results from the previous run") agBAMDebug = None if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_join_pairs.debug" % (prefix)) agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile) # before running samtools, check its availability agBAMProgress.logger.info("check SAMtools") check_samtools(agBAMProgress) # runing samtools agBAMProgress.logger.info("run SAMtools") try: with open(agBAMOutAllJoinPairs, 'w') as fOUT: agBAMProgress.logger.info( "# processed\t| Current Reads ID\t| Elapsed Time") if debug: agBAMDebug.debugger.debug( "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB" ) startTime = time.time() dContigPairs = collections.defaultdict(list) nJoinPairs = 0 nReadsPairs = 0 for record in run_samtools(bamFile, agBAMProgress): tmpRecord = record.split("\n") pairA = tmpRecord[0].split("\t") pairB = tmpRecord[1].split("\t") readsID = pairA[0] contigA = pairA[2] contigB = pairB[2] mateCtgB = pairA[6] mateCtgA = pairB[6] nReadsPairs += 1 # the first contidition makes sure # single end BAM are gonna have zero # joining-pairs extracted if contigA == "*" or contigB == "*": continue if pairA[0] == pairB[0] and contigA != contigB: alnLenA = getCIGAR(pairA[5]) alnLenB = getCIGAR(pairB[5]) leftMostPosA = int(pairA[3]) # 1-based in SAM leftMostPosB = int(pairB[3]) readLenA = len(pairA[9]) readLenB = len(pairB[9]) nMismatchesA = getMismatches(pairA[11:]) nMismatchesB = getMismatches(pairB[11:]) mapQA = int(pairA[4]) mapQB = int(pairB[4]) flagsA = explainSAMFlag(int(pairA[1])) flagsB = explainSAMFlag(int(pairB[1])) senseA = flagsA[4] senseB = flagsB[4] if debug: agBAMDebug.debugger.debug( "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d" % (readsID, contigA + ":" + str(leftMostPosA), contigB + ":" + str(leftMostPosB), int(alnLenA), int(alnLenB), mapQA, mapQB, senseA, senseB, readLenA, readLenB)) fracOvlA = alnLenA / readLenA fracOvlB = alnLenB / readLenB fracMismatchA = nMismatchesA / alnLenA fracMismatchB = nMismatchesB / alnLenB if (min(fracOvlA, fracOvlB) >= minFracOvl and # minimum fraction of overlaps max(fracMismatchA, fracMismatchB) <= maxFracMismatch and # maximum fraction of mismatches min(mapQA, mapQB) >= minMapQ): # minimum mapping quality startA = leftMostPosA stopA = startA + int(alnLenA) - 1 startB = leftMostPosB stopB = startB + int(alnLenB) - 1 nJoinPairs += 1 if contigA <= contigB: if (contigA, contigB) not in dContigPairs: dContigPairs[contigA, contigB] = [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] else: dContigPairs[contigA, contigB] += [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] fOUT.write( "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigA, startA, stopA, senseA, contigB, startB, stopB, senseB)) else: if (contigB, contigA) not in dContigPairs: dContigPairs[contigB, contigA] = [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] else: dContigPairs[contigB, contigA] += [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] fOUT.write( "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigB, startB, stopB, senseB, contigA, startA, stopA, senseA)) if nReadsPairs % 5000000 == 0: elapsedTime = float((time.time() - startTime) / 60) agBAMProgress.logger.info( "%d parsed\t| %s\t| %.2f m" % (nReadsPairs, readsID, elapsedTime)) except KeyboardInterrupt: agBAMProgress.logger.info( "Extract Joining-pairs INTERRUPTED by Keyboard") sys.exit(1) agBAMProgress.logger.info("%d reads pairs in the give BAM" % (nReadsPairs)) agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs)) agBAMProgress.logger.info("%d contig pairs given by these joining pairs" % (len(dContigPairs))) if nJoinPairs == 0: agBAMProgress.logger.error("No joining pairs extracted") agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs") sys.exit(1) else: agBAMProgress.logger.info("Succeeded") return dContigPairs
def get_gene_models(gff, outDir, prefix, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_GFFs") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join(moduleOutDir, "%s.agouti_gff.progressMeter" % (prefix)) agGFFProgress = agLOG.PROGRESS_METER(moduleName) agGFFProgress.add_file_handler(progressLogFile) agGFFProgress.logger.info("[BEGIN] Getting gene models") dGFFs = collections.defaultdict(list) nGene = 0 with open(gff, 'r') as fIN: for line in fIN: if line.startswith("##FASTA") or line.startswith("##Fasta"): break # skip empty lines and lines starting with '#' if not line.startswith('#') and len(line.strip()) > 0: tmp_line = line.strip().split("\t") if tmp_line[2] == "gene": nGene += 1 if nGene == 0: agGFFProgress.logger.error("Found zero genes") agGFFProgress.logger.error("Please check your GFF file") sys.exit(1) lobj_GeneModels = [AGOUTI_GFF() for i in xrange(nGene)] geneIndex = -1 stop = 0 fIN.seek(0) for line in fIN: # Stop before getting into FASTA zone if line.startswith("##FASTA") or line.startswith("##Fasta"): stop = 1 break # skip empty lines and lines starting with '#' if not line.startswith('#') and line.strip(): tmp_line = line.strip().split("\t") if tmp_line[2] == "gene": geneIndex += 1 attrs = tmp_line[8].split(';') for attr in attrs: attrID, attrVal = attr.split('=') if attrID == "ID": geneID = attrVal break #m = re.search("(;ID=.+;|ID=.+;|ID=.+|;ID=.+)", tmp_line[8]) #print m.group() #geneID = m.group().strip(';').split('=')[1] if geneIndex == 0: #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1], # int(tmp_line[3]), # int(tmp_line[4])) lobj_GeneModels[geneIndex].setGene( geneID, int(tmp_line[3]), int(tmp_line[4])) else: preCtgID = lobj_GeneModels[geneIndex - 1].ctgID preGeneID = lobj_GeneModels[geneIndex - 1].geneID dGFFs[preCtgID].append(lobj_GeneModels[geneIndex - 1]) #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1], # int(tmp_line[3]), # int(tmp_line[4])) lobj_GeneModels[geneIndex].setGene( geneID, int(tmp_line[3]), int(tmp_line[4])) lobj_GeneModels[geneIndex].setProgram(tmp_line[1]) lobj_GeneModels[geneIndex].setContigID(tmp_line[0]) lobj_GeneModels[geneIndex].setStrand(tmp_line[6]) elif tmp_line[2] == "stop_codon": lobj_GeneModels[geneIndex].setStopCodon() elif tmp_line[2] == "start_codon": lobj_GeneModels[geneIndex].setStartCodon() elif tmp_line[2] == "CDS": lobj_GeneModels[geneIndex].updateCDS( int(tmp_line[3]), int(tmp_line[4])) if not stop and geneIndex >= 0: dGFFs[lobj_GeneModels[geneIndex].ctgID].append( lobj_GeneModels[geneIndex]) if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_gff.debug" % (prefix)) agGFFDebug = agLOG.DEBUG(moduleName, debugLogFile) agGFFDebug.debugger.debug("Sequence\tNum_Gene_Models") nGeneModels = 0 for k, v in sorted(dGFFs.items()): genes = [(gene.geneStart, gene.geneStop) for gene in v] # make sure gene model are in ascending order soGenes = sorted(xrange(len(genes)), key=lambda k: genes[k]) tmpV = [] for i in xrange(len(soGenes)): index = soGenes[i] tmpV.append(v[index]) dGFFs[k] = tmpV nGeneModels += len(tmpV) if debug: agGFFDebug.debugger.debug("%s\t%d" % (k, len(tmpV))) agGFFProgress.logger.info("%d Gene Models parsed" % (nGeneModels)) agGFFProgress.logger.info("[DONE]") return dGFFs