def computeTreePhyloP(args): visitQueue = [args.root] bigwigCmds = [] while len(visitQueue) > 0: genome = visitQueue.pop() bedFlags = "" # Generate a bed file of all regions of # genome that dont align to parent bedInsertsFile = outFileName(args, genome, "bed", "inserts", True) if genome != args.root: runShellCommand( "halAlignedExtract %s %s --alignedFile %s --complement" % ( args.hal, genome, bedInsertsFile)) bedFlags = "--refBed %s" % bedInsertsFile # Run halPhyloP on the inserts wigFile = outFileName(args, genome, "wig", "phyloP", False) cmd = "halPhyloPMP.py %s %s %s %s --numProc %d %s" % ( args.hal, genome, args.mod, bedFlags, args.numProc, wigFile) if args.subtree is not None: cmd += " --subtree %s" % args.subtree if args.prec is not None: cmd += " --prec %d" % args.prec runShellCommand(cmd) runShellCommand("rm -f %s" % bedInsertsFile) # Lift down from the parent, appending to the wig file computed above if genome != args.root: parent = getHalParentName(args.hal, genome) parentWig = outFileName(args, parent, "wig", "phyloP", False) if os.path.isfile(parentWig): runShellCommand("halWiggleLiftover %s %s %s %s %s --append" % ( args.hal, parent, parentWig, genome, wigFile)) # Convert to bigwig if desired and delete wig file if args.bigWig is True and os.path.isfile(wigFile): sizesFile = outFileName(args, genome, "sizes", "chr", True) bwFile = outFileName(args, genome, "bw", "phyloP", False) bwCmd = "halStats %s --chromSizes %s > %s && " % (args.hal, genome, sizesFile) bwCmd += "wigToBigWig %s %s %s && " % (wigFile, sizesFile, bwFile) bwCmd += "rm -f %s &&" % wigFile bwCmd += "rm -f %s" % sizesFile bigwigCmds.append(bwCmd) # Recurse on children. children = getHalChildrenNames(args.hal, genome) for child in children: visitQueue.append(child) #parallel bigwig conversion runParallelShellCommands(bigwigCmds, args.numProc)
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) refSequenceStats = getHalSequenceStats(options.halFile, refGenome) options.smallFile = False options.firstSmallFile = True sliceCmds = [] sliceOpts = [] # we are going to deal with sequence coordinates if options.splitBySequence is True or options.refSequence is not None: for sequence, seqLen, nt, nb in refSequenceStats: if options.refSequence is None or sequence == options.refSequence: seqOpts = copy.deepcopy(options) if seqLen < options.smallSize: seqOpts.smallFile = True seqOpts.refGenome = refGenome seqOpts.refSequence = sequence index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHal2MafCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) if seqOpts.smallFile is True and seqLen > 0: options.firstSmallFile = False # we are slicing the gnome coordinates directly else: seqOpts = copy.deepcopy(options) assert seqOpts.splitBySequence is False genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome) # auto compute slice size from numprocs if seqOpts.sliceSize == None and seqOpts.numProc > 1: refLen = genomeLen if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc)) index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHal2MafCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds)
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) refSequenceStats = getHalSequenceStats(options.halFile, refGenome) options.smallFile = False options.firstSmallFile = True sliceCmds = [] sliceOpts = [] if options.refSequence is not None: refStat = [x for x in refSequenceStats if x[1] == options.refSequence] if len(refStat != 1): raise RuntimeError("Sequence %s not found in genome %s" % ( options.refSequence, options.refGenome)) totalLength = int(refStat[1]) else: totalLength = getHalGenomeLength(options.halFile, refGenome) seqOpts = copy.deepcopy(options) # auto compute slice size from numprocs if seqOpts.sliceSize == None and seqOpts.numProc > 1: refLen = totalLength if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc)) index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHalPhyloPCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds) writeChromSizes(options)
def createLods(halPath, outLodPath, outDir, maxBlock, scale, overwrite, maxDNA, absPath, trans, inMemory, probeFrac, minSeqFrac, scaleCorFac, numProc, chunk, minLod0, cutOff, minCovFrac): lodFile = open(outLodPath, "w") lodFile.write("0 %s\n" % formatOutHalPath(outLodPath, halPath, absPath)) steps, lastIsMax = getSteps(halPath, maxBlock, scale, minLod0, cutOff, minSeqFrac, minCovFrac) curStepFactor = scaleCorFac lodExtractCmds = [] prevStep = None for stepIdx in xrange(1,len(steps)): step = int(max(1, steps[stepIdx] * curStepFactor)) maxQueryLength = maxBlock * steps[stepIdx - 1] keepSequences = maxQueryLength <= maxDNA #we no longer pass the step to the halLodExtract executable, #rather we give the corresponding the scale factor and let #the step get computed for each internal node (instead of using the step #here which is a global minimum stepScale = (scale ** stepIdx) * curStepFactor outHalPath = makePath(halPath, outDir, step, "lod", "hal") srcPath = halPath if trans is True and stepIdx > 1: srcPath = makePath(halPath, outDir, prevStep, "lod", "hal") isMaxLod = stepIdx == len(steps) - 1 and lastIsMax is True if not isMaxLod and (overwrite is True or not os.path.isfile(outHalPath)): lodExtractCmds.append( getHalLodExtractCmd(srcPath, outHalPath, stepScale, keepSequences, inMemory, probeFrac, minSeqFrac, chunk, minCovFrac)) lodPath = formatOutHalPath(outLodPath, outHalPath, absPath) if isMaxLod: lodPath = MaxLodToken lodFile.write("%d %s\n" % (maxQueryLength, lodPath)) if prevStep > steps[-1]: break prevStep = step curStepFactor *= scaleCorFac lodFile.close() runParallelShellCommands(lodExtractCmds, numProc)
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) refSequenceStats = getHalSequenceStats(options.halFile, refGenome) options.smallFile = False options.firstSmallFile = True sliceCmds = [] sliceOpts = [] if options.refSequence is not None: refStat = [x for x in refSequenceStats if x[1] == options.refSequence] if len(refStat != 1): raise RuntimeError("Sequence %s not found in genome %s" % (options.refSequence, options.refGenome)) totalLength = int(refStat[1]) else: totalLength = getHalGenomeLength(options.halFile, refGenome) seqOpts = copy.deepcopy(options) # auto compute slice size from numprocs if seqOpts.sliceSize == None and seqOpts.numProc > 1: refLen = totalLength if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc)) index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHalPhyloPCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds) writeChromSizes(options)
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) options.smallFile = False options.firstSmallFile = True if options.refTargets: sliceCmds, sliceOpts = partitionRefTargets(options) elif options.splitBySequence is True or options.refSequence is not None: sliceCmds, sliceOpts = partitionBySeqCoords(options, refGenome) else: sliceCmds, sliceOpts = partitionByGenomeCoords(options, refGenome) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # clean up temporary bed files (if present) for opts in sliceOpts: if opts.refTargets and os.path.isfile(opts.refTargets): os.remove(opts.refTargets) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds)