def getSteps(halPath, maxBlock, scaleFactor, minLod0, cutOffFrac, minSeqFrac, minCovFrac): statsTable = getHalStats(halPath) sequenceStatsTable = dict() for row in statsTable: sequenceStatsTable[row[0]] = getHalSequenceStats(halPath, row[0]) maxLen = getMaxGenomeLength(statsTable) assert maxLen > 0 maxStep = math.ceil(float(maxLen) / float(maxBlock)) lodBaseStep = math.ceil(float(minLod0) / float(maxBlock)) baseStep = max(lodBaseStep, getMinAvgBlockSize(statsTable)) outList = [] step = baseStep # last LOD is just "max" token which tells browser it and anything # beyond is disabled. lastIsMax = False while True: outList.append(step) if step > maxStep * cutOffFrac: break minCoverage = 1.0 if minSeqFrac > 0. and minCovFrac > 0.: minCoverageFrac = getMinCoverageFrac(sequenceStatsTable, math.floor(step * minSeqFrac)) if minCoverageFrac < minCovFrac: lastIsMax = True break step *= scaleFactor return [int(x) for x in outList], lastIsMax
def partitionByGenomeCoords(options, refGenome): "we are slicing the gnome coordinates directly" sliceCmds = [] sliceOpts = [] seqOpts = copy.deepcopy(options) assert seqOpts.splitBySequence is False refSequenceStats = getHalSequenceStats(options.halFile, refGenome) # auto compute slice size from numprocs if seqOpts.sliceSize is None and seqOpts.numProc > 1: if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length else: # use median of sequence lengths refLen = int(statistics.median([r[1] for r in refSequenceStats])) seqOpts.sliceSize = math.ceil(math.ceil(refLen / seqOpts.numProc)) for refSeqStat in refSequenceStats: seqOpts.refSequence = refSeqStat[0] for sStart, sLen, sIdx in computeSlices(seqOpts, refSeqStat[1]): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHal2MafCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) return sliceCmds, sliceOpts
def writeChromSizes(options): if options.chromSizes is not None: csFile = open(options.chromSizes, "w") refSequenceStats = getHalSequenceStats(options.halFile, options.refGenome) assert refSequenceStats is not None for seqStat in refSequenceStats: csFile.write("%s\t%s\n" % (seqStat[0], seqStat[1])) csFile.close()
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) refSequenceStats = getHalSequenceStats(options.halFile, refGenome) options.smallFile = False options.firstSmallFile = True sliceCmds = [] sliceOpts = [] # we are going to deal with sequence coordinates if options.splitBySequence is True or options.refSequence is not None: for sequence, seqLen, nt, nb in refSequenceStats: if options.refSequence is None or sequence == options.refSequence: seqOpts = copy.deepcopy(options) if seqLen < options.smallSize: seqOpts.smallFile = True seqOpts.refGenome = refGenome seqOpts.refSequence = sequence index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHal2MafCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) if seqOpts.smallFile is True and seqLen > 0: options.firstSmallFile = False # we are slicing the gnome coordinates directly else: seqOpts = copy.deepcopy(options) assert seqOpts.splitBySequence is False genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome) # auto compute slice size from numprocs if seqOpts.sliceSize == None and seqOpts.numProc > 1: refLen = genomeLen if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc)) index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHal2MafCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds)
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) refSequenceStats = getHalSequenceStats(options.halFile, refGenome) options.smallFile = False options.firstSmallFile = True sliceCmds = [] sliceOpts = [] if options.refSequence is not None: refStat = [x for x in refSequenceStats if x[1] == options.refSequence] if len(refStat != 1): raise RuntimeError("Sequence %s not found in genome %s" % ( options.refSequence, options.refGenome)) totalLength = int(refStat[1]) else: totalLength = getHalGenomeLength(options.halFile, refGenome) seqOpts = copy.deepcopy(options) # auto compute slice size from numprocs if seqOpts.sliceSize == None and seqOpts.numProc > 1: refLen = totalLength if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc)) index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHalPhyloPCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds) writeChromSizes(options)
def runParallelSlices(options): refGenome = options.refGenome if refGenome is None: refGenome = getHalRootName(options.halFile) refSequenceStats = getHalSequenceStats(options.halFile, refGenome) options.smallFile = False options.firstSmallFile = True sliceCmds = [] sliceOpts = [] if options.refSequence is not None: refStat = [x for x in refSequenceStats if x[1] == options.refSequence] if len(refStat != 1): raise RuntimeError("Sequence %s not found in genome %s" % (options.refSequence, options.refGenome)) totalLength = int(refStat[1]) else: totalLength = getHalGenomeLength(options.halFile, refGenome) seqOpts = copy.deepcopy(options) # auto compute slice size from numprocs if seqOpts.sliceSize == None and seqOpts.numProc > 1: refLen = totalLength if seqOpts.length is not None and seqOpts.length > 0: refLen = seqOpts.length seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc)) index = 0 for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHalPhyloPCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) # run in parallel runParallelShellCommands(sliceCmds, options.numProc) # concatenate into output if desired concatenateSlices(sliceOpts, sliceCmds) writeChromSizes(options)
def partitionBySeqCoords(options, refGenome): "we are going to deal with sequence coordinates" refSequenceStats = getHalSequenceStats(options.halFile, refGenome) sliceCmds = [] sliceOpts = [] for sequence, seqLen, nt, nb in refSequenceStats: if options.refSequence is None or sequence == options.refSequence: seqOpts = copy.deepcopy(options) if seqLen < options.smallSize: seqOpts.smallFile = True seqOpts.refGenome = refGenome seqOpts.refSequence = sequence for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen): seqOpts.start = sStart seqOpts.length = sLen seqOpts.sliceNumber = sIdx sliceCmds.append(getHal2MafCmd(seqOpts)) sliceOpts.append(copy.deepcopy(seqOpts)) if seqOpts.smallFile is True and seqLen > 0: options.firstSmallFile = False return sliceCmds, sliceOpts