def __init__(self, options, project, event, schedule, depProjects, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.schedule = schedule self.depProjects = depProjects
def __init__(self, options, project, event, eventExpWrapper, schedule, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project self.event = event self.eventExpWrapper = eventExpWrapper self.schedule = schedule
def __init__(self, prepOptions, inSequenceID, chunksToCompute=None): disk = 3*inSequenceID.size if hasattr(inSequenceID, "size") else None RoundedJob.__init__(self, cores=prepOptions.cpu, memory=prepOptions.memory, disk=disk, preemptable=True) self.prepOptions = prepOptions self.inSequenceID = inSequenceID self.chunksToCompute = chunksToCompute
def __init__(self, repeatMaskOptions, queryID, targetIDs): targetsSize = sum(targetID.size for targetID in targetIDs) memory = 4 * 1024 * 1024 * 1024 disk = 2 * (queryID.size + targetsSize) RoundedJob.__init__(self, memory=memory, disk=disk, preemptable=True) self.repeatMaskOptions = repeatMaskOptions self.queryID = queryID self.targetIDs = targetIDs
def __init__(self, prepOptions, inChunkID): disk = inChunkID.size RoundedJob.__init__(self, memory=prepOptions.memory, cores=prepOptions.cpu, disk=disk, preemptable=True) self.prepOptions = prepOptions self.inChunkID = inChunkID
def __init__(self, prepOptions, chunkIDList): disk = 2 * sum([chunkID.size for chunkID in chunkIDList]) RoundedJob.__init__(self, cores=prepOptions.cpu, memory=prepOptions.memory, disk=disk, preemptable=True) self.prepOptions = prepOptions self.chunkIDList = chunkIDList
def __init__(self, inputSequenceIDs, configNode): RoundedJob.__init__(self, disk=sum([ id.size for id in inputSequenceIDs if hasattr(id, 'size') ]), preemptable=True) self.inputSequenceIDs = inputSequenceIDs self.configNode = configNode
def __init__(self, prepOptions, seqIDs, proportionSampled, inChunkID): disk = sum([seqID.size for seqID in seqIDs]) + 3 * inChunkID.size RoundedJob.__init__(self, memory=prepOptions.memory, cores=prepOptions.cpu, disk=disk, preemptable=True) self.prepOptions = prepOptions self.seqIDs = seqIDs self.inChunkID = inChunkID
def __init__(self, repeatMaskOptions, fragmentsID, targetIDs): if hasattr(fragmentsID, "size"): targetsSize = sum(targetID.size for targetID in targetIDs) memory = 3500000000 disk = 2 * (fragmentsID.size + targetsSize) else: memory = None disk = None RoundedJob.__init__(self, memory=memory, disk=disk, preemptable=True) self.repeatMaskOptions = repeatMaskOptions self.fragmentsID = fragmentsID self.targetIDs = targetIDs
def __init__(self, fastaID, inputBedID=None, eventName=None, minLength=None): disk = 2 * (fastaID.size) memory = fastaID.size RoundedJob.__init__(self, disk=disk, memory=memory, preemptable=True) self.fastaID = fastaID self.minLength = minLength self.inputBedID = inputBedID self.eventName = eventName
def __init__(self, fastaID, minLength, dnabrnnOpts): memory = 4 * 1024 * 1024 * 1024 disk = 2 * (fastaID.size) # todo: clean up cores = cpu_count() RoundedJob.__init__(self, memory=memory, disk=disk, cores=cores, preemptable=True) self.fastaID = fastaID self.minLength = minLength self.dnabrnnOpts = dnabrnnOpts
def __init__(self, repeatMaskOptions, queryID, targetIDs): targetsSize = sum(targetID.size for targetID in targetIDs) memory = 4 * 1024 * 1024 * 1024 disk = 2 * (queryID.size + targetsSize) if repeatMaskOptions.gpuLastz: # gpu jobs get the whole node (same hack as used in blast phase) cores = cpu_count() else: cores = None RoundedJob.__init__(self, memory=memory, disk=disk, cores=cores, preemptable=True) self.repeatMaskOptions = repeatMaskOptions self.queryID = queryID self.targetIDs = targetIDs
def __init__(self, fastaID, dnabrnnOpts, cpu, minLength=None, mergeLength=None, action=None): memory = 4 * 1024 * 1024 * 1024 disk = 2 * (fastaID.size) cores = min(cpu_count(), cpu) RoundedJob.__init__(self, memory=memory, disk=disk, cores=cores, preemptable=True) self.fastaID = fastaID self.minLength = minLength self.mergeLength = mergeLength self.action = action self.dnabrnnOpts = dnabrnnOpts
def __init__(self, fastaID, dnabrnnOpts, cpu, minLength=None, action=None, inputBedID=None, eventName=None): memory = 4 * 1024 * 1024 * 1024 disk = 2 * (fastaID.size) cores = min(cpu_count(), cpu) RoundedJob.__init__(self, memory=memory, disk=disk, cores=cores, preemptable=True) self.fastaID = fastaID self.minLength = minLength self.action = action self.dnabrnnOpts = dnabrnnOpts self.inputBedID = inputBedID #todo: moved to fileMasking --> remove from here self.eventName = eventName
def __init__(self, inputSequenceID, configNode): RoundedJob.__init__(self, preemptable=True) self.inputSequenceID = inputSequenceID self.configNode = configNode
def get_plan(options, project, inSeqFile, outSeqFile, toil): plan = get_generation_info() + '\n' if options.wdl: plan += wdl_workflow_start(options, inSeqFile) options.pp_map = {} if options.toil: # kick things off with an empty job which we will hook subsequent jobs onto # (using RoundedJob because root job must be sublcass of Job, # https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478) start_job = RoundedJob() parent_job = start_job job_idx = {} # preprocessing plan += '\n## Preprocessor\n' leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()] for i in range(0, len(leaves), options.preprocessBatchSize): pre_batch = leaves[i:i+options.preprocessBatchSize] if options.wdl: plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch) elif options.toil: job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i], cores=options.preprocessCores, memory=options.preprocessMemory, disk=options.preprocessDisk) else: plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format( get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch), options.cactusOptions, get_toil_resource_opts(options, 'preprocess')) if options.preprocessOnly: plan += '\n## Cactus\n' plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile, options.outHal, options.cactusOptions) return plan # shedule up the alignments schedule = Schedule() schedule.loadProject(project) schedule.compute() # set of all jobs, as genome names from the (fully resolved, output) seqfile events = set(outSeqFile.pathMap.keys()) - set(leaves) resolved = set(leaves) # convert follow-ons to dependencies follow_on_deps = {} for event in events: fo = schedule.followOn(event) if fo: follow_on_deps[fo] = event def get_deps(event): deps = set(schedule.deps(event)) if event in follow_on_deps: deps = deps.union(set(follow_on_deps[event])) # I don't know why the schedule doesn't always give the children # todo: understand! try: has_name = outSeqFile.tree.getNodeId(event) is not None except: has_name = False if has_name: for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)): if not outSeqFile.tree.isLeaf(node): deps.add(outSeqFile.tree.getName(node)) return deps events_and_virtuals = set(events) # add all events, potentially looping through virtual dependency chains # (hence the double loop) batch = set(events_and_virtuals) while len(batch) > 0: next_batch = set() for event in batch: for dep in get_deps(event): if dep not in events_and_virtuals: next_batch.add(dep) events_and_virtuals.add(dep) batch = next_batch # group jobs into rounds. where all jobs of round i can be run in parallel groups = [] while len(events_and_virtuals) > 0: group = [] to_remove = [] added = 0 for event in events_and_virtuals: if all([dep in resolved for dep in get_deps(event)]): if not schedule.isVirtual(event): group.append(event) to_remove.append(event) added += 1 if added == 0: sys.stderr.write("schedule deadlock:\n") for event in events_and_virtuals: sys.stderr.write("{} has deps {}\n".format(event, get_deps(event))) sys.exit(1) for tr in to_remove: resolved.add(tr) events_and_virtuals.remove(tr) groups.append(group) def halPath(event): if event == project.mcTree.getRootName(): return options.outHal else: return os.path.join(options.outDir, event + '.hal') def cigarPath(event): return os.path.join(options.outDir, event + '.cigar') # alignment groups plan += '\n## Alignment\n' for i, group in enumerate(groups): plan += '\n### Round {}'.format(i) if options.toil: # advance toil phase # todo: recapitulate exact dependencies parent_job = parent_job.addFollowOn(Job()) for event in sorted(group): plan += '\n' if options.wdl: plan += wdl_call_blast(options, project, event, cigarPath(event)) plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event]) elif options.toil: # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here leaf_deps, anc_deps = get_dep_names(options, project, event) fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps] job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast, options, outSeqFile, project, event, cigarPath(event), leaf_deps + anc_deps, *fa_promises, cores=options.blastCores, memory=options.blastMemory, disk=options.preprocessDisk) job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align, options, outSeqFile, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event], job_idx[("blast", event)].rv(), leaf_deps + anc_deps, *fa_promises, cores=options.alignCores, memory=options.alignMemory, disk=options.alignDisk) else: # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle) plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'blast')) plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database) # todo: just output the fasta in cactus-align. plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event]) # advance toil phase if options.toil: parent_job = parent_job.addFollowOn(Job()) # stitch together the final tree plan += '\n## HAL merging\n' root = project.mcTree.getRootName() prev_event = None append_count = 0 event_list = [] for group in reversed(groups): for event in group: if event != root: if options.wdl: plan += wdl_call_hal_append(options, project, event, prev_event) elif not options.toil: plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format( halPath(root), halPath(event), event, event, options.halOptions) append_count += 1 event_list.append(event) prev_event = event if options.toil: job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees, options, project, root, job_idx[('align', root)].rv(1), event_list, *[job_idx[('align', e)].rv(1) for e in event_list], cores=1, memory=options.alignMemory, disk=options.halAppendDisk) if options.wdl: plan += wdl_workflow_end(options, prev_event, append_count > 1) if options.toil: start_time = timeit.default_timer() toil.start(start_job) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time)) return plan
def __init__(self, prepOptions, chunkIDList): RoundedJob.__init__(self, preemptable=True) self.prepOptions = prepOptions self.chunkIDList = chunkIDList
def __init__(self, repeatMaskOptions, alignmentsID, queryID): RoundedJob.__init__(self, preemptable=True) self.repeatMaskOptions = repeatMaskOptions self.alignmentsID = alignmentsID self.queryID = queryID
def __init__(self, repeatMaskOptions, queryID, targetIDs): RoundedJob.__init__(self, preemptable=True) self.repeatMaskOptions = repeatMaskOptions self.queryID = queryID self.targetIDs = targetIDs
def __init__(self, prepOptions, chunkIDList): disk = 2*sum([chunkID.size for chunkID in chunkIDList]) RoundedJob.__init__(self, cores=prepOptions.cpu, memory=prepOptions.memory, disk=disk, preemptable=True) self.prepOptions = prepOptions self.chunkIDList = chunkIDList
def __init__(self, options, project, memory=None, cores=None): RoundedJob.__init__(self, memory=memory, cores=cores, preemptable=True) self.options = options self.project = project
def __init__(self, fastaID, cutBefore, cutAfter): disk = 2*(fastaID.size) RoundedJob.__init__(self, disk=disk, preemptable=True) self.fastaID = fastaID self.cutBefore = cutBefore self.cutAfter = cutAfter
def __init__(self, inputSequenceIDs, configNode): RoundedJob.__init__(self, disk=sum([id.size for id in inputSequenceIDs]), preemptable=True) self.inputSequenceIDs = inputSequenceIDs self.configNode = configNode
def __init__(self, prepXmlElems, inSequenceID, iteration = 0): self.prepXmlElems = prepXmlElems self.inSequenceID = inSequenceID self.iteration = iteration RoundedJob.__init__(self, preemptable=True)