def testRunsOnRandom(self): for tree in randomTreeSet(): if tree.size() < 120: dag = self.__addDagEdges(tree) sched = Schedule() sched.inGraph = dag sched.compute()
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() # Log the stats for the un-preprocessed assemblies for name, sequence in self.project.getInputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence) # Create jobs to create the output sequences logger.info("Reading config file from: %s" % self.project.getConfigID()) configFile = fileStore.readGlobalFile(self.project.getConfigID()) configNode = ET.parse(configFile).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Add the preprocessor child job. The output is a job promise value that will be #converted into a list of the IDs of the preprocessed sequences in the follow on job. preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode)) self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))]) #Now build the progressive-down job schedule = Schedule() schedule.loadProject(self.project, fileStore=fileStore) schedule.compute() self.options.event = self.project.mcTree.getRootName() leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ] fileStore.logToMaster("Leaf names = %s" % leafNames) self.options.globalLeafEventSet = set(leafNames) return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode)) # Log the stats for the un-preprocessed assemblies for name, sequence in self.project.getInputSequenceIDMap().items(): self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence) # Create jobs to create the output sequences logger.info("Reading config file from: %s" % self.project.getConfigID()) configFile = fileStore.readGlobalFile(self.project.getConfigID()) configNode = ET.parse(configFile).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Add the preprocessor child job. The output is a job promise value that will be #converted into a list of the IDs of the preprocessed sequences in the follow on job. preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode)) self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))]) #Now build the progressive-down job schedule = Schedule() schedule.loadProject(self.project, fileStore=fileStore) schedule.compute() self.options.event = self.project.mcTree.getRootName() leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ] fileStore.logToMaster("Leaf names = %s" % leafNames) self.options.globalLeafEventSet = set(leafNames) return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
def run(self): #Load the multi-cactus project project = MultiCactusProject() project.readXML(self.args[0]) #Create jobs to create the output sequences configNode = ET.parse(project.getConfigPath()).getroot() ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary.. #Create the preprocessor self.addChildTarget(CactusPreprocessor(project.getInputSequencePaths(), CactusPreprocessor.getOutputSequenceFiles(project.getInputSequencePaths(), project.getOutputSequenceDir()), configNode)) #Now build the progressive-down target schedule = Schedule() schedule.loadProject(project) schedule.compute() if self.options.event == None: self.options.event = project.mcTree.getRootName() assert self.options.event in project.expMap leafNames = [ project.mcTree.getName(i) for i in project.mcTree.getLeaves() ] self.options.globalLeafEventSet = set(leafNames) self.setFollowOnTarget(ProgressiveDown(self.options, project, self.options.event, schedule))
def get_plan(options, project, inSeqFile, outSeqFile): plan = get_generation_info() + '\n' if options.wdl: plan += wdl_workflow_start(options, inSeqFile) # preprocessing plan += '\n## Preprocessor\n' leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()] for i in range(0, len(leaves), options.preprocessBatchSize): pre_batch = leaves[i:i+options.preprocessBatchSize] if options.wdl: assert len(pre_batch) == 1 plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, leaves[i]) else: plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format( get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch), options.cactusOptions, get_toil_resource_opts(options, 'preprocess')) if options.preprocessOnly: # if we're only making preprocess jobs, we can end early plan += '\n## Cactus\n' plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile, options.outHal, options.cactusOptions) return plan # shedule up the alignments schedule = Schedule() schedule.loadProject(project) schedule.compute() # set of all jobs, as genome names from the (fully resolved, output) seqfile events = set(outSeqFile.pathMap.keys()) - set(leaves) resolved = set(leaves) # convert follow-ons to dependencies follow_on_deps = {} for event in events: fo = schedule.followOn(event) if fo: follow_on_deps[fo] = event def get_deps(event): deps = set(schedule.deps(event)) if event in follow_on_deps: deps = deps.union(set(follow_on_deps[event])) # I don't know why the schedule doesn't always give the children # todo: understand! try: has_name = outSeqFile.tree.getNodeId(event) is not None except: has_name = False if has_name: for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)): if not outSeqFile.tree.isLeaf(node): deps.add(outSeqFile.tree.getName(node)) return deps events_and_virtuals = set() # there can be chains of virtual events not caught by a single round # of get_deps, so we iterate till we have them all to_add = set() for event in events: to_add.add(event) while len(to_add) > 0: to_add1 = set() for event in to_add: events_and_virtuals.add(event) for dep in get_deps(event): if dep not in events_and_virtuals and dep not in to_add: to_add1.add(dep) to_add = to_add1 # group jobs into rounds. where all jobs of round i can be run in parallel groups = [] while len(events_and_virtuals) > 0: group = [] to_remove = [] added = 0 for event in events_and_virtuals: if all([dep in resolved for dep in get_deps(event)]): if not schedule.isVirtual(event): group.append(event) to_remove.append(event) added += 1 if added == 0: sys.stderr.write("schedule deadlock:\n") for event in events_and_virtuals: sys.stderr.write("{} has deps {}\n".format(event, ["{}(resolved={})".format(d, d in resolved) for d in get_deps(event)])) sys.exit(1) for tr in to_remove: resolved.add(tr) events_and_virtuals.remove(tr) groups.append(group) def halPath(event): if event == project.mcTree.getRootName(): return options.outHal else: return os.path.join(options.outDir, event + '.hal') def cigarPath(event): return os.path.join(options.outDir, event + '.cigar') # alignment groups plan += '\n## Alignment\n' for i, group in enumerate(groups): plan += '\n### Round {}'.format(i) for event in sorted(group): plan += '\n' if options.wdl: plan += wdl_call_blast(options, project, event, cigarPath(event)) plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event]) else: # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle) plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'blast')) plan += 'cactus-align {} {} {} {} --root {} {} {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'align')) # todo: just output the fasta in cactus-align. plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event]) # stitch together the final tree plan += '\n## HAL merging\n' root = project.mcTree.getRootName() prev_event = None append_count = 0 for group in reversed(groups): for event in group: if event != root: if options.wdl: plan += wdl_call_hal_append(options, project, event, prev_event) else: plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format( halPath(root), halPath(event), event, event, options.halOptions) append_count += 1 prev_event = event if options.wdl: plan += wdl_workflow_end(options, prev_event, append_count > 1) return plan
def get_plan(options, project, inSeqFile, outSeqFile, toil): plan = get_generation_info() + '\n' if options.wdl: plan += wdl_workflow_start(options, inSeqFile) options.pp_map = {} if options.toil: # kick things off with an empty job which we will hook subsequent jobs onto # (using RoundedJob because root job must be sublcass of Job, # https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478) start_job = RoundedJob() parent_job = start_job job_idx = {} # preprocessing plan += '\n## Preprocessor\n' leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()] for i in range(0, len(leaves), options.preprocessBatchSize): pre_batch = leaves[i:i+options.preprocessBatchSize] if options.wdl: plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch) elif options.toil: job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i], cores=options.preprocessCores, memory=options.preprocessMemory, disk=options.preprocessDisk) else: plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format( get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch), options.cactusOptions, get_toil_resource_opts(options, 'preprocess')) if options.preprocessOnly: plan += '\n## Cactus\n' plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile, options.outHal, options.cactusOptions) return plan # shedule up the alignments schedule = Schedule() schedule.loadProject(project) schedule.compute() # set of all jobs, as genome names from the (fully resolved, output) seqfile events = set(outSeqFile.pathMap.keys()) - set(leaves) resolved = set(leaves) # convert follow-ons to dependencies follow_on_deps = {} for event in events: fo = schedule.followOn(event) if fo: follow_on_deps[fo] = event def get_deps(event): deps = set(schedule.deps(event)) if event in follow_on_deps: deps = deps.union(set(follow_on_deps[event])) # I don't know why the schedule doesn't always give the children # todo: understand! try: has_name = outSeqFile.tree.getNodeId(event) is not None except: has_name = False if has_name: for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)): if not outSeqFile.tree.isLeaf(node): deps.add(outSeqFile.tree.getName(node)) return deps events_and_virtuals = set(events) # add all events, potentially looping through virtual dependency chains # (hence the double loop) batch = set(events_and_virtuals) while len(batch) > 0: next_batch = set() for event in batch: for dep in get_deps(event): if dep not in events_and_virtuals: next_batch.add(dep) events_and_virtuals.add(dep) batch = next_batch # group jobs into rounds. where all jobs of round i can be run in parallel groups = [] while len(events_and_virtuals) > 0: group = [] to_remove = [] added = 0 for event in events_and_virtuals: if all([dep in resolved for dep in get_deps(event)]): if not schedule.isVirtual(event): group.append(event) to_remove.append(event) added += 1 if added == 0: sys.stderr.write("schedule deadlock:\n") for event in events_and_virtuals: sys.stderr.write("{} has deps {}\n".format(event, get_deps(event))) sys.exit(1) for tr in to_remove: resolved.add(tr) events_and_virtuals.remove(tr) groups.append(group) def halPath(event): if event == project.mcTree.getRootName(): return options.outHal else: return os.path.join(options.outDir, event + '.hal') def cigarPath(event): return os.path.join(options.outDir, event + '.cigar') # alignment groups plan += '\n## Alignment\n' for i, group in enumerate(groups): plan += '\n### Round {}'.format(i) if options.toil: # advance toil phase # todo: recapitulate exact dependencies parent_job = parent_job.addFollowOn(Job()) for event in sorted(group): plan += '\n' if options.wdl: plan += wdl_call_blast(options, project, event, cigarPath(event)) plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event]) elif options.toil: # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here leaf_deps, anc_deps = get_dep_names(options, project, event) fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps] job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast, options, outSeqFile, project, event, cigarPath(event), leaf_deps + anc_deps, *fa_promises, cores=options.blastCores, memory=options.blastMemory, disk=options.preprocessDisk) job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align, options, outSeqFile, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event], job_idx[("blast", event)].rv(), leaf_deps + anc_deps, *fa_promises, cores=options.alignCores, memory=options.alignMemory, disk=options.alignDisk) else: # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle) plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'blast')) plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database) # todo: just output the fasta in cactus-align. plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event]) # advance toil phase if options.toil: parent_job = parent_job.addFollowOn(Job()) # stitch together the final tree plan += '\n## HAL merging\n' root = project.mcTree.getRootName() prev_event = None append_count = 0 event_list = [] for group in reversed(groups): for event in group: if event != root: if options.wdl: plan += wdl_call_hal_append(options, project, event, prev_event) elif not options.toil: plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format( halPath(root), halPath(event), event, event, options.halOptions) append_count += 1 event_list.append(event) prev_event = event if options.toil: job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees, options, project, root, job_idx[('align', root)].rv(1), event_list, *[job_idx[('align', e)].rv(1) for e in event_list], cores=1, memory=options.alignMemory, disk=options.halAppendDisk) if options.wdl: plan += wdl_workflow_end(options, prev_event, append_count > 1) if options.toil: start_time = timeit.default_timer() toil.start(start_job) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time)) return plan