Пример #1
0
 def testRunsOnRandom(self):
     for tree in randomTreeSet():
         if tree.size() < 120:
             dag = self.__addDagEdges(tree)
             sched = Schedule()
             sched.inGraph = dag
             sched.compute()
Пример #2
0
 def testRunsOnRandom(self):
     for tree in randomTreeSet():
         if tree.size() < 120:
             dag = self.__addDagEdges(tree)
             sched = Schedule()
             sched.inGraph = dag
             sched.compute()
Пример #3
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        fileStore.logToMaster("Using the following configuration:\n%s" % ET.tostring(self.configNode))

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in self.project.getInputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" % self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode))
        self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))])

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Пример #4
0
 def run(self):
     #Load the multi-cactus project
     project = MultiCactusProject()
     project.readXML(self.args[0])
     #Create jobs to create the output sequences
     configNode = ET.parse(project.getConfigPath()).getroot()
     ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
     #Create the preprocessor
     self.addChildTarget(CactusPreprocessor(project.getInputSequencePaths(), 
                                            CactusPreprocessor.getOutputSequenceFiles(project.getInputSequencePaths(), project.getOutputSequenceDir()),
                                            configNode))
     #Now build the progressive-down target
     schedule = Schedule()
     schedule.loadProject(project)
     schedule.compute()
     if self.options.event == None:
         self.options.event = project.mcTree.getRootName()
     assert self.options.event in project.expMap
     leafNames = [ project.mcTree.getName(i) for i in project.mcTree.getLeaves() ]
     self.options.globalLeafEventSet = set(leafNames)
     self.setFollowOnTarget(ProgressiveDown(self.options, project, self.options.event, schedule))
Пример #5
0
    def run(self, fileStore):
        self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot()
        self.configWrapper = ConfigWrapper(self.configNode)
        self.configWrapper.substituteAllPredefinedConstantsWithLiterals()

        # Log the stats for the un-preprocessed assemblies
        for name, sequence in self.project.getInputSequenceIDMap().items():
            self.addChildJobFn(logAssemblyStats, "Before preprocessing", name, sequence)

        # Create jobs to create the output sequences
        logger.info("Reading config file from: %s" % self.project.getConfigID())
        configFile = fileStore.readGlobalFile(self.project.getConfigID())
        configNode = ET.parse(configFile).getroot()
        ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() #This is necessary..
        #Add the preprocessor child job. The output is a job promise value that will be
        #converted into a list of the IDs of the preprocessed sequences in the follow on job.
        preprocessorJob = self.addChild(CactusPreprocessor(self.project.getInputSequenceIDs(), configNode))
        self.project.setOutputSequenceIDs([preprocessorJob.rv(i) for i in range(len(self.project.getInputSequenceIDs()))])

        #Now build the progressive-down job
        schedule = Schedule()
        schedule.loadProject(self.project, fileStore=fileStore)
        schedule.compute()
        self.options.event = self.project.mcTree.getRootName()
        leafNames = [ self.project.mcTree.getName(i) for i in self.project.mcTree.getLeaves() ]
        fileStore.logToMaster("Leaf names = %s" % leafNames)
        self.options.globalLeafEventSet = set(leafNames)

        return self.addFollowOn(RunCactusPreprocessorThenProgressiveDown2(options=self.options, project=self.project, event=self.options.event, schedule=schedule, memory=self.configWrapper.getDefaultMemory())).rv()
Пример #6
0
def get_plan(options, project, inSeqFile, outSeqFile):

    plan = get_generation_info() + '\n'

    if options.wdl:
        plan += wdl_workflow_start(options, inSeqFile)
    
    # preprocessing
    plan += '\n## Preprocessor\n'
    leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()]
    for i in range(0, len(leaves), options.preprocessBatchSize):
        pre_batch = leaves[i:i+options.preprocessBatchSize]
        if options.wdl:
            assert len(pre_batch) == 1
            plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, leaves[i])
        else:
            plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format(
                get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch),
                options.cactusOptions, get_toil_resource_opts(options, 'preprocess'))

    if options.preprocessOnly:
        # if we're only making preprocess jobs, we can end early
        plan += '\n## Cactus\n'
        plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile,
                                              options.outHal, options.cactusOptions)
        return plan

    # shedule up the alignments
    schedule = Schedule()
    schedule.loadProject(project)
    schedule.compute()

    # set of all jobs, as genome names from the (fully resolved, output) seqfile
    events = set(outSeqFile.pathMap.keys()) - set(leaves)
    resolved = set(leaves)

    # convert follow-ons to dependencies
    follow_on_deps = {}
    for event in events:
        fo = schedule.followOn(event)
        if fo:
            follow_on_deps[fo] = event

    def get_deps(event):
        deps = set(schedule.deps(event))
        if event in follow_on_deps:
            deps = deps.union(set(follow_on_deps[event]))
        # I don't know why the schedule doesn't always give the children
        # todo: understand!
        try:
            has_name = outSeqFile.tree.getNodeId(event) is not None
        except:
            has_name = False
        if has_name:
            for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)):
                if not outSeqFile.tree.isLeaf(node):
                    deps.add(outSeqFile.tree.getName(node))
        return deps

    events_and_virtuals = set()
    # there can be chains of virtual events not caught by a single round
    # of get_deps, so we iterate till we have them all
    to_add = set()
    for event in events:
        to_add.add(event)
    while len(to_add) > 0:
        to_add1 = set()
        for event in to_add:
            events_and_virtuals.add(event)
            for dep in get_deps(event):
                if dep not in events_and_virtuals and dep not in to_add:
                    to_add1.add(dep)
        to_add = to_add1

    # group jobs into rounds.  where all jobs of round i can be run in parallel
    groups = []
    while len(events_and_virtuals) > 0:
        group = []
        to_remove = []
        added = 0
        for event in events_and_virtuals:
            if all([dep in resolved for dep in get_deps(event)]):
                if not schedule.isVirtual(event):
                    group.append(event)
                to_remove.append(event)
                added += 1
        if added == 0:
            sys.stderr.write("schedule deadlock:\n")
            for event in events_and_virtuals:
                sys.stderr.write("{} has deps {}\n".format(event, ["{}(resolved={})".format(d, d in resolved) for d in get_deps(event)]))
            sys.exit(1)
        for tr in to_remove:
            resolved.add(tr)
            events_and_virtuals.remove(tr)
        groups.append(group)

    def halPath(event):
        if event == project.mcTree.getRootName():
            return options.outHal
        else:
            return os.path.join(options.outDir, event + '.hal')
    def cigarPath(event):
        return os.path.join(options.outDir, event + '.cigar')

    # alignment groups
    plan += '\n## Alignment\n'
    for i, group in enumerate(groups):
        plan += '\n### Round {}'.format(i)
        for event in sorted(group):
            plan += '\n'
            if options.wdl:
                plan += wdl_call_blast(options, project, event, cigarPath(event))
                plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event])
            else:
                # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle)
                plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'blast'))
                plan += 'cactus-align {} {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'align'))
                # todo: just output the fasta in cactus-align.
                plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event])

    # stitch together the final tree
    plan += '\n## HAL merging\n'
    root = project.mcTree.getRootName()
    prev_event = None
    append_count = 0
    for group in reversed(groups):
        for event in group:
            if event != root:
                if options.wdl:
                    plan += wdl_call_hal_append(options, project, event, prev_event)
                else:
                    plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format(
                        halPath(root), halPath(event), event, event, options.halOptions)
                append_count += 1
            prev_event = event

    if options.wdl:
        plan += wdl_workflow_end(options, prev_event, append_count > 1)

    return plan
Пример #7
0
def get_plan(options, project, inSeqFile, outSeqFile, toil):

    plan = get_generation_info() + '\n'

    if options.wdl:
        plan += wdl_workflow_start(options, inSeqFile)
        options.pp_map = {}

    if options.toil:
        # kick things off with an empty job which we will hook subsequent jobs onto
        # (using RoundedJob because root job must be sublcass of Job,
        #  https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478)
        start_job = RoundedJob()
        parent_job = start_job
        job_idx = {}
    
    # preprocessing
    plan += '\n## Preprocessor\n'
    leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()]
    for i in range(0, len(leaves), options.preprocessBatchSize):
        pre_batch = leaves[i:i+options.preprocessBatchSize]
        if options.wdl:
            plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch)
        elif options.toil:
            job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i],
                                                                          cores=options.preprocessCores,
                                                                          memory=options.preprocessMemory,
                                                                          disk=options.preprocessDisk)
        else:
            plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format(
                get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch),
                options.cactusOptions, get_toil_resource_opts(options, 'preprocess'))

    if options.preprocessOnly:
        plan += '\n## Cactus\n'
        plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile,
                                              options.outHal, options.cactusOptions)
        return plan

    # shedule up the alignments
    schedule = Schedule()
    schedule.loadProject(project)
    schedule.compute()

    # set of all jobs, as genome names from the (fully resolved, output) seqfile
    events = set(outSeqFile.pathMap.keys()) - set(leaves)
    resolved = set(leaves)

    # convert follow-ons to dependencies
    follow_on_deps = {}
    for event in events:
        fo = schedule.followOn(event)
        if fo:
            follow_on_deps[fo] = event

    def get_deps(event):
        deps = set(schedule.deps(event))
        if event in follow_on_deps:
            deps = deps.union(set(follow_on_deps[event]))
        # I don't know why the schedule doesn't always give the children
        # todo: understand!
        try:
            has_name = outSeqFile.tree.getNodeId(event) is not None
        except:
            has_name = False
        if has_name:
            for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)):
                if not outSeqFile.tree.isLeaf(node):
                    deps.add(outSeqFile.tree.getName(node))
        return deps

    events_and_virtuals = set(events)
    # add all events, potentially looping through virtual dependency chains
    # (hence the double loop)
    batch = set(events_and_virtuals)
    while len(batch) > 0:
        next_batch = set()
        for event in batch:
            for dep in get_deps(event):
                if dep not in events_and_virtuals:
                    next_batch.add(dep)
                    events_and_virtuals.add(dep)
        batch = next_batch

    # group jobs into rounds.  where all jobs of round i can be run in parallel
    groups = []
    while len(events_and_virtuals) > 0:
        group = []
        to_remove = []
        added = 0
        for event in events_and_virtuals:
            if all([dep in resolved for dep in get_deps(event)]):
                if not schedule.isVirtual(event):
                    group.append(event)
                to_remove.append(event)
                added += 1
        if added == 0:
            sys.stderr.write("schedule deadlock:\n")
            for event in events_and_virtuals:
                sys.stderr.write("{} has deps {}\n".format(event, get_deps(event)))
            sys.exit(1)
        for tr in to_remove:
            resolved.add(tr)
            events_and_virtuals.remove(tr)
        groups.append(group)

    def halPath(event):
        if event == project.mcTree.getRootName():
            return options.outHal
        else:
            return os.path.join(options.outDir, event + '.hal')
    def cigarPath(event):
        return os.path.join(options.outDir, event + '.cigar')

    # alignment groups
    plan += '\n## Alignment\n'
    for i, group in enumerate(groups):
        plan += '\n### Round {}'.format(i)
        if options.toil:
            # advance toil phase
            # todo: recapitulate exact dependencies
            parent_job = parent_job.addFollowOn(Job())
        for event in sorted(group):
            plan += '\n'
            if options.wdl:
                plan += wdl_call_blast(options, project, event, cigarPath(event))
                plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event])
            elif options.toil:
                # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here
                leaf_deps, anc_deps = get_dep_names(options, project, event)
                fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps]
                job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast,
                                                                     options,
                                                                     outSeqFile,
                                                                     project,
                                                                     event,
                                                                     cigarPath(event),
                                                                     leaf_deps + anc_deps,
                                                                     *fa_promises,
                                                                     cores=options.blastCores,
                                                                     memory=options.blastMemory,
                                                                     disk=options.preprocessDisk)
                job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align,
                                                                                       options, outSeqFile,
                                                                                       project,
                                                                                       event,
                                                                                       cigarPath(event),
                                                                                       halPath(event),
                                                                                       outSeqFile.pathMap[event],
                                                                                       job_idx[("blast", event)].rv(),
                                                                                       leaf_deps + anc_deps, *fa_promises,
                                                                                       cores=options.alignCores,
                                                                                       memory=options.alignMemory,
                                                                                       disk=options.alignDisk)
            else:
                # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle)
                plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'blast'))
                plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database)
                # todo: just output the fasta in cactus-align.
                plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event])

    # advance toil phase
    if options.toil:
        parent_job = parent_job.addFollowOn(Job())
                
    # stitch together the final tree
    plan += '\n## HAL merging\n'
    root = project.mcTree.getRootName()
    prev_event = None
    append_count = 0
    event_list = []
    for group in reversed(groups):
        for event in group:
            if event != root:
                if options.wdl:
                    plan += wdl_call_hal_append(options, project, event, prev_event)
                elif not options.toil:
                    plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format(
                        halPath(root), halPath(event), event, event, options.halOptions)
                append_count += 1
                event_list.append(event)
            prev_event = event

    if options.toil:
        job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees,
                                                         options,
                                                         project,
                                                         root,
                                                         job_idx[('align', root)].rv(1),
                                                         event_list,
                                                         *[job_idx[('align', e)].rv(1) for e in event_list],
                                                         cores=1,
                                                         memory=options.alignMemory,
                                                         disk=options.halAppendDisk)

    if options.wdl:
        plan += wdl_workflow_end(options, prev_event, append_count > 1)

    if options.toil:
        start_time = timeit.default_timer()
        toil.start(start_job)
        end_time = timeit.default_timer()
        run_time = end_time - start_time
        logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time))
        
    return plan