Пример #1
0
def cactusPrepare(options, project):
    """ annotate a SeqFile with ancestral names as well as paths for output sequences."""

    # read the input
    seqFile = SeqFile(options.seqFile)
    configNode = ET.parse(options.configFile).getroot()
    config = ConfigWrapper(configNode)

    # prepare output sequence directory
    # todo: support remote (ie s3) output directory
    try:
        os.makedirs(options.outSeqDir)
    except:
        pass
    if not os.path.isdir(options.outSeqDir):
        raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir))
    if not os.access(options.outSeqDir, os.W_OK):
        logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir))

    # hack the configfile to skip preprocessing and write it to the output dir
    if options.preprocessOnly:
        config.removePreprocessors()
        options.configFile = os.path.join(options.outSeqDir, 'config.xml')
        config.writeXML(options.configFile)
        
    # pass through the config file to the options
    # todo (don't like second hard-code check of .xml path)
    if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"):
        options.cactusOptions += ' --configFile {}'.format(options.configFile)

    # get the ancestor names
    tree = MultiCactusTree(seqFile.tree)
    tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())

    # make the output
    outSeqFile = SeqFile()
    outSeqFile.tree= tree
    outSeqFile.pathMap = seqFile.pathMap
    outSeqFile.outgroups = seqFile.outgroups

    # update paths for preprocessed leaves or inferred ancestors
    for node in outSeqFile.tree.breadthFirstTraversal():
        name = outSeqFile.tree.getName(node)
        leaf = outSeqFile.tree.isLeaf(node)
        if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly):
            out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name)
            outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename))

    # write the output
    with open(options.outSeqFile, 'w') as out_sf:
        out_sf.write(str(outSeqFile))

    # write the instructions
    print(get_plan(options, project, outSeqFile))
Пример #2
0
def runCactusAfterBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)

            # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
            def get_input_path(suffix=''):
                base_path = options.cigarsFile[0]
                for input_path in options.cigarsFile:
                    if suffix and input_path.endswith(suffix):
                        return input_path
                    if os.path.basename(base_path).startswith(
                            os.path.basename(input_path)):
                        base_path = input_path
                return base_path + suffix

            # import the outgroups
            outgroupIDs = []
            outgroup_fragment_found = False
            for i, outgroup in enumerate(outgroups):
                try:
                    outgroupID = toil.importFile(
                        makeURL(get_input_path('.og_fragment_{}'.format(i))))
                    outgroupIDs.append(outgroupID)
                    experiment.setSequenceID(outgroup, outgroupID)
                    outgroup_fragment_found = True
                    assert not options.pangenome
                except:
                    # we assume that input is not coming from cactus blast, so we'll treat output
                    # sequences normally and not go looking for fragments
                    outgroupIDs = []
                    break

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in leaves or (not outgroup_fragment_found
                                        and genome in outgroups):
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)

                    experiment.setSequenceID(genome, toil.importFile(seq))

            if not outgroup_fragment_found:
                outgroupIDs = [
                    experiment.getSequenceID(outgroup)
                    for outgroup in outgroups
                ]

            # write back the experiment, as CactusWorkflowArguments wants a path
            experiment.writeXML(experimentFile)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            if options.pangenome:
                # turn off the megablock filter as it ruins non-all-to-all alignments
                configWrapper.disableCafMegablockFilter()
                # the recoverable chains parameter does not seem to play nicely with star-like alignments either
                #configWrapper.disableRecoverableChains()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            #import the files that cactus-blast made
            workFlowArgs.alignmentsID = toil.importFile(
                makeURL(get_input_path()))
            workFlowArgs.secondaryAlignmentsID = None
            if not options.pafInput:
                try:
                    workFlowArgs.secondaryAlignmentsID = toil.importFile(
                        makeURL(get_input_path('.secondary')))
                except:
                    pass
            workFlowArgs.outgroupFragmentIDs = outgroupIDs
            workFlowArgs.ingroupCoverageIDs = []
            if outgroup_fragment_found and len(outgroups) > 0:
                for i in range(len(leaves)):
                    workFlowArgs.ingroupCoverageIDs.append(
                        toil.importFile(
                            makeURL(get_input_path(
                                '.ig_coverage_{}'.format(i)))))

            halID = toil.start(
                Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput))

        # export the hal
        toil.exportFile(halID, makeURL(options.outputHal))
Пример #3
0
def main():
    options = get_options()

    with Toil(options) as workflow:
        setupBinaries(options)

        importSingularityImage(options)

        ## Preprocessing:
        if (options.pathOverrides or options.pathOverrideNames):
            if not options.pathOverrides or not options.pathOverrideNames or \
            len(options.pathOverrideNames) != len(options.pathOverrides):
                raise RuntimeError(
                    'same number of values must be passed to --pathOverrides and --pathOverrideNames'
                )

        # apply path overrides.  this was necessary for wdl which doesn't take kindly to
        # text files of local paths (ie seqfile).  one way to fix would be to add support
        # for s3 paths and force wdl to use it.  a better way would be a more fundamental
        # interface shift away from files of paths throughout all of cactus
        if options.pathOverrides:
            seqFile = SeqFile(options.seqFile)
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            tree = MultiCactusTree(seqFile.tree)
            tree.nameUnlabeledInternalNodes(
                prefix=config.getDefaultInternalNodePrefix())
            for name, override in zip(options.pathOverrideNames,
                                      options.pathOverrides):
                seqFile.pathMap[name] = override
            override_seq = os.path.join(options.cactusDir, 'seqFile.override')
            with open(override_seq, 'w') as out_sf:
                out_sf.write(str(seqFile))
            options.seqFile = override_seq

        # Import asms; by default, prepends unique IDs in the technique used in cactus-blast.
        asms = get_asms_from_seqfile(options.seqFile, workflow)

        ## Perform alignments:
        if not workflow.options.restart:
            alignments = workflow.start(
                Job.wrapJobFn(run_cactus_reference_align, asms, options.refID,
                              options.debug_export, options.dipcall_bed_filter,
                              options.dipcall_vcf_filter))

        else:
            alignments = workflow.restart()

        if options.debug_export:
            # first, ensure the debug dir exists.
            if not os.path.isdir(options.debug_export_dir):
                os.mkdir(options.debug_export_dir)

            print(alignments)
            # Then return value is: (all_primary, all_secondary, ref_mappings, primary_mappings, secondary_mappings)
            for asm, mapping_file in alignments[2].items():
                workflow.exportFile(
                    mapping_file, 'file://' +
                    os.path.abspath("mappings_for_" + asm + ".paf"))
            for asm, mapping_file in alignments[3].items():
                workflow.exportFile(
                    mapping_file, 'file://' +
                    os.path.abspath("mappings_for_" + asm + ".cigar"))
            for asm, mapping_file in alignments[4].items():
                workflow.exportFile(
                    mapping_file, 'file://' +
                    os.path.abspath("mappings_for_" + asm + ".cigar.secondry"))

        ## Save alignments:
        if options.dipcall_vcf_filter:  # this is substantially less restrictive than the dipcall_bed_filter.
            dipcall_filtered = workflow.start(
                Job.wrapJobFn(apply_dipcall_vcf_filter, alignments[0]))
            workflow.exportFile(dipcall_filtered, makeURL(options.outputFile))
            workflow.exportFile(
                alignments[1],
                makeURL(options.outputFile + ".unfiltered.secondary"))
        else:
            workflow.exportFile(alignments[0], makeURL(options.outputFile))
            workflow.exportFile(alignments[1],
                                makeURL(options.outputFile + ".secondary"))
Пример #4
0
def runCactusGraphMap(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #load cactus config
            configNode = ET.parse(options.configFile).getroot()
            config = ConfigWrapper(configNode)
            config.substituteAllPredefinedConstantsWithLiterals()

            # get the minigraph "virutal" assembly name
            graph_event = getOptionalAttrib(findRequiredNode(
                configNode, "refgraph"),
                                            "assemblyName",
                                            default="__MINIGRAPH_SEQUENCES__")

            # load the seqfile
            seqFile = SeqFile(options.seqFile)

            logger.info("Genomes for graphmap, {}".format(seqFile.pathMap))

            if not options.outputFasta and graph_event not in seqFile.pathMap:
                raise RuntimeError(
                    "{} assembly not found in seqfile so it must be specified with --outputFasta"
                    .format(graph_event))

            #import the graph
            gfa_id = toil.importFile(makeURL(options.minigraphGFA))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            seqIDMap = {}
            for genome, seq in seqFile.pathMap.items():
                if genome != graph_event:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    seqIDMap[genome] = toil.importFile(seq)

            # run the workflow
            paf_id, gfa_fa_id = toil.start(
                Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap,
                              gfa_id, graph_event))

        #export the paf
        toil.exportFile(paf_id, makeURL(options.outputPAF))
        if gfa_fa_id:
            toil.exportFile(gfa_fa_id, makeURL(options.outputFasta))

        # update the input seqfile (in place!)
        add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta),
                              graph_event)
Пример #5
0
def cactusPrepare(options, project):
    """ annotate a SeqFile with ancestral names as well as paths for output sequences."""

    # read the input
    seqFile = SeqFile(options.seqFile)
    configNode = ET.parse(options.configFile).getroot()
    config = ConfigWrapper(configNode)

    if not options.wdl:
        # prepare output sequence directory
        # todo: support remote (ie s3) output directory
        try:
            os.makedirs(options.outDir)
        except:
            pass
        if not os.path.isdir(options.outDir):
            raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir))
        if not os.access(options.outDir, os.W_OK):
            logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir))

    if options.preprocessOnly or options.gpu:
        if options.preprocessOnly:
            # hack the configfile to skip preprocessing and write it to the output dir
            config.removePreprocessors()
        if options.gpu:
            # hack the configfile to toggle on gpu lastz
            cafNode = findRequiredNode(config.xmlRoot, "caf")
            cafNode.attrib["gpuLastz"] = "true"
            # realigning doesn't mix well with lastz so we make sure it's off
            # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271
            cafNode.attrib["realign"] = "0"
        options.configFile = os.path.join(options.outDir, 'config-prepared.xml')
        sys.stderr.write("configuration saved in {}\n".format(options.configFile))
        config.writeXML(options.configFile)
        
    # pass through the config file to the options
    # todo (don't like second hard-code check of .xml path)
    if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl:
        options.cactusOptions += ' --configFile {}'.format(options.configFile)

    # get the ancestor names
    tree = MultiCactusTree(seqFile.tree)
    tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())

    # make the output
    outSeqFile = SeqFile()
    outSeqFile.tree= tree
    outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap)
    outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups)

    # update paths for preprocessed leaves or inferred ancestors
    for node in outSeqFile.tree.breadthFirstTraversal():
        name = outSeqFile.tree.getName(node)
        leaf = outSeqFile.tree.isLeaf(node)
        if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly):
            out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name)
            outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename))
            if options.wdl:
                # uniquify name in wdl to prevent collisions
                outSeqFile.pathMap[name] += '.pp'

    # write the output
    if options.outSeqFile:
        with open(options.outSeqFile, 'w') as out_sf:
            out_sf.write(str(outSeqFile))

    # write the instructions
    print(get_plan(options, project, seqFile, outSeqFile))
Пример #6
0
def make_align_job(options, toil):
    options.cactusDir = getTempDirectory()

    # apply path overrides.  this was necessary for wdl which doesn't take kindly to
    # text files of local paths (ie seqfile).  one way to fix would be to add support
    # for s3 paths and force wdl to use it.  a better way would be a more fundamental
    # interface shift away from files of paths throughout all of cactus
    if options.pathOverrides:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        tree = MultiCactusTree(seqFile.tree)
        tree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        for name, override in zip(options.pathOverrideNames,
                                  options.pathOverrides):
            seqFile.pathMap[name] = override
        override_seq = os.path.join(options.cactusDir, 'seqFile.override')
        with open(override_seq, 'w') as out_sf:
            out_sf.write(str(seqFile))
        options.seqFile = override_seq

    if not options.root:
        seqFile = SeqFile(options.seqFile)
        configNode = ET.parse(options.configFile).getroot()
        config = ConfigWrapper(configNode)
        mcTree = MultiCactusTree(seqFile.tree)
        mcTree.nameUnlabeledInternalNodes(
            prefix=config.getDefaultInternalNodePrefix())
        options.root = mcTree.getRootName()

    if options.acyclic:
        seqFile = SeqFile(options.seqFile)
        tree = MultiCactusTree(seqFile.tree)
        leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
        if options.acyclic not in leaves:
            raise RuntimeError(
                "Genome specified with --acyclic, {}, not found in tree leaves"
                .format(options.acyclic))

    #to be consistent with all-in-one cactus, we make sure the project
    #isn't limiting itself to the subtree (todo: parameterize so root can
    #be passed through from prepare to blast/align)
    proj_options = copy.deepcopy(options)
    proj_options.root = None
    #Create the progressive cactus project (as we do in runCactusProgressive)
    projWrapper = ProjectWrapper(proj_options,
                                 proj_options.configFile,
                                 ignoreSeqPaths=options.root)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    # open up the experiment (as we do in ProgressiveUp.run)
    # note that we copy the path into the options here
    experimentFile = project.expMap[options.root]
    expXml = ET.parse(experimentFile).getroot()
    experiment = ExperimentWrapper(expXml)
    configPath = experiment.getConfigPath()
    configXml = ET.parse(configPath).getroot()

    seqIDMap = dict()
    tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root)
    leaves = [tree.getName(leaf) for leaf in tree.getLeaves()]
    outgroups = experiment.getOutgroupGenomes()
    genome_set = set(leaves + outgroups)

    # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups
    def get_input_path(suffix=''):
        base_path = options.cigarsFile[0]
        for input_path in options.cigarsFile:
            if suffix and input_path.endswith(suffix):
                return input_path
            if os.path.basename(base_path).startswith(
                    os.path.basename(input_path)):
                base_path = input_path
        return base_path + suffix

    # import the outgroups
    outgroupIDs = []
    outgroup_fragment_found = False
    for i, outgroup in enumerate(outgroups):
        try:
            outgroupID = toil.importFile(
                makeURL(get_input_path('.og_fragment_{}'.format(i))))
            outgroupIDs.append(outgroupID)
            experiment.setSequenceID(outgroup, outgroupID)
            outgroup_fragment_found = True
            assert not options.pangenome
        except:
            # we assume that input is not coming from cactus blast, so we'll treat output
            # sequences normally and not go looking for fragments
            outgroupIDs = []
            break

    #import the sequences (that we need to align for the given event, ie leaves and outgroups)
    for genome, seq in list(project.inputSequenceMap.items()):
        if genome in leaves or (not outgroup_fragment_found
                                and genome in outgroups):
            if os.path.isdir(seq):
                tmpSeq = getTempFile()
                catFiles(
                    [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)],
                    tmpSeq)
                seq = tmpSeq
            seq = makeURL(seq)

            logger.info("Importing {}".format(seq))
            experiment.setSequenceID(genome, toil.importFile(seq))

    if not outgroup_fragment_found:
        outgroupIDs = [
            experiment.getSequenceID(outgroup) for outgroup in outgroups
        ]

    # write back the experiment, as CactusWorkflowArguments wants a path
    experiment.writeXML(experimentFile)

    #import cactus config
    if options.configFile:
        cactusConfigID = toil.importFile(makeURL(options.configFile))
    else:
        cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
    project.setConfigID(cactusConfigID)

    project.syncToFileStore(toil)
    configNode = ET.parse(project.getConfigPath()).getroot()
    configWrapper = ConfigWrapper(configNode)
    configWrapper.substituteAllPredefinedConstantsWithLiterals()

    if options.singleCopySpecies:
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format(
                options.singleCopySpecies)

    if options.barMaskFilter:
        findRequiredNode(
            configWrapper.xmlRoot,
            "bar").attrib["partialOrderAlignmentMaskFilter"] = str(
                options.barMaskFilter)

    if options.pangenome:
        # turn off the megablock filter as it ruins non-all-to-all alignments
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["minimumBlockHomologySupport"] = "0"
        findRequiredNode(
            configWrapper.xmlRoot,
            "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
        # turn off mapq filtering
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["runMapQFiltering"] = "0"
        # more iterations here helps quite a bit to reduce underalignment
        findRequiredNode(configWrapper.xmlRoot,
                         "caf").attrib["maxRecoverableChainsIterations"] = "50"
        # turn down minimum block degree to get a fat ancestor
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["minimumBlockDegree"] = "1"
        # turn on POA
        findRequiredNode(configWrapper.xmlRoot,
                         "bar").attrib["partialOrderAlignment"] = "1"
        # save it
        if not options.batch:
            pg_file = options.outHal + ".pg-conf.xml"
            if pg_file.startswith('s3://'):
                pg_temp_file = getTempFile()
            else:
                pg_temp_file = pg_file
            configWrapper.writeXML(pg_temp_file)
            if pg_file.startswith('s3://'):
                write_s3(pg_temp_file,
                         pg_file,
                         region=get_aws_region(options.jobStore))
            logger.info("pangenome configuration overrides saved in {}".format(
                pg_file))

    workFlowArgs = CactusWorkflowArguments(options,
                                           experimentFile=experimentFile,
                                           configNode=configNode,
                                           seqIDMap=project.inputSequenceIDMap)

    #import the files that cactus-blast made
    workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path()))
    workFlowArgs.secondaryAlignmentsID = None
    if not options.pafInput:
        try:
            workFlowArgs.secondaryAlignmentsID = toil.importFile(
                makeURL(get_input_path('.secondary')))
        except:
            pass
    workFlowArgs.outgroupFragmentIDs = outgroupIDs
    workFlowArgs.ingroupCoverageIDs = []
    if outgroup_fragment_found and len(outgroups) > 0:
        for i in range(len(leaves)):
            workFlowArgs.ingroupCoverageIDs.append(
                toil.importFile(
                    makeURL(get_input_path('.ig_coverage_{}'.format(i)))))

    align_job = Job.wrapJobFn(run_cactus_align,
                              configWrapper,
                              workFlowArgs,
                              project,
                              checkpointInfo=options.checkpointInfo,
                              doRenaming=options.nonCactusInput,
                              pafInput=options.pafInput,
                              pafSecondaries=options.usePafSecondaries,
                              doVG=options.outVG,
                              doGFA=options.outGFA,
                              delay=options.stagger,
                              eventNameAsID=options.eventNameAsID,
                              acyclicEvent=options.acyclic)
    return align_job
Пример #7
0
def runCactusBlastOnly(options):
    with Toil(options) as toil:
        importSingularityImage(options)
        #Run the workflow
        if options.restart:
            alignmentID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()

            # apply path overrides.  this was necessary for wdl which doesn't take kindly to
            # text files of local paths (ie seqfile).  one way to fix would be to add support
            # for s3 paths and force wdl to use it.  a better way would be a more fundamental
            # interface shift away from files of paths throughout all of cactus
            if options.pathOverrides:
                seqFile = SeqFile(options.seqFile)
                configNode = ET.parse(options.configFile).getroot()
                config = ConfigWrapper(configNode)
                tree = MultiCactusTree(seqFile.tree)
                tree.nameUnlabeledInternalNodes(
                    prefix=config.getDefaultInternalNodePrefix())
                for name, override in zip(options.pathOverrideNames,
                                          options.pathOverrides):
                    seqFile.pathMap[name] = override
                override_seq = os.path.join(options.cactusDir,
                                            'seqFile.override')
                with open(override_seq, 'w') as out_sf:
                    out_sf.write(str(seqFile))
                options.seqFile = override_seq

            #to be consistent with all-in-one cactus, we make sure the project
            #isn't limiting itself to the subtree (todo: parameterize so root can
            #be passed through from prepare to blast/align)
            proj_options = copy.deepcopy(options)
            proj_options.root = None
            #Create the progressive cactus project (as we do in runCactusProgressive)
            projWrapper = ProjectWrapper(proj_options,
                                         proj_options.configFile,
                                         ignoreSeqPaths=options.root)
            projWrapper.writeXml()

            pjPath = os.path.join(
                options.cactusDir, ProjectWrapper.alignmentDirName,
                '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)

            # open up the experiment (as we do in ProgressiveUp.run)
            # note that we copy the path into the options here
            experimentFile = project.expMap[options.root]
            expXml = ET.parse(experimentFile).getroot()
            logger.info("Experiment {}".format(ET.tostring(expXml)))
            experiment = ExperimentWrapper(expXml)
            configPath = experiment.getConfigPath()
            configXml = ET.parse(configPath).getroot()

            seqIDMap = dict()
            tree = MultiCactusTree(experiment.getTree()).extractSubTree(
                options.root)
            leaves = tree.getChildNames(tree.getRootName())
            outgroups = experiment.getOutgroupGenomes()
            genome_set = set(leaves + outgroups)
            logger.info("Genomes in blastonly, {}: {}".format(
                options.root, list(genome_set)))

            print(str(project.inputSequenceMap))

            #import the sequences (that we need to align for the given event, ie leaves and outgroups)
            for genome, seq in list(project.inputSequenceMap.items()):
                if genome in genome_set:
                    if os.path.isdir(seq):
                        tmpSeq = getTempFile()
                        catFiles([
                            os.path.join(seq, subSeq)
                            for subSeq in os.listdir(seq)
                        ], tmpSeq)
                        seq = tmpSeq
                    seq = makeURL(seq)
                    project.inputSequenceIDMap[genome] = toil.importFile(seq)
                else:
                    # out-of-scope sequences will only cause trouble later on
                    del project.inputSequenceMap[genome]

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            workFlowArgs = CactusWorkflowArguments(
                options,
                experimentFile=experimentFile,
                configNode=configNode,
                seqIDMap=project.inputSequenceIDMap)

            outWorkFlowArgs = toil.start(
                CactusTrimmingBlastPhase(standAlone=True,
                                         cactusWorkflowArguments=workFlowArgs,
                                         phaseName="trimBlast"))

        # export the alignments
        toil.exportFile(outWorkFlowArgs.alignmentsID,
                        makeURL(options.outputFile))
        # optional secondary alignments
        if outWorkFlowArgs.secondaryAlignmentsID:
            toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID,
                            makeURL(options.outputFile) + '.secondary')
        # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts
        for i, outgroupFragmentID in enumerate(
                outWorkFlowArgs.outgroupFragmentIDs):
            toil.exportFile(
                outgroupFragmentID,
                makeURL(options.outputFile) + '.og_fragment_{}'.format(i))
        # cactus-align can recompute coverage on the fly, but we save them because we have them
        for i, ingroupCoverageID in enumerate(
                outWorkFlowArgs.ingroupCoverageIDs):
            toil.exportFile(
                ingroupCoverageID,
                makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))