示例#1
0
 def getConfigPath(self):
     config = self.xmlRoot.attrib["config"]
     if config == 'default':
         config = os.path.join(cactusRootPath(), "cactus_config.xml")
     if config == 'defaultProgressive':
         config = os.path.join(cactusRootPath(), "cactus_progressive_config.xml")
     return config
    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = cactusRootPath()
            configPath = os.path.join(dir,
                                      "cactus_progressive_config.xml")
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)
        if self.options.outputMaf is not None:
            self.configWrapper.setBuildMaf(True)
            self.configWrapper.setJoinMaf(True)
        # pre-emptively turn down maxParallelSubtree for singleMachine
        # mode if not enough threads are provided to support it.  Probably
        # need to do something for other ?combined? batch systems?
        if self.options.batchSystem == 'singleMachine' and \
               self.options.database == 'kyoto_tycoon':
            if int(self.options.maxThreads) < \
                   self.configWrapper.getMaxParallelSubtrees() * 3:
                self.configWrapper.setMaxParallelSubtrees(
                    max(1, int(self.options.maxThreads) / 3)) 

        # this is a little hack to effectively toggle back to the
        # non-progressive version of cactus (as published in Gen. Res. 2011)
        # from the high-level interface. 
        if self.options.legacy is True:
            self.configWrapper.setSubtreeSize(sys.maxint)
示例#3
0
    def run(self, fileStore):
        """
        mask alpha satellites with dna-brnn
        """
        fastaFile = fileStore.readGlobalFile(self.fastaID)

        cmd = ['dna-brnn', fastaFile] + self.dnabrnnOpts.split()
        if '-i' not in self.dnabrnnOpts:
            # pull up the model
            # todo: is there are more robust way?
            cmd += ['-i', os.path.join(cactusRootPath(), 'attcc-alpha.knm')]

        if self.cores:
            cmd += ['-t', str(self.cores)]

        bedFile = fileStore.getLocalTempFile()

        # run dna-brnn to make a bed file
        cactus_call(outfile=bedFile, parameters=cmd)

        maskedFile = fileStore.getLocalTempFile()

        mask_cmd = [
            'cactus_fasta_softmask_intervals.py', '--origin=zero',
            '--minLength={}'.format(self.minLength), bedFile
        ]

        # do the softmasking
        cactus_call(infile=fastaFile, outfile=maskedFile, parameters=mask_cmd)

        return fileStore.writeGlobalFile(maskedFile)
示例#4
0
def wdl_workflow_start(options, in_seq_file):

    s = 'version 1.0\n\n'
    s += wdl_task_preprocess(options) + '\n'
    s += wdl_task_blast(options) + '\n'
    s += wdl_task_align(options) + '\n'
    s += wdl_task_hal_append(options) + '\n'
    
    s += 'workflow cactus_prepared {\n\n'

    # we need to explicitly import local files
    s += '    input {\n'

    s += '        File seq_file'
    if not options.noLocalInputs:
        s += '=\"{}\"'.format(os.path.abspath(options.seqFile))
    s += '\n'

    s += '        File? config_file'
    if not options.noLocalInputs and options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"):
        s += '=\"{}\"'.format(os.path.abspath(options.configFile))
    s += '\n'
    
    for name, fa_path in in_seq_file.pathMap.items():
        # todo: replace with check from toil
        if '://' not in fa_path:
            s += '        File {}'.format(input_fa_name(name))
            if not options.noLocalInputs:
                s += '=\"{}\"'.format(os.path.abspath(fa_path))
            s += '\n'
    s += '    }\n'
    return s
示例#5
0
    def processConfig(self):
        # read in the default right out of cactus
        if self.options.configFile is not None:
            configPath = self.options.configFile
        else:
            dir = cactusRootPath()
            configPath = os.path.join(dir, "cactus_progressive_config.xml")
        configXml = ET.parse(configPath).getroot()
        self.configWrapper = ConfigWrapper(configXml)
        # here we can go through the options and apply some to the config
        self.configWrapper.setBuildHal(True)
        self.configWrapper.setBuildFasta(True)
        if self.options.outputMaf is not None:
            self.configWrapper.setBuildMaf(True)
            self.configWrapper.setJoinMaf(True)
        # pre-emptively turn down maxParallelSubtree for singleMachine
        # mode if not enough threads are provided to support it.  Probably
        # need to do something for other ?combined? batch systems?
        if self.options.batchSystem == 'singleMachine' and \
               self.options.database == 'kyoto_tycoon':
            if int(self.options.maxThreads) < \
                   self.configWrapper.getMaxParallelSubtrees() * 3:
                self.configWrapper.setMaxParallelSubtrees(
                    max(1,
                        int(self.options.maxThreads) / 3))

        # this is a little hack to effectively toggle back to the
        # non-progressive version of cactus (as published in Gen. Res. 2011)
        # from the high-level interface.
        if self.options.legacy is True:
            self.configWrapper.setSubtreeSize(sys.maxint)
示例#6
0
def getRandomConfigFile():
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot()
    cafNode = config.find("caf")
    assert len(config.findall("caf")) == 1
    
    annealingRounds = 1 + int(random.random() * 10)
    cafNode.attrib["annealingRounds"] = " ".join([ str(1 + int(random.random() * 10)) for i in xrange(annealingRounds) ])
    deannealingRounds = list(set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ]))
    deannealingRounds.sort()
    cafNode.attrib["deannealingRounds"] = " ".join([ str(i) for i in deannealingRounds ])
    cafNode.attrib["trim"] = " ".join([ str(1 + int(random.random() * 5)) for i in xrange(annealingRounds) ])
    
    cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds)
    
    cafNode.attrib["minimumTreeCoverage"] = str(random.random())
    cafNode.attrib["blockTrim"] = str(int(random.random() * 5))
    cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(random.choice([0, 1]))
    cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5]))
    
    checkNode = config.find("check")
    checkNode.attrib["runCheck"] = "1"
    
    checkNode = config.find("normal")
    checkNode.attrib["iterations"] = "2"
    
    #Now print the file..
    fileHandle = open(tempConfigFile, 'w')
    ET.ElementTree(config).write(fileHandle)
    fileHandle.close()
    if getLogLevelString() == "DEBUG":
        system("cat %s" % tempConfigFile)
    return tempConfigFile
示例#7
0
def getConfigFile(matchingAlgorithm="greedy"):
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(
        os.path.join(cactusRootPath(),
                     "cactus_progressive_config.xml")).getroot()
    config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm
    ET.ElementTree(config).write(tempConfigFile)
    return os.path.abspath(tempConfigFile)
示例#8
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.configFile = os.path.join(cactusRootPath(), "cactus_config.xml")
     self.configNode = ET.parse(self.configFile).getroot()
     self.barNode = self.configNode.find("bar")
     assert self.barNode != None
示例#9
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.configFile = os.path.join(cactusRootPath(), "cactus_config.xml")
     self.configNode = ET.parse(self.configFile).getroot()
     self.barNode = self.configNode.find("bar")
     assert self.barNode != None
def getConfigFile(matchingAlgorithm="greedy"):
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()
    #Set the matching algorithm
    config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm
    #Now print the file..
    fileHandle = open(tempConfigFile, 'w')
    ET.ElementTree(config).write(fileHandle)
    fileHandle.close()
    return os.path.abspath(tempConfigFile)
示例#11
0
def cactusPrepare(options, project):
    """ annotate a SeqFile with ancestral names as well as paths for output sequences."""

    # read the input
    seqFile = SeqFile(options.seqFile)
    configNode = ET.parse(options.configFile).getroot()
    config = ConfigWrapper(configNode)

    # prepare output sequence directory
    # todo: support remote (ie s3) output directory
    try:
        os.makedirs(options.outSeqDir)
    except:
        pass
    if not os.path.isdir(options.outSeqDir):
        raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir))
    if not os.access(options.outSeqDir, os.W_OK):
        logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir))

    # hack the configfile to skip preprocessing and write it to the output dir
    if options.preprocessOnly:
        config.removePreprocessors()
        options.configFile = os.path.join(options.outSeqDir, 'config.xml')
        config.writeXML(options.configFile)
        
    # pass through the config file to the options
    # todo (don't like second hard-code check of .xml path)
    if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"):
        options.cactusOptions += ' --configFile {}'.format(options.configFile)

    # get the ancestor names
    tree = MultiCactusTree(seqFile.tree)
    tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())

    # make the output
    outSeqFile = SeqFile()
    outSeqFile.tree= tree
    outSeqFile.pathMap = seqFile.pathMap
    outSeqFile.outgroups = seqFile.outgroups

    # update paths for preprocessed leaves or inferred ancestors
    for node in outSeqFile.tree.breadthFirstTraversal():
        name = outSeqFile.tree.getName(node)
        leaf = outSeqFile.tree.isLeaf(node)
        if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly):
            out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name)
            outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename))

    # write the output
    with open(options.outSeqFile, 'w') as out_sf:
        out_sf.write(str(outSeqFile))

    # write the instructions
    print(get_plan(options, project, outSeqFile))
示例#12
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("outputSequenceDir", help='Directory where the processed sequences will be placed')
    parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("inputSequences", nargs='+', help='input FASTA file(s)')

    options = parser.parse_args()
    setLoggingFromOptions(options)

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=options.outputSequenceDir, configFile=options.configFile, inputSequences=options.inputSequences, toil=toil, restart=options.restart)
示例#13
0
def getConfigFile(matchingAlgorithm="greedy"):
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(
        os.path.join(cactusRootPath(),
                     "cactus_progressive_config.xml")).getroot()
    #Set the matching algorithm
    config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm
    #Now print the file..
    fileHandle = open(tempConfigFile, 'w')
    ET.ElementTree(config).write(fileHandle)
    fileHandle.close()
    return os.path.abspath(tempConfigFile)
示例#14
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                      " must appear in NEWICK tree in <seqfile>) to use as a "
                      "root for the alignment.  Any genomes not below this node "
                      "in the tree may be used as outgroups but will never appear"
                      " in the output.  If no root is specifed then the root"
                      " of the tree is used. ", default=None)
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)
    parser.add_argument("--database", choices=["kyoto_tycoon", "redis"],
                        help="The type of database", default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError('Only 1 CPU detected.  Cactus requires at least 2')

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusProgressive(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("Cactus has finished after {} seconds".format(run_time))
示例#15
0
 def processConfig(self):
     # read in the default right out of cactus
     if self.options.configFile is not None:
         configPath = self.options.configFile
     else:
         dir = cactusRootPath()
         configPath = os.path.join(dir, "cactus_progressive_config.xml")
     configXml = ET.parse(configPath).getroot()
     self.configWrapper = ConfigWrapper(configXml)
     # here we can go through the options and apply some to the config
     self.configWrapper.setBuildHal(True)
     self.configWrapper.setBuildFasta(True)
示例#16
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("outputSequenceDir", help='Directory where the processed sequences will be placed')
    parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("inputSequences", nargs='+', help='input FASTA file(s)')

    options = parser.parse_args()
    setLoggingFromOptions(options)

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=options.outputSequenceDir, configFile=options.configFile, inputSequences=options.inputSequences, toil=toil, restart=options.restart)
示例#17
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file (will be modified if necessary to include graph Fasta sequence)")
    parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)")
    parser.add_argument("outputPAF", type=str, help = "Output pairwise alignment file in PAF format")
    parser.add_argument("--outputFasta", type=str, help = "Output graph sequence file in FASTA format (required if not present in seqFile)")
    parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp (overrides config option of same name)")    
    parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory")
    parser.add_argument("--refFromGFA", type=str, help = "Do not align given genome from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)")

    #WDL hacks
    parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile")
    parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outputGAFDir:
        if not os.path.isdir(options.outputGAFDir):
            os.makedirs(options.outputGAFDir)

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames')

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMap(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
示例#18
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.useOutgroup = False
     self.doSelfAlignment = False
     #Load the config file, turn on the checks.
     configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot())
     configWrapper.turnAllModesOn()
     self.tempDir = getTempDirectory(os.getcwd())
     self.configFile = os.path.join(self.tempDir, "tempConfig.xml")
     configWrapper.writeXML(self.configFile)
示例#19
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("--vg", required=True, nargs='+',  help = "Input vg files (PackedGraph or HashGraph format)")
    parser.add_argument("--outDir", required=True, type=str, help = "Output directory")
    parser.add_argument("--outName", required=True, type=str, help = "Basename of all output files")
    parser.add_argument("--reference", required=True, type=str, help = "Reference event name")
    parser.add_argument("--vcfReference", type=str, help = "Reference event for VCF (if different from --reference)")
    parser.add_argument("--rename", nargs='+', default = [], help = "Path renaming, each of form src>dest (see clip-vg -r)")
    parser.add_argument("--clipLength", type=int, default=None, help = "clip out unaligned sequences longer than this")
    parser.add_argument("--wlineSep", type=str, help = "wline separator for vg convert")
    parser.add_argument("--indexCores", type=int, default=1, help = "cores for indexing processes")
    parser.add_argument("--decoyGraph", help= "decoy sequences vg graph to add (PackedGraph or HashGraph format)")
    parser.add_argument("--hal", nargs='+', default = [], help = "Input hal files (for merging)")
    
    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outDir and not options.outDir.startswith('s3://'):
        if not os.path.isdir(options.outDir):
            os.makedirs(options.outDir)

    if options.hal and len(options.hal) != len(options.vg):
        raise RuntimeError("If --hal and --vg should specify the same number of files")
        
    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMapJoin(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap-join has finished after {} seconds".format(run_time))
示例#20
0
 def processConfig(self):
     # read in the default right out of cactus
     if self.options.configFile is not None:
         configPath = self.options.configFile
     else:
         dir = cactusRootPath()
         configPath = os.path.join(dir,
                                   "cactus_progressive_config.xml")
     log.info("Using config from path %s." % configPath)
     configXml = ET.parse(configPath).getroot()
     self.configWrapper = ConfigWrapper(configXml)
     # here we can go through the options and apply some to the config
     self.configWrapper.setBuildHal(True)
     self.configWrapper.setBuildFasta(True)
示例#21
0
 def setUp(self):
     self.batchSystem = "singleMachine"
     if getBatchSystem() != None:
         self.batchSystem = getBatchSystem()
     unittest.TestCase.setUp(self)
     self.useOutgroup = False
     self.doSelfAlignment = False
     #Load the config file, turn on the checks.
     configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot())
     configWrapper.turnAllModesOn()
     configWrapper.turnOffHeaderChecks()
     self.tempDir = getTempDirectory(os.getcwd())
     self.configFile = os.path.join(self.tempDir, "tempConfig.xml")
     configWrapper.writeXML(self.configFile)
示例#22
0
def loadDnaBrnnModel(toil, configNode, maskAlpha=False):
    """ store the model in a toil file id so it can be used in any workflow """
    for prepXml in configNode.findall("preprocessor"):
        if prepXml.attrib["preprocessJob"] == "dna-brnn":
            if maskAlpha or getOptionalAttrib(
                    prepXml, "active", typeFn=bool, default=False):
                dnabrnnOpts = getOptionalAttrib(prepXml,
                                                "dna-brnnOpts",
                                                default="")
                if '-i' in dnabrnnOpts:
                    model_path = dnabrnnOpts[dnabrnnOpts.index('-i') + 1]
                else:
                    model_path = os.path.join(cactusRootPath(),
                                              'attcc-alpha.knm')
                os.environ["CACTUS_DNA_BRNN_MODEL_ID"] = toil.importFile(
                    makeURL(model_path))
示例#23
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file (gzipped fastas supported)")
    parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)")
    parser.add_argument("graphmapPAF", type=str, help = "Output pairwise alignment file in PAF format (can be gzipped)")
    parser.add_argument("--outDir", required=True, type=str, help = "Output directory")
    parser.add_argument("--refContigs", nargs="*", help = "Subset to these reference contigs (multiple allowed)", default=[])
    parser.add_argument("--refContigsFile", type=str, help = "Subset to (newline-separated) reference contigs in this file")
    parser.add_argument("--otherContig", type=str, help = "Lump all reference contigs unselected by above options into single one with this name")
    parser.add_argument("--reference", type=str, help = "Name of reference (in seqFile).  Ambiguity filters will not be applied to it")
    parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp")
    
    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outDir and not options.outDir.startswith('s3://'):
        if not os.path.isdir(options.outDir):
            os.makedirs(options.outDir)
        
    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMapSplit(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap-split has finished after {} seconds".format(run_time))
示例#24
0
def getRandomConfigFile():
    tempConfigFile = getTempFile(rootDir="./", suffix=".xml")
    config = ET.parse(os.path.join(cactusRootPath(),
                                   "cactus_config.xml")).getroot()
    cafNode = config.find("caf")
    assert len(config.findall("caf")) == 1

    annealingRounds = 1 + int(random.random() * 10)
    cafNode.attrib["annealingRounds"] = " ".join(
        [str(1 + int(random.random() * 10)) for i in xrange(annealingRounds)])
    deannealingRounds = list(
        set([
            1 + int(random.random() * 10)
            for i in xrange(int(random.random() * 10))
        ]))
    deannealingRounds.sort()
    cafNode.attrib["deannealingRounds"] = " ".join(
        [str(i) for i in deannealingRounds])
    cafNode.attrib["trim"] = " ".join(
        [str(1 + int(random.random() * 5)) for i in xrange(annealingRounds)])

    cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() *
                                               annealingRounds)

    cafNode.attrib["minimumTreeCoverage"] = str(random.random())
    cafNode.attrib["blockTrim"] = str(int(random.random() * 5))
    cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(
        random.choice([0, 1]))
    cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5]))

    checkNode = config.find("check")
    checkNode.attrib["runCheck"] = "1"

    checkNode = config.find("normal")
    checkNode.attrib["iterations"] = "2"

    #Now print the file..
    fileHandle = open(tempConfigFile, 'w')
    ET.ElementTree(config).write(fileHandle)
    fileHandle.close()
    if getLogLevelString() == "DEBUG":
        system("cat %s" % tempConfigFile)
    return tempConfigFile
示例#25
0
def main():
    parser = ArgumentParser()
    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outSeqDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed')
    parser.add_argument("outSeqFile", help = "Path for annotated Seq file output")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")
    parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs")
    parser.add_argument("--jobStore", type=str, default="$JOBSTORE", help="jobstore to use in suggested commands")
    parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command")
    parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command")
    parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs")

    options = parser.parse_args()
    options.database = 'kyoto_tycoon'
    #todo support root option
    options.root = None

    # need to go through this garbage (copied from the main() in progressive_cactus) to
    # come up with the project
    options.cactusDir = getTempDirectory()
    #Create the progressive cactus project
    projWrapper = ProjectWrapper(options, options.configFile)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    enableDumpStack()
    cactusPrepare(options, project)
示例#26
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="+",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outputHal", type=str, help="Output HAL file")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None,
        required=True)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Override some CAF settings whose defaults are not suited to star trees"
    )
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    if options.pafInput:
        # cactus-graphmap does not do any prepending to simplify interface with minigraph node names
        # so it must be done here
        options.nonCactusInput = True

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
示例#27
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("inSeqFile",
                        type=str,
                        nargs='?',
                        default=None,
                        help="Input Seq file")
    parser.add_argument(
        "outSeqFile",
        type=str,
        nargs='?',
        default=None,
        help="Output Seq file (ex generated with cactus-prepare)")
    parser.add_argument("--configFile",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--inputNames",
        nargs='*',
        help=
        'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)'
    )
    parser.add_argument(
        "--inPaths",
        nargs='*',
        help=
        'Space-separated list of input fasta paths (to be used in place of --inSeqFile'
    )
    parser.add_argument(
        "--outPaths",
        nargs='*',
        help=
        'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)'
    )
    parser.add_argument("--maskAlpha",
                        action='store_true',
                        help='Use dna-brnn instead of lastz for repeatmasking')
    parser.add_argument(
        "--clipAlpha",
        action='store_true',
        help=
        'use dna-brnn instead of lastz for repeatmasking.  Also, clip sequence using given minimum length instead of softmasking'
    )
    parser.add_argument(
        "--ignore",
        nargs='*',
        help='Space-separate list of genomes from inSeqFile to ignore',
        default=[])
    parser.add_argument(
        "--maskPAF",
        type=str,
        help=
        'Incorporate coverage gaps from given PAF when masking.  Only implemented for dna-brnn masking'
    )
    parser.add_argument(
        "--brnnCores",
        type=int,
        help=
        'Specify number of cores for each dna-brnn job (overriding default value from the config)'
    )
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # we have two modes: operate directly on paths or rely on the seqfiles.  they cannot be mixed
    if options.inSeqFile or options.outSeqFile:
        if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths:
            raise RuntimeError(
                '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths'
            )
    elif options.inPaths or options.outPaths:
        if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile:
            raise RuntimeError(
                '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile'
            )
        if len(options.inPaths) != len(options.outPaths):
            raise RuntimeError(
                '--inPaths and --outPaths must have the same number of arguments'
            )
    else:
        raise RuntimeError(
            '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input'
        )
    if options.maskAlpha and options.clipAlpha:
        raise RuntimeError(
            '--maskAlpha and --clipAlpha cannot be used together')
    if options.clipAlpha:
        options.maskAlpha = True
    if options.maskPAF and not options.inputNames and not options.inSeqFile:
        raise RuntimeError(
            '--maskPAF requires event names specified wither with an input seqfile or with --inputNames'
        )
    if options.ignore and options.clipAlpha is None:
        raise RuntimeError('--ignore can only be used with --clipAlpha')

    inSeqPaths = []
    outSeqPaths = []
    inNames = options.inputNames
    eventNames = []

    #load cactus config
    configNode = ET.parse(options.configFile).getroot()
    #we never want to preprocess minigraph sequences
    graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    options.ignore.append(graph_event)

    # mine the paths out of the seqfiles
    if options.inSeqFile:
        inSeqFile = SeqFile(options.inSeqFile)
        outSeqFile = SeqFile(options.outSeqFile)

        if not inNames:
            inNames = [
                inSeqFile.tree.getName(node)
                for node in inSeqFile.tree.getLeaves()
            ]

        for inName in inNames:
            if inName in options.ignore:
                # "convenience" functionality: we let the --ignore option update the output seqfile
                # to reflect the fact that we're not touching the original input
                outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName]
                continue
            if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap:
                raise RuntimeError(
                    '{} not present in input and output Seq files'.format(
                        inName))
            inPath = inSeqFile.pathMap[inName]
            outPath = outSeqFile.pathMap[inName]
            if os.path.isdir(inPath):
                try:
                    os.makedirs(outPath)
                except:
                    pass
                assert os.path.isdir(inPath) == os.path.isdir(outPath)
                inSeqPaths += [
                    os.path.join(inPath, seqPath)
                    for seqPath in os.listdir(inPath)
                ]
                outSeqPaths += [
                    os.path.join(outPath, seqPath)
                    for seqPath in os.listdir(inPath)
                ]
            else:
                inSeqPaths += [inPath]
                outSeqPaths += [outPath]
            eventNames.append(inName)

        if options.ignore:
            # see comment above
            with open(options.outSeqFile, 'w') as outSF:
                outSF.write(str(outSeqFile))

    # we got path names directly from the command line
    else:
        inSeqPaths = options.inPaths
        outSeqPaths = options.outPaths

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=None,
                      configFile=options.configFile,
                      inputSequences=inSeqPaths,
                      toil=toil,
                      restart=options.restart,
                      outputSequences=outSeqPaths,
                      maskAlpha=options.maskAlpha,
                      clipAlpha=options.clipAlpha,
                      maskPAF=options.maskPAF,
                      inputEventNames=eventNames,
                      brnnCores=options.brnnCores)
示例#28
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("seqFile", help="Input Seq file")
    parser.add_argument(
        "outSeqFile",
        help="Output Seq file (ex generated with cactus-prepare)")
    parser.add_argument("--configFile",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--inputNames",
        nargs='*',
        help=
        'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)'
    )
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    inSeqFile = SeqFile(options.seqFile)
    outSeqFile = SeqFile(options.outSeqFile)

    inNames = options.inputNames
    if not inNames:
        inNames = [
            inSeqFile.tree.getName(node)
            for node in inSeqFile.tree.getLeaves()
        ]

    inSeqPaths = []
    outSeqPaths = []

    for inName in inNames:
        if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap:
            raise RuntimeError(
                '{} not present in input and output Seq files'.format(inNmae))
        inPath = inSeqFile.pathMap[inName]
        outPath = outSeqFile.pathMap[inName]
        if os.path.isdir(inPath):
            try:
                os.makedirs(outPath)
            except:
                pass
            assert os.path.isdir(inPath) == os.path.isdir(outPath)
            inSeqPaths += [
                os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath)
            ]
            outSeqPaths += [
                os.path.join(outPath, seqPath)
                for seqPath in os.listdir(inPath)
            ]
        else:
            inSeqPaths += [inPath]
            outSeqPaths += [outPath]

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=None,
                      configFile=options.configFile,
                      inputSequences=inSeqPaths,
                      toil=toil,
                      restart=options.restart,
                      outputSequences=outSeqPaths)
示例#29
0
def get_options():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    # addCactusWorkflowOptions(parser)

    # ### For quick debugging of apply_dipcall_bed_filter:
    # parser.add_argument('paf', type=str,
    #                     help='For quick debugging of apply_dipcall_bed_filter.')

    # options for basic input/output
    parser.add_argument(
        'seqFile',
        type=str,
        help=
        'A file containing all the information specified by cactus in construction. This aligner ignores the newick tree.'
    )
    parser.add_argument(
        'refID',
        type=str,
        help=
        'Specifies which asm in seqFile should be treated as the reference.')
    parser.add_argument("outputFile",
                        type=str,
                        help="Output pairwise alignment file")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    # dipcall-like filters
    parser.add_argument(
        '--dipcall_bed_filter',
        action='store_true',
        help=
        "Applies filters & minimap2 arguments used to make the bedfile in dipcall. Only affects the primary mappings file. Secondary mappings aren't used in dipcall."
    )
    parser.add_argument(
        '--dipcall_vcf_filter',
        action='store_true',
        help=
        "Applies filters & minimap2 arguments used to make the vcf in dipcall. Only affects the primary mappings file. Secondary mappings aren't used in dipcall."
    )

    # Progressive Cactus Options:
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")

    ## options for importing assemblies:
    # following arguments are only useful under --non_blast_output
    # parser.add_argument('--non_blast_output', action='store_true',
    #                 help="Instead of using cactus-blast-style prepended ids, use an alternative import method that only alters contig ids if absolutely necessary.")
    # parser.add_argument('--all_unique_ids', action='store_true',
    #                     help="Only take effect when called with --non_blast_output. Prevents the program from touching the assembly files; the user promises that they don't contain any duplicate contig ids. In reality, there should never be contig renamings if there are no duplicate fasta ids.")
    # parser.add_argument('--overwrite_assemblies', action='store_true',
    #                     help="When cleaning the assembly files to make sure there are no duplicate contig ids, overwrite the assembly files. Copy them to a neigboring folder with the affix '_edited_for_duplicate_contig_ids' instead.")

    # # Useful in normal asms import
    # parser.add_argument('--assembly_save_dir', type=str, default='./unique_id_assemblies/',
    #                     help='While deduplicating contig ids in the input fastas, save the assemblies in this directory. Ignored when used in conjunction with --overwrite_assemblies.')

    # for debugging:
    parser.add_argument(
        '--debug_export',
        action='store_true',
        help='Export several other files for debugging inspection.')
    parser.add_argument('--debug_export_dir',
                        type=str,
                        default='./debug_export_dir/',
                        help='Location of the exported debug files.')
    options = parser.parse_args()
    return options
示例#30
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="*",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outHal",
                        type=str,
                        help="Output HAL file (or directory in --batch mode)")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Pangenome Options
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings."
        " The overridden configuration will be saved in <outHal>.pg-conf.xml")
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument(
        "--usePafSecondaries",
        action="store_true",
        help=
        "use the secondary alignments from the PAF input.  They are ignored by default."
    )
    parser.add_argument("--singleCopySpecies",
                        type=str,
                        help="Filter out all self-alignments in given species")
    parser.add_argument(
        "--barMaskFilter",
        type=int,
        default=None,
        help=
        "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)"
    )
    parser.add_argument(
        "--outVG",
        action="store_true",
        help="export pangenome graph in VG (.vg) in addition to HAL")
    parser.add_argument(
        "--outGFA",
        action="store_true",
        help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL")
    parser.add_argument(
        "--batch",
        action="store_true",
        help=
        "Launch batch of alignments.  Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit"
    )
    parser.add_argument(
        "--stagger",
        type=int,
        help=
        "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)",
        default=0)
    parser.add_argument(
        "--acyclic",
        type=str,
        help=
        "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing"
    )

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    options.buildHal = True
    options.buildFasta = True

    if options.outHal.startswith('s3://'):
        if not has_s3:
            raise RuntimeError(
                "S3 support requires toil to be installed with [aws]")
        # write a little something to the bucket now to catch any glaring problems asap
        test_file = os.path.join(getTempDirectory(), 'check')
        with open(test_file, 'w') as test_o:
            test_o.write("\n")
        region = get_aws_region(
            options.jobStore) if options.jobStore.startswith('aws:') else None
        write_s3(test_file,
                 options.outHal if options.outHal.endswith('.hal') else
                 os.path.join(options.outHal, 'test'),
                 region=region)
        options.checkpointInfo = (get_aws_region(options.jobStore),
                                  options.outHal)
    else:
        options.checkpointInfo = None

    if options.batch:
        # the output hal is a directory, make sure it's there
        if not os.path.isdir(options.outHal):
            os.makedirs(options.outHal)
        assert len(options.cigarsFile) == 0
    else:
        assert len(options.cigarsFile) > 0

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # We set which type of unique ids to expect.  Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap)
    # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy
    # But I don't think there's a real use case yet of making a separate parameter
    options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID')
    if options.eventNameAsID is not None:
        options.eventNameAsID = False if not bool(
            eventName) or eventName == '0' else True
    else:
        options.eventNameAsID = options.pangenome or options.pafInput
    os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str(
        int(options.eventNameAsID))

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            align_jobs = make_batch_align_jobs(options, toil)
            results_dict = toil.start(
                Job.wrapJobFn(run_batch_align_jobs, align_jobs))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
            else:
                assert len(results_dict) == 1 and None in results_dict
                halID, vgID, gfaID = results_dict[None][0], results_dict[None][
                    1], results_dict[None][2]
                # export the hal
                toil.exportFile(halID, makeURL(options.outHal))
                # export the vg
                if options.outVG:
                    toil.exportFile(
                        vgID,
                        makeURL(os.path.splitext(options.outHal)[0] + '.vg'))
                if options.outGFA:
                    toil.exportFile(
                        gfaID,
                        makeURL(
                            os.path.splitext(options.outHal)[0] + '.gfa.gz'))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
示例#31
0
def main_batch():
    """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine.
    pros: much less chance of a problem with one chromosome affecting anything else
          more forgiving for inexact resource specs
          could be ported to Terra
    cons: less efficient use of resources
    """
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("chromFile", help="chroms file")
    parser.add_argument("outHal",
                        type=str,
                        help="Output directory (can be s3://)")
    parser.add_argument(
        "--alignOptions",
        type=str,
        help=
        "Options to pass through to cactus-align (don't forget to wrap in quotes)"
    )
    parser.add_argument("--alignCores",
                        type=int,
                        help="Number of cores per align job")
    parser.add_argument(
        "--alignCoresOverrides",
        nargs="*",
        help=
        "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected"
    )

    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))

    options = parser.parse_args()

    options.containerImage = None
    options.binariesMode = None
    options.root = None
    options.latest = None
    options.database = "kyoto_tycoon"

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # Turn the overrides into a dict
    cores_overrides = {}
    if options.alignCoresOverrides:
        for o in options.alignCoresOverrides:
            try:
                chrom, cores = o.split(',')
                cores_overrides[chrom] = int(cores)
            except:
                raise RuntimeError(
                    "Error parsing alignCoresOverrides \"{}\"".format(o))
    options.alignCoresOverrides = cores_overrides

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            config_id = toil.importFile(makeURL(options.configFile))
            # load the chromfile into memory
            chrom_dict = {}
            with open(options.chromFile, 'r') as chrom_file:
                for line in chrom_file:
                    toks = line.strip().split()
                    if len(toks):
                        assert len(toks) == 3
                        chrom, seqfile, alnFile = toks[0], toks[1], toks[2]
                        chrom_dict[chrom] = toil.importFile(
                            makeURL(seqfile)), toil.importFile(
                                makeURL(alnFile))
            results_dict = toil.start(
                Job.wrapJobFn(align_toil_batch, chrom_dict, config_id,
                              options))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
                    toil.exportFile(
                        results[3],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal.log'.format(chrom))))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info(
        "cactus-align-batch has finished after {} seconds".format(run_time))
示例#32
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help = "Input Seq file")
    parser.add_argument("outSeqFile", type=str, nargs='?', default=None, help = "Output Seq file (ex generated with cactus-prepare)")
    parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--inputNames", nargs='*', help='input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)')
    parser.add_argument("--inPaths", nargs='*', help='Space-separated list of input fasta paths (to be used in place of --inSeqFile')
    parser.add_argument("--outPaths", nargs='*', help='Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)')
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()
    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # we have two modes: operate directly on paths or rely on the seqfiles.  they cannot be mixed
    if options.inSeqFile or options.outSeqFile:
        if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths:
            raise RuntimeError('--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths')
    elif options.inPaths or options.outPaths:
        if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile or options.inputNames:
            raise RuntimeError('--inPaths must be used in conjunction with --outPaths and not with --inSeqFile, --outSeqFile nor --inputNames')
        if len(options.inPaths) != len(options.outPaths):
            raise RuntimeError('--inPaths and --outPaths must have the same number of arguments')
    else:
        raise RuntimeError('--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input')


    inSeqPaths = []
    outSeqPaths = []

    # mine the paths out of the seqfiles
    if options.inSeqFile:
        inSeqFile = SeqFile(options.inSeqFile)
        outSeqFile = SeqFile(options.outSeqFile)

        inNames = options.inputNames
        if not inNames:
            inNames = [inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves()]


        for inName in inNames:
            if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap:
                raise RuntimeError('{} not present in input and output Seq files'.format(inNmae))
            inPath = inSeqFile.pathMap[inName]
            outPath = outSeqFile.pathMap[inName]
            if os.path.isdir(inPath):
                try:
                    os.makedirs(outPath)
                except:
                    pass
                assert os.path.isdir(inPath) == os.path.isdir(outPath)
                inSeqPaths += [os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath)]
                outSeqPaths += [os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath)]
            else:
                inSeqPaths += [inPath]
                outSeqPaths += [outPath]

    # we got path names directly from the command line
    else:
        inSeqPaths = options.inPaths
        outSeqPaths = options.outPaths

    with Toil(options) as toil:
        stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
示例#33
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    # tokyo_cabinet is no longer supported
    options.database = "kyoto_tycoon"

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None and options.batchSystem != 'singleMachine':
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    start_time = timeit.default_timer()
    runCactusProgressive(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("Cactus has finished after {} seconds".format(run_time))
示例#34
0
def main():
    parser = ArgumentParser()
    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.'
                        ' Required when not using --wdl')
    parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]")
    parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]")
    parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands")
    parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need"
                        " to be respecified when running on Terra")
    parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs")
    parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands")
    parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command")
    parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command")
    parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs")
    parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime")
    
    parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast")
    parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)")
    parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)")
    parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version")
    parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task")
    parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks")

    parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified")
    parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job")
    parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job")
    parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job")

    parser.add_argument("--defaultMem", type=float, help="Memory in GB for each job unless otherwise specified")
    parser.add_argument("--preprocessMem", type=float, help="Memory in GB for each cactus-preprocess job")
    parser.add_argument("--blastMem", type=float, help="Memory in GB for each cactus-blast job")
    parser.add_argument("--alignMem", type=float, help="Memory in GB for each cactus-align job")

    parser.add_argument("--defaultDisk", type=int, help="Disk in GB for each job unless otherwise specified")
    parser.add_argument("--preprocessDisk", type=int, help="Disk in GB for each cactus-preprocess job")
    parser.add_argument("--blastDisk", type=int, help="Disk in GB for each cactus-blast job")
    parser.add_argument("--alignDisk", type=int, help="Disk in GB for each cactus-align job")
    parser.add_argument("--halAppendDisk", type=int, help="Disk in GB for each halAppendSubtree job")

    parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible in GB for each cactus-preprocess job [default=2]", default=2)
    parser.add_argument("--blastPreemptible", type=int, help="Preemptible in GB for each cactus-blast job [default=1]", default=1)
    parser.add_argument("--alignPreemptible", type=int, help="Preemptible in GB for each cactus-align job [default=1]", default=1)
    parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible in GB for each halAppendSubtree job [default=1]", default=1)

    options = parser.parse_args()
    options.database = 'kyoto_tycoon'
    #todo support root option
    options.root = None

    if not options.wdl:
        if not options.outDir:
            raise RuntimeError("--outDir option required when not using --wdl")
        if not options.outSeqFile:
            options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile))
            if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile):
                options.outSeqFile += '.1'
                
    if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"):
        raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu")

    if not options.outHal:
        options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal')

    if options.wdl:
        if options.preprocessBatchSize != 1:
            if options.preprocessBatchSize != 3:
                # hacky way to only warn for non-default
                sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n")
            options.preprocessBatchSize = 1
        # wdl handles output file structure
        if options.outDir:
            sys.stderr.write("Warning: --outDir option ignored with --wdl\n")
        options.outDir = "."
        if options.outSeqFile:
            sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n")
            options.outSeqFile = None
        if options.preprocessOnly:
            raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl')
    if not options.dockerImage:
        options.dockerImage = getDockerImage()
    # apply defaults
    if options.defaultCores:
        if not options.preprocessCores:
            options.preprocessCores = options.defaultCores
        if not options.blastCores:
            options.blastCores = options.defaultCores
        if not options.alignCores:
            options.alignCores = options.defaultCores
    if options.defaultMem:
        if not options.preprocessMem:
            options.preprocessMem = options.defaultMem
        if not options.blastMem:
            options.blastMem = options.defaultMem
        if not options.alignMem:
            options.alignMem = options.defaultMem
    if not options.alignCores or options.alignCores == 1:
        if options.alignCores == 1:
            sys.stderr.write("Warning: --alignCores changed from 1 to 2\n")
        options.alignCores = 2
    if options.defaultDisk:
        if not options.preprocessDisk:
            options.preprocessDisk = options.defaultDisk
        if not options.blastDisk:
            options.blastDisk = options.defaultDisk
        if not options.alignDisk:
            options.alignDisk = options.defaultDisk
        if not options.halAppendDisk:
            options.halAppendDisk = options.defaultDisk

    # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion
    # note: k80 not included as WGA_GPU doesn't run on it.  
    acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4']
    if options.gpuType not in acceptable_gpus:
        raise RuntimeError('--gpuType {} not supported by Terra.  Acceptable types are {}'.format(
            options.gpuType, acceptable_gpus))

    # need to go through this garbage (copied from the main() in progressive_cactus) to
    # come up with the project
    options.cactusDir = getTempDirectory()
    #Create the progressive cactus project
    projWrapper = ProjectWrapper(options, options.configFile)
    projWrapper.writeXml()
    # used to unique jobstore
    options.jobStoreCount = 0

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    enableDumpStack()
    cactusPrepare(options, project)
示例#35
0
def cactusPrepare(options, project):
    """ annotate a SeqFile with ancestral names as well as paths for output sequences."""

    # read the input
    seqFile = SeqFile(options.seqFile)
    configNode = ET.parse(options.configFile).getroot()
    config = ConfigWrapper(configNode)

    if not options.wdl:
        # prepare output sequence directory
        # todo: support remote (ie s3) output directory
        try:
            os.makedirs(options.outDir)
        except:
            pass
        if not os.path.isdir(options.outDir):
            raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir))
        if not os.access(options.outDir, os.W_OK):
            logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir))

    if options.preprocessOnly or options.gpu:
        if options.preprocessOnly:
            # hack the configfile to skip preprocessing and write it to the output dir
            config.removePreprocessors()
        if options.gpu:
            # hack the configfile to toggle on gpu lastz
            cafNode = findRequiredNode(config.xmlRoot, "caf")
            cafNode.attrib["gpuLastz"] = "true"
            # realigning doesn't mix well with lastz so we make sure it's off
            # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271
            cafNode.attrib["realign"] = "0"
        options.configFile = os.path.join(options.outDir, 'config-prepared.xml')
        sys.stderr.write("configuration saved in {}\n".format(options.configFile))
        config.writeXML(options.configFile)
        
    # pass through the config file to the options
    # todo (don't like second hard-code check of .xml path)
    if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl:
        options.cactusOptions += ' --configFile {}'.format(options.configFile)

    # get the ancestor names
    tree = MultiCactusTree(seqFile.tree)
    tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix())

    # make the output
    outSeqFile = SeqFile()
    outSeqFile.tree= tree
    outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap)
    outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups)

    # update paths for preprocessed leaves or inferred ancestors
    for node in outSeqFile.tree.breadthFirstTraversal():
        name = outSeqFile.tree.getName(node)
        leaf = outSeqFile.tree.isLeaf(node)
        if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly):
            out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name)
            outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename))
            if options.wdl:
                # uniquify name in wdl to prevent collisions
                outSeqFile.pathMap[name] += '.pp'

    # write the output
    if options.outSeqFile:
        with open(options.outSeqFile, 'w') as out_sf:
            out_sf.write(str(outSeqFile))

    # write the instructions
    print(get_plan(options, project, seqFile, outSeqFile))
示例#36
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("blastOutput", nargs="+", help = "Blast output (from cactus-blast)")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")
    parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile")
    parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                        " must appear in NEWICK tree in <seqfile>) to use as a "
                        "root for the alignment.  Any genomes not below this node "
                        "in the tree may be used as outgroups but will never appear"
                        " in the output.  If no root is specifed then the root"
                        " of the tree is used. ", default=None, required=True)
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)
    parser.add_argument("--nonBlastInput", action="store_true",
                        help="Input does not come from cactus-blast: Do not append ids to fasta names")
    parser.add_argument("--nonBlastMegablockFilter", action="store_true",
                        help="By default, the megablock filter is off for --nonBlastInput, as it does not play"
                        "nicely with reference-based alignments.  This flag will turn it back on")
    parser.add_argument("--pafInput", action="store_true",
                        help="'blastOutput' input is in paf format, rather than lastz cigars.")    

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames')

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError('Only 1 CPU detected.  Cactus requires at least 2')

    # tokyo_cabinet is no longer supported
    options.database = "kyoto_tycoon"
        
    options.database = 'kyoto_tycoon'

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
示例#37
0
def main(toil_mode=False):
    parser = ArgumentParser()
    if toil_mode:
        Job.Runner.addToilOptions(parser)
        parser.add_argument("--latest", dest="latest", action="store_true",
                            help="Use the latest version of the docker container "
                            "rather than pulling one matching this version of cactus")
        parser.add_argument("--containerImage", dest="containerImage", default=None,
                            help="Use the the specified pre-built containter image "
                            "rather than pulling one from quay.io")
        parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                            help="The way to run the Cactus binaries (at top level; use --cactusOpts to set it in nested calls)",
                            default=None)
    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.'
                        ' Required when not using --wdl')
    parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]")
    parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]", required=toil_mode)
    if not toil_mode:
        parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands")
        parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need"
                            " to be respecified when running on Terra")
        parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands")
    parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs")
    parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command")
    parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo --retryCount 0", help="options for every cactus command")
    parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs")
    parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime")
    
    parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast")
    parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)")
    parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)")
    parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version")
    parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task")
    parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks")

    if not toil_mode:
        parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified")
    parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job")
    parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job")
    parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job")

    if not toil_mode:
        parser.add_argument("--defaultMemory", type=human2bytesN, help="Memory for each job unless otherwise specified. "
                            "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")
    parser.add_argument("--preprocessMemory", type=human2bytesN, help="Memory for each cactus-preprocess job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")
    parser.add_argument("--blastMemory", type=human2bytesN, help="Memory for each cactus-blast job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")                                
    parser.add_argument("--alignMemory", type=human2bytesN, help="Memory for each cactus-align job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")

    if not toil_mode:
        parser.add_argument("--defaultDisk", type=human2bytesN, help="Disk for each job unless otherwise specified. "
                            "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")
    parser.add_argument("--preprocessDisk", type=human2bytesN, help="Disk for each cactus-preprocess job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")
    parser.add_argument("--blastDisk", type=human2bytesN, help="Disk for each cactus-blast job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")
    parser.add_argument("--alignDisk", type=human2bytesN, help="Disk for each cactus-align job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")
    parser.add_argument("--halAppendDisk", type=human2bytesN, help="Disk for each halAppendSubtree job. "
                        "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)")

    parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible attempt count for each cactus-preprocess job [default=2]", default=2)
    parser.add_argument("--blastPreemptible", type=int, help="Preemptible attempt count for each cactus-blast job [default=1]", default=1)
    parser.add_argument("--alignPreemptible", type=int, help="Preemptible attempt count for each cactus-align job [default=1]", default=1)
    parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible attempt count for each halAppendSubtree job [default=1]", default=1)
    parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon")

    options = parser.parse_args()
    #todo support root option
    options.root = None

    if toil_mode:
        options.wdl = False
        options.noLocalInputs = False
        options.outDir = '.'
        setupBinaries(options)
        # need to avoid nested container calls, so set toil-inside-toil jobs to local by default
        if "--binariesMode" not in options.cactusOptions:
            options.cactusOptions += " --binariesMode local"
        if options.jobStore.startswith('aws'):
            if not options.outHal.startswith('s3://'):
                raise RuntimeError("--outHal must be s3:// address when using s3 job store")
            if not has_s3:
                raise RuntimeError("S3 support requires toil to be installed with [aws]")
    options.toil = toil_mode

    if not options.wdl and not options.toil:
        if not options.outDir:
            raise RuntimeError("--outDir option required when not using --wdl")
        if not options.outSeqFile:
            options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile))
            if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile):
                options.outSeqFile += '.1'
                
    if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"):
        raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu")

    if not options.outHal:
        options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal')

    if options.wdl:
        # wdl handles output file structure
        if options.outDir:
            sys.stderr.write("Warning: --outDir option ignored with --wdl\n")
        options.outDir = "."
        if options.outSeqFile:
            sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n")
            options.outSeqFile = None
        if options.preprocessOnly:
            raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl')
    if not options.dockerImage:
        options.dockerImage = getDockerImage()
    # apply defaults
    if options.defaultCores:
        if not options.preprocessCores:
            options.preprocessCores = options.defaultCores
        if not options.blastCores:
            options.blastCores = options.defaultCores
        if not options.alignCores:
            options.alignCores = options.defaultCores
    if options.defaultMemory:
        if not options.preprocessMemory:
            options.preprocessMemory = options.defaultMemory
        if not options.blastMemory:
            options.blastMemory = options.defaultMemory
        if not options.alignMemory:
            options.alignMemory = options.defaultMemory
    if not options.alignCores or options.alignCores == 1:
        if options.alignCores == 1:
            sys.stderr.write("Warning: --alignCores changed from 1 to 2\n")
        options.alignCores = 2
    if options.defaultDisk:
        if not options.preprocessDisk:
            options.preprocessDisk = options.defaultDisk
        if not options.blastDisk:
            options.blastDisk = options.defaultDisk
        if not options.alignDisk:
            options.alignDisk = options.defaultDisk
        if not options.halAppendDisk:
            options.halAppendDisk = options.defaultDisk

    # todo: no reason not to support non-1 batch size, but mirror wdl logic for now
    if options.toil:
        if options.preprocessBatchSize != 1:
            if options.preprocessBatchSize != 3:
                # hacky way to only warn for non-default
                sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n")
            options.preprocessBatchSize = 1
        # todo: could also support this
        assert not options.preprocessOnly

    # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion
    # note: k80 not included as WGA_GPU doesn't run on it.  
    acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4']
    if options.gpuType not in acceptable_gpus:
        raise RuntimeError('--gpuType {} not supported by Terra.  Acceptable types are {}'.format(
            options.gpuType, acceptable_gpus))

    # need to go through this garbage (copied from the main() in progressive_cactus) to
    # come up with the project
    options.cactusDir = getTempDirectory()
    #Create the progressive cactus project
    projWrapper = ProjectWrapper(options, options.configFile)
    projWrapper.writeXml()
    # used to unique jobstore
    options.jobStoreCount = 0

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    project.readXML(pjPath)

    enableDumpStack()
    cactusPrepare(options, project)