示例#1
0
def main():
    usage = "usage: %prog [options] <multicactus project>"
    description = "Progressive version of cactus_workflow"
    parser = OptionParser(usage=usage, description=description)
    Stack.addJobTreeOptions(parser)
    addCactusWorkflowOptions(parser)
    
    parser.add_option("--nonRecursive", dest="nonRecursive", action="store_true",
                      help="Only process given event (not children) [default=False]", 
                      default=False)
    
    parser.add_option("--event", dest="event", 
                      help="Target event to process [default=root]", default=None)
    
    parser.add_option("--overwrite", dest="overwrite", action="store_true",
                      help="Recompute and overwrite output files if they exist [default=False]",
                      default=False)
    
    options, args = parser.parse_args()
    setLoggingFromOptions(options)

    if len(args) != 1:
        parser.print_help()
        raise RuntimeError("Unrecognised input arguments: %s" % " ".join(args))

    Stack(RunCactusPreprocessorThenProgressiveDown(options, args)).startJobTree(options)
示例#2
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file (will be modified if necessary to include graph Fasta sequence)")
    parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)")
    parser.add_argument("outputPAF", type=str, help = "Output pairwise alignment file in PAF format")
    parser.add_argument("--outputFasta", type=str, help = "Output graph sequence file in FASTA format (required if not present in seqFile)")
    parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp (overrides config option of same name)")    
    parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory")
    parser.add_argument("--refFromGFA", type=str, help = "Do not align given genome from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)")

    #WDL hacks
    parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile")
    parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outputGAFDir:
        if not os.path.isdir(options.outputGAFDir):
            os.makedirs(options.outputGAFDir)

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames')

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMap(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
示例#3
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                      " must appear in NEWICK tree in <seqfile>) to use as a "
                      "root for the alignment.  Any genomes not below this node "
                      "in the tree may be used as outgroups but will never appear"
                      " in the output.  If no root is specifed then the root"
                      " of the tree is used. ", default=None)
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)
    parser.add_argument("--database", choices=["kyoto_tycoon", "redis"],
                        help="The type of database", default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError('Only 1 CPU detected.  Cactus requires at least 2')

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusProgressive(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("Cactus has finished after {} seconds".format(run_time))
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("--vg", required=True, nargs='+',  help = "Input vg files (PackedGraph or HashGraph format)")
    parser.add_argument("--outDir", required=True, type=str, help = "Output directory")
    parser.add_argument("--outName", required=True, type=str, help = "Basename of all output files")
    parser.add_argument("--reference", required=True, type=str, help = "Reference event name")
    parser.add_argument("--vcfReference", type=str, help = "Reference event for VCF (if different from --reference)")
    parser.add_argument("--rename", nargs='+', default = [], help = "Path renaming, each of form src>dest (see clip-vg -r)")
    parser.add_argument("--clipLength", type=int, default=None, help = "clip out unaligned sequences longer than this")
    parser.add_argument("--wlineSep", type=str, help = "wline separator for vg convert")
    parser.add_argument("--indexCores", type=int, default=1, help = "cores for indexing processes")
    parser.add_argument("--decoyGraph", help= "decoy sequences vg graph to add (PackedGraph or HashGraph format)")
    parser.add_argument("--hal", nargs='+', default = [], help = "Input hal files (for merging)")
    
    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outDir and not options.outDir.startswith('s3://'):
        if not os.path.isdir(options.outDir):
            os.makedirs(options.outDir)

    if options.hal and len(options.hal) != len(options.vg):
        raise RuntimeError("If --hal and --vg should specify the same number of files")
        
    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMapJoin(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap-join has finished after {} seconds".format(run_time))
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file (gzipped fastas supported)")
    parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)")
    parser.add_argument("graphmapPAF", type=str, help = "Output pairwise alignment file in PAF format (can be gzipped)")
    parser.add_argument("--outDir", required=True, type=str, help = "Output directory")
    parser.add_argument("--refContigs", nargs="*", help = "Subset to these reference contigs (multiple allowed)", default=[])
    parser.add_argument("--refContigsFile", type=str, help = "Subset to (newline-separated) reference contigs in this file")
    parser.add_argument("--otherContig", type=str, help = "Lump all reference contigs unselected by above options into single one with this name")
    parser.add_argument("--reference", type=str, help = "Name of reference (in seqFile).  Ambiguity filters will not be applied to it")
    parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp")
    
    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if options.outDir and not options.outDir.startswith('s3://'):
        if not os.path.isdir(options.outDir):
            os.makedirs(options.outDir)
        
    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusGraphMapSplit(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-graphmap-split has finished after {} seconds".format(run_time))
示例#6
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="+",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outputHal", type=str, help="Output HAL file")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None,
        required=True)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Override some CAF settings whose defaults are not suited to star trees"
    )
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    if options.pafInput:
        # cactus-graphmap does not do any prepending to simplify interface with minigraph node names
        # so it must be done here
        options.nonCactusInput = True

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
示例#7
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    # tokyo_cabinet is no longer supported
    options.database = "kyoto_tycoon"

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None and options.batchSystem != 'singleMachine':
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    start_time = timeit.default_timer()
    runCactusProgressive(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("Cactus has finished after {} seconds".format(run_time))
示例#8
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("blastOutput", nargs="+", help = "Blast output (from cactus-blast)")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")
    parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile")
    parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides")

    #Progressive Cactus Options
    parser.add_argument("--configFile", dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml"))
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                        " must appear in NEWICK tree in <seqfile>) to use as a "
                        "root for the alignment.  Any genomes not below this node "
                        "in the tree may be used as outgroups but will never appear"
                        " in the output.  If no root is specifed then the root"
                        " of the tree is used. ", default=None, required=True)
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)
    parser.add_argument("--nonBlastInput", action="store_true",
                        help="Input does not come from cactus-blast: Do not append ids to fasta names")
    parser.add_argument("--nonBlastMegablockFilter", action="store_true",
                        help="By default, the megablock filter is off for --nonBlastInput, as it does not play"
                        "nicely with reference-based alignments.  This flag will turn it back on")
    parser.add_argument("--pafInput", action="store_true",
                        help="'blastOutput' input is in paf format, rather than lastz cigars.")    

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames')

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError('Only 1 CPU detected.  Cactus requires at least 2')

    # tokyo_cabinet is no longer supported
    options.database = "kyoto_tycoon"
        
    options.database = 'kyoto_tycoon'

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
示例#9
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database",
                        dest="database",
                        help="Database type: tokyo_cabinet or kyoto_tycoon"
                        " [default: %(default)s]",
                        default="kyoto_tycoon")
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=None)
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5
    runCactusProgressive(options)
示例#10
0
def main_batch():
    """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine.
    pros: much less chance of a problem with one chromosome affecting anything else
          more forgiving for inexact resource specs
          could be ported to Terra
    cons: less efficient use of resources
    """
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("chromFile", help="chroms file")
    parser.add_argument("outHal",
                        type=str,
                        help="Output directory (can be s3://)")
    parser.add_argument(
        "--alignOptions",
        type=str,
        help=
        "Options to pass through to cactus-align (don't forget to wrap in quotes)"
    )
    parser.add_argument("--alignCores",
                        type=int,
                        help="Number of cores per align job")
    parser.add_argument(
        "--alignCoresOverrides",
        nargs="*",
        help=
        "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected"
    )

    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))

    options = parser.parse_args()

    options.containerImage = None
    options.binariesMode = None
    options.root = None
    options.latest = None
    options.database = "kyoto_tycoon"

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # Turn the overrides into a dict
    cores_overrides = {}
    if options.alignCoresOverrides:
        for o in options.alignCoresOverrides:
            try:
                chrom, cores = o.split(',')
                cores_overrides[chrom] = int(cores)
            except:
                raise RuntimeError(
                    "Error parsing alignCoresOverrides \"{}\"".format(o))
    options.alignCoresOverrides = cores_overrides

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            config_id = toil.importFile(makeURL(options.configFile))
            # load the chromfile into memory
            chrom_dict = {}
            with open(options.chromFile, 'r') as chrom_file:
                for line in chrom_file:
                    toks = line.strip().split()
                    if len(toks):
                        assert len(toks) == 3
                        chrom, seqfile, alnFile = toks[0], toks[1], toks[2]
                        chrom_dict[chrom] = toil.importFile(
                            makeURL(seqfile)), toil.importFile(
                                makeURL(alnFile))
            results_dict = toil.start(
                Job.wrapJobFn(align_toil_batch, chrom_dict, config_id,
                              options))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
                    toil.exportFile(
                        results[3],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal.log'.format(chrom))))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info(
        "cactus-align-batch has finished after {} seconds".format(run_time))
示例#11
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument(
        "cigarsFile",
        nargs="*",
        help=
        "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)"
    )
    parser.add_argument("outHal",
                        type=str,
                        help="Output HAL file (or directory in --batch mode)")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowd) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --paths) of path overrides")

    #Pangenome Options
    parser.add_argument(
        "--pangenome",
        action="store_true",
        help=
        "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings."
        " The overridden configuration will be saved in <outHal>.pg-conf.xml")
    parser.add_argument(
        "--pafInput",
        action="store_true",
        help="'cigarsFile' arugment is in PAF format, rather than lastz cigars."
    )
    parser.add_argument(
        "--usePafSecondaries",
        action="store_true",
        help=
        "use the secondary alignments from the PAF input.  They are ignored by default."
    )
    parser.add_argument("--singleCopySpecies",
                        type=str,
                        help="Filter out all self-alignments in given species")
    parser.add_argument(
        "--barMaskFilter",
        type=int,
        default=None,
        help=
        "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)"
    )
    parser.add_argument(
        "--outVG",
        action="store_true",
        help="export pangenome graph in VG (.vg) in addition to HAL")
    parser.add_argument(
        "--outGFA",
        action="store_true",
        help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL")
    parser.add_argument(
        "--batch",
        action="store_true",
        help=
        "Launch batch of alignments.  Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit"
    )
    parser.add_argument(
        "--stagger",
        type=int,
        help=
        "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)",
        default=0)
    parser.add_argument(
        "--acyclic",
        type=str,
        help=
        "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing"
    )

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonCactusInput",
        action="store_true",
        help=
        "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars"
    )
    parser.add_argument("--database",
                        choices=["kyoto_tycoon", "redis"],
                        help="The type of database",
                        default="kyoto_tycoon")

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # cactus doesn't run with 1 core
    if options.batchSystem == 'singleMachine':
        if options.maxCores is not None:
            if int(options.maxCores) < 2:
                raise RuntimeError('Cactus requires --maxCores > 1')
        else:
            # is there a way to get this out of Toil?  That would be more consistent
            if cpu_count() < 2:
                raise RuntimeError(
                    'Only 1 CPU detected.  Cactus requires at least 2')

    options.buildHal = True
    options.buildFasta = True

    if options.outHal.startswith('s3://'):
        if not has_s3:
            raise RuntimeError(
                "S3 support requires toil to be installed with [aws]")
        # write a little something to the bucket now to catch any glaring problems asap
        test_file = os.path.join(getTempDirectory(), 'check')
        with open(test_file, 'w') as test_o:
            test_o.write("\n")
        region = get_aws_region(
            options.jobStore) if options.jobStore.startswith('aws:') else None
        write_s3(test_file,
                 options.outHal if options.outHal.endswith('.hal') else
                 os.path.join(options.outHal, 'test'),
                 region=region)
        options.checkpointInfo = (get_aws_region(options.jobStore),
                                  options.outHal)
    else:
        options.checkpointInfo = None

    if options.batch:
        # the output hal is a directory, make sure it's there
        if not os.path.isdir(options.outHal):
            os.makedirs(options.outHal)
        assert len(options.cigarsFile) == 0
    else:
        assert len(options.cigarsFile) > 0

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    # We set which type of unique ids to expect.  Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap)
    # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy
    # But I don't think there's a real use case yet of making a separate parameter
    options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID')
    if options.eventNameAsID is not None:
        options.eventNameAsID = False if not bool(
            eventName) or eventName == '0' else True
    else:
        options.eventNameAsID = options.pangenome or options.pafInput
    os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str(
        int(options.eventNameAsID))

    start_time = timeit.default_timer()
    with Toil(options) as toil:
        importSingularityImage(options)
        if options.restart:
            results_dict = toil.restart()
        else:
            align_jobs = make_batch_align_jobs(options, toil)
            results_dict = toil.start(
                Job.wrapJobFn(run_batch_align_jobs, align_jobs))

        # when using s3 output urls, things get checkpointed as they're made so no reason to export
        # todo: make a more unified interface throughout cactus for this
        # (see toil-vg's outstore logic which, while not perfect, would be an improvement
        if not options.outHal.startswith('s3://'):
            if options.batch:
                for chrom, results in results_dict.items():
                    toil.exportFile(
                        results[0],
                        makeURL(
                            os.path.join(options.outHal,
                                         '{}.hal'.format(chrom))))
                    if options.outVG:
                        toil.exportFile(
                            results[1],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.vg'.format(chrom))))
                    if options.outGFA:
                        toil.exportFile(
                            results[2],
                            makeURL(
                                os.path.join(options.outHal,
                                             '{}.gfa.gz'.format(chrom))))
            else:
                assert len(results_dict) == 1 and None in results_dict
                halID, vgID, gfaID = results_dict[None][0], results_dict[None][
                    1], results_dict[None][2]
                # export the hal
                toil.exportFile(halID, makeURL(options.outHal))
                # export the vg
                if options.outVG:
                    toil.exportFile(
                        vgID,
                        makeURL(os.path.splitext(options.outHal)[0] + '.vg'))
                if options.outGFA:
                    toil.exportFile(
                        gfaID,
                        makeURL(
                            os.path.splitext(options.outHal)[0] + '.gfa.gz'))

    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-align has finished after {} seconds".format(run_time))
示例#12
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database",
                        dest="database",
                        help="Database type: tokyo_cabinet or kyoto_tycoon"
                        " [default: %(default)s]",
                        default="kyoto_tycoon")
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=None)
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None)
    parser.add_argument("--latest",
                        dest="latest",
                        action="store_true",
                        help="Use the latest, locally-built docker container "
                        "rather than pulling from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()
    options.cactusDir = getTempDirectory()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    #Create the progressive cactus project
    projWrapper = ProjectWrapper(options)
    projWrapper.writeXml()

    pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                          '%s_project.xml' % ProjectWrapper.alignmentDirName)
    assert os.path.exists(pjPath)

    project = MultiCactusProject()

    if not os.path.isdir(options.cactusDir):
        os.makedirs(options.cactusDir)

    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            project.readXML(pjPath)
            #import the sequences
            seqIDs = []
            for seq in project.getInputSequencePaths():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([
                        os.path.join(seq, subSeq) for subSeq in os.listdir(seq)
                    ], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                seqIDs.append(toil.importFile(seq))
            project.setInputSequenceIDs(seqIDs)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(
                    makeURL(project.getConfigPath()))
            logger.info("Setting config id to: %s" % cactusConfigID)
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()

            project.writeXML(pjPath)
            halID = toil.start(
                RunCactusPreprocessorThenProgressiveDown(
                    options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))
示例#13
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("blastOutput",
                        type=str,
                        help="Blast output (from cactus-blast)")
    parser.add_argument("outputHal", type=str, help="Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None,
        required=True)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)
    parser.add_argument(
        "--nonBlastInput",
        action="store_true",
        help=
        "Input does not come from cactus-blast: Do not append ids to fasta names"
    )

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    options.database = 'kyoto_tycoon'

    options.buildHal = True
    options.buildFasta = True

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    start_time = timeit.default_timer()
    runCactusAfterBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-blast has finished after {} seconds".format(run_time))
示例#14
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help="Seq file")
    parser.add_argument("outputFile",
                        type=str,
                        help="Output pairwise alignment file")
    parser.add_argument(
        "--pathOverrides",
        nargs="*",
        help="paths (multiple allowed) to override from seqFile")
    parser.add_argument(
        "--pathOverrideNames",
        nargs="*",
        help="names (must be same number as --pathOverrides) of path overrides"
    )

    #Progressive Cactus Options
    parser.add_argument("--database",
                        dest="database",
                        help="Database type: tokyo_cabinet or kyoto_tycoon"
                        " [default: %(default)s]",
                        default="kyoto_tycoon")
    parser.add_argument("--configFile",
                        dest="configFile",
                        help="Specify cactus configuration file",
                        default=os.path.join(cactusRootPath(),
                                             "cactus_progressive_config.xml"))
    parser.add_argument(
        "--root",
        dest="root",
        help="Name of ancestral node (which"
        " must appear in NEWICK tree in <seqfile>) to use as a "
        "root for the alignment.  Any genomes not below this node "
        "in the tree may be used as outgroups but will never appear"
        " in the output.  If no root is specifed then the root"
        " of the tree is used. ",
        default=None,
        required=True)
    parser.add_argument(
        "--latest",
        dest="latest",
        action="store_true",
        help="Use the latest version of the docker container "
        "rather than pulling one matching this version of cactus")
    parser.add_argument(
        "--containerImage",
        dest="containerImage",
        default=None,
        help="Use the the specified pre-built containter image "
        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode",
                        choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries",
                        default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)
    enableDumpStack()

    if (options.pathOverrides or options.pathOverrideNames):
        if not options.pathOverrides or not options.pathOverrideNames or \
           len(options.pathOverrideNames) != len(options.pathOverrides):
            raise RuntimeError(
                'same number of values must be passed to --pathOverrides and --pathOverrideNames'
            )

    # Mess with some toil options to create useful defaults.
    cactus_override_toil_options(options)

    start_time = timeit.default_timer()
    runCactusBlastOnly(options)
    end_time = timeit.default_timer()
    run_time = end_time - start_time
    logger.info("cactus-blast has finished after {} seconds".format(run_time))
示例#15
0
def main():
    parser = ArgumentParser()
    Job.Runner.addToilOptions(parser)
    addCactusWorkflowOptions(parser)

    parser.add_argument("seqFile", help = "Seq file")
    parser.add_argument("outputHal", type=str, help = "Output HAL file")

    #Progressive Cactus Options
    parser.add_argument("--database", dest="database",
                      help="Database type: tokyo_cabinet or kyoto_tycoon"
                      " [default: %(default)s]",
                      default="kyoto_tycoon")
    parser.add_argument("--configFile", dest="configFile",
                      help="Specify cactus configuration file",
                      default=None)
    parser.add_argument("--root", dest="root", help="Name of ancestral node (which"
                      " must appear in NEWICK tree in <seqfile>) to use as a "
                      "root for the alignment.  Any genomes not below this node "
                      "in the tree may be used as outgroups but will never appear"
                      " in the output.  If no root is specifed then the root"
                      " of the tree is used. ", default=None)   
    parser.add_argument("--latest", dest="latest", action="store_true",
                        help="Use the latest version of the docker container "
                        "rather than pulling one matching this version of cactus")
    parser.add_argument("--containerImage", dest="containerImage", default=None,
                        help="Use the the specified pre-built containter image "
                        "rather than pulling one from quay.io")
    parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"],
                        help="The way to run the Cactus binaries", default=None)

    options = parser.parse_args()

    setupBinaries(options)
    setLoggingFromOptions(options)

    # Mess with some toil options to create useful defaults.

    # Caching generally slows down the cactus workflow, plus some
    # methods like readGlobalFileStream don't support forced
    # reads directly from the job store rather than from cache.
    options.disableCaching = True
    # Job chaining breaks service termination timing, causing unused
    # databases to accumulate and waste memory for no reason.
    options.disableChaining = True
    # The default deadlockWait is currently 60 seconds. This can cause
    # issues if the database processes take a while to actually begin
    # after they're issued. Change it to at least an hour so that we
    # don't preemptively declare a deadlock.
    if options.deadlockWait is None or options.deadlockWait < 3600:
        options.deadlockWait = 3600
    if options.retryCount is None:
        # If the user didn't specify a retryCount value, make it 5
        # instead of Toil's default (1).
        options.retryCount = 5

    with Toil(options) as toil:
        importSingularityImage()
        #Run the workflow
        if options.restart:
            halID = toil.restart()
        else:
            options.cactusDir = getTempDirectory()
            #Create the progressive cactus project 
            projWrapper = ProjectWrapper(options)
            projWrapper.writeXml()

            pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName,
                                  '%s_project.xml' % ProjectWrapper.alignmentDirName)
            assert os.path.exists(pjPath)

            project = MultiCactusProject()

            if not os.path.isdir(options.cactusDir):
                os.makedirs(options.cactusDir)

            project.readXML(pjPath)
            #import the sequences
            seqIDs = []
            print "Importing %s sequences" % (len(project.getInputSequencePaths()))
            for seq in project.getInputSequencePaths():
                if os.path.isdir(seq):
                    tmpSeq = getTempFile()
                    catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq)
                    seq = tmpSeq
                seq = makeURL(seq)
                seqIDs.append(toil.importFile(seq))
            project.setInputSequenceIDs(seqIDs)

            #import cactus config
            if options.configFile:
                cactusConfigID = toil.importFile(makeURL(options.configFile))
            else:
                cactusConfigID = toil.importFile(makeURL(project.getConfigPath()))
            project.setConfigID(cactusConfigID)

            project.syncToFileStore(toil)
            configNode = ET.parse(project.getConfigPath()).getroot()
            configWrapper = ConfigWrapper(configNode)
            configWrapper.substituteAllPredefinedConstantsWithLiterals()


            project.writeXML(pjPath)
            halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory()))

        toil.exportFile(halID, makeURL(options.outputHal))