示例#1
0
 def load_tool_configfile(self):
     """Test path of tools_path.yaml on default install path, home or argument"""
     if USER_TOOLS_PATH.exists() and not ARGS_TOOLS_PATH.exists():
         self.tools_config = load_configfile(USER_TOOLS_PATH)
     elif ARGS_TOOLS_PATH.exists():
         self.tools_config = load_configfile(ARGS_TOOLS_PATH)
         ARGS_TOOLS_PATH.unlink()
     else:
         self.tools_config = load_configfile(GIT_TOOLS_PATH)
def load_cluster_config(path):
    """Load config to dict either from absolute path or relative to profile dir."""
    if path:
        path = os.path.join(os.path.dirname(__file__), os.path.expandvars(path))
        default_cluster_config = io.load_configfile(path)
    else:
        default_cluster_config = {}
    if "__default__" not in default_cluster_config:
        default_cluster_config["__default__"] = {}
    return default_cluster_config
示例#3
0
    def sconfig(self, line, cell):
        " Load JSON or YAML into workflow's config object "
        workflow = self.get_workflow()

        # create a temp file, so we can use snakemake.load_configfile
        #  it wouldn't be hard to roll our own to avoid this...
        cell_config_file = tempfile.NamedTemporaryFile('w', delete=False)
        cell_config_file.write(cell)
        cell_config_file.close()

        snakemake.workflow.config.update(load_configfile(
            cell_config_file.name))
        logger.debug(repr(snakemake.workflow.config))
        os.unlink(cell_config_file.name)
示例#4
0
    def run_workflow(self,
                     workflow="recover_mags",
                     cores=16,
                     profile=None,
                     dryrun=False,
                     conda_frontend="mamba",
                     snakemake_args=""):
        """Runs the aviary pipeline
        By default all steps are executed
        Needs a config-file which is generated by given inputs.
        Most snakemake arguments can be appended to the command for more info see 'snakemake --help'
        """

        if not os.path.exists(self.config):
            logging.critical(f"config-file not found: {self.config}\n")
            sys.exit(1)

        self.validate_config()

        conf = load_configfile(self.config)

        cmd = (
            "snakemake --snakefile {snakefile} --directory {working_dir} "
            "{jobs} --rerun-incomplete "
            "--configfile '{config_file}' --nolock "
            " {profile} {conda_frontend} --use-conda {conda_prefix} {dryrun} "
            " {target_rule} "
            " {args} ").format(
                snakefile=get_snakefile(),
                working_dir=self.output,
                jobs="--jobs {}".format(cores) if cores is not None else "",
                config_file=self.config,
                profile="" if
                (profile is None) else "--profile {}".format(profile),
                dryrun="--dryrun" if dryrun else "",
                args=" ".join(snakemake_args),
                target_rule=workflow if workflow != "None" else "",
                conda_prefix="--conda-prefix " + self.conda_prefix,
                conda_frontend="--conda-frontend " + conda_frontend)
        logging.info("Executing: %s" % cmd)
        try:
            subprocess.check_call(cmd, shell=True)
        except subprocess.CalledProcessError as e:
            # removes the traceback
            logging.critical(e)
            exit(1)
示例#5
0
    def __init__(self, workflow, config):
        # workflow is available only in __init__
        # print("\n".join(list(workflow.__dict__.keys())))
        # print(workflow.__dict__)
        self.snakefile = SNAKEFILE
        self.tools_config = None
        self.path_config = workflow.overwrite_configfiles[0]
        self.config = config

        self.use_env_modules = workflow.use_env_modules
        self.use_conda = workflow.use_conda
        self.use_singularity = workflow.use_singularity
        # test if cluster_config.yaml pass to snakemake
        if not workflow.overwrite_clusterconfig and get_install_mode(
        ) == "cluster":
            self.cluster_config = load_configfile(
                DEFAULT_PROFILE.joinpath("cluster_config.yaml"))
        elif not workflow.overwrite_clusterconfig and get_install_mode(
        ) == "local":
            self.cluster_config = None
        else:
            self.cluster_config = workflow.overwrite_clusterconfig

        self.load_tool_configfile()
示例#6
0
def validate_assembly_config(config):
    c = load_configfile(config)
    valid = True
    if not "samples" in c:
        logging.critical("'samples' is not defined in %s" % config)
        valid = False
    try:
        if len(c["samples"].keys()) == 0:
            logging.critical("no samples are defined under 'samples' in %s" %
                             config)
            valid = False
        for sample, meta in c["samples"].items():
            if sample == "coassemblies":
                for coassembly, file_list in meta["coassemblies"].items():
                    for co_sample in file_list:
                        if co_sample not in c["samples"]:
                            logging.critical(
                                "Sample %s under coassembly %s is not a defined sample in the configuration"
                                % (co_sample, coassembly))
                            valid = False
            # common/known bad characters
            if " " in sample or "_" in sample:
                logging.critical(
                    "The sample ID for %s contains invalid characters; use words or words separated by dashes only"
                )
                valid = False
            if not "path" in meta:
                logging.critical("'path' is not set for sample %s" % sample)
                valid = False
                continue
            for f in meta["path"]:
                if not os.path.exists(f):
                    logging.critical("%s does not exist for sample %s" %
                                     (f, sample))
                    valid = False
    except KeyError:
        pass

    if not "preprocessing" in c:
        logging.critical("'preprocessing' is not defined in %s" % config)
        valid = False
    try:
        if not "adapters" in c["preprocessing"]:
            logging.critical(
                "'adapters' is not defined under 'preprocessing' in %s" %
                config)
            valid = False
        for f in c["preprocessing"]["adapters"].split(","):
            if not os.path.exists(f):
                logging.critical("adapters file [%s] does not exist" % f)
                valid = False
        if not "contamination" in c["preprocessing"]:
            logging.critical(
                "'contamination' is not defined under 'preprocessing' in %s" %
                config)
            valid = False
        if not "references" in c["preprocessing"]["contamination"]:
            logging.critical(
                "'references' is not defined under 'contamination' in %s" %
                config)
            valid = False
        if not "rRNA" in c["preprocessing"]["contamination"]["references"]:
            logging.critical(
                "'rRNA' is not a defined contamination reference in %s" %
                config)
        for ref, f in c["preprocessing"]["contamination"]["references"].items(
        ):
            if not os.path.exists(f):
                logging.critical(
                    "contamination reference file [%s] does not exist for %s" %
                    (f, ref))
                valid = False
        if not "normalization" in c["preprocessing"]:
            logging.critical("'normalization' is not defined in %s" % config)
            valid = False
    except KeyError:
        pass

    if not "assembly" in c:
        logging.critical("'assembly' is not defined in %s" % config)
        valid = False
    try:
        if not c["assembly"]["assembler"] == "megahit" and not c["assembly"][
                "assembler"] == "spades":
            logging.critical(
                "'assembler' entry [%s] is not a supported assembler" %
                c["assembly"]["assembler"])
            valid = False
    except KeyError:
        pass

    if not "annotation" in c:
        logging.critical("'annotation' is not defined in %s" % config)
        valid = False
    try:
        if not "references" in c["annotation"]:
            logging.critical("'references' is not defined in %s" % config)
            valid = False

        if "refseq" in c["annotation"]["references"]:
            if not os.path.exists(
                    c["annotation"]["references"]["refseq"]["namemap"]):
                logging.critical(
                    "namemap reference file [%s] does not exist" %
                    c["annotation"]["references"]["refseq"]["namemap"])
                valid = False
            if not os.path.exists(
                    c["annotation"]["references"]["refseq"]["tree"]):
                logging.critical(
                    "tree reference file [%s] does not exist" %
                    c["annotation"]["references"]["refseq"]["tree"])
                valid = False
            if not os.path.exists(
                    c["annotation"]["references"]["refseq"]["dmnd"]):
                logging.critical(
                    "fasta reference file [%s] does not exist" %
                    c["annotation"]["references"]["refseq"]["fasta"])
                valid = False

        if "cazy" in c["annotation"]["references"]:
            if not os.path.exists(
                    c["annotation"]["references"]["cazy"]["namemap"]):
                logging.critical(
                    "namemap reference file [%s] does not exist" %
                    c["annotation"]["references"]["cazy"]["namemap"])
                valid = False
            if not os.path.exists(
                    c["annotation"]["references"]["cazy"]["dmnd"]):
                logging.critical(
                    "fasta reference file [%s] does not exist" %
                    c["annotation"]["references"]["cazy"]["fasta"])
                valid = False

        if "cog" in c["annotation"]["references"]:
            if not os.path.exists(
                    c["annotation"]["references"]["cog"]["namemap"]):
                logging.critical(
                    "namemap reference file [%s] does not exist" %
                    c["annotation"]["references"]["cog"]["namemap"])
                valid = False
            if not os.path.exists(
                    c["annotation"]["references"]["cog"]["dmnd"]):
                logging.critical("fasta reference file [%s] does not exist" %
                                 c["annotation"]["references"]["cog"]["fasta"])
                valid = False

        if "enzyme" in c["annotation"]["references"]:
            if not os.path.exists(
                    c["annotation"]["references"]["enzyme"]["namemap"]):
                logging.critical(
                    "namemap reference file [%s] does not exist" %
                    c["annotation"]["references"]["enzyme"]["namemap"])
                valid = False
            if not os.path.exists(
                    c["annotation"]["references"]["enzyme"]["dmnd"]):
                logging.critical(
                    "fasta reference file [%s] does not exist" %
                    c["annotation"]["references"]["enzyme"]["fasta"])
                valid = False

        if "eggnog" in c["annotation"]["references"]:
            if not os.path.exists(
                    c["annotation"]["references"]["eggnog"]["namemap"]):
                logging.critical(
                    "namemap reference file [%s] does not exist" %
                    c["annotation"]["references"]["eggnog"]["namemap"])
                valid = False
            if not os.path.exists(
                    c["annotation"]["references"]["eggnog"]["dmnd"]):
                logging.critical(
                    "fasta reference file [%s] does not exist" %
                    c["annotation"]["references"]["eggnog"]["fasta"])
                valid = False

    except KeyError:
        valid = False
        pass

    return valid
                        help='pre-processed by HVG filtering',
                        action='store_true')
    parser.add_argument('-s',
                        '--scale',
                        action='store_true',
                        help='pre-processed by scaling')

    args = parser.parse_args()
    config = args.config
    task = args.task
    hvgs = args.hvgs
    scale = args.scale
    method = args.method

    # Load config file
    params = load_configfile(config)

    # Check inputs
    if method not in params['METHODS']:
        raise ValueError(
            f'{method} is not a valid method.\n'
            f'Please choose one of: {list(params["METHODS"].keys())}')

    if task not in params['DATA_SCENARIOS']:
        raise ValueError(
            f'{task} is not a valid integration task.\n'
            f'Please choose one of: {list(params["DATA_SCENARIOS"].keys())}')

    # Get path values
    folder = params['ROOT']
    t_folder = task
示例#8
0
def validate_config(config, workflow):
    conf = load_configfile(config)
示例#9
0
#!/usr/bin/env python3

import os
import pandas as pd
from snakemake.io import load_configfile

config = load_configfile("config.yaml")

# make a file with col1 as contig group name and col2 as a commma separated list of contigs. This was written for use with GATK for splitting the variant calling steps by chromosome/contig. We group the unplaced contigs together since those tend to be small.
fai_file = config["ref"]["fai"]
contigs_file = "grouped_contigs.tsv"

os.system("perl -e 'print qq/name\tcontigs\n/' >" + contigs_file)
os.system("grep -Po '(^chr\S+|^\d+)' " + fai_file +
          " | perl -lne 'print qq/$_\t$_/' >>" + contigs_file)
os.system(
    "grep -Pv '(^chr\S+|^\d+)' " + fai_file +
    " | cut -f1 | perl -npe 's/\n/,/g' | perl -lne 's/,$//; print qq/unplaced_contigs\t$_/'  >>"
    + contigs_file)
contig_groups = pd.read_table(contigs_file)

# check chromosomes/contigs parsed correctly.
num_contigs_fai = sum(1 for line in open(fai_file))
num_contigs_parsed = contig_groups.shape[0] + contig_groups['contigs'][
    contig_groups.shape[0] - 1].count(',')
assert num_contigs_fai == num_contigs_parsed, "Chromosomes in .fai not parsed correctly."
示例#10
0
def snakemake(snakefile,
              listrules=False,
              list_target_rules=False,
              cores=1,
              nodes=1,
              local_cores=1,
              resources=dict(),
              config=dict(),
              configfile=None,
              config_args=None,
              workdir=None,
              targets=None,
              dryrun=False,
              touch=False,
              forcetargets=False,
              forceall=False,
              forcerun=[],
              prioritytargets=[],
              stats=None,
              printreason=False,
              printshellcmds=False,
              printdag=False,
              printrulegraph=False,
              printd3dag=False,
              nocolor=False,
              quiet=False,
              keepgoing=False,
              cluster=None,
              cluster_config=None,
              cluster_sync=None,
              drmaa=None,
              jobname="snakejob.{rulename}.{jobid}.sh",
              immediate_submit=False,
              standalone=False,
              ignore_ambiguity=False,
              snakemakepath=None,
              lock=True,
              unlock=False,
              cleanup_metadata=None,
              force_incomplete=False,
              ignore_incomplete=False,
              list_version_changes=False,
              list_code_changes=False,
              list_input_changes=False,
              list_params_changes=False,
              list_resources=False,
              summary=False,
              detailed_summary=False,
              latency_wait=3,
              benchmark_repeats=1,
              wait_for_files=None,
              print_compilation=False,
              debug=False,
              notemp=False,
              nodeps=False,
              keep_target_files=False,
              allowed_rules=None,
              jobscript=None,
              timestamp=False,
              greediness=None,
              no_hooks=False,
              overwrite_shellcmd=None,
              updated_files=None,
              log_handler=None,
              keep_logger=False,
              verbose=False):
    """Run snakemake on a given snakefile.

    This function provides access to the whole snakemake functionality. It is not thread-safe.

    Args:
        snakefile (str):            the path to the snakefile
        listrules (bool):           list rules (default False)
        list_target_rules (bool):   list target rules (default False)
        cores (int):                the number of provided cores (ignored when using cluster support) (default 1)
        nodes (int):                the number of provided cluster nodes (ignored without cluster support) (default 1)
        local_cores (int):                the number of provided local cores if in cluster mode (ignored without cluster support) (default 1)
        resources (dict):           provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {})
        config (dict):              override values for workflow config
        workdir (str):              path to working directory (default None)
        targets (list):             list of targets, e.g. rule or file names (default None)
        dryrun (bool):              only dry-run the workflow (default False)
        touch (bool):               only touch all output files if present (default False)
        forcetargets (bool):        force given targets to be re-created (default False)
        forceall (bool):            force all output files to be re-created (default False)
        forcerun (list):             list of files and rules that shall be re-created/re-executed (default [])
        prioritytargets (list):     list of targets that shall be run with maximum priority (default [])
        stats (str):                path to file that shall contain stats about the workflow execution (default None)
        printreason (bool):         print the reason for the execution of each job (default false)
        printshellcmds (bool):      print the shell command of each job (default False)
        printdag (bool):            print the dag in the graphviz dot language (default False)
        printrulegraph (bool):      print the graph of rules in the graphviz dot language (default False)
        printd3dag (bool):          print a D3.js compatible JSON representation of the DAG (default False)
        nocolor (bool):             do not print colored output (default False)
        quiet (bool):               do not print any default job information (default False)
        keepgoing (bool):           keep goind upon errors (default False)
        cluster (str):              submission command of a cluster or batch system to use, e.g. qsub (default None)
        cluster_config (str):       configuration file for cluster options (default None)
        cluster_sync (str):         blocking cluster submission command (like SGE 'qsub -sync y')  (default None)
        drmaa (str):                if not None use DRMAA for cluster support, str specifies native args passed to the cluster when submitting a job
        jobname (str):              naming scheme for cluster job scripts (default "snakejob.{rulename}.{jobid}.sh")
        immediate_submit (bool):    immediately submit all cluster jobs, regardless of dependencies (default False)
        standalone (bool):          kill all processes very rudely in case of failure (do not use this if you use this API) (default False)
        ignore_ambiguity (bool):    ignore ambiguous rules and always take the first possible one (default False)
        snakemakepath (str):        path to the snakemake executable (default None)
        lock (bool):                lock the working directory when executing the workflow (default True)
        unlock (bool):              just unlock the working directory (default False)
        cleanup_metadata (bool):    just cleanup metadata of output files (default False)
        force_incomplete (bool):    force the re-creation of incomplete files (default False)
        ignore_incomplete (bool):   ignore incomplete files (default False)
        list_version_changes (bool): list output files with changed rule version (default False)
        list_code_changes (bool):   list output files with changed rule code (default False)
        list_input_changes (bool):  list output files with changed input files (default False)
        list_params_changes (bool): list output files with changed params (default False)
        summary (bool):             list summary of all output files and their status (default False)
        latency_wait (int):         how many seconds to wait for an output file to appear after the execution of a job, e.g. to handle filesystem latency (default 3)
        benchmark_repeats (int):    number of repeated runs of a job if declared for benchmarking (default 1)
        wait_for_files (list):      wait for given files to be present before executing the workflow
        list_resources (bool):      list resources used in the workflow (default False)
        summary (bool):             list summary of all output files and their status (default False). If no option  is specified a basic summary will be ouput. If 'detailed' is added as an option e.g --summary detailed, extra info about the input and shell commands will be included
        detailed_summary (bool):    list summary of all input and output files and their status (default False)
        print_compilation (bool):   print the compilation of the snakefile (default False)
        debug (bool):               allow to use the debugger within rules
        notemp (bool):              ignore temp file flags, e.g. do not delete output files marked as temp after use (default False)
        nodeps (bool):              ignore dependencies (default False)
        keep_target_files (bool):   Do not adjust the paths of given target files relative to the working directory.
        allowed_rules (set):        Restrict allowed rules to the given set. If None or empty, all rules are used.
        jobscript (str):            path to a custom shell script template for cluster jobs (default None)
        timestamp (bool):           print time stamps in front of any output (default False)
        greediness (float):         set the greediness of scheduling. This value between 0 and 1 determines how careful jobs are selected for execution. The default value (0.5 if prioritytargets are used, 1.0 else) provides the best speed and still acceptable scheduling quality.
        overwrite_shellcmd (str):   a shell command that shall be executed instead of those given in the workflow. This is for debugging purposes only.
        updated_files(list):        a list that will be filled with the files that are updated or created during the workflow execution
        verbose(bool):              show additional debug output (default False)
        log_handler (function):     redirect snakemake output to this custom log handler, a function that takes a log message dictionary (see below) as its only argument (default None). The log message dictionary for the log handler has to following entries:

            :level:
                the log level ("info", "error", "debug", "progress", "job_info")

            :level="info", "error" or "debug":
                :msg:
                    the log message
            :level="progress":
                :done:
                    number of already executed jobs

                :total:
                    number of total jobs

            :level="job_info":
                :input:
                    list of input files of a job

                :output:
                    list of output files of a job

                :log:
                    path to log file of a job

                :local:
                    whether a job is executed locally (i.e. ignoring cluster)

                :msg:
                    the job message

                :reason:
                    the job reason

                :priority:
                    the job priority

                :threads:
                    the threads of the job


    Returns:
        bool:   True if workflow execution was successful.

    """

    if updated_files is None:
        updated_files = list()

    if cluster or cluster_sync or drmaa:
        cores = sys.maxsize
    else:
        nodes = sys.maxsize

    if cluster_config:
        cluster_config = load_configfile(cluster_config)
    else:
        cluster_config = dict()

    if not keep_logger:
        setup_logger(handler=log_handler,
                     quiet=quiet,
                     printreason=printreason,
                     printshellcmds=printshellcmds,
                     nocolor=nocolor,
                     stdout=dryrun,
                     debug=verbose,
                     timestamp=timestamp)

    if greediness is None:
        greediness = 0.5 if prioritytargets else 1.0
    else:
        if not (0 <= greediness <= 1.0):
            logger.error("Error: greediness must be a float between 0 and 1.")
            return False

    if not os.path.exists(snakefile):
        logger.error("Error: Snakefile \"{}\" not present.".format(snakefile))
        return False
    snakefile = os.path.abspath(snakefile)

    cluster_mode = (cluster is not None) + (cluster_sync is not
                                            None) + (drmaa is not None)
    if cluster_mode > 1:
        logger.error("Error: cluster and drmaa args are mutually exclusive")
        return False
    if debug and (cores > 1 or cluster_mode):
        logger.error(
            "Error: debug mode cannot be used with more than one core or cluster execution.")
        return False

    overwrite_config = dict()
    if configfile:
        overwrite_config.update(load_configfile(configfile))
    if config:
        overwrite_config.update(config)

    if workdir:
        olddir = os.getcwd()
        if not os.path.exists(workdir):
            logger.info(
                "Creating specified working directory {}.".format(workdir))
            os.makedirs(workdir)
        workdir = os.path.abspath(workdir)
        os.chdir(workdir)
    workflow = Workflow(snakefile=snakefile,
                        snakemakepath=snakemakepath,
                        jobscript=jobscript,
                        overwrite_shellcmd=overwrite_shellcmd,
                        overwrite_config=overwrite_config,
                        overwrite_workdir=workdir,
                        overwrite_configfile=configfile,
                        config_args=config_args,
                        debug=debug)

    if standalone:
        try:
            # set the process group
            os.setpgrp()
        except:
            # ignore: if it does not work we can still work without it
            pass

    success = True
    try:
        workflow.include(snakefile,
                         overwrite_first_rule=True,
                         print_compilation=print_compilation)
        workflow.check()

        if not print_compilation:
            if listrules:
                workflow.list_rules()
            elif list_target_rules:
                workflow.list_rules(only_targets=True)
            elif list_resources:
                workflow.list_resources()
            else:
                # if not printdag and not printrulegraph:
                # handle subworkflows
                subsnakemake = partial(snakemake,
                                       cores=cores,
                                       nodes=nodes,
                                       local_cores=local_cores,
                                       resources=resources,
                                       dryrun=dryrun,
                                       touch=touch,
                                       printreason=printreason,
                                       printshellcmds=printshellcmds,
                                       nocolor=nocolor,
                                       quiet=quiet,
                                       keepgoing=keepgoing,
                                       cluster=cluster,
                                       cluster_config=cluster_config,
                                       cluster_sync=cluster_sync,
                                       drmaa=drmaa,
                                       jobname=jobname,
                                       immediate_submit=immediate_submit,
                                       standalone=standalone,
                                       ignore_ambiguity=ignore_ambiguity,
                                       snakemakepath=snakemakepath,
                                       lock=lock,
                                       unlock=unlock,
                                       cleanup_metadata=cleanup_metadata,
                                       force_incomplete=force_incomplete,
                                       ignore_incomplete=ignore_incomplete,
                                       latency_wait=latency_wait,
                                       benchmark_repeats=benchmark_repeats,
                                       verbose=verbose,
                                       notemp=notemp,
                                       nodeps=nodeps,
                                       jobscript=jobscript,
                                       timestamp=timestamp,
                                       greediness=greediness,
                                       no_hooks=no_hooks,
                                       overwrite_shellcmd=overwrite_shellcmd,
                                       config=config,
                                       config_args=config_args,
                                       keep_logger=True)
                success = workflow.execute(
                    targets=targets,
                    dryrun=dryrun,
                    touch=touch,
                    cores=cores,
                    nodes=nodes,
                    local_cores=local_cores,
                    forcetargets=forcetargets,
                    forceall=forceall,
                    forcerun=forcerun,
                    prioritytargets=prioritytargets,
                    quiet=quiet,
                    keepgoing=keepgoing,
                    printshellcmds=printshellcmds,
                    printreason=printreason,
                    printrulegraph=printrulegraph,
                    printdag=printdag,
                    cluster=cluster,
                    cluster_config=cluster_config,
                    cluster_sync=cluster_sync,
                    jobname=jobname,
                    drmaa=drmaa,
                    printd3dag=printd3dag,
                    immediate_submit=immediate_submit,
                    ignore_ambiguity=ignore_ambiguity,
                    stats=stats,
                    force_incomplete=force_incomplete,
                    ignore_incomplete=ignore_incomplete,
                    list_version_changes=list_version_changes,
                    list_code_changes=list_code_changes,
                    list_input_changes=list_input_changes,
                    list_params_changes=list_params_changes,
                    summary=summary,
                    latency_wait=latency_wait,
                    benchmark_repeats=benchmark_repeats,
                    wait_for_files=wait_for_files,
                    detailed_summary=detailed_summary,
                    nolock=not lock,
                    unlock=unlock,
                    resources=resources,
                    notemp=notemp,
                    nodeps=nodeps,
                    keep_target_files=keep_target_files,
                    cleanup_metadata=cleanup_metadata,
                    subsnakemake=subsnakemake,
                    updated_files=updated_files,
                    allowed_rules=allowed_rules,
                    greediness=greediness,
                    no_hooks=no_hooks)

    except BrokenPipeError:
        # ignore this exception and stop. It occurs if snakemake output is piped into less and less quits before reading the whole output.
        # in such a case, snakemake shall stop scheduling and quit with error 1
        success = False
    except (Exception, BaseException) as ex:
        print_exception(ex, workflow.linemaps)
        success = False
    if workdir:
        os.chdir(olddir)
    if workflow.persistence:
        workflow.persistence.unlock()
    if not keep_logger:
        logger.cleanup()
    return success
示例#11
0
文件: conf.py 项目: lijiyang/atlas
def validate_config(config, workflow):
    conf = load_configfile(config)
    validate_sample_defs(conf, workflow)
示例#12
0
def main(root_dir, args):
    if len(args) < 3:
        print("python run.py unfinished config.yaml cores")
        sys.exit(1)

    unfinished_dir = args[1]
    config_file = [args[2]]
    cores = 79 if len(args) <= 3 else int(args[3])
    parallel = 10 if len(args) <= 4 else int(args[4])
    # load config
    config = load_configfile(config_file[0])
    # set defualt parameter
    if 'mail' not in config.keys():
        config['mail'] = False
    if 'bark' not in config.keys():
        config['bark'] = False
    # check_config
    if not check_config(config):
        sys.exit(1)

    finished_dir = "finished"
    duplication_dir = "duplication"
    metdata_dir = "metadata"

    if not os.path.exists(metdata_dir):
        os.makedirs(metdata_dir)
    # finished dir
    if not os.path.exists(finished_dir):
        os.makedirs(finished_dir)
    # duplication metadir
    if not os.path.exists(duplication_dir):
        os.makedirs(duplication_dir)

    dup_file = ".file_duplication.json"
    # preprocess
    sample_files = glob.glob(os.path.join(unfinished_dir, "*.txt"))
    # remove duplication before running
    sample_files = remove_duplication(sample_files, dup_file, duplication_dir)

    # select unfinished files
    db = "meta_info.sqlite3"
    if not os.path.exists(db):
        # create database if not exists
        df = build_metadata_table(sample_files)
        df2 = build_sample_table(df, unfinished_dir)
        table_to_sql(df, table_name="meta", db=db)
        table_to_sql(df2, table_name="sample", db=db)
    else:
        # otherwise, upgrade the db
        df = build_metadata_table(sample_files, table_name="meta", db=db)
        if df.shape[0] > 0 and df.shape[1] > 0:
            table_to_sql(df, table_name="meta", db=db)
            df2 = build_sample_table(df, unfinished_dir)
            table_to_sql(df2, table_name="sample", db=db)

    # get the unfinished meta files
    df = table_from_sql(table_name="meta", db=db)
    sample_files = df.loc[df['status'] == 0, 'meta_file'].to_list()
    sample_files = [os.path.join(unfinished_dir, f) for f in sample_files]

    sf = get_snakefile(root_dir, "Snakefile")

    #todo_files = []

    while len(sample_files) > 0:
        for i in range(min(parallel, len(sample_files))):
            file = sample_files.pop()
            shutil.move(file, metdata_dir)
            #todo_files.append(os.path.join( metdata_dir ,os.path.basename(file))  )

        run_snakemake(sf, config_file, cores, unlock=True)
        status = run_snakemake(sf, config_file, cores)
        # move the finished file to finished

        if status:
            contents = "snakemake run successfully"
            if config['bark']:
                bark_notification(config['bark_api'], contents)
            if config['feishu']:
                feishu_notification(config['feishu_api'], contents)
            finished_file = glob.glob(os.path.join(metdata_dir, "*.txt"))
            for _ in range(len(finished_file)):
                file = finished_file.pop()
                update_status(file, table_name="meta", db=db)

                if os.path.isfile(
                        os.path.join(finished_dir, os.path.basename(file))):
                    shutil.copy2(file, finished_dir)
                    os.unlink(file)
                else:
                    shutil.move(file, finished_dir)
        else:
            contents = "snakemake run failed"
            if config['bark']:
                bark_notification(config['bark_api'], contents)
            if config['feishu']:
                feishu_notification(config['feishu_api'], contents)
            broken_file = glob.glob(os.path.join(metdata_dir, "*.txt"))
            for _ in range(len(broken_file)):
                file = broken_file.pop()
                shutil.move(file, "unfinished")
示例#13
0
    def _create_snakemake_dag(snakefile: str,
                              configfiles: Optional[List[str]] = None,
                              **kwargs: Any) -> DAG:
        """Create ``snakemake.dag.DAG`` instance.

        The code of this function comes from the Snakemake codebase and is adapted
        to fullfil REANA purposes of getting the needed metadata.

        :param snakefile: Path to Snakefile.
        :type snakefile: string
        :param configfiles: List of config files paths.
        :type configfiles: List
        :param kwargs: Snakemake args.
        :type kwargs: Any
        """
        overwrite_config = dict()
        if configfiles is None:
            configfiles = []
        for f in configfiles:
            # get values to override. Later configfiles override earlier ones.
            overwrite_config.update(load_configfile(f))
        # convert provided paths to absolute paths
        configfiles = list(map(os.path.abspath, configfiles))
        workflow = Workflow(
            snakefile=snakefile,
            overwrite_configfiles=configfiles,
            overwrite_config=overwrite_config,
        )

        workflow.include(snakefile=snakefile, overwrite_first_rule=True)
        workflow.check()

        # code copied and adapted from `snakemake.workflow.Workflow.execute()`
        # in order to build the DAG and calculate the job dependencies.
        # https://github.com/snakemake/snakemake/blob/75a544ba528b30b43b861abc0ad464db4d6ae16f/snakemake/workflow.py#L525
        def rules(items):
            return map(
                workflow._rules.__getitem__,
                filter(workflow.is_rule, items),
            )

        if kwargs.get("keep_target_files"):

            def files(items):
                return filterfalse(workflow.is_rule, items)

        else:

            def files(items):
                relpath = (lambda f: f if os.path.isabs(f) or f.startswith(
                    "root://") else os.path.relpath(f))
                return map(relpath, filterfalse(workflow.is_rule, items))

        if not kwargs.get("targets"):
            targets = ([workflow.first_rule]
                       if workflow.first_rule is not None else list())

        prioritytargets = kwargs.get("prioritytargets", [])
        forcerun = kwargs.get("forcerun", [])
        until = kwargs.get("until", [])
        omit_from = kwargs.get("omit_from", [])

        priorityrules = set(rules(prioritytargets))
        priorityfiles = set(files(prioritytargets))
        forcerules = set(rules(forcerun))
        forcefiles = set(files(forcerun))
        untilrules = set(rules(until))
        untilfiles = set(files(until))
        omitrules = set(rules(omit_from))
        omitfiles = set(files(omit_from))

        targetrules = set(
            chain(
                rules(targets),
                filterfalse(Rule.has_wildcards, priorityrules),
                filterfalse(Rule.has_wildcards, forcerules),
                filterfalse(Rule.has_wildcards, untilrules),
            ))
        targetfiles = set(
            chain(files(targets), priorityfiles, forcefiles, untilfiles))
        dag = DAG(
            workflow,
            workflow.rules,
            targetrules=targetrules,
            targetfiles=targetfiles,
            omitfiles=omitfiles,
            omitrules=omitrules,
        )

        workflow.persistence = Persistence(dag=dag)
        dag.init()
        dag.update_checkpoint_dependencies()
        dag.check_dynamic()
        return dag
示例#14
0
文件: conf.py 项目: pythseq/atlas
def validate_config(config):
    conf = load_configfile(config)
    validate_sample_defs(conf)
示例#15
0
 def validate_config(self):
     load_configfile(self.config)
示例#16
0
def run_workflow(workflow, working_dir, config_file, jobs, max_mem, profile,
                 dryrun, snakemake_args):
    """Runs the ATLAS pipline

    By default all steps are executed but a sub-workflow can be specified.
    Needs a config-file and expects to find a sample table in the working-directory. Both can be generated with 'atlas init'

    Most snakemake arguments can be appended to the command for more info see 'snakemake --help'

    For more details, see: https://metagenome-atlas.readthedocs.io
    """

    logger.info(f"Atlas version: {__version__}")

    if config_file is None:
        config_file = os.path.join(working_dir, "config.yaml")

    if not os.path.exists(config_file):
        logger.critical(f"config-file not found: {config_file}\n"
                        "generate one with 'atlas init'")
        exit(1)

    sample_file = os.path.join(working_dir, "samples.tsv")

    if not os.path.exists(sample_file):
        logger.critical(f"sample.tsv not found in the working directory. "
                        "Generate one with 'atlas init'")
        exit(1)

    validate_config(config_file, workflow)

    conf = load_configfile(config_file)

    db_dir = conf["database_dir"]

    cmd = ("snakemake --snakefile {snakefile} --directory {working_dir} "
           "{jobs} --rerun-incomplete "
           "--configfile '{config_file}' --nolock "
           " {profile} --use-conda {conda_prefix} {dryrun} "
           " {max_mem_string} "
           " --scheduler greedy "
           " {target_rule} "
           " {args} ").format(
               snakefile=get_snakefile(),
               working_dir=working_dir,
               jobs="--jobs {}".format(jobs) if jobs is not None else "",
               config_file=config_file,
               profile="" if
               (profile is None) else "--profile {}".format(profile),
               dryrun="--dryrun" if dryrun else "",
               args=" ".join(snakemake_args),
               target_rule=workflow if workflow != "None" else "",
               conda_prefix="--conda-prefix " +
               os.path.join(db_dir, "conda_envs"),
               max_mem_string=handle_max_mem(max_mem, profile),
           )
    logger.debug("Executing: %s" % cmd)
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        # removes the traceback
        logger.critical(e)
        exit(1)