예제 #1
0
    def finish(self, job, update_dynamic=True):
        self._finished.add(job)
        try:
            self._ready_jobs.remove(job)
        except KeyError:
            pass
        # mark depending jobs as ready
        for job_ in self.depending[job]:
            if self.needrun(job_) and self._ready(job_):
                self._ready_jobs.add(job_)

        if update_dynamic and job.dynamic_output:
            logger.info("Dynamically updating jobs")
            newjob = self.update_dynamic(job)
            if newjob:
                # simulate that this job ran and was finished before
                self.omitforce.add(newjob)
                self._needrun.add(newjob)
                self._finished.add(newjob)

                self.postprocess()
                self.handle_protected(newjob)
                self.handle_touch(newjob)
                # add finished jobs to len as they are not counted after new postprocess
                self._len += len(self._finished)
예제 #2
0
    def printjob(self, job):
        # skip dynamic jobs that will be "executed" only in dryrun mode
        if self.dag.dynamic(job):
            return

        def format_files(job, io, ruleio, dynamicio):
            for f in io:
                f_ = ruleio[f]
                if f in dynamicio:
                    yield "{} (dynamic)".format(f.format_dynamic())
                else:
                    yield f

        priority = self.dag.priority(job)
        logger.job_info(jobid=self.dag.jobid(job),
                        msg=job.message,
                        name=job.rule.name,
                        local=self.workflow.is_local(job.rule),
                        input=list(format_files(job, job.input, job.ruleio,
                                                job.dynamic_input)),
                        output=list(format_files(job, job.output, job.ruleio,
                                                 job.dynamic_output)),
                        log=list(job.log),
                        benchmark=job.benchmark,
                        wildcards=job.wildcards_dict,
                        reason=str(self.dag.reason(job)),
                        resources=job.resources_dict,
                        priority="highest"
                        if priority == Job.HIGHEST_PRIORITY else priority,
                        threads=job.threads)

        if job.dynamic_output:
            logger.info("Subsequent jobs will be added dynamically "
                        "depending on the output of this rule")
예제 #3
0
    def d3dag(self, max_jobs=10000):
        def node(job):
            jobid = self.jobid(job)
            return {
                "id": jobid,
                "value": {
                    "jobid": jobid,
                    "label": job.rule.name,
                    "rule": job.rule.name
                }
            }

        def edge(a, b):
            return {"u": self.jobid(a), "v": self.jobid(b)}

        jobs = list(self.jobs)

        if len(jobs) > max_jobs:
            logger.info(
                "Job-DAG is too large for visualization (>{} jobs).".format(
                    max_jobs))
        else:
            logger.d3dag(nodes=[node(job) for job in jobs],
                         edges=[edge(dep, job) for job in jobs for dep in
                                self.dependencies[job] if self.needrun(dep)])
예제 #4
0
    def schedule(self):
        """ Schedule jobs that are ready, maximizing cpu usage. """
        try:
            while True:
                # work around so that the wait does not prevent keyboard interrupts
                while not self._open_jobs.wait(1):
                    pass

                # obtain needrun and running jobs in a thread-safe way
                with self._lock:
                    needrun = list(self.open_jobs)
                    running = list(self.running)
                # free the event
                self._open_jobs.clear()

                # handle errors
                if not self.keepgoing and self._errors:
                    logger.info("Will exit after finishing "
                                "currently running jobs.")
                    if not running:
                        self._executor.shutdown()
                        logger.error(_ERROR_MSG_FINAL)
                        return False
                    continue
                # normal shutdown because all jobs have been finished
                if not needrun and not running:
                    self._executor.shutdown()
                    if self._errors:
                        logger.error(_ERROR_MSG_FINAL)
                    return not self._errors

                # continue if no new job needs to be executed
                if not needrun:
                    continue

                logger.debug("Resources before job selection: {}".format(
                    self.resources))
                logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) +
                             "\n\t".join(map(str, needrun)))

                # select jobs by solving knapsack problem
                run = self.job_selector(needrun)
                logger.debug("Selected jobs ({}):\n\t".format(len(run)) +
                             "\n\t".join(map(str, run)))
                # update running jobs
                with self._lock:
                    self.running.update(run)
                logger.debug(
                    "Resources after job selection: {}".format(self.resources))
                # actually run jobs
                for job in run:
                    self.run(job)
        except (KeyboardInterrupt, SystemExit):
            logger.info("Terminating processes on user request.")
            self._executor.cancel()
            with self._lock:
                running = list(self.running)
            for job in running:
                job.cleanup()
            return False
예제 #5
0
 def download_from_remote(self):
     if self.is_remote and self.remote_object.exists():
         logger.info("Downloading from remote: {}".format(self.file))
         self.remote_object.download()
     else:
         raise RemoteFileException(
             "The file to be downloaded does not seem to exist remotely.")
예제 #6
0
 def cleanup(self):
     """ Cleanup output files. """
     to_remove = [f for f in self.expanded_output if f.exists]
     if to_remove:
         logger.info("Removing output files of failed job {}"
                     " since they might be corrupted:\n{}".format(
                         self, ", ".join(to_remove)))
         for f in to_remove:
             f.remove()
예제 #7
0
 def finish_job(self, job):
     super().finish_job(job)
     self.stats.report_job_end(job)
     try:
         self.workflow.persistence.finished(job)
     except IOError as e:
         logger.info("Failed to remove marker file for job started "
                     "({}). Please ensure write permissions for the "
                     "directory {}".format(e,
                                           self.workflow.persistence.path))
예제 #8
0
 def _error(self, job):
     """ Clear jobs and stop the workflow. """
     with self._lock:
         self._errors = True
         self.running.remove(job)
         self.failed.add(job)
         self._free_resources(job)
         if self.keepgoing:
             logger.info("Job failed, going on with independent jobs.")
         self._open_jobs.set()
예제 #9
0
 def _run(self, job, callback=None, error_callback=None):
     super()._run(job)
     self.stats.report_job_start(job)
     try:
         self.workflow.persistence.started(job)
     except IOError as e:
         logger.info(
             "Failed to set marker file for job started ({}). "
             "Snakemake will work, but cannot ensure that output files "
             "are complete in case of a kill signal or power loss. "
             "Please ensure write permissions for the "
             "directory {}".format(e, self.workflow.persistence.path))
예제 #10
0
def change_working_directory(directory=None):
    """ Change working directory in execution context if provided. """
    if directory:
        try:
            saved_directory = os.getcwd()
            logger.info("Changing to shadow directory: {}".format(directory))
            os.chdir(directory)
            yield
        finally:
            os.chdir(saved_directory)
    else:
        yield
예제 #11
0
def wait_for_files(files, latency_wait=3):
    """Wait for given files to be present in filesystem."""
    files = list(files)
    get_missing = lambda: [f for f in files if not os.path.exists(f)]
    missing = get_missing()
    if missing:
        logger.info("Waiting at most {} seconds for missing files.".format(
            latency_wait))
        for _ in range(latency_wait):
            if not get_missing():
                return
            time.sleep(1)
        raise IOError("Missing files after {} seconds:\n{}".format(
            latency_wait, "\n".join(get_missing())))
예제 #12
0
    def cleanup(self):
        """ Cleanup output files. """
        to_remove = [f for f in self.expanded_output if f.exists]

        to_remove.extend([f for f in self.remote_input if f.exists])
        to_remove.extend([f for f in self.remote_output if f.exists_local])
        if to_remove:
            logger.info("Removing output files of failed job {}"
                        " since they might be corrupted:\n{}".format(
                            self, ", ".join(to_remove)))
            for f in to_remove:
                f.remove()

            self.rmdir_empty_remote_dirs()
예제 #13
0
def print_exception(ex, linemaps):
    """
    Print an error message for a given exception.

    Arguments
    ex -- the exception
    linemaps -- a dict of a dict that maps for each snakefile
        the compiled lines to source code lines in the snakefile.
    """
    tb = "Full " + "".join(traceback.format_exception(type(ex), ex, ex.__traceback__))
    logger.debug(tb)
    if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError):
        logger.error(format_error(ex, ex.lineno,
                                  linemaps=linemaps,
                                  snakefile=ex.filename,
                                  show_traceback=True))
        return
    origin = get_exception_origin(ex, linemaps)
    if origin is not None:
        lineno, file = origin
        logger.error(format_error(ex, lineno,
                                  linemaps=linemaps,
                                  snakefile=file,
                                  show_traceback=True))
        return
    elif isinstance(ex, TokenError):
        logger.error(format_error(ex, None, show_traceback=False))
    elif isinstance(ex, MissingRuleException):
        logger.error(format_error(ex, None,
                                  linemaps=linemaps,
                                  snakefile=ex.filename,
                                  show_traceback=False))
    elif isinstance(ex, RuleException):
        for e in ex._include + [ex]:
            if not e.omit:
                logger.error(format_error(e, e.lineno,
                                          linemaps=linemaps,
                                          snakefile=e.filename,
                                          show_traceback=True))
    elif isinstance(ex, WorkflowError):
        logger.error(format_error(ex, ex.lineno,
                                  linemaps=linemaps,
                                  snakefile=ex.snakefile,
                                  show_traceback=True))
    elif isinstance(ex, KeyboardInterrupt):
        logger.info("Cancelling snakemake on user request.")
    else:
        traceback.print_exception(type(ex), ex, ex.__traceback__)
예제 #14
0
    def printjob(self, job):
        # skip dynamic jobs that will be "executed" only in dryrun mode
        if self.dag.dynamic(job):
            return

        def format_files(job, io, ruleio, dynamicio):
            for f in io:
                f_ = ruleio[f]
                if f in dynamicio:
                    yield "{} (dynamic)".format(f_)
                else:
                    yield f

        def format_ruleitem(name, value):
            return "" if not value else "\t{}: {}".format(name, value)

        desc = list()
        if not self.quiet:
            if job.message:
                desc.append(job.message)
            else:
                desc.append("{}rule {}:".format(self.rule_prefix(job), job.rule.name))
                for name, value in (
                    ("input", ", ".join(format_files(
                        job, job.input, job.ruleio, job.dynamic_input))),
                    ("output", ", ".join(format_files(
                        job, job.output, job.ruleio,
                        job.dynamic_output))),
                    ("log", job.log),
                    ("reason",
                        self.dag.reason(job) if self.printreason else None)):
                    if value:
                        desc.append(format_ruleitem(name, value))
                priority = self.dag.priority(job)
                if priority > 1:
                    desc.append(format_ruleitem(
                        "priority", "highest"
                        if priority == Job.HIGHEST_PRIORITY
                        else priority))
                if self.printthreads and job.threads > 1:
                    desc.append(format_ruleitem("threads", job.threads))
        if self.printshellcmds and job.shellcmd:
            desc.append(job.shellcmd)
        if desc:
            logger.info("\n".join(desc))
            if job.dynamic_output:
                logger.warning("Subsequent jobs will be added dynamically "
                    "depending on the output of this rule")
예제 #15
0
def remove(file, remove_non_empty_dir=False):
    if os.path.exists(file):
        if os.path.isdir(file):
            if remove_non_empty_dir:
                shutil.rmtree(file)
            else:
                try:
                    os.removedirs(file)
                except OSError as e:
                    # skip non empty directories
                    if e.errno == 39:
                        logger.info("Skipped removing empty directory {}".format(e.filename))
                    else:
                        logger.warning(str(e))
        else:
            os.remove(file)
예제 #16
0
    def include(self, snakefile,
                overwrite_first_rule=False,
                print_compilation=False,
                overwrite_shellcmd=None):
        """
        Include a snakefile.
        """
        # check if snakefile is a path to the filesystem
        if not urllib.parse.urlparse(snakefile).scheme:
            if not os.path.isabs(snakefile) and self.included_stack:
                current_path = os.path.dirname(self.included_stack[-1])
                snakefile = os.path.join(current_path, snakefile)
            # Could still be an url if relative import was used
            if not urllib.parse.urlparse(snakefile).scheme:
                snakefile = os.path.abspath(snakefile)
        # else it could be an url.
        # at least we don't want to modify the path for clarity.

        if snakefile in self.included:
            logger.info("Multiple include of {} ignored".format(snakefile))
            return
        self.included.append(snakefile)
        self.included_stack.append(snakefile)

        global workflow

        workflow = self

        first_rule = self.first_rule
        code, linemap, rulecount = parse(snakefile,
                                         overwrite_shellcmd=self.overwrite_shellcmd,
                                         rulecount=self._rulecount)
        self._rulecount = rulecount

        if print_compilation:
            print(code)

        # insert the current directory into sys.path
        # this allows to import modules from the workflow directory
        sys.path.insert(0, os.path.dirname(snakefile))

        self.linemaps[snakefile] = linemap
        exec(compile(code, snakefile, "exec"), self.globals)
        if not overwrite_first_rule:
            self.first_rule = first_rule
        self.included_stack.pop()
예제 #17
0
    def __init__(
        self,
        path,
        job,
        caption,
        env,
        category,
        workflow,
        wildcards_overwrite=None,
        mode_embedded=True,
        aux_files=None,
        name_overwrite=None,
    ):
        self.name_overwrite = name_overwrite
        self.mode_embedded = mode_embedded
        self.path = path
        self.target = os.path.basename(path)
        self.size = os.path.getsize(self.path)
        logger.info("Adding {} ({:.2g} MB).".format(self.name, self.size / 1e6))
        self.raw_caption = caption
        self.mime, _ = mime_from_file(self.path)
        self.workflow = workflow

        h = hashlib.sha256()
        h.update(path.encode())

        self.id = h.hexdigest()
        self.job = job
        self._wildcards = (
            job.wildcards if wildcards_overwrite is None else wildcards_overwrite
        )
        self.wildcards = logging.format_wildcards(self._wildcards)
        self.params = (
            logging.format_dict(job.params).replace("\n", r"\n").replace('"', r"\"")
        )
        self.category = category

        self.aux_files = aux_files or []

        self.data_uri = self._data_uri()
        self.png_uri = self._png_uri()
예제 #18
0
    def createSampleFileMapping(self, sample_annotation):
        """
        create a sample file mapping with unique entries of existing files
            columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ]
        """

        assay_mapping = {'RNA_ID': 'RNA_BAM_FILE', 'DNA_ID': 'DNA_VCF_FILE'}

        assay_subsets = []
        for id_, file_type in assay_mapping.items():
            df = sample_annotation[[id_, file_type]].drop_duplicates().copy()
            df.rename(columns={
                id_: 'ID',
                file_type: 'FILE_PATH'
            },
                      inplace=True)
            df['ASSAY'] = id_
            df['FILE_TYPE'] = file_type
            assay_subsets.append(df)

        file_mapping = pd.concat(assay_subsets)

        # cleaning SAMPLE_FILE_MAPPING
        file_mapping.dropna(inplace=True)
        existent = [
            pathlib.Path(x).exists() for x in file_mapping["FILE_PATH"]
        ]
        if sum(existent) < file_mapping.shape[0]:
            logger.info(
                "WARNING: there are files in the sample annotation that do not exist"
            )
        file_mapping = file_mapping[existent].drop_duplicates()
        if file_mapping.shape[0] == 0:
            raise ValueError(
                "No files exist in sample annotation. Please check your sample annotation."
            )

        file_mapping.to_csv(self.getProcDataDir() + "/file_mapping.csv",
                            index=False)

        return file_mapping
예제 #19
0
def advanced_argument_conversion(arg_dict):
    """Experimental adjustment of sbatch arguments to the given or default partition."""
    # Currently not adjusting for multiple node jobs
    nodes = int(arg_dict.get("nodes", 1))
    if nodes > 1:
        return arg_dict
    partition = arg_dict.get("partition", None) or _get_default_partition()
    constraint = arg_dict.get("constraint", None)
    ncpus = int(arg_dict.get("cpus-per-task", 1))
    runtime = arg_dict.get("time", None)
    config = _get_cluster_configuration(partition, constraint, arg_dict.get("mem", 0))
    mem = arg_dict.get("mem", ncpus * min(config["MEMORY_PER_CPU"]))
    if mem > max(config["MEMORY"]):
        logger.info(
            f"requested memory ({mem}) > max memory ({max(config['MEMORY'])}); "
            "adjusting memory settings"
        )
        mem = max(config["MEMORY"])

    # Calculate available memory as defined by the number of requested
    # cpus times memory per cpu
    AVAILABLE_MEM = ncpus * min(config["MEMORY_PER_CPU"])
    # Add additional cpus if memory is larger than AVAILABLE_MEM
    if mem > AVAILABLE_MEM:
        logger.info(
            f"requested memory ({mem}) > "
            f"ncpus x MEMORY_PER_CPU ({AVAILABLE_MEM}); "
            "trying to adjust number of cpus up"
        )
        ncpus = int(math.ceil(mem / min(config["MEMORY_PER_CPU"])))
    if ncpus > max(config["CPUS"]):
        logger.info(
            f"ncpus ({ncpus}) > available cpus ({max(config['CPUS'])}); "
            "adjusting number of cpus down"
        )
        ncpus = min(int(max(config["CPUS"])), ncpus)
    adjusted_args = {"mem": int(mem), "cpus-per-task": ncpus}

    # Update time. If requested time is larger than maximum allowed time, reset
    if runtime:
        runtime = time_to_minutes(runtime)
        time_limit = max(config["TIMELIMIT_MINUTES"])
        if runtime > time_limit:
            logger.info(
                f"time (runtime) > time limit {time_limit}; " "adjusting time down"
            )
            adjusted_args["time"] = time_limit

    # update and return
    arg_dict.update(adjusted_args)
    return arg_dict
예제 #20
0
    def unshadow_output(self, job):
        """ Move files from shadow directory to real output paths. """
        if not job.shadow_dir or not job.expanded_output:
            return
        for real_output in chain(job.expanded_output, job.log):
            shadow_output = job.shadowed_path(real_output).file
            # Remake absolute symlinks as relative
            if os.path.islink(shadow_output):
                dest = os.readlink(shadow_output)
                if os.path.isabs(dest):
                    rel_dest = os.path.relpath(dest, job.shadow_dir)
                    os.remove(shadow_output)
                    os.symlink(rel_dest, shadow_output)

            if os.path.realpath(shadow_output) == os.path.realpath(
                    real_output):
                continue
            logger.info("Moving shadow output {} to destination {}".format(
                shadow_output, real_output))
            shutil.move(shadow_output, real_output)
        shutil.rmtree(job.shadow_dir)
예제 #21
0
 def pull(self, dryrun=False):
     if self.is_local:
         return
     if dryrun:
         logger.info("Singularity image {} will be pulled.".format(
             self.url))
         return
     logger.debug("Singularity image location: {}".format(self.path))
     if not os.path.exists(self.path):
         logger.info("Pulling singularity image {}.".format(self.url))
         try:
             p = subprocess.check_output([
                 "singularity", "pull", "--name", "{}.simg".format(
                     self.hash), self.url
             ],
                                         cwd=self._img_dir,
                                         stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as e:
             raise WorkflowError("Failed to pull singularity image "
                                 "from {}:\n{}".format(
                                     self.url, e.stdout.decode()))
예제 #22
0
    def handle_temp(self, job):
        """ Remove temp files if they are no longer needed. """
        if self.notemp:
            return

        needed = lambda job_, f: any(
            f in files for j, files in self.depending[job_].items()
            if not self.finished(j) and self.needrun(j) and j != job)

        def unneeded_files():
            for job_, files in self.dependencies[job].items():
                for f in job_.temp_output & files:
                    if not needed(job_, f):
                        yield f
            for f in filterfalse(partial(needed, job), job.temp_output):
                if not f in self.targetfiles:
                    yield f

        for f in unneeded_files():
            logger.info("Removing temporary output file {}.".format(f))
            f.remove(remove_non_empty_dir=True)
def fastqc():
    # input: expand("data/xenograft-20201201/fastq/{{fq_id}}_{rg}.fq.gz", rg=[1, 2])
    # output: expand("results/20201130-week-49/xenograft/fastqc/{{fq_id}}_{rg}_fastqc.{ext}", rg=[1, 2], ext=["html", "zip"])
    with TemporaryDirectory() as tempdir:
        job_label = snakemake.params.get("label")
        if job_label:
            logger.info(f"Job started: {job_label}")
        logger.info(f"Using temporary directory: {tempdir}")

        cmd = f"fastqc -t {snakemake.threads} -o {tempdir} -f fastq {snakemake.input}"
        logger.info(cmd)
        shell(cmd)

        destdir = os.path.dirname(snakemake.output[0])
        logger.info(f"Copying to destination: {destdir}")

        for f in os.listdir(tempdir):
            if f.endswith("fastqc.html") or f.endswith("fastqc.zip"):
                logger.info("Copying {f}")
                full_path = path.join(tempdir, f)
                shell("mv {full_path} {destdir}")
예제 #24
0
    def _handle_error(self, job):
        """Clear jobs and stop the workflow.

        If Snakemake is configured to restart jobs then the job might have
        "restart_times" left and we just decrement and let the scheduler
        try to run the job again.
        """
        self.get_executor(job).handle_job_error(job)
        self.running.remove(job)
        self._free_resources(job)
        # attempt starts counting from 1, but the first attempt is not
        # a restart, hence we subtract 1.
        if job.restart_times > job.attempt - 1:
            logger.info("Trying to restart job {}.".format(self.dag.jobid(job)))
            job.attempt += 1
        else:
            self._errors = True
            self.failed.add(job)
            if self.keepgoing:
                logger.info("Job failed, going on with independent jobs.")
        self._open_jobs.release()
예제 #25
0
    def handle_temp(self, job):
        """ Remove temp files if they are no longer needed. """
        if self.notemp:
            return

        needed = lambda job_, f: any(
            f in files for j, files in self.depending[job_].items()
            if not self.finished(j) and self.needrun(j) and j != job)

        def unneeded_files():
            for job_, files in self.dependencies[job].items():
                for f in job_.temp_output & files:
                    if not needed(job_, f):
                        yield f
            for f in filterfalse(partial(needed, job), job.temp_output):
                if not f in self.targetfiles:
                    yield f

        for f in unneeded_files():
            logger.info("Removing temporary output file {}.".format(f))
            f.remove()
예제 #26
0
    def migrate_v1_to_v2(self):
        logger.info("Migrating .snakemake folder to new format...")
        i = 0
        for path, _, filenames in os.walk(self._metadata_path):
            path = Path(path)
            for filename in filenames:
                with open(path / filename, "r") as f:
                    try:
                        record = json.load(f)
                    except json.JSONDecodeError:
                        continue  # not a properly formatted JSON file

                    if record.get("incomplete", False):
                        target_path = Path(
                            self._incomplete_path) / path.relative_to(
                                self._metadata_path)
                        os.makedirs(target_path, exist_ok=True)
                        shutil.copyfile(
                            path / filename,
                            target_path / filename,
                        )
                i += 1
                # this can take a while for large folders...
                if (i % 10000) == 0 and i > 0:
                    logger.info("{} files migrated".format(i))

        logger.info("Migration complete")
예제 #27
0
    def updateParamFiles(self,path,filename,sa_df,param_cols,ID,include):
        """
        path: string. path to where to write the param files (separate from filename to build path if not existing)
        filename: string. name of file to write in path
        param_cols: list. list of sample annotation columns to use to determine parameters for that job
        ID: list. list containing the identifier for the sa_col. either the [sample name] or the [samples in drop group]
        include: boolean. True- include all of the columns in param_cols to build param file. False- use all other columns in SA
        """
        # build the path to the param file
        path.mkdir(parents = True,exist_ok = True)

        param_cols = [col for col in param_cols if col in sa_df.columns] # remove params that are not columns in SA table

        # take the complement of columns if indicated by !include
        if not include:
            param_cols = [col for col in sa_df.columns if col not in param_cols]
    
        # designate the TEMP and final param file names
        true_filename = "{path}/{filename}".format(path = path, filename = filename)

    
        # if a file by the desired name exists. 
        if os.path.isfile(true_filename):

            # replace any strings of nan with "NA"
            current_SA = sa_df.loc[sa_df["RNA_ID"].isin(ID),param_cols].reset_index(drop = True)
            current_SA = current_SA.replace("nan","NA").fillna(value = "NA")
            old_SA = pd.read_csv(true_filename).reset_index(drop = True).fillna(value = "NA")

            if current_SA.equals(old_SA):
                pass
            else:
                # if they're different remove the existing file and rename TEMP to the desired file. Updating to the current SA table
                logger.info("{} Param Files do not match. Updating to current Sample Annotation\n".format(filename))
                current_SA.to_csv(true_filename, index = False,header = True,na_rep = "NA")
                
        # if the param file doesn't exist, just write to the desired file
        else:
            logger.info("{} Param File did not already exist. Writing it\n".format(filename))
            sa_df.loc[sa_df["RNA_ID"].isin(ID),param_cols].to_csv(true_filename, index = False,header = True,na_rep = "NA")
예제 #28
0
def wait_for_files(files, latency_wait=3, force_stay_on_remote=False, ignore_pipe=False):
    """Wait for given files to be present in filesystem."""
    files = list(files)
    def get_missing():
        return [
            f for f in files
            if not (f.exists_remote
                    if (isinstance(f, _IOFile) and
                       f.is_remote and
                       (force_stay_on_remote or f.should_stay_on_remote))
                    else os.path.exists(f) if not (is_flagged(f, "pipe") and ignore_pipe) else True)]

    missing = get_missing()
    if missing:
        logger.info("Waiting at most {} seconds for missing files.".format(
            latency_wait))
        for _ in range(latency_wait):
            if not get_missing():
                return
            time.sleep(1)
        raise IOError("Missing files after {} seconds:\n{}".format(
            latency_wait, "\n".join(get_missing())))
예제 #29
0
    def _error(self, job):
        """Clear jobs and stop the workflow.

        If Snakemake is configured to restart jobs then the job might have
        "restart_times" left and we just decrement and let the scheduler
        try to run the job again.
        """
        with self._lock:
            self.running.remove(job)
            self._free_resources(job)
            self._open_jobs.set()
            if job.restart_times > 0:
                msg = (("Trying to restart job for rule {} with "
                        "wildcards {}").format(job.rule.name,
                                               job.wildcards_dict))
                logger.info(msg)
                job.restart_times -= 1
            else:
                self._errors = True
                self.failed.add(job)
                if self.keepgoing:
                    logger.info("Job failed, going on with independent jobs.")
예제 #30
0
    def run(self,
            job,
            callback=None,
            submit_callback=None,
            error_callback=None):
        super()._run(job)
        jobscript = self.get_jobscript(job)
        self.spawn_jobscript(job, jobscript)

        try:
            drmaa_args = job.format_wildcards(
                self.drmaa_args, cluster=self.cluster_wildcards(job))
        except AttributeError as e:
            raise WorkflowError(str(e), rule=job.rule)

        import drmaa
        try:
            jt = self.session.createJobTemplate()
            jt.remoteCommand = jobscript
            jt.nativeSpecification = drmaa_args
            jt.jobName = os.path.basename(jobscript)

            jobid = self.session.runJob(jt)
        except (drmaa.errors.InternalException,
                drmaa.errors.InvalidAttributeValueException) as e:
            print_exception(WorkflowError("DRMAA Error: {}".format(e)),
                            self.workflow.linemaps)
            error_callback(job)
            return
        logger.info("Submitted DRMAA job (jobid {})".format(jobid))
        self.submitted.append(jobid)
        self.session.deleteJobTemplate(jt)

        submit_callback(job)

        with self.lock:
            self.active_jobs.append(
                DRMAAClusterJob(job, jobid, callback, error_callback,
                                jobscript))
예제 #31
0
    def cleanup(self):
        """ Cleanup output files. """
        to_remove = [f for f in self.expanded_output if f.exists]

        to_remove.extend(
            [
                f
                for f in self.remote_output
                if (
                    f.exists_remote
                    if (f.is_remote and f.should_stay_on_remote)
                    else f.exists_local
                )
            ]
        )
        if to_remove:
            logger.info(
                "Removing output files of failed job {}"
                " since they might be corrupted:\n{}".format(self, ", ".join(to_remove))
            )
            for f in to_remove:
                f.remove()
예제 #32
0
    def subsetGroups(self, ids_by_group, subset_groups, warn=30, error=10):
        if subset_groups is None:
            subset = ids_by_group
        else:
            subset_groups = [
                subset_groups
            ] if subset_groups.__class__ == str else subset_groups
            subset = {
                gr: ids
                for gr, ids in ids_by_group.items() if gr in subset_groups
            }

        for group in subset_groups:
            if len(subset[group]) < error:
                raise ValueError(
                    f'Too few IDs in DROP_GROUP {group}, please ensure that it has at least {error} IDs'
                )
            elif len(subset[group]) < warn:
                logger.info(
                    f'WARNING: Less than {warn} IDs in DROP_GROUP {group}')

        return subset
예제 #33
0
    def execute_script(self, fname, edit=None):
        import nbformat

        fname_out = self.log.get("notebook", None)
        if fname_out is None or edit:
            output_parameter = ""
        else:
            fname_out = os.path.join(os.getcwd(), fname_out)
            output_parameter = "--output {fname_out:q}"

        if edit is not None:
            logger.info("Opening notebook for editing.")
            cmd = (
                "jupyter notebook --browser ':' --no-browser --log-level ERROR --ip {edit.ip} --port {edit.port} "
                "--NotebookApp.quit_button=True {{fname:q}}".format(edit=edit))
        else:
            cmd = (
                "jupyter-nbconvert --log-level ERROR --execute {output_parameter} "
                "--to notebook --ExecutePreprocessor.timeout=-1 {{fname:q}}".
                format(output_parameter=output_parameter))

        if ON_WINDOWS:
            fname = fname.replace("\\", "/")
            fname_out = fname_out.replace("\\",
                                          "/") if fname_out else fname_out

        self._execute_cmd(cmd, fname_out=fname_out, fname=fname)

        if edit:
            logger.info("Saving modified notebook.")
            nb = nbformat.read(fname, as_version=4)

            self.remove_preamble_cell(nb)

            # clean up all outputs
            for cell in nb["cells"]:
                cell["outputs"] = []

            nbformat.write(nb, self.local_path)
예제 #34
0
    def code(self):
        try:
            from pygments.lexers import get_lexer_by_name
            from pygments.formatters import HtmlFormatter
            from pygments import highlight
            import pygments.util
        except ImportError:
            raise WorkflowError(
                "Python package pygments must be installed to create reports."
            )
        source, language = None, None
        if self._rule.shellcmd is not None:
            source = self._rule.shellcmd
            language = "bash"
        elif self._rule.script is not None:
            logger.info("Loading script code for rule {}".format(self.name))
            _, source, language = script.get_source(
                self._rule.script, self._rule.basedir
            )
            source = source.decode()
        elif self._rule.wrapper is not None:
            logger.info("Loading wrapper code for rule {}".format(self.name))
            _, source, language = script.get_source(
                wrapper.get_script(
                    self._rule.wrapper, prefix=self._rule.workflow.wrapper_prefix
                )
            )
            source = source.decode()

        try:
            lexer = get_lexer_by_name(language)
            return highlight(
                source,
                lexer,
                HtmlFormatter(linenos=True, cssclass="source", wrapcode=True),
            )
        except pygments.util.ClassNotFound:
            return "<pre><code>source</code></pre>"
예제 #35
0
def get_config(config_dir, config_name):
    """
    # search order:
    config_some_profile_name|variant.yaml
    config_some_profile|variant.yaml
    config_some|variant.yaml
    config|variant.yaml
    config_some_profile_name.yaml
    config_some_profile.yaml
    config_some.yaml
    config.yaml
    """
    config_dict = {}
    if os.path.isdir(os.path.join(config_dir, 'configs')):
        config_dir = os.path.join(config_dir, 'configs')

    try_config_name = copy(config_name)

    keep_variant = True
    while not config_dict:
        try:
            config_dict = load_config_file(os.path.join(config_dir,
                                                        try_config_name))
        except FileNotFoundError:
            parent_config_name = get_parent_config_name(try_config_name)

            if parent_config_name:
                logger.info(f"{try_config_name} not found, "
                            f"trying {parent_config_name}")
                try_config_name = parent_config_name
            else:
                if keep_variant:
                    name, ext = os.path.splitext(config_name)
                    try_config_name = name.split('|')[0] + ext
                else:
                    raise FileNotFoundError("No corresponding config file found!")

    return config_dict
예제 #36
0
파일: setupDrop.py 프로젝트: adeslatt/drop
def installRPackages(config: DropConfig = None):
    logger.info("check for missing R packages")
    script = Path(drop.__file__).parent / "installRPackages.R"
    requirements = Path(drop.__file__).parent / 'requirementsR.txt'

    # install main packages
    response = subprocess.run(["Rscript", script, requirements],
                              stderr=subprocess.STDOUT)
    response.check_returncode()

    # install pipeline depending packages
    if config is not None:
        pkg_assembly_name = config.getBSGenomeName()
        response = subprocess.run(["Rscript", script, pkg_assembly_name],
                                  stderr=subprocess.STDOUT)
        response.check_returncode()

        pkg_mafdb_name = config.getMafDbName()
        if pkg_mafdb_name is not None and config.get("mae").get(
                'addAF') is True:
            response = subprocess.run(["Rscript", script, pkg_mafdb_name],
                                      stderr=subprocess.STDOUT)
            response.check_returncode()
def bam_stats():
    with TemporaryDirectory() as tempdir:
        job_label = snakemake.params.get("label")
        if job_label:
            logger.info(f"Job started: {job_label}")
        logger.info(f"Using temporary directory: {tempdir}")

        nthreads = snakemake.threads
        total_mem = snakemake.resources.mem_mb
        logger.info(f"# of cores available: {nthreads}")
        logger.info(f"Total memory available: {total_mem}MB")

        shell("samtools stats {snakemake.input.bam} > {tempdir}/stats.txt")
        shell(
            "samtools idxstats {snakemake.input.bam} > {tempdir}/idxstats.txt")
        shell(
            "samtools flagstats {snakemake.input.bam} > {tempdir}/flagstats.txt"
        )

        logger.info(f"Copying to destination")
        shell("mv {tempdir}/stats.txt {snakemake.output.stats}")
        shell("mv {tempdir}/idxstats.txt {snakemake.output.idxstats}")
        shell("mv {tempdir}/flagstats.txt {snakemake.output.flagstats}")
예제 #38
0
    def store(self, job: Job):
        """
        Store generated job output in the cache.
        """

        with TemporaryDirectory(dir=self.path) as tmpdirname:
            tmpdir = Path(tmpdirname)

            for outputfile, cachefile in self.get_outputfiles_and_cachefiles(
                    job):
                self.check_writeable(cachefile)
                logger.info(
                    "Moving output file {} to cache.".format(outputfile))

                tmp = tmpdir / cachefile.name
                # First move is performed into a tempdir (it might involve a copy if not on the same FS).
                # This is important, such that network filesystem latency
                # does not lead to concurrent writes to the same file.
                # We can use the plain copy method of shutil, because we do not care about the metadata.
                shutil.move(outputfile, tmp, copy_function=shutil.copy)
                # make readable/writeable for all
                os.chmod(
                    tmp,
                    stat.S_IRUSR
                    | stat.S_IWUSR
                    | stat.S_IRGRP
                    | stat.S_IWGRP
                    | stat.S_IROTH
                    | stat.S_IWOTH,
                )

                # Move to the actual path (now we are on the same FS, hence move is atomic).
                # Here we use the default copy function, also copying metadata (which is important here).
                # It will always work, because we are guaranteed to be in the same FS.
                shutil.move(tmp, cachefile)
                # now restore the outputfile via a symlink
                self.symlink(cachefile, outputfile, utime=False)
예제 #39
0
파일: local.py 프로젝트: wook2014/snakemake
    def store(self, job: Job):
        """
        Store generated job output in the cache.
        """

        if not os.access(self.path, os.W_OK):
            raise WorkflowError(
                "Cannot access cache location {}. Please ensure that "
                "it is present and writeable.".format(self.path))
        with TemporaryDirectory(dir=self.path) as tmpdirname:
            tmpdir = Path(tmpdirname)

            for outputfile, cachefile in self.get_outputfiles_and_cachefiles(
                    job):
                if not os.path.exists(outputfile):
                    raise WorkflowError(
                        "Cannot move output file {} to cache. It does not exist "
                        "(maybe it was not created by the job?).")
                self.check_writeable(cachefile)
                logger.info(
                    "Moving output file {} to cache.".format(outputfile))

                tmp = tmpdir / cachefile.name
                # First move is performed into a tempdir (it might involve a copy if not on the same FS).
                # This is important, such that network filesystem latency
                # does not lead to concurrent writes to the same file.
                # We can use the plain copy method of shutil, because we do not care about the metadata.
                shutil.move(outputfile, tmp, copy_function=shutil.copy)

                self.set_permissions(tmp)

                # Move to the actual path (now we are on the same FS, hence move is atomic).
                # Here we use the default copy function, also copying metadata (which is important here).
                # It will always work, because we are guaranteed to be in the same FS.
                shutil.move(tmp, cachefile)
                # now restore the outputfile via a symlink
                self.symlink(cachefile, outputfile, utime=False)
예제 #40
0
    def run(self, job,
            callback=None,
            submit_callback=None,
            error_callback=None):
        super()._run(job)
        jobscript = self.get_jobscript(job)
        self.spawn_jobscript(job, jobscript)

        try:
            drmaa_args = job.format_wildcards(
                self.drmaa_args,
                cluster=self.cluster_wildcards(job))
        except AttributeError as e:
            raise WorkflowError(str(e), rule=job.rule)

        import drmaa
        try:
            jt = self.session.createJobTemplate()
            jt.remoteCommand = jobscript
            jt.nativeSpecification = drmaa_args
            jt.jobName = os.path.basename(jobscript)

            jobid = self.session.runJob(jt)
        except (drmaa.errors.InternalException,
                drmaa.errors.InvalidAttributeValueException) as e:
            print_exception(WorkflowError("DRMAA Error: {}".format(e)),
                            self.workflow.linemaps)
            error_callback(job)
            return
        logger.info("Submitted DRMAA job (jobid {})".format(jobid))
        self.submitted.append(jobid)
        self.session.deleteJobTemplate(jt)

        submit_callback(job)

        with self.lock:
            self.active_jobs.append(DRMAAClusterJob(job, jobid, callback, error_callback, jobscript))
예제 #41
0
    def createSampleFileMapping(self):
        """
        create a sample file mapping with unique entries of existing files
            columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ]
        """

        assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']}
        assay_subsets = []
        for id_, file_types in assay_mapping.items():
            for file_type in file_types:
                df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy()
                df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True)
                df['ASSAY'] = id_
                df['FILE_TYPE'] = file_type
                assay_subsets.append(df)
        file_mapping = pd.concat(assay_subsets)

        # cleaning SAMPLE_FILE_MAPPING
        file_mapping.dropna(inplace=True)
        file_mapping.drop_duplicates(inplace=True)

        # check for missing files
        existing = utils.checkFileExists(file_mapping["FILE_PATH"])
        if len(existing) == 0:
            message = "File mapping is empty. "
            message += "Please check that all files in your sample annotation exist."
            raise FileNotFoundError(message)
        elif len(existing) < file_mapping.shape[0]:
            missing = set(file_mapping["FILE_PATH"]) - set(existing)
            logger.info(f"WARNING: {len(missing)} files missing in samples annotation. Ignoring...")
            logger.debug(f"Missing files: {missing}")

            file_mapping = file_mapping[file_mapping["FILE_PATH"].isin(existing)]

        # write file mapping
        file_mapping.to_csv(self.root / "file_mapping.csv", index=False)
        return file_mapping
예제 #42
0
def test_wrappers(args_dict):
    """"""
    # Cleanup data and leave
    if args_dict["clean_output"]:
        logger.info("Removing output data")
        for wrapper_name in args_dict["wrappers"]:
            wrapper_workdir = os.path.join(args_dict["workdir"], wrapper_name)
            shutil.rmtree(wrapper_workdir, ignore_errors=True)
        sys.exit()

    # Test wrappers
    for wrapper_name in args_dict["wrappers"]:
        logger.warning("Testing Wrapper {}".format(wrapper_name))
        try:
            snakefile = get_snakefile_fn(workflow_dir=WRAPPER_DIR,
                                         workflow=wrapper_name)
            wrapper_workdir = os.path.join(args_dict["workdir"], wrapper_name)
            logger.debug("Working in directory: {}".format(wrapper_workdir))

            #Run Snakemake through the API
            snakemake(snakefile=snakefile,
                      workdir=wrapper_workdir,
                      config={"data_dir": DATA_DIR},
                      wrapper_prefix=WRAPPER_PREFIX,
                      use_conda=True,
                      cores=args_dict["cores"],
                      verbose=args_dict["verbose"],
                      quiet=args_dict["quiet"])

        finally:
            logger.debug("List of file generated: {}".format(
                os.listdir(wrapper_workdir)))
            shutil.rmtree(os.path.join(wrapper_workdir, ".snakemake"),
                          ignore_errors=True)
            if not args_dict["keep_output"]:
                logger.debug("Removing temporary directory")
                shutil.rmtree(wrapper_workdir, ignore_errors=True)
예제 #43
0
    def printjob(self, job):
        # skip dynamic jobs that will be "executed" only in dryrun mode
        if self.dag.dynamic(job):
            return

        def format_files(job, io, ruleio, dynamicio):
            for f in io:
                f_ = ruleio[f]
                if f in dynamicio:
                    yield "{} (dynamic)".format(f.format_dynamic())
                else:
                    yield f

        priority = self.dag.priority(job)
        logger.job_info(jobid=self.dag.jobid(job),
                        msg=job.message,
                        name=job.rule.name,
                        local=self.workflow.is_local(job.rule),
                        input=list(
                            format_files(job, job.input, job.ruleio,
                                         job.dynamic_input)),
                        output=list(
                            format_files(job, job.output, job.ruleio,
                                         job.dynamic_output)),
                        log=list(job.log),
                        benchmark=job.benchmark,
                        wildcards=job.wildcards_dict,
                        reason=str(self.dag.reason(job)),
                        resources=job.resources_dict,
                        priority="highest"
                        if priority == Job.HIGHEST_PRIORITY else priority,
                        threads=job.threads)

        if job.dynamic_output:
            logger.info("Subsequent jobs will be added dynamically "
                        "depending on the output of this rule")
예제 #44
0
def query_all_division(args):
    """collect info from all division into dataframe indexed on organism name
    """
    BAC_REPLACE = [(' sp ', ' sp. '), (' pv ', ' pv. '), (' str ', ' str. '),
                   (' subsp ', ' subsp. '), ('(', ''), (')', '')]
    if args.kingdoms and not isinstance(args.kingdoms, (list, tuple)):
        args.kingdoms = args.kingdoms.split(',')
    else:
        args.kingdoms = _ens_rest_query(ext="/info/divisions?")

    SP = {}
    for k in args.kingdoms:
        species = _ens_rest_query(ext='/info/species?', params={'division':
                                                                k})['species']
        logger.info('adding {} species from {}'.format(len(species), k))
        for s in species:
            if k == 'EnsemblBacteria':
                name = s['name']
                for old, new in BAC_REPLACE:
                    s['name'] = name.replace(old, new)
            SP[s['name']] = s

    df = pd.DataFrame.from_dict(SP, orient='index')
    return df
예제 #45
0
def setupTempFiles(config):
    # create temporary directory
    if not TMP_DIR.exists():
        logger.info(f"create temporary files directory {TMP_DIR}")
        TMP_DIR.mkdir(parents=True)

    # save config file
    CONF_FILE = getConfFile(config)
    with open(CONF_FILE, 'w') as f:
        yaml.safe_dump(config.copy(), f)

    done_files = {}
    for method in METHODS.keys():

        # final rule output file
        done_file = getMethodPath(method, type_='final_file', str_=False)
        done_files[method] = str(done_file)

        # create module tmp Dir if missing
        tmp_dir = getMethodPath(method, type_='tmp_dir', str_=False)
        if not tmp_dir.exists():
            tmp_dir.mkdir(parents=True)

    return TMP_DIR, CONF_FILE, done_files
예제 #46
0
def installRPackages():
    logger.info("check for missing R packages")
    script = pathlib.Path(drop.__file__).parent / "installRPackages.R"
    requirements = pathlib.Path(drop.__file__).parent / 'requirementsR.txt'

    #packages = [x.strip().split("#")[0] for x in open(requirements, 'r')]
    #packages = [x for x in packages if x != '']

    #for package in packages:
    #    logger.info(f"check {package}")
    call = subprocess.Popen(["Rscript", script, requirements],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)

    # check output for errors
    stdout, stderr = call.communicate()
    if stderr:
        print(stderr)
        exit(1)
    stdout = stdout.decode()
    ep = re.compile("Execution halted|^ERROR", re.M)
    if ep.search(stdout):
        print(stdout)
        exit(1)
예제 #47
0
def remove(file, remove_non_empty_dir=False):
    if file.is_remote and file.should_stay_on_remote:
        if file.exists_remote:
            file.remote_object.remove()
    elif os.path.isdir(file) and not os.path.islink(file):
        if remove_non_empty_dir:
            shutil.rmtree(file)
        else:
            try:
                os.removedirs(file)
            except OSError as e:
                # skip non empty directories
                if e.errno == 39:
                    logger.info("Skipped removing non-empty directory {}".format(e.filename))
                else:
                    logger.warning(str(e))
    #Remember that dangling symlinks fail the os.path.exists() test, but
    #we definitely still want to zap them. try/except is the safest way.
    #Also, we don't want to remove the null device if it is an output.
    elif os.devnull != str(file):
        try:
            os.remove(file)
        except FileNotFoundError:
            pass
예제 #48
0
def writeDependencyFile():
    """
    Entry point for writing .wBuild.depend.
    """
    #if not wbuildVersionIsCurrent():
    #    print(bcolors.WARNING + "Version of the project's static .wBuild lib is not the same as the dynamically loaded "
    #                            "wBuild"
    #                            "version. It is strongly recommended to update .wBuild lib using \'wbuild update\'; "
    #                            "otherwise, the consistency of the build can not be guaranteed." + bcolors.ENDC)
    logger.info("Structuring dependencies...")
    conf = Config()
    htmlOutputPath = conf.get("htmlOutputPath")
    logger.debug("Loaded config.\n html output path (key htmlOutputPath): " +
                 htmlOutputPath + "\n")
    scriptsPath = conf.get("scriptsPath")
    wbData = parseWBInfosFromRFiles(script_dir=scriptsPath,
                                    htmlPath=htmlOutputPath)
    mdData = parseMDFiles(script_dir=scriptsPath, htmlPath=htmlOutputPath)
    dependFile = tempfile.NamedTemporaryFile('w', delete=False)
    with dependFile as f:  #start off with the header
        f.write('######\n')
        f.write('#This is a autogenerated snakemake file by wBuild\n')
        f.write('#wBuild by Leonhard Wachutka\n')
        f.write('######\n')
        # write rules
        for r in wbData:
            writeRule(r, f)
        # write md rules
        for r in mdData:
            writeMdRule(r, f)

        # write build index rule
        writeIndexRule(wbData, mdData, f)
    logger.info("Dependencies file generated.\n")

    return (dependFile.name)
예제 #49
0
 def lock_warn_only(self):
     if self.locked:
         logger.info(
             "Error: Directory cannot be locked. This usually "
             "means that another Snakemake instance is running on this directory."
             "Another possiblity is that a previous run exited unexpectedly.")
예제 #50
0
 def cancel(self):
     logger.info("Will exit after finishing currently running jobs.")
     self.shutdown()
예제 #51
0
 def handle_protected(self, job):
     """ Write-protect output files that are marked with protected(). """
     for f in job.expanded_output:
         if f in job.protected_output:
             logger.info("Write-protecting output file {}.".format(f))
             f.protect()
예제 #52
0
 def handle_touch(self, job):
     """ Touches those output files that are marked for touching. """
     for f in job.expanded_output:
         if f in job.touch_output:
             logger.info("Touching output file {}.".format(f))
             f.touch_or_create()
예제 #53
0
 def progress(self):
     """ Display the progress. """
     logger.info("{} of {} steps ({:.0%}) done".format(self.finished_jobs,
         len(self.dag), self.finished_jobs / len(self.dag)))
예제 #54
0
    def execute(self,
                targets=None,
                dryrun=False,
                touch=False,
                cores=1,
                nodes=1,
                local_cores=1,
                forcetargets=False,
                forceall=False,
                forcerun=None,
                prioritytargets=None,
                quiet=False,
                keepgoing=False,
                printshellcmds=False,
                printreason=False,
                printdag=False,
                cluster=None,
                cluster_config=None,
                cluster_sync=None,
                jobname=None,
                immediate_submit=False,
                ignore_ambiguity=False,
                printrulegraph=False,
                printd3dag=False,
                drmaa=None,
                stats=None,
                force_incomplete=False,
                ignore_incomplete=False,
                list_version_changes=False,
                list_code_changes=False,
                list_input_changes=False,
                list_params_changes=False,
                summary=False,
                detailed_summary=False,
                latency_wait=3,
                benchmark_repeats=3,
                wait_for_files=None,
                nolock=False,
                unlock=False,
                resources=None,
                notemp=False,
                nodeps=False,
                cleanup_metadata=None,
                subsnakemake=None,
                updated_files=None,
                keep_target_files=False,
                allowed_rules=None,
                greediness=1.0,
                no_hooks=False):

        self.global_resources = dict() if resources is None else resources
        self.global_resources["_cores"] = cores
        self.global_resources["_nodes"] = nodes

        def rules(items):
            return map(self._rules.__getitem__, filter(self.is_rule, items))

        if keep_target_files:

            def files(items):
                return filterfalse(self.is_rule, items)
        else:

            def files(items):
                return map(os.path.relpath, filterfalse(self.is_rule, items))

        if not targets:
            targets = [self.first_rule
                       ] if self.first_rule is not None else list()
        if prioritytargets is None:
            prioritytargets = list()
        if forcerun is None:
            forcerun = list()

        priorityrules = set(rules(prioritytargets))
        priorityfiles = set(files(prioritytargets))
        forcerules = set(rules(forcerun))
        forcefiles = set(files(forcerun))
        targetrules = set(chain(rules(targets),
                                filterfalse(Rule.has_wildcards, priorityrules),
                                filterfalse(Rule.has_wildcards, forcerules)))
        targetfiles = set(chain(files(targets), priorityfiles, forcefiles))
        if forcetargets:
            forcefiles.update(targetfiles)
            forcerules.update(targetrules)

        rules = self.rules
        if allowed_rules:
            rules = [rule for rule in rules if rule.name in set(allowed_rules)]

        if wait_for_files is not None:
            try:
                snakemake.io.wait_for_files(wait_for_files,
                                            latency_wait=latency_wait)
            except IOError as e:
                logger.error(str(e))
                return False

        dag = DAG(
            self, rules,
            dryrun=dryrun,
            targetfiles=targetfiles,
            targetrules=targetrules,
            forceall=forceall,
            forcefiles=forcefiles,
            forcerules=forcerules,
            priorityfiles=priorityfiles,
            priorityrules=priorityrules,
            ignore_ambiguity=ignore_ambiguity,
            force_incomplete=force_incomplete,
            ignore_incomplete=ignore_incomplete or printdag or printrulegraph,
            notemp=notemp)

        self.persistence = Persistence(
            nolock=nolock,
            dag=dag,
            warn_only=dryrun or printrulegraph or printdag or summary or
            list_version_changes or list_code_changes or list_input_changes or
            list_params_changes)

        if cleanup_metadata:
            for f in cleanup_metadata:
                self.persistence.cleanup_metadata(f)
            return True

        dag.init()
        dag.check_dynamic()

        if unlock:
            try:
                self.persistence.cleanup_locks()
                logger.info("Unlocking working directory.")
                return True
            except IOError:
                logger.error("Error: Unlocking the directory {} failed. Maybe "
                             "you don't have the permissions?")
                return False
        try:
            self.persistence.lock()
        except IOError:
            logger.error(
                "Error: Directory cannot be locked. Please make "
                "sure that no other Snakemake process is trying to create "
                "the same files in the following directory:\n{}\n"
                "If you are sure that no other "
                "instances of snakemake are running on this directory, "
                "the remaining lock was likely caused by a kill signal or "
                "a power loss. It can be removed with "
                "the --unlock argument.".format(os.getcwd()))
            return False

        if self.subworkflows and not printdag and not printrulegraph:
            # backup globals
            globals_backup = dict(self.globals)
            # execute subworkflows
            for subworkflow in self.subworkflows:
                subworkflow_targets = subworkflow.targets(dag)
                updated = list()
                if subworkflow_targets:
                    logger.info(
                        "Executing subworkflow {}.".format(subworkflow.name))
                    if not subsnakemake(subworkflow.snakefile,
                                        workdir=subworkflow.workdir,
                                        targets=subworkflow_targets,
                                        updated_files=updated):
                        return False
                    dag.updated_subworkflow_files.update(subworkflow.target(f)
                                                         for f in updated)
                else:
                    logger.info("Subworkflow {}: Nothing to be done.".format(
                        subworkflow.name))
            if self.subworkflows:
                logger.info("Executing main workflow.")
            # rescue globals
            self.globals.update(globals_backup)

        dag.check_incomplete()
        dag.postprocess()

        if nodeps:
            missing_input = [f for job in dag.targetjobs for f in job.input
                             if dag.needrun(job) and not os.path.exists(f)]
            if missing_input:
                logger.error(
                    "Dependency resolution disabled (--nodeps) "
                    "but missing input "
                    "files detected. If this happens on a cluster, please make sure "
                    "that you handle the dependencies yourself or turn of "
                    "--immediate-submit. Missing input files:\n{}".format(
                        "\n".join(missing_input)))
                return False

        updated_files.extend(f for job in dag.needrun_jobs for f in job.output)

        if printd3dag:
            dag.d3dag()
            return True
        elif printdag:
            print(dag)
            return True
        elif printrulegraph:
            print(dag.rule_dot())
            return True
        elif summary:
            print("\n".join(dag.summary(detailed=False)))
            return True
        elif detailed_summary:
            print("\n".join(dag.summary(detailed=True)))
            return True
        elif list_version_changes:
            items = list(
                chain(*map(self.persistence.version_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_code_changes:
            items = list(chain(*map(self.persistence.code_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_input_changes:
            items = list(chain(*map(self.persistence.input_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_params_changes:
            items = list(
                chain(*map(self.persistence.params_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True

        scheduler = JobScheduler(self, dag, cores,
                                 local_cores=local_cores,
                                 dryrun=dryrun,
                                 touch=touch,
                                 cluster=cluster,
                                 cluster_config=cluster_config,
                                 cluster_sync=cluster_sync,
                                 jobname=jobname,
                                 immediate_submit=immediate_submit,
                                 quiet=quiet,
                                 keepgoing=keepgoing,
                                 drmaa=drmaa,
                                 printreason=printreason,
                                 printshellcmds=printshellcmds,
                                 latency_wait=latency_wait,
                                 benchmark_repeats=benchmark_repeats,
                                 greediness=greediness)

        if not dryrun and not quiet:
            if len(dag):
                if cluster or cluster_sync or drmaa:
                    logger.resources_info(
                        "Provided cluster nodes: {}".format(nodes))
                else:
                    logger.resources_info("Provided cores: {}".format(cores))
                    logger.resources_info("Rules claiming more threads will be scaled down.")
                provided_resources = format_resources(resources)
                if provided_resources:
                    logger.resources_info(
                        "Provided resources: " + provided_resources)
                ignored_resources = format_resource_names(
                    set(resource for job in dag.needrun_jobs for resource in
                        job.resources_dict if resource not in resources))
                if ignored_resources:
                    logger.resources_info(
                        "Ignored resources: " + ignored_resources)
                logger.run_info("\n".join(dag.stats()))
            else:
                logger.info("Nothing to be done.")
        if dryrun and not len(dag):
            logger.info("Nothing to be done.")

        success = scheduler.schedule()

        if success:
            if dryrun:
                if not quiet and len(dag):
                    logger.run_info("\n".join(dag.stats()))
            elif stats:
                scheduler.stats.to_json(stats)
            if not dryrun and not no_hooks:
                self._onsuccess(logger.get_logfile())
            return True
        else:
            if not dryrun and not no_hooks:
                self._onerror(logger.get_logfile())
            return False
예제 #55
0
 def list_resources(self):
     for resource in set(
         resource for rule in self.rules for resource in rule.resources):
         if resource not in "_cores _nodes".split():
             logger.info(resource)
예제 #56
0
 def upload_to_remote(self):
     if self.is_remote:
         logger.info("Uploading to remote: {}".format(self.file))
         self.remote_object.upload()
예제 #57
0
def snakemake(snakefile,
              listrules=False,
              list_target_rules=False,
              cores=1,
              nodes=1,
              local_cores=1,
              resources=dict(),
              config=dict(),
              configfile=None,
              config_args=None,
              workdir=None,
              targets=None,
              dryrun=False,
              touch=False,
              forcetargets=False,
              forceall=False,
              forcerun=[],
              prioritytargets=[],
              stats=None,
              printreason=False,
              printshellcmds=False,
              printdag=False,
              printrulegraph=False,
              printd3dag=False,
              nocolor=False,
              quiet=False,
              keepgoing=False,
              cluster=None,
              cluster_config=None,
              cluster_sync=None,
              drmaa=None,
              jobname="snakejob.{rulename}.{jobid}.sh",
              immediate_submit=False,
              standalone=False,
              ignore_ambiguity=False,
              snakemakepath=None,
              lock=True,
              unlock=False,
              cleanup_metadata=None,
              force_incomplete=False,
              ignore_incomplete=False,
              list_version_changes=False,
              list_code_changes=False,
              list_input_changes=False,
              list_params_changes=False,
              list_resources=False,
              summary=False,
              detailed_summary=False,
              latency_wait=3,
              benchmark_repeats=1,
              wait_for_files=None,
              print_compilation=False,
              debug=False,
              notemp=False,
              nodeps=False,
              keep_target_files=False,
              allowed_rules=None,
              jobscript=None,
              timestamp=False,
              greediness=None,
              no_hooks=False,
              overwrite_shellcmd=None,
              updated_files=None,
              log_handler=None,
              keep_logger=False,
              verbose=False):
    """Run snakemake on a given snakefile.

    This function provides access to the whole snakemake functionality. It is not thread-safe.

    Args:
        snakefile (str):            the path to the snakefile
        listrules (bool):           list rules (default False)
        list_target_rules (bool):   list target rules (default False)
        cores (int):                the number of provided cores (ignored when using cluster support) (default 1)
        nodes (int):                the number of provided cluster nodes (ignored without cluster support) (default 1)
        local_cores (int):                the number of provided local cores if in cluster mode (ignored without cluster support) (default 1)
        resources (dict):           provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {})
        config (dict):              override values for workflow config
        workdir (str):              path to working directory (default None)
        targets (list):             list of targets, e.g. rule or file names (default None)
        dryrun (bool):              only dry-run the workflow (default False)
        touch (bool):               only touch all output files if present (default False)
        forcetargets (bool):        force given targets to be re-created (default False)
        forceall (bool):            force all output files to be re-created (default False)
        forcerun (list):             list of files and rules that shall be re-created/re-executed (default [])
        prioritytargets (list):     list of targets that shall be run with maximum priority (default [])
        stats (str):                path to file that shall contain stats about the workflow execution (default None)
        printreason (bool):         print the reason for the execution of each job (default false)
        printshellcmds (bool):      print the shell command of each job (default False)
        printdag (bool):            print the dag in the graphviz dot language (default False)
        printrulegraph (bool):      print the graph of rules in the graphviz dot language (default False)
        printd3dag (bool):          print a D3.js compatible JSON representation of the DAG (default False)
        nocolor (bool):             do not print colored output (default False)
        quiet (bool):               do not print any default job information (default False)
        keepgoing (bool):           keep goind upon errors (default False)
        cluster (str):              submission command of a cluster or batch system to use, e.g. qsub (default None)
        cluster_config (str):       configuration file for cluster options (default None)
        cluster_sync (str):         blocking cluster submission command (like SGE 'qsub -sync y')  (default None)
        drmaa (str):                if not None use DRMAA for cluster support, str specifies native args passed to the cluster when submitting a job
        jobname (str):              naming scheme for cluster job scripts (default "snakejob.{rulename}.{jobid}.sh")
        immediate_submit (bool):    immediately submit all cluster jobs, regardless of dependencies (default False)
        standalone (bool):          kill all processes very rudely in case of failure (do not use this if you use this API) (default False)
        ignore_ambiguity (bool):    ignore ambiguous rules and always take the first possible one (default False)
        snakemakepath (str):        path to the snakemake executable (default None)
        lock (bool):                lock the working directory when executing the workflow (default True)
        unlock (bool):              just unlock the working directory (default False)
        cleanup_metadata (bool):    just cleanup metadata of output files (default False)
        force_incomplete (bool):    force the re-creation of incomplete files (default False)
        ignore_incomplete (bool):   ignore incomplete files (default False)
        list_version_changes (bool): list output files with changed rule version (default False)
        list_code_changes (bool):   list output files with changed rule code (default False)
        list_input_changes (bool):  list output files with changed input files (default False)
        list_params_changes (bool): list output files with changed params (default False)
        summary (bool):             list summary of all output files and their status (default False)
        latency_wait (int):         how many seconds to wait for an output file to appear after the execution of a job, e.g. to handle filesystem latency (default 3)
        benchmark_repeats (int):    number of repeated runs of a job if declared for benchmarking (default 1)
        wait_for_files (list):      wait for given files to be present before executing the workflow
        list_resources (bool):      list resources used in the workflow (default False)
        summary (bool):             list summary of all output files and their status (default False). If no option  is specified a basic summary will be ouput. If 'detailed' is added as an option e.g --summary detailed, extra info about the input and shell commands will be included
        detailed_summary (bool):    list summary of all input and output files and their status (default False)
        print_compilation (bool):   print the compilation of the snakefile (default False)
        debug (bool):               allow to use the debugger within rules
        notemp (bool):              ignore temp file flags, e.g. do not delete output files marked as temp after use (default False)
        nodeps (bool):              ignore dependencies (default False)
        keep_target_files (bool):   Do not adjust the paths of given target files relative to the working directory.
        allowed_rules (set):        Restrict allowed rules to the given set. If None or empty, all rules are used.
        jobscript (str):            path to a custom shell script template for cluster jobs (default None)
        timestamp (bool):           print time stamps in front of any output (default False)
        greediness (float):         set the greediness of scheduling. This value between 0 and 1 determines how careful jobs are selected for execution. The default value (0.5 if prioritytargets are used, 1.0 else) provides the best speed and still acceptable scheduling quality.
        overwrite_shellcmd (str):   a shell command that shall be executed instead of those given in the workflow. This is for debugging purposes only.
        updated_files(list):        a list that will be filled with the files that are updated or created during the workflow execution
        verbose(bool):              show additional debug output (default False)
        log_handler (function):     redirect snakemake output to this custom log handler, a function that takes a log message dictionary (see below) as its only argument (default None). The log message dictionary for the log handler has to following entries:

            :level:
                the log level ("info", "error", "debug", "progress", "job_info")

            :level="info", "error" or "debug":
                :msg:
                    the log message
            :level="progress":
                :done:
                    number of already executed jobs

                :total:
                    number of total jobs

            :level="job_info":
                :input:
                    list of input files of a job

                :output:
                    list of output files of a job

                :log:
                    path to log file of a job

                :local:
                    whether a job is executed locally (i.e. ignoring cluster)

                :msg:
                    the job message

                :reason:
                    the job reason

                :priority:
                    the job priority

                :threads:
                    the threads of the job


    Returns:
        bool:   True if workflow execution was successful.

    """

    if updated_files is None:
        updated_files = list()

    if cluster or cluster_sync or drmaa:
        cores = sys.maxsize
    else:
        nodes = sys.maxsize

    if cluster_config:
        cluster_config = load_configfile(cluster_config)
    else:
        cluster_config = dict()

    if not keep_logger:
        setup_logger(handler=log_handler,
                     quiet=quiet,
                     printreason=printreason,
                     printshellcmds=printshellcmds,
                     nocolor=nocolor,
                     stdout=dryrun,
                     debug=verbose,
                     timestamp=timestamp)

    if greediness is None:
        greediness = 0.5 if prioritytargets else 1.0
    else:
        if not (0 <= greediness <= 1.0):
            logger.error("Error: greediness must be a float between 0 and 1.")
            return False

    if not os.path.exists(snakefile):
        logger.error("Error: Snakefile \"{}\" not present.".format(snakefile))
        return False
    snakefile = os.path.abspath(snakefile)

    cluster_mode = (cluster is not None) + (cluster_sync is not
                                            None) + (drmaa is not None)
    if cluster_mode > 1:
        logger.error("Error: cluster and drmaa args are mutually exclusive")
        return False
    if debug and (cores > 1 or cluster_mode):
        logger.error(
            "Error: debug mode cannot be used with more than one core or cluster execution.")
        return False

    overwrite_config = dict()
    if configfile:
        overwrite_config.update(load_configfile(configfile))
    if config:
        overwrite_config.update(config)

    if workdir:
        olddir = os.getcwd()
        if not os.path.exists(workdir):
            logger.info(
                "Creating specified working directory {}.".format(workdir))
            os.makedirs(workdir)
        workdir = os.path.abspath(workdir)
        os.chdir(workdir)
    workflow = Workflow(snakefile=snakefile,
                        snakemakepath=snakemakepath,
                        jobscript=jobscript,
                        overwrite_shellcmd=overwrite_shellcmd,
                        overwrite_config=overwrite_config,
                        overwrite_workdir=workdir,
                        overwrite_configfile=configfile,
                        config_args=config_args,
                        debug=debug)

    if standalone:
        try:
            # set the process group
            os.setpgrp()
        except:
            # ignore: if it does not work we can still work without it
            pass

    success = True
    try:
        workflow.include(snakefile,
                         overwrite_first_rule=True,
                         print_compilation=print_compilation)
        workflow.check()

        if not print_compilation:
            if listrules:
                workflow.list_rules()
            elif list_target_rules:
                workflow.list_rules(only_targets=True)
            elif list_resources:
                workflow.list_resources()
            else:
                # if not printdag and not printrulegraph:
                # handle subworkflows
                subsnakemake = partial(snakemake,
                                       cores=cores,
                                       nodes=nodes,
                                       local_cores=local_cores,
                                       resources=resources,
                                       dryrun=dryrun,
                                       touch=touch,
                                       printreason=printreason,
                                       printshellcmds=printshellcmds,
                                       nocolor=nocolor,
                                       quiet=quiet,
                                       keepgoing=keepgoing,
                                       cluster=cluster,
                                       cluster_config=cluster_config,
                                       cluster_sync=cluster_sync,
                                       drmaa=drmaa,
                                       jobname=jobname,
                                       immediate_submit=immediate_submit,
                                       standalone=standalone,
                                       ignore_ambiguity=ignore_ambiguity,
                                       snakemakepath=snakemakepath,
                                       lock=lock,
                                       unlock=unlock,
                                       cleanup_metadata=cleanup_metadata,
                                       force_incomplete=force_incomplete,
                                       ignore_incomplete=ignore_incomplete,
                                       latency_wait=latency_wait,
                                       benchmark_repeats=benchmark_repeats,
                                       verbose=verbose,
                                       notemp=notemp,
                                       nodeps=nodeps,
                                       jobscript=jobscript,
                                       timestamp=timestamp,
                                       greediness=greediness,
                                       no_hooks=no_hooks,
                                       overwrite_shellcmd=overwrite_shellcmd,
                                       config=config,
                                       config_args=config_args,
                                       keep_logger=True)
                success = workflow.execute(
                    targets=targets,
                    dryrun=dryrun,
                    touch=touch,
                    cores=cores,
                    nodes=nodes,
                    local_cores=local_cores,
                    forcetargets=forcetargets,
                    forceall=forceall,
                    forcerun=forcerun,
                    prioritytargets=prioritytargets,
                    quiet=quiet,
                    keepgoing=keepgoing,
                    printshellcmds=printshellcmds,
                    printreason=printreason,
                    printrulegraph=printrulegraph,
                    printdag=printdag,
                    cluster=cluster,
                    cluster_config=cluster_config,
                    cluster_sync=cluster_sync,
                    jobname=jobname,
                    drmaa=drmaa,
                    printd3dag=printd3dag,
                    immediate_submit=immediate_submit,
                    ignore_ambiguity=ignore_ambiguity,
                    stats=stats,
                    force_incomplete=force_incomplete,
                    ignore_incomplete=ignore_incomplete,
                    list_version_changes=list_version_changes,
                    list_code_changes=list_code_changes,
                    list_input_changes=list_input_changes,
                    list_params_changes=list_params_changes,
                    summary=summary,
                    latency_wait=latency_wait,
                    benchmark_repeats=benchmark_repeats,
                    wait_for_files=wait_for_files,
                    detailed_summary=detailed_summary,
                    nolock=not lock,
                    unlock=unlock,
                    resources=resources,
                    notemp=notemp,
                    nodeps=nodeps,
                    keep_target_files=keep_target_files,
                    cleanup_metadata=cleanup_metadata,
                    subsnakemake=subsnakemake,
                    updated_files=updated_files,
                    allowed_rules=allowed_rules,
                    greediness=greediness,
                    no_hooks=no_hooks)

    except BrokenPipeError:
        # ignore this exception and stop. It occurs if snakemake output is piped into less and less quits before reading the whole output.
        # in such a case, snakemake shall stop scheduling and quit with error 1
        success = False
    except (Exception, BaseException) as ex:
        print_exception(ex, workflow.linemaps)
        success = False
    if workdir:
        os.chdir(olddir)
    if workflow.persistence:
        workflow.persistence.unlock()
    if not keep_logger:
        logger.cleanup()
    return success