예제 #1
0
 def inner(self, outfile, *args, **kwargs):
     try:
         f()
     except Exception as e:
         E.warn("received exception {} - touching {}".format(
             str(e), outfile))
     IOTools.touch_file(outfile)
예제 #2
0
    def run(self, infile, outfile, params):

        if params.reference_fasta is None:
            raise ValueError("please provide a reference database")

        statement = (
            "{params.path_nucmer} -p {outfile} {params.reference_fasta} {infile} >& {outfile}.nucmer; "
            "{params.path_dnadiff} -p {outfile} -d {outfile}.delta >& {outfile}.dnadiff; "
            "{params.path_mummerplot} --large --fat --png {outfile}.1delta >& {outfile}.mummerplot"
            .format(**locals()))

        retval = P.run(statement)
        IOTools.touch_file(outfile)
        return retval
예제 #3
0
    def ignore_task(self, infiles, outfiles, params):
        """return True if task should be ignored.

        This method will also create the output file(s).
        """
        if self._ignore:
            m = str(outfiles)
            for ignore in IOTools.val2list(self._ignore):
                if ignore in m:
                    E.warn("task {} will be ignored".format(self.__name__))
                    for f in IOTools.val2list(outfiles):
                        E.info("creating empty file {}".format(f))
                        IOTools.touch_file(f)
                    return True
        return False
예제 #4
0
    def run(self, infiles, outfile, params):

        if not outfile.endswith("-pass.fastq.gz"):
            raise ValueError(
                "outfile must end in -pass.fastq.gz, got {}".format(outfile))

        if params.min_size_bytes:
            before = len(infiles)
            infiles = [
                x for x in infiles
                if os.path.getsize(x) >= params.min_size_bytes
            ]
            E.debug(
                "removing small files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if params.newer_than:
            before = len(infiles)
            cutoff = os.path.getmtime(params.newer_than)
            infiles = [x for x in infiles if os.path.getmtime(x) > cutoff]
            E.debug(
                "removing old files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if len(infiles) == 0:
            E.warn("no files left after filtering, creating empty file")
            IOTools.touch_file(outfile)
            return

        infiles = " ".join(infiles)

        outfile_fail = IOTools.snip(outfile,
                                    "-pass.fastq.gz") + "-fail.fastq.gz"

        statement = ("zcat {infiles} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "- "
                     "| gzip "
                     "> {outfile}".format(**locals()))
        return P.run(statement)
예제 #5
0
    def run(self, infile, outfile, params):

        if params.reference_database is None:
            raise ValueError("please provide a reference database")

        statement = (
            "{params.path_lastal} {params.lastal_options} "
            "{params.reference_database} {infile} "
            "| {params.path_lastsplit} {params.lastsplit_options} "
            "| {params.path_mafsort} "
            "| gzip "
            "> {outfile}.maf.gz; "
            "{params.path_lastdotplot} "
            "<(zcat {outfile}.maf.gz "
            "| daisy maf2maf --log={outfile}.filter.log --min-length={params.min_contig_length} ) "
            "{outfile}.png ".format(**locals()))

        retval = P.run(statement, job_memory="15G")
        IOTools.touch_file(outfile)
        return retval
예제 #6
0
 def touch_and_mark_as_mounted(source, dest):
     o = os.stat(source)
     IOTools.touch_file(dest, times=(o.st_atime, o.st_mtime))
     with open(dest + ".mnt", "w") as outf:
         outf.write(get_mounted_location(source))
예제 #7
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        # ensure output directory exists.
        # This should be done on the pipeline level, but
        # ruffus currently seems not to allow this.
        outdir = os.path.dirname(outfile)
        if outdir and not os.path.exists(outdir):
            os.makedirs(outdir)

        output_files = [
            self.map_table_to_file(x, outfile) for x in self.tablenames
        ]

        kwargs = {
            'output_files': output_files,
            'input_files': infiles,
            'outdir': outdir
        }

        if self._runtime_regex:
            kwargs["alias"] = self.build_alias(str(infiles),
                                               regex=self._runtime_regex,
                                               alias=self._runtime_alias)

        self.save_meta(outfile, **kwargs)

        if self.ignore:
            found = False
            for i in self.ignore:
                if i in outdir:
                    found = True
                    break

            if found:
                E.warn("skipping task {} at runtime, an empty file is created".
                       format(outfile))
                IOTools.touch_file(outfile)
                return

        # if self.runtime_filter:
        # TODO: create empty outfile if regex matches
        #    pass

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    IOTools.snip(outfile) + ".info"))
            return

        # AH: duplicated from above?
        params = self.build_params(output_files=output_files)

        on_error_options = ["raise", "ignore"]
        on_error = params.get("on_error", "raise")
        if on_error not in on_error_options:
            raise ValueError("unknown option to 'on_error': '{}' "
                             "should be one of '{}'".format(
                                 on_error, ",".join(on_error_options)))

        if self.ignore_task(infiles, outfile, params):
            return

        # deal with placeholder files created by identity that are
        # located on a remote mount point
        def map_to_mount(fn):
            if os.path.exists(fn + ".mnt"):
                if not P.PARAMS["mount_point"]:
                    raise ValueError(
                        "encountered mounted file {}, but no mount point present"
                        .format(fn))
                with open(fn + ".mnt") as inf:
                    mount_path = inf.read()
                return os.path.join(P.PARAMS["mount_point"], mount_path)
            else:
                return fn

        # replace infiles with mount locations if necessary
        if isinstance(infiles, list):
            infiles = [map_to_mount(x) for x in infiles]
        else:
            infiles = map_to_mount(infiles)

        try:
            benchmark = self.run(infiles, outfile, as_namedtuple(params))
        except Exception as ex:
            on_error = params.get("on_error", "raise")
            if on_error == "raise":
                raise
            elif on_error == "ignore":
                E.warn(
                    "error occured during execution of {} but will be ignored:\n{}"
                    .format(self.__name__, ex))
                E.warn(
                    "an empty output file {} will be created.".format(outfile))
                IOTools.touch_file(outfile)
                benchmark = None

        if benchmark:
            self.save_benchmark(outfile, benchmark)
예제 #8
0
    def run(self, infile, outfile, params):
        # TODO: bam_fastqc_sequence_length_distribution.tsv may
        # contain ranges such as '30-31'. Convert to beginning of
        # range like in this perl command:
        #
        # perl -p -i -e "s/\-\d+//"
        # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv

        if infile.endswith(".gz"):
            prefix = IOTools.snip(os.path.basename(infile[:-3]))
        else:
            prefix = IOTools.snip(os.path.basename(infile))

        outdir = os.path.dirname(outfile)

        datafile = os.path.join(outdir, "{}_fastqc".format(prefix),
                                "fastqc_data.txt")

        if not os.path.exists(datafile):
            if not os.path.exists(outdir):
                os.makedirs(outdir)

            retval = P.run(
                "{params.path} "
                "{params.options} "
                "--extract "
                "--outdir {outdir} "
                "{infile} "
                ">& {outfile} ".format(**locals()), **params._asdict())
        else:
            IOTools.touch_file(outfile)
            retval = None

        def _split_output(lines):
            body, header, section, status = [], None, None, None
            for line in lines:
                if line.startswith("##FastQC"):
                    continue
                elif line.startswith("#"):
                    header, body = line[1:-1].split("\t"), []
                elif line.startswith(">>END_MODULE"):
                    yield section, header, body, status
                    body, header, section, status = [], None, None, None
                elif line.startswith(">>"):
                    section, status = line[2:-1].split("\t")
                else:
                    fields = line[:-1].split("\t")
                    body.append(fields)

        # split into separate files for upload
        summary_data = []
        with IOTools.open_file(datafile) as inf:
            for section, header, body, status in _split_output(inf):
                if len(body) == 0:
                    continue
                summary_data.append((section, status))
                tablename = "{}_".format(self.name) + re.sub(
                    " ", "_", section).lower()
                if tablename not in self.tablenames:
                    raise ValueError(
                        "unknown tablename {}, expected one of {}".format(
                            tablename, self.tablenames))
                output_file = ".".join((outfile, tablename, "tsv"))
                with open(output_file, "w") as outf:
                    outf.write("\t".join([x.lower() for x in header]) + "\n")
                    # remove first column, which contains the identifier
                    outf.write("\n".join(["\t".join(x) for x in body]) + "\n")

        output_file = ".".join(
            (outfile, "{}_summary".format(self.name), "tsv"))
        with IOTools.open_file(output_file, "w") as outf:
            outf.write("section\tstatus\n")
            for section, status in summary_data:
                outf.write("{}\t{}\n".format(section, status))

        return retval