def run(self, infile, outfile, params):

        try:
            retval = P.run("{params.path} view -H  "
                           "{infile} "
                           "2> {outfile}.log "
                           "> {outfile}.tmp; ".format(**locals()))
        except OSError as e:
            E.warn("input file {} gave the following errors: {}".format(
                infile, str(e)))

        with open(outfile, "w") as outf, open(outfile + ".tmp") as inf:
            outf.write("header_tag\ttag\tlineno\tvalue\n")
            for lineno, line in enumerate(inf):
                fields = line[1:-1].split("\t")
                header_tag = fields[0]
                if header_tag == "CO":
                    # Do not split comment lines
                    outf.write("\t".join((header_tag, "", str(lineno),
                                          "\t".join(fields[1:]))) + "\n")
                else:
                    for field in fields[1:]:
                        sub_tag, content = field.split(":", 1)
                        outf.write("\t".join((header_tag, sub_tag, str(lineno),
                                              content)) + "\n")

        os.unlink(outfile + ".tmp")
        return retval
示例#2
0
 def inner(self, outfile, *args, **kwargs):
     try:
         f()
     except Exception as e:
         E.warn("received exception {} - touching {}".format(
             str(e), outfile))
     IOTools.touch_file(outfile)
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        self.input_files = collect_file_meta_information({"in": infiles})
        self.input_alias = self.build_alias(str(infiles),
                                            regex=self._input_regex,
                                            alias=self._input_alias)

        is_empty_outfile = outfile == []
        # patch for missing outfiles when number of outfiles is not
        # known (is it a ruffus thing?)
        if is_empty_outfile:
            assert isinstance(infiles, str)
            outdir = "{}/{}.dir".format(os.path.dirname(infiles), self.name)
            outfile = os.path.join(outdir, self.output)
            # dummy.info gets removed and then added
            self.save_meta(os.path.join(outdir, "dummy.info"))
        else:
            self.save_meta(outfile)

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    self.build_meta_filename(outfile, "benchmark.info")))
            return

        params = self.build_params()
        benchmark = self.run(infiles, outfile, as_namedtuple(params))

        if not is_empty_outfile:
            self.save_benchmark(outfile, benchmark)
    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=" ")
        reference_fasta = get_reference(params)

        # warning: requires -m or -c in the options
        if "--multiallelic-caller" not in params.options and \
           "-m" not in params.options and \
           "-c" not in params.options and \
           "--consensus-caller" not in params.options:
            E.warn("bcftools call requires -m or -c, got {}".format(
                params.options))

        # limit number of jobs to node to limit I/O
        job_threads = 4

        return P.run("{params.path_samtools} mpileup "
                     "-ug "
                     "-f {reference_fasta} "
                     "{params.samtools_options} "
                     "{bam} "
                     "2> {outfile}.pileup.log "
                     "| {params.path} call "
                     "--variants-only "
                     "--output-type z "
                     "{params.options} "
                     "2> {outfile}.call.log "
                     "> {outfile}; "
                     "tabix -p vcf {outfile} ".format(**locals()))
def get_reference_for_bam(bamfile, fastafiles):
    """deduce reference sequence used within BAM files.

    This method compares the sequence dictionary in the bamfile with a
    list of fastafiles. The comparison will stop at the first match
    that is found.

    :param bamfile: :term:`BAM` formatted file
    :param fastafiles: list of :term:`fasta` formatted files. The
        fasta files need to indexed with samtools faidx.
    :return: a tuple (filename, diffs). The first is the filename if
        found, otherwise None.  If not found, diffs is a list of all
        input files with a list missing contigs or length mismatches.

    """

    diffs = []

    # Temporary fix: see issue SYS-517
    if not os.path.exists(bamfile):
        E.warn("could not find file {}".format(bamfile))

    try:
        with pysam.AlignmentFile(bamfile, check_sq=False) as inf:
            sequence_dict = dict(list(zip(inf.references, inf.lengths)))
    except IOError as ex:
        E.warn("could not open bamfile {}: {}".format(bamfile, ex))
        return None, None

    fastafn, diffs = match_sequence_dictionaries(sequence_dict, fastafiles)

    return fastafn, diffs
示例#6
0
    def save_benchmark(self, outfile, benchmark):

        if not isinstance(benchmark, list):
            benchmark = [benchmark]

        # flatten if nested list and remove None
        benchmark = [
            x for x in IOTools.flatten(benchmark, ltypes=(list, ))
            if x is not None
        ]

        filename = self.build_meta_filename(outfile, "benchmark.bench")

        if not benchmark:
            E.warn("could not save benchmark info to {}".format(filename))
            return

        try:
            header = benchmark[0]._fields
        except AttributeError as ex:
            E.warn("could not save benchmark timings for {}:"
                   " {} from {}".format(outfile, str(ex), str(benchmark[0])))
            return

        with open(filename, "w") as outf:
            outf.write("\t".join(header) + "\n")
            for b in benchmark:
                outf.write("\t".join(map(str, b)) + "\n")
    def run(self, infile, outfiles, params):

        tbxfile = pysam.VariantFile(infile)
        statements = []

        for chrom in list(tbxfile.header.contigs):
            output_file = outfiles.format(chrom)
            output_dir = os.path.dirname(output_file)
            statements.append(
                "mkdir {output_dir}; "
                "tabix -h {infile} {chrom} | bgzip > {output_file}; "
                "tabix -p vcf {output_file} ".format(**locals()))

        retvals = P.run(statements)

        # clean up empty vcfs, opening empty VCF in pysam throws
        # ValueError
        for chrom in list(tbxfile.header.contigs):
            output_file = outfiles.format(chrom)
            output_dir = os.path.dirname(output_file)
            try:
                f = pysam.VariantFile(output_file)
                f.close()
            except ValueError:
                E.warn("removing empty VCF {}".format(output_file))
                shutil.rmtree(output_dir)

        tbxfile.close()
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)

        stmnts = []

        prefix = IOTools.snip(outfile, ".vcf.gz")
        vcf_output = prefix + ".raw.vcf.gz"

        if not os.path.exists(vcf_output):
            stmnts.append("java "
                          "-Djava.io.tmpdir=%(tmpdir)s "
                          "-jar {self.path} "
                          "--analysis_type HaplotypeCaller "
                          "--input_file {bam} "
                          "--reference_sequence {reference_fasta} "
                          "--logging_level INFO "
                          "--log_to_file {outfile}.HaplotypeCaller.log "
                          "{params.haplotypecaller} "
                          "--out {vcf_output} "
                          ">& {prefix}.HaplotypeCaller.err".format(**locals()))
        else:
            E.warn("output file {vcf_output} already exists - "
                   "it will not be recomputed".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="5G")
示例#9
0
    def _test_task_will_run(self, taskf):

        # check if task is installed
        version = get_task_version(taskf)
        if version is None:
            self.skipTest("tools for task {} not available".format(taskf.name))
            return

        # define input/output files
        tool_config = self.test_config["tool"]
        input_files = {}
        for expected in taskf.expected:
            if taskf.name in tool_config:
                # task specific input files
                p = tool_config[taskf.name].get(
                    expected, tool_config.get(expected, None))
            else:
                # generic input files
                p = tool_config.get(expected, None)
            if p is None:
                self.skipTest(
                    "data for input slot {} not provided for {}".format(
                        expected, taskf.name))
                return

            input_files[expected] = p

        tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}".format(taskf.name))
        if isinstance(taskf.output, str):
            outfile = os.path.join(tmpdir, taskf.output)
        else:
            outfile = [os.path.join(tmpdir, x) for x in taskf.output]

        # instantiate task
        t = taskf()
        # set custom options for test
        if taskf.name in tool_config:
            for key, value in tool_config[taskf.name].items():
                setattr(t, key, value)

        # run task
        t.register_input(input_files)
        t(input_files.values(), outfile)

        # check if tool produced non-zero output
        if isinstance(outfile, list):
            for x in outfile:
                self.assertTrue(os.path.exists(x))
                self.assertGreater(os.path.getsize(x), 0)
        else:
            self.assertTrue(os.path.exists(outfile))
            self.assertGreater(os.path.getsize(outfile), 0)

        # cleanup
        try:
            shutil.rmtree(tmpdir)
        except OSError as ex:
            E.warn("could not remove {}: {}".format(tmpdir, ex))
    def run(self, infile, outfile, params):

        if "reference_fasta" in params._fields:
            reference_fasta = "REFERENCE_SEQUENCE={}".format(
                params.reference_fasta)
        else:
            reference_fasta = ""

        # command can fail when no output is produced, but still produce output
        # 12G is required for java overhead
        retval = P.run("java -Xmx8000m -jar {params.path} "
                       "CollectMultipleMetrics "
                       "{reference_fasta} "
                       "INPUT={infile} "
                       "TMP_DIR=%(tmpdir)s "
                       "{params.options} "
                       "OUTPUT={outfile} "
                       ">& {outfile} ".format(**locals()),
                       job_memory="12G",
                       ignore_errors=True)

        def get_section(section, data):
            pattern = "## {}".format(section)
            keep = False
            result = []
            for line in data:
                if line.startswith("##"):
                    if line.startswith(pattern):
                        keep = True
                    else:
                        keep = False
                if keep:
                    result.append(line)
            return result

        for tablename in self.tablenames:
            filename = re.sub("histogram", "metrics", tablename)
            raw = filename[len("picard_"):]
            src = outfile + "." + raw
            dest = outfile + "." + tablename + ".tsv"

            if not os.path.exists(src):
                E.warn("no file {}, ignored".format(src))
                continue

            with IOTools.open_file(src) as inf:
                data = inf.readlines()

            if tablename.endswith("metrics"):
                data = get_section("METRICS", data)
            elif tablename.endswith("histogram"):
                data = get_section("HISTOGRAM", data)

            with IOTools.open_file(dest, "w") as outf:
                outf.write("".join(data))

        return retval
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")

        bam = resolve_argument(params.bam, sep=",")
        reference_fasta = get_reference(params)

        bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")])
        stmnts = []
        if not os.path.exists(prefix + ".annotated.vcf.gz"):
            tmpfile, pre_statement, post_statement = self.pre_process(
                params.vcf, outfile, params)

            stmnts.append(pre_statement)
            stmnts.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type VariantAnnotator "
                "--variant {tmpfile} "
                "{bam} "
                "--reference_sequence {reference_fasta} "
                "--logging_level INFO "
                "--log_to_file {prefix}.VariantAnnotator.log "
                "--annotation FisherStrand "
                "--annotation StrandOddsRatio "
                "--annotation ReadPosRankSumTest "
                "--annotation RMSMappingQuality "
                "--annotation MappingQualityRankSumTest "
                "{params.options} "
                "--out {prefix}.annotated.vcf.gz "
                ">& {prefix}.VariantAnnotator.err".format(**locals()))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

            stmnts.append(post_statement)
        else:
            E.warn("using pre-existing file {} with annotated variants".format(
                prefix + ".annotated.vcf.gz"))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

        return self.run_statements(stmnts, job_memory="3G")
示例#12
0
    def ignore_task(self, infiles, outfiles, params):
        """return True if task should be ignored.

        This method will also create the output file(s).
        """
        if self._ignore:
            m = str(outfiles)
            for ignore in IOTools.val2list(self._ignore):
                if ignore in m:
                    E.warn("task {} will be ignored".format(self.__name__))
                    for f in IOTools.val2list(outfiles):
                        E.info("creating empty file {}".format(f))
                        IOTools.touch_file(f)
                    return True
        return False
    def run(self, infile, outfile, params):

        with open(outfile, "w") as outf:
            outf.write("chromosome\tsize\tmapped\tunmapped\n")

        try:
            retval = P.run("{params.path} idxstats "
                           "{infile} "
                           "2> {outfile}.log "
                           ">> {outfile}; ".format(**locals()))
        except OSError as e:
            E.warn("input  file {} gave the following errors: {}".format(
                infile, str(e)))
            retval = None

        return retval
    def run(self, infile, outfile, params):

        with open(outfile, "w") as outf:
            outf.write("counts\tcounts_fail\tcategory\n")

        try:
            retval = P.run("{params.path} flagstat "
                           "{infile} "
                           "2> {outfile}.log "
                           "| perl -p -e 's/ \+ /\\t/; s/ /\\t/; s/\\(.*//' "
                           ">> {outfile}; ".format(**locals()))
        except OSError as e:
            E.warn("input  file {} gave the following errors: {}".format(
                infile, str(e)))

        return retval
示例#15
0
    def _test_task_will_run(self, taskf):
        # check if task is installed
        version = get_task_version(taskf)
        if version is None:
            self.skipTest("tools for task {} not available".format(taskf.name))
            return

        # define input/output files
        metric_config = self.test_config["metric"]
        infiles = None
        for key, values in metric_config.items():
            if key == taskf.name:
                infiles = values.get("files", None)
            elif "patterns" in values:
                for pattern in values["patterns"]:
                    if re.search(pattern, taskf.name):
                        infiles = values.get("files", None)
                        break
            if infiles:
                break

        if infiles is None:
            self.skipTest("no input files specified for {}".format(taskf.name))
            return

        tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}_".format(taskf.name))
        outfile = os.path.join(tmpdir, "output.tsv")

        # instantiate task
        task = taskf()
        # set custom options for test
        if taskf.name in metric_config:
            for key, value in metric_config[taskf.name].items():
                setattr(task, key, value)

        # run task
        task(infiles, outfile)

        # check if tool produced non-zero output
        self.assertTrue(os.path.exists(outfile))
        self.assertGreater(os.path.getsize(outfile), 0)

        # cleanup
        try:
            shutil.rmtree(tmpdir)
        except OSError as ex:
            E.warn("could not remove {}: {}".format(tmpdir, ex))
示例#16
0
    def run(self, infiles, outfile, params):

        if not outfile.endswith("-pass.fastq.gz"):
            raise ValueError(
                "outfile must end in -pass.fastq.gz, got {}".format(outfile))

        if params.min_size_bytes:
            before = len(infiles)
            infiles = [
                x for x in infiles
                if os.path.getsize(x) >= params.min_size_bytes
            ]
            E.debug(
                "removing small files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if params.newer_than:
            before = len(infiles)
            cutoff = os.path.getmtime(params.newer_than)
            infiles = [x for x in infiles if os.path.getmtime(x) > cutoff]
            E.debug(
                "removing old files: after={}, before={}, removed={}".format(
                    len(infiles), before, before - len(infiles)))

        if len(infiles) == 0:
            E.warn("no files left after filtering, creating empty file")
            IOTools.touch_file(outfile)
            return

        infiles = " ".join(infiles)

        outfile_fail = IOTools.snip(outfile,
                                    "-pass.fastq.gz") + "-fail.fastq.gz"

        statement = ("zcat {infiles} "
                     "| daisy fastq2fastq "
                     "--method=filter-ONT "
                     "--min-average-quality={params.min_average_quality} "
                     "--log={outfile}.log "
                     "--min-length={params.min_length} "
                     "--output-removed-fastq={outfile_fail} "
                     "- "
                     "| gzip "
                     "> {outfile}".format(**locals()))
        return P.run(statement)
    def __call__(self, dataframe, map_sample2label={}):

        df = dataframe.pivot(index="gc_bin", columns="sample",
                             values="mean").reset_index()

        # remove duplicate sample names
        # (in Cancer analysis: two blood samples)
        to_drop = [x for x in df.columns if x.startswith("2:")]
        df.drop(to_drop, axis=1, inplace=True)

        df.colums = [map_sample2label.get(x, x) for x in df.columns]

        if df.empty:
            E.warn("no data, no plot will be output")
            return

        ax = df.plot(kind="line", x="gc_bin")

        return ax
示例#18
0
def get_sequence_length_dict(fastafn):
    """return sequence/length dictionary from a
    fasta file.

    The fasta file needs to be indexed with samtools faidx.
    """
    # Temporary fix: see issue SYS-517
    if not os.path.exists(fastafn):
        E.warn("could not find file {}".format(fastafn))
        return None

    try:
        with pysam.FastaFile(fastafn) as inf:
            fastadict = dict(list(zip(inf.references, inf.lengths)))
    except IOError as ex:
        E.warn("file {} could not be opened".format(ex))
        fastadict = None

    return fastadict
    def __call__(self, track, column_is_norm=None, *args, **kwargs):

        fn = track

        table = pandas.read_csv(
            fn,
            comment="#",
            sep="\t",
            dtype={
                "CHROM": object
            },
        ).set_index(["CHROM", "POS"])

        columns = table.columns

        if len(columns) > 2:
            E.warn("too many columns {}".format(columns))

        if column_is_norm:
            if len(columns) == 2:
                num, den = columns
                if num in column_is_norm:
                    num, den = den, num
            else:
                den = [x for x in columns if x in column_is_norm][0]
                num = [x for x in columns if x not in column_is_norm][0]

            if den not in column_is_norm:
                raise ValueError(
                    "denominator is {}, but not norm, cols = {}".format(
                        den, columns))

        table = table[(table[num] >= self.min_depth)
                      & (table[den] >= self.min_depth)]
        for column in columns:
            table[column] = table[column] / table[column].median()

        table["sample"] = num
        table["copy number"] = 2.0 * table[num] / table[den]
        table.drop(columns, axis=1, inplace=True)
        return table
示例#20
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        if self.mountpoint:
            # revert mount redirection for arvados to allow redirection
            # on individual cluster nodes
            for d, key, value in IOTools.nested_iter(infiles):
                d[key] = re.sub(self.mountpoint, "arv=", value)

        self.instantiate_input(infiles)
        self.save_meta(outfile, output_file=outfile)

        if only_info:
            E.warn("only_info - meta information has been updated")
            return

        params = self.build_params(output_file=outfile)
        benchmark = self.run(outfile, as_namedtuple(params))
        self.save_benchmark(outfile, benchmark)
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        self.input_files = collect_file_meta_information({"in": infiles})
        self.input_alias = self.build_alias(str(infiles),
                                            regex=self._input_regex,
                                            alias=self._input_alias)

        if isinstance(outfile, list):
            outdir = [os.path.dirname(x) for x in outfile]
            basefile = os.path.commonprefix(outfile)
        else:
            outdir = os.path.dirname(outfile)
            basefile = outfile

        kwargs = {'output_file': outfile,
                  'input_files': infiles,
                  'outdir': outdir}

        self.save_meta(outfile, **kwargs)

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    os.path.join(os.path.dirname(basefile), "benchmark.info")))
            return

        params = self.build_params()
        benchmark = self.run(infiles, outfile, as_namedtuple(params))

        self.save_benchmark(
            basefile,
            benchmark)
    def run(self, infile, outfile, params):

        options = []
        reference_fasta = params.reference_fasta
        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)
        reference_label = None
        use_target_regions = True
        if params.reference_fasta:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            if params.reference_fasta == "auto":

                fasta = resolve_argument(list(reference_fasta_map.values()),
                                         ",").split(",")

                reference_fasta, diffs = get_reference_for_bam(
                    infile, fastafiles=fasta)

                if reference_fasta:
                    options.append("--ref-seq {}".format(reference_fasta))
                    reference_label = map_path2name[reference_fasta]
                elif diffs:
                    E.warn(
                        "attempted to detect reference fasta, but unable to do so. "
                        "diffs: {}".format(diffs))
                else:
                    E.warn("sequence dict is empty, BAM likely to be empty. "
                           "target_regions will be ignored")
                    use_target_regions = False
            else:
                options.append("--ref-seq {}".format(params.reference_fasta))
                reference_label = map_path2name.get(params.reference_fasta,
                                                    None)

        if params.target_regions and use_target_regions:
            target_regions = get_associated_file(params, reference_label,
                                                 "target_regions")
            # convert to 1-based coordinates and decompress
            if target_regions.endswith(".bed.gz"):
                target_regions = (
                    "<(zcat {} "
                    "| awk '{{printf(\"%%s\\t%%i\\t%%i\\n\", $1, $2+1, $3)}}')"
                    .format(target_regions))
            options.append("--target-regions {}".format(target_regions))

        options = " ".join(options)
        if not os.path.exists(outfile + ".tmp"):
            try:
                retval = P.run("{params.path} stats "
                               "{self.options} "
                               "{options} "
                               "{infile} "
                               "2> {outfile}.log "
                               "> {outfile}.tmp; ".format(**locals()),
                               job_memory="16G")
            except OSError as e:
                E.warn("input file {} gave the following errors: {}".format(
                    infile, str(e)))
                return None
        else:
            retval = None

        def split_output(lines):
            is_comment = True
            section, body = None, []
            for line in lines:
                if line.startswith("#"):
                    if body:
                        yield section, body
                    body = []
                    is_comment = True
                else:
                    # the following preserves new-line
                    line = re.sub("\t#.*", "", line)
                    fields = line[:-1].split("\t")
                    section = fields[0]
                    body.append(fields[1:])
                    is_comment = False

            if body:
                yield section, body

        # split into separate files for upload
        with IOTools.open_file(outfile + ".tmp") as inf:
            for section, body in split_output(inf):
                try:
                    tablename, columns = self._map_section_to_table[section]
                except KeyError:
                    continue

                output_file = self.map_table_to_file(tablename, outfile)
                with IOTools.open_file(output_file, "w") as outf:

                    if len(columns) > 1 and columns[1].startswith("VAR_"):
                        outf.write("{}\t{}\n".format(columns[0],
                                                     columns[1][4:]))
                        for data in body:
                            outf.write("{}\t{}\n".format(
                                data[0], ",".join(data)))
                    else:
                        outf.write("\t".join(columns) + "\n")
                        # remove first column, which contains the identifier
                        outf.write("\n".join(["\t".join(x)
                                              for x in body]) + "\n")

        os.rename(outfile + ".tmp", outfile)

        return retval
示例#23
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory TaskLibrary functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt))
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
示例#24
0
    def run(self, outfile, params):

        min_job_memory = 3
        if "-t" in params.options:
            job_threads = int(re.search("-t\s*(\d+)",
                                        params.options).groups()[0])
        else:
            job_threads = 1

        job_memory = "{}G".format(
            float(min_job_memory + 1.0 * job_threads) / job_threads)

        cram_fasta = params.cram_fasta
        if params.cram_fasta is None:
            cram_fasta = params.reference_fasta

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "-R {}".format(pipes.quote(readgroup_string))
            # add additional level of quoting:
            readgroup_option = re.sub("\\t", "\\\\t", readgroup_option)
        else:
            readgroup_option = ""

        fastq = " ".join(sra_peek(params.sra))
        outfile = os.path.abspath(outfile)

        if params.extract_to_temp:
            tmpdir = P.get_temp_filename(clear=True)
            tmpdir_pre = "mkdir {};".format(tmpdir)
            tmpdir_post = "rm -rf {}".format(tmpdir)
        else:
            tmpdir = os.path.dirname(outfile)
            tmpdir_pre = ""
            tmpdir_post = ""

        # AH: fastq-dump hangs with arv mounts, thus try copying first
        if not IOTools.is_local(params.sra):
            E.warn("copying file {} to temporary directory".format(params.sra))
            temp_sra = os.path.join(
                tmpdir, os.path.basename(params.sra))
            fastq_dump = (
                "cp {params.sra}* {tmpdir}; "
                "fastq-dump --split-files --gzip {temp_sra} >& {outfile}.dump.log ".format(
                    **locals()))
            tmpdir_post = "rm -f {}*; {}".format(
                temp_sra, tmpdir_post)
        else:
            fastq_dump = (
                "fastq-dump --split-files --gzip {params.sra} >& {outfile}.dump.log "
            )

        return P.run(
            "{tmpdir_pre} "
            "cd {tmpdir}; "
            "{fastq_dump}; "
            "{self.path} mem -v 3 "
            "{readgroup_option} "
            "{params.options} "
            "{params.reference_fasta} "
            "{fastq} "
            "2> {outfile}.map.log "
            "| samtools view -O cram --reference {params.cram_fasta} /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort -T {tmpdir} -O cram /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile} >& {outfile}.index.log; "
            "{tmpdir_post}".format(**locals()))
示例#25
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = Toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = Workflow.expand_globs(config["input"])
    input_combos = Workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()
示例#26
0
    def run(self, outfile, params):

        local_options = []
        outfile = os.path.abspath(outfile)
        outdir = os.path.dirname(outfile)

        # assumption is that index is called xyz.fa without the .fa.
        reference_fasta = IOTools.snip(params.reference_fasta, ".fa", ".fasta")
        if not os.path.exists(reference_fasta):
            raise ValueError("input reference {} does not exist".format(reference_fasta))

        if "--jobs" in params.options or "-j" in params.options:
            job_threads = int(re.search("(--jobs|-j)\s*(\d+)",
                                        params.options).groups()[1])
        else:
            job_threads = 8

        if "--memory-limit" in params.options or "-m" in params.options:
            job_memory_gb = int(re.search("(--memory-limit|-m)\s*(\d+)",
                                          params.options).groups()[1])
        else:
            job_memory_gb = 60
            local_options.append("--memory-limit {}".format(job_memory_gb))

        if job_memory_gb < 60:
            E.warn("isaac-align likely to require at least 60Gb of memory, {}G requested".format(
                job_memory_gb))

        job_memory = "{}G".format(float(job_memory_gb) / job_threads)

        fastq_dir = os.path.join(outdir, "input_fastq")
        if not os.path.exists(fastq_dir):
            os.makedirs(fastq_dir)

        if len(params.fastq) == 2:
            if not os.path.exists(os.path.join(fastq_dir, "lane1_read1.fastq.gz")):
                os.symlink(os.path.abspath(params.fastq[0]), os.path.join(fastq_dir, "lane1_read1.fastq.gz"))
            if not os.path.exists(os.path.join(fastq_dir, "lane1_read2.fastq.gz")):
                os.symlink(os.path.abspath(params.fastq[1]), os.path.join(fastq_dir, "lane1_read2.fastq.gz"))
        else:
            raise NotImplementedError("expected 2 fastq files, received only {}".format(len(params.fastq)))

        intermediate_bam = os.path.join(outdir,
                                        "Aligned",
                                        "Projects",
                                        "default",
                                        "default",
                                        "sorted.bam")

        # picard statement to set readgroup
        picard_statement = self.build_picard_statement(
            intermediate_bam,
            outfile,
            params)

        tmpdir = os.path.join(outdir, "TEMP")

        local_options = " ".join(local_options)
        # isaac generates output files in working directory, so do a cd and make
        # sure that absolute path names are used elsewhere.
        statement = (
            "cd {outdir}; "
            "{self.path} "
            "--reference-genome {reference_fasta}/sorted-reference.xml "
            "--base-calls {fastq_dir} "
            "--base-calls-format fastq-gz "
            "--temp-directory {tmpdir} "
            "--cleanup-intermediary 1 "
            "--bam-gzip-level {params.bam_gzip_level} "
            "{params.options} "
            "{local_options} "
            ">& {outfile}.isaac.log; "
            "{picard_statement}; "
            "rm -rf {tmpdir} "
            .format(**locals()))

        return P.run(statement)
示例#27
0
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        # ensure output directory exists.
        # This should be done on the pipeline level, but
        # ruffus currently seems not to allow this.
        outdir = os.path.dirname(outfile)
        if outdir and not os.path.exists(outdir):
            os.makedirs(outdir)

        output_files = [
            self.map_table_to_file(x, outfile) for x in self.tablenames
        ]

        kwargs = {
            'output_files': output_files,
            'input_files': infiles,
            'outdir': outdir
        }

        if self._runtime_regex:
            kwargs["alias"] = self.build_alias(str(infiles),
                                               regex=self._runtime_regex,
                                               alias=self._runtime_alias)

        self.save_meta(outfile, **kwargs)

        if self.ignore:
            found = False
            for i in self.ignore:
                if i in outdir:
                    found = True
                    break

            if found:
                E.warn("skipping task {} at runtime, an empty file is created".
                       format(outfile))
                IOTools.touch_file(outfile)
                return

        # if self.runtime_filter:
        # TODO: create empty outfile if regex matches
        #    pass

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    IOTools.snip(outfile) + ".info"))
            return

        # AH: duplicated from above?
        params = self.build_params(output_files=output_files)

        on_error_options = ["raise", "ignore"]
        on_error = params.get("on_error", "raise")
        if on_error not in on_error_options:
            raise ValueError("unknown option to 'on_error': '{}' "
                             "should be one of '{}'".format(
                                 on_error, ",".join(on_error_options)))

        if self.ignore_task(infiles, outfile, params):
            return

        # deal with placeholder files created by identity that are
        # located on a remote mount point
        def map_to_mount(fn):
            if os.path.exists(fn + ".mnt"):
                if not P.PARAMS["mount_point"]:
                    raise ValueError(
                        "encountered mounted file {}, but no mount point present"
                        .format(fn))
                with open(fn + ".mnt") as inf:
                    mount_path = inf.read()
                return os.path.join(P.PARAMS["mount_point"], mount_path)
            else:
                return fn

        # replace infiles with mount locations if necessary
        if isinstance(infiles, list):
            infiles = [map_to_mount(x) for x in infiles]
        else:
            infiles = map_to_mount(infiles)

        try:
            benchmark = self.run(infiles, outfile, as_namedtuple(params))
        except Exception as ex:
            on_error = params.get("on_error", "raise")
            if on_error == "raise":
                raise
            elif on_error == "ignore":
                E.warn(
                    "error occured during execution of {} but will be ignored:\n{}"
                    .format(self.__name__, ex))
                E.warn(
                    "an empty output file {} will be created.".format(outfile))
                IOTools.touch_file(outfile)
                benchmark = None

        if benchmark:
            self.save_benchmark(outfile, benchmark)
示例#28
0
    def run(self, infiles, outfile, params):

        phase1_statements = []
        outdir = os.path.dirname(outfile)
        retvals = []
        vcf_files = []
        for fn in infiles:
            prefix = re.search(params.regex_filename, fn).groups()[0]

            out_fn = os.path.join(outdir, "file_{}".format(prefix))
            vcf_files.append(out_fn)

            if os.path.exists(out_fn + ".bcf"):
                continue

            phase1_statements.append("{self.path} ingest1 "
                                     "--output {out_fn} "
                                     "--fasta-ref {params.reference_fasta} "
                                     "{fn} "
                                     ">& {out_fn}.log; ".format(**locals()))

        phase2_statements = []
        block_files = []
        for start in range(0, len(vcf_files), self.block_size):

            out_fn = os.path.join(outdir, "block_{}".format(start))

            block_files.append(out_fn)

            if os.path.exists(out_fn + ".bcf"):
                continue

            end = start + self.block_size
            files = " ".join(
                ["{}.bcf".format(x) for x in vcf_files[start:end]])

            phase2_statements.append("{self.path} ingest2 "
                                     "--output {out_fn} "
                                     "{files} "
                                     ">& {out_fn}.log; ".format(**locals()))

        if phase2_statements:
            if phase1_statements:
                retvals.extend(P.run(phase1_statements, job_memory="4G"))
            else:
                E.warn("all files complete for phase 1")
            retvals.extend(P.run(phase2_statements, job_memory="4G"))
        else:
            E.warn("all files complete for phase 2")

        with pysam.VariantFile(block_files[0] + ".bcf") as bcf_file:
            contigs = list(bcf_file.header.contigs)

        files = " ".join(["{}.bcf".format(x) for x in block_files])
        phase3_statements = []
        chromosome_files = []
        for contig in contigs:

            out_fn = os.path.join(outdir, "chr_{}.bcf".format(contig))
            chromosome_files.append(out_fn)
            if os.path.exists(out_fn):
                continue

            phase3_statements.append(
                "{self.path} genotype "
                "--thread 4 "
                "--output-file {out_fn} "
                "--output-type b "
                "-r {contig} "
                "{files} "
                ">& {out_fn}.log; "
                "bcftools index {out_fn}".format(**locals()))

        retvals.extend(P.run(phase3_statements, job_memory="4G",
                             job_threads=4))

        if phase3_statements or not os.path.exists(outfile):
            files = " ".join(chromosome_files)
            retvals.extend(
                P.run("bcftools concat "
                      "-o {outfile} "
                      "-O z "
                      "{files} "
                      ">& {outfile}_concat.log; "
                      "tabix -p vcf {outfile}".format(**locals())))

        return retvals