def run(self, infile, outfiles, params):

        tbxfile = pysam.VariantFile(infile)
        statements = []

        for chrom in list(tbxfile.header.contigs):
            output_file = outfiles.format(chrom)
            output_dir = os.path.dirname(output_file)
            statements.append(
                "mkdir {output_dir}; "
                "tabix -h {infile} {chrom} | bgzip > {output_file}; "
                "tabix -p vcf {output_file} ".format(**locals()))

        retvals = P.run(statements)

        # clean up empty vcfs, opening empty VCF in pysam throws
        # ValueError
        for chrom in list(tbxfile.header.contigs):
            output_file = outfiles.format(chrom)
            output_dir = os.path.dirname(output_file)
            try:
                f = pysam.VariantFile(output_file)
                f.close()
            except ValueError:
                E.warn("removing empty VCF {}".format(output_file))
                shutil.rmtree(output_dir)

        tbxfile.close()
def get_reference_for_bam(bamfile, fastafiles):
    """deduce reference sequence used within BAM files.

    This method compares the sequence dictionary in the bamfile with a
    list of fastafiles. The comparison will stop at the first match
    that is found.

    :param bamfile: :term:`BAM` formatted file
    :param fastafiles: list of :term:`fasta` formatted files. The
        fasta files need to indexed with samtools faidx.
    :return: a tuple (filename, diffs). The first is the filename if
        found, otherwise None.  If not found, diffs is a list of all
        input files with a list missing contigs or length mismatches.

    """

    diffs = []

    # Temporary fix: see issue SYS-517
    if not os.path.exists(bamfile):
        E.warn("could not find file {}".format(bamfile))

    try:
        with pysam.AlignmentFile(bamfile, check_sq=False) as inf:
            sequence_dict = dict(list(zip(inf.references, inf.lengths)))
    except IOError as ex:
        E.warn("could not open bamfile {}: {}".format(bamfile, ex))
        return None, None

    fastafn, diffs = match_sequence_dictionaries(sequence_dict, fastafiles)

    return fastafn, diffs
示例#3
0
    def save_benchmark(self, outfile, benchmark):

        if not isinstance(benchmark, list):
            benchmark = [benchmark]

        # flatten if nested list and remove None
        benchmark = [
            x for x in IOTools.flatten(benchmark, ltypes=(list, ))
            if x is not None
        ]

        filename = self.build_meta_filename(outfile, "benchmark.bench")

        if not benchmark:
            E.warn("could not save benchmark info to {}".format(filename))
            return

        try:
            header = benchmark[0]._fields
        except AttributeError as ex:
            E.warn("could not save benchmark timings for {}:"
                   " {} from {}".format(outfile, str(ex), str(benchmark[0])))
            return

        with open(filename, "w") as outf:
            outf.write("\t".join(header) + "\n")
            for b in benchmark:
                outf.write("\t".join(map(str, b)) + "\n")
    def __call__(self, infiles, outfile, only_info=False):

        # NOTE: extras not implemented in ruffus 2.6.3, thus
        # use parameter:
        only_info = "only_info" in P.PARAMS

        self.input_files = collect_file_meta_information({"in": infiles})
        self.input_alias = self.build_alias(str(infiles),
                                            regex=self._input_regex,
                                            alias=self._input_alias)

        is_empty_outfile = outfile == []
        # patch for missing outfiles when number of outfiles is not
        # known (is it a ruffus thing?)
        if is_empty_outfile:
            assert isinstance(infiles, str)
            outdir = "{}/{}.dir".format(os.path.dirname(infiles), self.name)
            outfile = os.path.join(outdir, self.output)
            # dummy.info gets removed and then added
            self.save_meta(os.path.join(outdir, "dummy.info"))
        else:
            self.save_meta(outfile)

        if only_info:
            E.warn(
                "only_info - meta information in {} has been updated".format(
                    self.build_meta_filename(outfile, "benchmark.info")))
            return

        params = self.build_params()
        benchmark = self.run(infiles, outfile, as_namedtuple(params))

        if not is_empty_outfile:
            self.save_benchmark(outfile, benchmark)
    def run(self, infile, outfile, params):

        if params.reference_fasta_map is None:
            raise ValueError("bam2reference requires a reference sequence map")

        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)

        fasta = resolve_argument(list(reference_fasta_map.values()),
                                 ",").split(",")
        retval, diff = get_reference_for_bam(infile, fasta)
        if retval is None:
            if diff is None:
                retval = "corrupted"
            else:
                retval = "unknown"
                E.debug("differences: {}".format(str(diff)))
            path = ""
        else:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            path = map_path2name.get(retval, os.path.basename(retval))

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("filename\treference\tpath\n")
            outf.write("\t".join((infile, retval, path)) + "\n")

        return None
    def run(self, infile, outfile, params):

        try:
            retval = P.run("{params.path} view -H  "
                           "{infile} "
                           "2> {outfile}.log "
                           "> {outfile}.tmp; ".format(**locals()))
        except OSError as e:
            E.warn("input file {} gave the following errors: {}".format(
                infile, str(e)))

        with open(outfile, "w") as outf, open(outfile + ".tmp") as inf:
            outf.write("header_tag\ttag\tlineno\tvalue\n")
            for lineno, line in enumerate(inf):
                fields = line[1:-1].split("\t")
                header_tag = fields[0]
                if header_tag == "CO":
                    # Do not split comment lines
                    outf.write("\t".join((header_tag, "", str(lineno),
                                          "\t".join(fields[1:]))) + "\n")
                else:
                    for field in fields[1:]:
                        sub_tag, content = field.split(":", 1)
                        outf.write("\t".join((header_tag, sub_tag, str(lineno),
                                              content)) + "\n")

        os.unlink(outfile + ".tmp")
        return retval
    def run(self, outfile, params):

        bam = resolve_argument(params.bam)
        reference_fasta = get_reference(params)

        stmnts = []

        prefix = IOTools.snip(outfile, ".vcf.gz")
        vcf_output = prefix + ".raw.vcf.gz"

        if not os.path.exists(vcf_output):
            stmnts.append("java "
                          "-Djava.io.tmpdir=%(tmpdir)s "
                          "-jar {self.path} "
                          "--analysis_type HaplotypeCaller "
                          "--input_file {bam} "
                          "--reference_sequence {reference_fasta} "
                          "--logging_level INFO "
                          "--log_to_file {outfile}.HaplotypeCaller.log "
                          "{params.haplotypecaller} "
                          "--out {vcf_output} "
                          ">& {prefix}.HaplotypeCaller.err".format(**locals()))
        else:
            E.warn("output file {vcf_output} already exists - "
                   "it will not be recomputed".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="5G")
示例#8
0
 def inner(self, outfile, *args, **kwargs):
     try:
         f()
     except Exception as e:
         E.warn("received exception {} - touching {}".format(
             str(e), outfile))
     IOTools.touch_file(outfile)
    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=" ")
        reference_fasta = get_reference(params)

        # warning: requires -m or -c in the options
        if "--multiallelic-caller" not in params.options and \
           "-m" not in params.options and \
           "-c" not in params.options and \
           "--consensus-caller" not in params.options:
            E.warn("bcftools call requires -m or -c, got {}".format(
                params.options))

        # limit number of jobs to node to limit I/O
        job_threads = 4

        return P.run("{params.path_samtools} mpileup "
                     "-ug "
                     "-f {reference_fasta} "
                     "{params.samtools_options} "
                     "{bam} "
                     "2> {outfile}.pileup.log "
                     "| {params.path} call "
                     "--variants-only "
                     "--output-type z "
                     "{params.options} "
                     "2> {outfile}.call.log "
                     "> {outfile}; "
                     "tabix -p vcf {outfile} ".format(**locals()))
    def get_version(self):
        help_string1 = E.run("{self.path_manta} --version".format(**locals()),
                             return_stdout=True).strip()

        help_string2 = E.run(
            "{self.path_strelka} --version".format(**locals()),
            return_stdout=True).strip()
        return "-".join([help_string1, help_string2])
示例#11
0
    def _test_task_will_run(self, taskf):

        # check if task is installed
        version = get_task_version(taskf)
        if version is None:
            self.skipTest("tools for task {} not available".format(taskf.name))
            return

        # define input/output files
        tool_config = self.test_config["tool"]
        input_files = {}
        for expected in taskf.expected:
            if taskf.name in tool_config:
                # task specific input files
                p = tool_config[taskf.name].get(
                    expected, tool_config.get(expected, None))
            else:
                # generic input files
                p = tool_config.get(expected, None)
            if p is None:
                self.skipTest(
                    "data for input slot {} not provided for {}".format(
                        expected, taskf.name))
                return

            input_files[expected] = p

        tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}".format(taskf.name))
        if isinstance(taskf.output, str):
            outfile = os.path.join(tmpdir, taskf.output)
        else:
            outfile = [os.path.join(tmpdir, x) for x in taskf.output]

        # instantiate task
        t = taskf()
        # set custom options for test
        if taskf.name in tool_config:
            for key, value in tool_config[taskf.name].items():
                setattr(t, key, value)

        # run task
        t.register_input(input_files)
        t(input_files.values(), outfile)

        # check if tool produced non-zero output
        if isinstance(outfile, list):
            for x in outfile:
                self.assertTrue(os.path.exists(x))
                self.assertGreater(os.path.getsize(x), 0)
        else:
            self.assertTrue(os.path.exists(outfile))
            self.assertGreater(os.path.getsize(outfile), 0)

        # cleanup
        try:
            shutil.rmtree(tmpdir)
        except OSError as ex:
            E.warn("could not remove {}: {}".format(tmpdir, ex))
    def run(self, infile, outfile, params):

        if "reference_fasta" in params._fields:
            reference_fasta = "REFERENCE_SEQUENCE={}".format(
                params.reference_fasta)
        else:
            reference_fasta = ""

        # command can fail when no output is produced, but still produce output
        # 12G is required for java overhead
        retval = P.run("java -Xmx8000m -jar {params.path} "
                       "CollectMultipleMetrics "
                       "{reference_fasta} "
                       "INPUT={infile} "
                       "TMP_DIR=%(tmpdir)s "
                       "{params.options} "
                       "OUTPUT={outfile} "
                       ">& {outfile} ".format(**locals()),
                       job_memory="12G",
                       ignore_errors=True)

        def get_section(section, data):
            pattern = "## {}".format(section)
            keep = False
            result = []
            for line in data:
                if line.startswith("##"):
                    if line.startswith(pattern):
                        keep = True
                    else:
                        keep = False
                if keep:
                    result.append(line)
            return result

        for tablename in self.tablenames:
            filename = re.sub("histogram", "metrics", tablename)
            raw = filename[len("picard_"):]
            src = outfile + "." + raw
            dest = outfile + "." + tablename + ".tsv"

            if not os.path.exists(src):
                E.warn("no file {}, ignored".format(src))
                continue

            with IOTools.open_file(src) as inf:
                data = inf.readlines()

            if tablename.endswith("metrics"):
                data = get_section("METRICS", data)
            elif tablename.endswith("histogram"):
                data = get_section("HISTOGRAM", data)

            with IOTools.open_file(dest, "w") as outf:
                outf.write("".join(data))

        return retval
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")
        bams = resolve_argument(params.bam, ",")
        reference_fasta = get_reference(params)

        statements, gvcfs = [], []
        # TODO: sort out multi-threading
        for idx, bam in enumerate(bams.split(",")):
            output = prefix + "." + str(idx) + ".g.vcf"
            gvcfs.append(output)

            if os.path.exists(output):
                E.info("{} already exists - skipped".format(output))
                continue

            statements.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type HaplotypeCaller "
                "--input_file {bam} "
                "--reference_sequence {reference_fasta} "
                "--emitRefConfidence GVCF "
                "--logging_level INFO "
                "--log_to_file {prefix}.HaplotypeCaller.{idx}.log "
                "{params.haplotypecaller} "
                "--out {output} "
                ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals()))

        if statements:
            self.run_statements(statements, job_memory="4G")

        stmnts = []
        gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs])
        vcf_output = prefix + ".raw.vcf.gz"
        stmnts.append("java "
                      "-Djava.io.tmpdir=%(tmpdir)s "
                      "-jar {self.path} "
                      "--analysis_type GenotypeGVCFs "
                      "--reference_sequence {reference_fasta} "
                      "{gvcfs} "
                      "--logging_level INFO "
                      "--log_to_file {prefix}.GenotypeGVCFs.log "
                      "{params.genotypegvcfs} "
                      "--out {vcf_output} "
                      ">& {prefix}.GenotypeGVCFs".format(**locals()))

        stmnts.extend(
            self.build_calibration_workflow(outfile, prefix, vcf_output,
                                            params))

        return self.run_statements(stmnts, job_memory="4G")
    def run(self, outfile, params):

        prefix = IOTools.snip(outfile, ".vcf.gz")

        bam = resolve_argument(params.bam, sep=",")
        reference_fasta = get_reference(params)

        bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")])
        stmnts = []
        if not os.path.exists(prefix + ".annotated.vcf.gz"):
            tmpfile, pre_statement, post_statement = self.pre_process(
                params.vcf, outfile, params)

            stmnts.append(pre_statement)
            stmnts.append(
                "java "
                "-Djava.io.tmpdir=%(tmpdir)s "
                "-jar {self.path} "
                "--analysis_type VariantAnnotator "
                "--variant {tmpfile} "
                "{bam} "
                "--reference_sequence {reference_fasta} "
                "--logging_level INFO "
                "--log_to_file {prefix}.VariantAnnotator.log "
                "--annotation FisherStrand "
                "--annotation StrandOddsRatio "
                "--annotation ReadPosRankSumTest "
                "--annotation RMSMappingQuality "
                "--annotation MappingQualityRankSumTest "
                "{params.options} "
                "--out {prefix}.annotated.vcf.gz "
                ">& {prefix}.VariantAnnotator.err".format(**locals()))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

            stmnts.append(post_statement)
        else:
            E.warn("using pre-existing file {} with annotated variants".format(
                prefix + ".annotated.vcf.gz"))

            stmnts.extend(
                self.build_calibration_workflow(outfile, prefix,
                                                prefix + ".annotated.vcf.gz",
                                                params))

        return self.run_statements(stmnts, job_memory="3G")
示例#15
0
    def ignore_task(self, infiles, outfiles, params):
        """return True if task should be ignored.

        This method will also create the output file(s).
        """
        if self._ignore:
            m = str(outfiles)
            for ignore in IOTools.val2list(self._ignore):
                if ignore in m:
                    E.warn("task {} will be ignored".format(self.__name__))
                    for f in IOTools.val2list(outfiles):
                        E.info("creating empty file {}".format(f))
                        IOTools.touch_file(f)
                    return True
        return False
示例#16
0
 def get_version(self):
     help_string = E.run("{self.path} -h".format(**locals()),
                         return_stderr=True).strip()
     if "vcf-concat [OPTIONS]" in help_string:
         return "unknown"
     else:
         raise ValueError("vcf-concat not found")
    def run(self, infile, outfile, params):

        with open(outfile, "w") as outf:
            outf.write("counts\tcounts_fail\tcategory\n")

        try:
            retval = P.run("{params.path} flagstat "
                           "{infile} "
                           "2> {outfile}.log "
                           "| perl -p -e 's/ \+ /\\t/; s/ /\\t/; s/\\(.*//' "
                           ">> {outfile}; ".format(**locals()))
        except OSError as e:
            E.warn("input  file {} gave the following errors: {}".format(
                infile, str(e)))

        return retval
    def run(self, infile, outfile, params):

        with open(outfile, "w") as outf:
            outf.write("chromosome\tsize\tmapped\tunmapped\n")

        try:
            retval = P.run("{params.path} idxstats "
                           "{infile} "
                           "2> {outfile}.log "
                           ">> {outfile}; ".format(**locals()))
        except OSError as e:
            E.warn("input  file {} gave the following errors: {}".format(
                infile, str(e)))
            retval = None

        return retval
示例#19
0
def expand_globs(config, is_test=False):
    """detect and expand glob expressions in the input section.

    A glob expression is any filename that contains a '*'. Multiple
    glob expressions can be combined on the same line by a ','.

    A "find" expression is detected starting with 'find'. These
    expressions will be evaluated in a shell and the results insterted
    into the dictionary.

    If a filename starts with "file=", the contents of the file
    following the "=" are read and inserted. Multiple files can be
    separated by a ','.

    If a glob or find expression is evaluated to nothing, an exception
    is raised unless ``is_test`` is set. In that case, two files will be
    returned called "test1" and "test2".
    """

    for d, key, value in IOTools.nested_iter(config):
        if isinstance(value, str):
            if value.startswith("find"):
                try:
                    data = E.run(value, return_stdout=True)
                except Exception as e:
                    data = e.output
                d[key] = [x for x in data.split("\n") if x]
            elif "*" in value:
                if "," in value:
                    v = [glob.glob(x.strip()) for x in value.split(",")]
                    v = [item for sublist in v for item in sublist]
                else:
                    v = glob.glob(value)
                d[key] = v
            elif value.startswith("file="):
                filenames = [x.strip() for x in value.split("=")[1].split(",")]
                paths = []
                for fn in filenames:
                    with IOTools.open_file(fn) as inf:
                        paths.extend([x.strip() for x in inf if x.strip()])
                d[key] = paths
            if len(d[key]) == 0:
                if not is_test:
                    raise ValueError(
                        "expression '{}' expanded to nothing".format(value))
                else:
                    # insert some random files for testing purposes:
                    if "*" in value:
                        # replace glob expressions
                        value = re.sub(",.*", "", value)
                        d[key] = [re.sub("[*]", "test1", value),
                                  re.sub("[*]", "test2", value)]
                    else:
                        if "bam" in value:
                            d[key] = ["test1.bam", "test2.bam"]
                        elif "vcf" in value:
                            d[key] = ["test1.vcf.gz", "test2.vcf.gz"]
                        else:
                            d[key] = ["test1.txt", "test2.txt"]
    return config
 def get_version(self):
     help_string = E.run("{self.path}".format(**locals()),
                         return_stderr=True).strip()
     if "USAGE: pbsim" in help_string:
         return "unknown"
     else:
         raise ValueError("pbsim not found")
示例#21
0
    def _test_task_will_run(self, taskf):
        # check if task is installed
        version = get_task_version(taskf)
        if version is None:
            self.skipTest("tools for task {} not available".format(taskf.name))
            return

        # define input/output files
        metric_config = self.test_config["metric"]
        infiles = None
        for key, values in metric_config.items():
            if key == taskf.name:
                infiles = values.get("files", None)
            elif "patterns" in values:
                for pattern in values["patterns"]:
                    if re.search(pattern, taskf.name):
                        infiles = values.get("files", None)
                        break
            if infiles:
                break

        if infiles is None:
            self.skipTest("no input files specified for {}".format(taskf.name))
            return

        tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}_".format(taskf.name))
        outfile = os.path.join(tmpdir, "output.tsv")

        # instantiate task
        task = taskf()
        # set custom options for test
        if taskf.name in metric_config:
            for key, value in metric_config[taskf.name].items():
                setattr(task, key, value)

        # run task
        task(infiles, outfile)

        # check if tool produced non-zero output
        self.assertTrue(os.path.exists(outfile))
        self.assertGreater(os.path.getsize(outfile), 0)

        # cleanup
        try:
            shutil.rmtree(tmpdir)
        except OSError as ex:
            E.warn("could not remove {}: {}".format(tmpdir, ex))
示例#22
0
 def get_version(self):
     version_bedtools = run_metric_bedtools_intersection.get_version(self)
     help_string = E.run(
         "{self.gat_path} --version 2> /dev/null".format(**locals()),
         return_stdout=True).strip()
     return "{} {}".format(
         version_bedtools,
         re.search(r"gat-run.py version: (\S+):", help_string).groups()[0])
示例#23
0
 def get_version(self):
     help_text = E.run("{self.path} -version".format(**locals()),
                       return_stderr=True).strip()
     if help_text and "not found" not in help_text:
         return re.search(r"BBMap version (\S+)", help_text).groups()[0]
     else:
         raise ValueError("bbmap not found at/as {}: {}".format(
             self.path, help_text))
 def get_version(self):
     help_string = E.run("{self.path} --version".format(**locals()),
                         return_stderr=True).strip()
     if help_string and "not found" not in help_string:
         return re.search("Canu (.+)", help_string).groups()[0]
     else:
         raise ValueError("canu not found at/as {}: {}".format(
             self.path, help_string))
示例#25
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    tools = glob.glob(
        os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools",
                     "*.py"))

    counter = E.Counter()
    for tool in tools:
        counter.found += 1
        tool_module = re.sub(".py", "", os.path.basename(tool))
        tool_name = re.sub("_", "-", tool_module)
        if tool_name in ("__init__", "cli"):
            c.ignored += 1
            continue

        dest = os.path.join("tools", "{}.rst".format(tool_name))

        if os.path.exists(dest) and not options.output_force:
            counter.skipped += 1
            continue

        with IOTools.openFile(dest, "w") as outf:
            outf.write(TEMPLATE_TOOL.format(**locals()))

        counter.new += 1

    E.info(counter)
    E.stop()
 def get_version(self):
     help_string = E.run("{self.path} ".format(**locals()),
                         return_stdout=True,
                         on_error="ignore").strip()
     if help_string:
         return re.search("Delly \(Version: (\S+)\)",
                          help_string).groups()[0]
     else:
         raise ValueError("delly not found at/as {}".format(self.path))
    def __call__(self, dataframe, map_sample2label={}):

        df = dataframe.pivot(index="gc_bin", columns="sample",
                             values="mean").reset_index()

        # remove duplicate sample names
        # (in Cancer analysis: two blood samples)
        to_drop = [x for x in df.columns if x.startswith("2:")]
        df.drop(to_drop, axis=1, inplace=True)

        df.colums = [map_sample2label.get(x, x) for x in df.columns]

        if df.empty:
            E.warn("no data, no plot will be output")
            return

        ax = df.plot(kind="line", x="gc_bin")

        return ax
示例#28
0
def get_sequence_length_dict(fastafn):
    """return sequence/length dictionary from a
    fasta file.

    The fasta file needs to be indexed with samtools faidx.
    """
    # Temporary fix: see issue SYS-517
    if not os.path.exists(fastafn):
        E.warn("could not find file {}".format(fastafn))
        return None

    try:
        with pysam.FastaFile(fastafn) as inf:
            fastadict = dict(list(zip(inf.references, inf.lengths)))
    except IOError as ex:
        E.warn("file {} could not be opened".format(ex))
        fastadict = None

    return fastadict
 def get_version(self):
     help_string = E.run(
         "{self.path} version 2> /dev/null".format(**locals()),
         return_stdout=True,
         on_error="ignore").strip()
     if help_string and "not found" not in help_string:
         return re.search(r"Product: RTG Tools (\S+)",
                          help_string).groups()[0]
     else:
         raise ValueError("rtg not found at/as {}: {}".format(
             self.path, help_string))
 def get_version(self):
     help_string = E.run("{self.path} ".format(**locals()),
                         return_stdout=True,
                         on_error="ignore").strip()
     # lumpy express without arguments ends in error
     if help_string:
         raise NotImplementedError()
         return re.search(r"lumpy \(Version: (\S+)\)",
                          help_string).groups()[0]
     else:
         raise ValueError("lumpy not found at/as {}".format(self.path))