def run(self, infile, outfiles, params): tbxfile = pysam.VariantFile(infile) statements = [] for chrom in list(tbxfile.header.contigs): output_file = outfiles.format(chrom) output_dir = os.path.dirname(output_file) statements.append( "mkdir {output_dir}; " "tabix -h {infile} {chrom} | bgzip > {output_file}; " "tabix -p vcf {output_file} ".format(**locals())) retvals = P.run(statements) # clean up empty vcfs, opening empty VCF in pysam throws # ValueError for chrom in list(tbxfile.header.contigs): output_file = outfiles.format(chrom) output_dir = os.path.dirname(output_file) try: f = pysam.VariantFile(output_file) f.close() except ValueError: E.warn("removing empty VCF {}".format(output_file)) shutil.rmtree(output_dir) tbxfile.close()
def get_reference_for_bam(bamfile, fastafiles): """deduce reference sequence used within BAM files. This method compares the sequence dictionary in the bamfile with a list of fastafiles. The comparison will stop at the first match that is found. :param bamfile: :term:`BAM` formatted file :param fastafiles: list of :term:`fasta` formatted files. The fasta files need to indexed with samtools faidx. :return: a tuple (filename, diffs). The first is the filename if found, otherwise None. If not found, diffs is a list of all input files with a list missing contigs or length mismatches. """ diffs = [] # Temporary fix: see issue SYS-517 if not os.path.exists(bamfile): E.warn("could not find file {}".format(bamfile)) try: with pysam.AlignmentFile(bamfile, check_sq=False) as inf: sequence_dict = dict(list(zip(inf.references, inf.lengths))) except IOError as ex: E.warn("could not open bamfile {}: {}".format(bamfile, ex)) return None, None fastafn, diffs = match_sequence_dictionaries(sequence_dict, fastafiles) return fastafn, diffs
def save_benchmark(self, outfile, benchmark): if not isinstance(benchmark, list): benchmark = [benchmark] # flatten if nested list and remove None benchmark = [ x for x in IOTools.flatten(benchmark, ltypes=(list, )) if x is not None ] filename = self.build_meta_filename(outfile, "benchmark.bench") if not benchmark: E.warn("could not save benchmark info to {}".format(filename)) return try: header = benchmark[0]._fields except AttributeError as ex: E.warn("could not save benchmark timings for {}:" " {} from {}".format(outfile, str(ex), str(benchmark[0]))) return with open(filename, "w") as outf: outf.write("\t".join(header) + "\n") for b in benchmark: outf.write("\t".join(map(str, b)) + "\n")
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS self.input_files = collect_file_meta_information({"in": infiles}) self.input_alias = self.build_alias(str(infiles), regex=self._input_regex, alias=self._input_alias) is_empty_outfile = outfile == [] # patch for missing outfiles when number of outfiles is not # known (is it a ruffus thing?) if is_empty_outfile: assert isinstance(infiles, str) outdir = "{}/{}.dir".format(os.path.dirname(infiles), self.name) outfile = os.path.join(outdir, self.output) # dummy.info gets removed and then added self.save_meta(os.path.join(outdir, "dummy.info")) else: self.save_meta(outfile) if only_info: E.warn( "only_info - meta information in {} has been updated".format( self.build_meta_filename(outfile, "benchmark.info"))) return params = self.build_params() benchmark = self.run(infiles, outfile, as_namedtuple(params)) if not is_empty_outfile: self.save_benchmark(outfile, benchmark)
def run(self, infile, outfile, params): if params.reference_fasta_map is None: raise ValueError("bam2reference requires a reference sequence map") reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") retval, diff = get_reference_for_bam(infile, fasta) if retval is None: if diff is None: retval = "corrupted" else: retval = "unknown" E.debug("differences: {}".format(str(diff))) path = "" else: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) path = map_path2name.get(retval, os.path.basename(retval)) with IOTools.open_file(outfile, "w") as outf: outf.write("filename\treference\tpath\n") outf.write("\t".join((infile, retval, path)) + "\n") return None
def run(self, infile, outfile, params): try: retval = P.run("{params.path} view -H " "{infile} " "2> {outfile}.log " "> {outfile}.tmp; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) with open(outfile, "w") as outf, open(outfile + ".tmp") as inf: outf.write("header_tag\ttag\tlineno\tvalue\n") for lineno, line in enumerate(inf): fields = line[1:-1].split("\t") header_tag = fields[0] if header_tag == "CO": # Do not split comment lines outf.write("\t".join((header_tag, "", str(lineno), "\t".join(fields[1:]))) + "\n") else: for field in fields[1:]: sub_tag, content = field.split(":", 1) outf.write("\t".join((header_tag, sub_tag, str(lineno), content)) + "\n") os.unlink(outfile + ".tmp") return retval
def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".vcf.gz") vcf_output = prefix + ".raw.vcf.gz" if not os.path.exists(vcf_output): stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.HaplotypeCaller.log " "{params.haplotypecaller} " "--out {vcf_output} " ">& {prefix}.HaplotypeCaller.err".format(**locals())) else: E.warn("output file {vcf_output} already exists - " "it will not be recomputed".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="5G")
def inner(self, outfile, *args, **kwargs): try: f() except Exception as e: E.warn("received exception {} - touching {}".format( str(e), outfile)) IOTools.touch_file(outfile)
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=" ") reference_fasta = get_reference(params) # warning: requires -m or -c in the options if "--multiallelic-caller" not in params.options and \ "-m" not in params.options and \ "-c" not in params.options and \ "--consensus-caller" not in params.options: E.warn("bcftools call requires -m or -c, got {}".format( params.options)) # limit number of jobs to node to limit I/O job_threads = 4 return P.run("{params.path_samtools} mpileup " "-ug " "-f {reference_fasta} " "{params.samtools_options} " "{bam} " "2> {outfile}.pileup.log " "| {params.path} call " "--variants-only " "--output-type z " "{params.options} " "2> {outfile}.call.log " "> {outfile}; " "tabix -p vcf {outfile} ".format(**locals()))
def get_version(self): help_string1 = E.run("{self.path_manta} --version".format(**locals()), return_stdout=True).strip() help_string2 = E.run( "{self.path_strelka} --version".format(**locals()), return_stdout=True).strip() return "-".join([help_string1, help_string2])
def _test_task_will_run(self, taskf): # check if task is installed version = get_task_version(taskf) if version is None: self.skipTest("tools for task {} not available".format(taskf.name)) return # define input/output files tool_config = self.test_config["tool"] input_files = {} for expected in taskf.expected: if taskf.name in tool_config: # task specific input files p = tool_config[taskf.name].get( expected, tool_config.get(expected, None)) else: # generic input files p = tool_config.get(expected, None) if p is None: self.skipTest( "data for input slot {} not provided for {}".format( expected, taskf.name)) return input_files[expected] = p tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}".format(taskf.name)) if isinstance(taskf.output, str): outfile = os.path.join(tmpdir, taskf.output) else: outfile = [os.path.join(tmpdir, x) for x in taskf.output] # instantiate task t = taskf() # set custom options for test if taskf.name in tool_config: for key, value in tool_config[taskf.name].items(): setattr(t, key, value) # run task t.register_input(input_files) t(input_files.values(), outfile) # check if tool produced non-zero output if isinstance(outfile, list): for x in outfile: self.assertTrue(os.path.exists(x)) self.assertGreater(os.path.getsize(x), 0) else: self.assertTrue(os.path.exists(outfile)) self.assertGreater(os.path.getsize(outfile), 0) # cleanup try: shutil.rmtree(tmpdir) except OSError as ex: E.warn("could not remove {}: {}".format(tmpdir, ex))
def run(self, infile, outfile, params): if "reference_fasta" in params._fields: reference_fasta = "REFERENCE_SEQUENCE={}".format( params.reference_fasta) else: reference_fasta = "" # command can fail when no output is produced, but still produce output # 12G is required for java overhead retval = P.run("java -Xmx8000m -jar {params.path} " "CollectMultipleMetrics " "{reference_fasta} " "INPUT={infile} " "TMP_DIR=%(tmpdir)s " "{params.options} " "OUTPUT={outfile} " ">& {outfile} ".format(**locals()), job_memory="12G", ignore_errors=True) def get_section(section, data): pattern = "## {}".format(section) keep = False result = [] for line in data: if line.startswith("##"): if line.startswith(pattern): keep = True else: keep = False if keep: result.append(line) return result for tablename in self.tablenames: filename = re.sub("histogram", "metrics", tablename) raw = filename[len("picard_"):] src = outfile + "." + raw dest = outfile + "." + tablename + ".tsv" if not os.path.exists(src): E.warn("no file {}, ignored".format(src)) continue with IOTools.open_file(src) as inf: data = inf.readlines() if tablename.endswith("metrics"): data = get_section("METRICS", data) elif tablename.endswith("histogram"): data = get_section("HISTOGRAM", data) with IOTools.open_file(dest, "w") as outf: outf.write("".join(data)) return retval
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bams = resolve_argument(params.bam, ",") reference_fasta = get_reference(params) statements, gvcfs = [], [] # TODO: sort out multi-threading for idx, bam in enumerate(bams.split(",")): output = prefix + "." + str(idx) + ".g.vcf" gvcfs.append(output) if os.path.exists(output): E.info("{} already exists - skipped".format(output)) continue statements.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--emitRefConfidence GVCF " "--logging_level INFO " "--log_to_file {prefix}.HaplotypeCaller.{idx}.log " "{params.haplotypecaller} " "--out {output} " ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals())) if statements: self.run_statements(statements, job_memory="4G") stmnts = [] gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs]) vcf_output = prefix + ".raw.vcf.gz" stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type GenotypeGVCFs " "--reference_sequence {reference_fasta} " "{gvcfs} " "--logging_level INFO " "--log_to_file {prefix}.GenotypeGVCFs.log " "{params.genotypegvcfs} " "--out {vcf_output} " ">& {prefix}.GenotypeGVCFs".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="4G")
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bam = resolve_argument(params.bam, sep=",") reference_fasta = get_reference(params) bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")]) stmnts = [] if not os.path.exists(prefix + ".annotated.vcf.gz"): tmpfile, pre_statement, post_statement = self.pre_process( params.vcf, outfile, params) stmnts.append(pre_statement) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type VariantAnnotator " "--variant {tmpfile} " "{bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {prefix}.VariantAnnotator.log " "--annotation FisherStrand " "--annotation StrandOddsRatio " "--annotation ReadPosRankSumTest " "--annotation RMSMappingQuality " "--annotation MappingQualityRankSumTest " "{params.options} " "--out {prefix}.annotated.vcf.gz " ">& {prefix}.VariantAnnotator.err".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) stmnts.append(post_statement) else: E.warn("using pre-existing file {} with annotated variants".format( prefix + ".annotated.vcf.gz")) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) return self.run_statements(stmnts, job_memory="3G")
def ignore_task(self, infiles, outfiles, params): """return True if task should be ignored. This method will also create the output file(s). """ if self._ignore: m = str(outfiles) for ignore in IOTools.val2list(self._ignore): if ignore in m: E.warn("task {} will be ignored".format(self.__name__)) for f in IOTools.val2list(outfiles): E.info("creating empty file {}".format(f)) IOTools.touch_file(f) return True return False
def get_version(self): help_string = E.run("{self.path} -h".format(**locals()), return_stderr=True).strip() if "vcf-concat [OPTIONS]" in help_string: return "unknown" else: raise ValueError("vcf-concat not found")
def run(self, infile, outfile, params): with open(outfile, "w") as outf: outf.write("counts\tcounts_fail\tcategory\n") try: retval = P.run("{params.path} flagstat " "{infile} " "2> {outfile}.log " "| perl -p -e 's/ \+ /\\t/; s/ /\\t/; s/\\(.*//' " ">> {outfile}; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) return retval
def run(self, infile, outfile, params): with open(outfile, "w") as outf: outf.write("chromosome\tsize\tmapped\tunmapped\n") try: retval = P.run("{params.path} idxstats " "{infile} " "2> {outfile}.log " ">> {outfile}; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) retval = None return retval
def expand_globs(config, is_test=False): """detect and expand glob expressions in the input section. A glob expression is any filename that contains a '*'. Multiple glob expressions can be combined on the same line by a ','. A "find" expression is detected starting with 'find'. These expressions will be evaluated in a shell and the results insterted into the dictionary. If a filename starts with "file=", the contents of the file following the "=" are read and inserted. Multiple files can be separated by a ','. If a glob or find expression is evaluated to nothing, an exception is raised unless ``is_test`` is set. In that case, two files will be returned called "test1" and "test2". """ for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if value.startswith("find"): try: data = E.run(value, return_stdout=True) except Exception as e: data = e.output d[key] = [x for x in data.split("\n") if x] elif "*" in value: if "," in value: v = [glob.glob(x.strip()) for x in value.split(",")] v = [item for sublist in v for item in sublist] else: v = glob.glob(value) d[key] = v elif value.startswith("file="): filenames = [x.strip() for x in value.split("=")[1].split(",")] paths = [] for fn in filenames: with IOTools.open_file(fn) as inf: paths.extend([x.strip() for x in inf if x.strip()]) d[key] = paths if len(d[key]) == 0: if not is_test: raise ValueError( "expression '{}' expanded to nothing".format(value)) else: # insert some random files for testing purposes: if "*" in value: # replace glob expressions value = re.sub(",.*", "", value) d[key] = [re.sub("[*]", "test1", value), re.sub("[*]", "test2", value)] else: if "bam" in value: d[key] = ["test1.bam", "test2.bam"] elif "vcf" in value: d[key] = ["test1.vcf.gz", "test2.vcf.gz"] else: d[key] = ["test1.txt", "test2.txt"] return config
def get_version(self): help_string = E.run("{self.path}".format(**locals()), return_stderr=True).strip() if "USAGE: pbsim" in help_string: return "unknown" else: raise ValueError("pbsim not found")
def _test_task_will_run(self, taskf): # check if task is installed version = get_task_version(taskf) if version is None: self.skipTest("tools for task {} not available".format(taskf.name)) return # define input/output files metric_config = self.test_config["metric"] infiles = None for key, values in metric_config.items(): if key == taskf.name: infiles = values.get("files", None) elif "patterns" in values: for pattern in values["patterns"]: if re.search(pattern, taskf.name): infiles = values.get("files", None) break if infiles: break if infiles is None: self.skipTest("no input files specified for {}".format(taskf.name)) return tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}_".format(taskf.name)) outfile = os.path.join(tmpdir, "output.tsv") # instantiate task task = taskf() # set custom options for test if taskf.name in metric_config: for key, value in metric_config[taskf.name].items(): setattr(task, key, value) # run task task(infiles, outfile) # check if tool produced non-zero output self.assertTrue(os.path.exists(outfile)) self.assertGreater(os.path.getsize(outfile), 0) # cleanup try: shutil.rmtree(tmpdir) except OSError as ex: E.warn("could not remove {}: {}".format(tmpdir, ex))
def get_version(self): version_bedtools = run_metric_bedtools_intersection.get_version(self) help_string = E.run( "{self.gat_path} --version 2> /dev/null".format(**locals()), return_stdout=True).strip() return "{} {}".format( version_bedtools, re.search(r"gat-run.py version: (\S+):", help_string).groups()[0])
def get_version(self): help_text = E.run("{self.path} -version".format(**locals()), return_stderr=True).strip() if help_text and "not found" not in help_text: return re.search(r"BBMap version (\S+)", help_text).groups()[0] else: raise ValueError("bbmap not found at/as {}: {}".format( self.path, help_text))
def get_version(self): help_string = E.run("{self.path} --version".format(**locals()), return_stderr=True).strip() if help_string and "not found" not in help_string: return re.search("Canu (.+)", help_string).groups()[0] else: raise ValueError("canu not found at/as {}: {}".format( self.path, help_string))
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) tools = glob.glob( os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools", "*.py")) counter = E.Counter() for tool in tools: counter.found += 1 tool_module = re.sub(".py", "", os.path.basename(tool)) tool_name = re.sub("_", "-", tool_module) if tool_name in ("__init__", "cli"): c.ignored += 1 continue dest = os.path.join("tools", "{}.rst".format(tool_name)) if os.path.exists(dest) and not options.output_force: counter.skipped += 1 continue with IOTools.openFile(dest, "w") as outf: outf.write(TEMPLATE_TOOL.format(**locals())) counter.new += 1 E.info(counter) E.stop()
def get_version(self): help_string = E.run("{self.path} ".format(**locals()), return_stdout=True, on_error="ignore").strip() if help_string: return re.search("Delly \(Version: (\S+)\)", help_string).groups()[0] else: raise ValueError("delly not found at/as {}".format(self.path))
def __call__(self, dataframe, map_sample2label={}): df = dataframe.pivot(index="gc_bin", columns="sample", values="mean").reset_index() # remove duplicate sample names # (in Cancer analysis: two blood samples) to_drop = [x for x in df.columns if x.startswith("2:")] df.drop(to_drop, axis=1, inplace=True) df.colums = [map_sample2label.get(x, x) for x in df.columns] if df.empty: E.warn("no data, no plot will be output") return ax = df.plot(kind="line", x="gc_bin") return ax
def get_sequence_length_dict(fastafn): """return sequence/length dictionary from a fasta file. The fasta file needs to be indexed with samtools faidx. """ # Temporary fix: see issue SYS-517 if not os.path.exists(fastafn): E.warn("could not find file {}".format(fastafn)) return None try: with pysam.FastaFile(fastafn) as inf: fastadict = dict(list(zip(inf.references, inf.lengths))) except IOError as ex: E.warn("file {} could not be opened".format(ex)) fastadict = None return fastadict
def get_version(self): help_string = E.run( "{self.path} version 2> /dev/null".format(**locals()), return_stdout=True, on_error="ignore").strip() if help_string and "not found" not in help_string: return re.search(r"Product: RTG Tools (\S+)", help_string).groups()[0] else: raise ValueError("rtg not found at/as {}: {}".format( self.path, help_string))
def get_version(self): help_string = E.run("{self.path} ".format(**locals()), return_stdout=True, on_error="ignore").strip() # lumpy express without arguments ends in error if help_string: raise NotImplementedError() return re.search(r"lumpy \(Version: (\S+)\)", help_string).groups()[0] else: raise ValueError("lumpy not found at/as {}".format(self.path))