def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".vcf.gz") vcf_output = prefix + ".raw.vcf.gz" if not os.path.exists(vcf_output): stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.HaplotypeCaller.log " "{params.haplotypecaller} " "--out {vcf_output} " ">& {prefix}.HaplotypeCaller.err".format(**locals())) else: E.warn("output file {vcf_output} already exists - " "it will not be recomputed".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="5G")
def run(self, outfile, params): retvals = [] prefix = IOTools.snip(outfile, ".bed.gz") vcffile = prefix + ".vcf.gz" if not os.path.exists(vcffile): retvals.extend(run_tool_delly.run(self, vcffile, params)) statements = [] statements.append("{self.path_bcftools} query " "{params.bcftools_options} " "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" " "{vcffile} " "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{" "case \"DEL\": $5=0; break; " "case \"DUP\": $5=3; break; " "case \"INS\": next; break; " "}}; print }}' " "| bgzip " "> {outfile}".format(**locals())) statements.append("tabix -f -p bed {outfile}".format(**locals())) statement = "; ".join(statements) retvals.append(P.run(statement)) return retvals
def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".bam") stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type RealignerTargetCreator " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.RealignerTargetCreator.log " "{params.realignertargetcreator} " "--out {outfile}.realign.intervals " ">& {outfile}.RealignerTargetCreator.err".format(**locals())) stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type IndelRealigner " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--targetIntervals {outfile}.realign.intervals " "--logging_level INFO " "--log_to_file {outfile}.IndelRealigner.log " "{params.indelrealigner} " "--out @[email protected] " ">& {outfile}.IndelRealigner.err".format(**locals())) stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type BaseRecalibrator " "--input_file @[email protected] " "--reference_sequence {reference_fasta} " "--logging_level INFO " "{params.baserecalibrator} " "--log_to_file {outfile}.BaseRecalibrator.log " "--out {outfile}.recal_data.table " ">& {outfile}.BaseRecalibrator.err".format(**locals())) stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type PrintReads " "--input_file @[email protected] " "--reference_sequence {reference_fasta} " "--BQSR {outfile}.recal_data.table " "--logging_level INFO " "--log_to_file {outfile}.PrintReads.log " "--out {outfile} " ">& {outfile}.PrintReads.err".format(**locals())) stmnts.append("mv {prefix}.bai {outfile}.bam.bai") return self.run_statements(stmnts, job_memory="3G")
def build_readgroup_string(outfile, params): if params.readgroup_id_regex is None: readgroup_id = IOTools.snip(os.path.basename(outfile), ".bam") else: try: readgroup_id = "-".join(re.search( params.readgroup_id_regex, outfile).groups()) except AttributeError as ex: raise AttributeError("regular expression {} does not match {}".format( params.readgroup_id_regex, outfile)) if params.readgroup_sample_regex is None: readgroup_sample = readgroup_id else: try: readgroup_sample = "-".join(re.search( params.readgroup_sample_regex, outfile).groups()) except AttributeError as ex: raise AttributeError("regular expression {} does not match {}".format( params.readgroup_sample_regex, outfile)) readgroup_string = "@RG\tID:{}\tSM:{}".format( readgroup_id, readgroup_sample) if params.readgroup_header: readgroup_string += "\t{}".format(params.readgroup_header) return readgroup_string, readgroup_id, readgroup_sample
def run(self, infile, outfile, params): outfile_pass = IOTools.snip(outfile, ".tsv") + "-pass.fastq.gz" outfile_fail = IOTools.snip(outfile, ".tsv") + "-fail.fastq.gz" statement = ("zcat {infile} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "--output-stats-tsv={outfile} " "- " "| gzip " "> {outfile_pass} " "".format(**locals())) return P.run(statement)
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bams = resolve_argument(params.bam, ",") reference_fasta = get_reference(params) statements, gvcfs = [], [] # TODO: sort out multi-threading for idx, bam in enumerate(bams.split(",")): output = prefix + "." + str(idx) + ".g.vcf" gvcfs.append(output) if os.path.exists(output): E.info("{} already exists - skipped".format(output)) continue statements.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--emitRefConfidence GVCF " "--logging_level INFO " "--log_to_file {prefix}.HaplotypeCaller.{idx}.log " "{params.haplotypecaller} " "--out {output} " ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals())) if statements: self.run_statements(statements, job_memory="4G") stmnts = [] gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs]) vcf_output = prefix + ".raw.vcf.gz" stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type GenotypeGVCFs " "--reference_sequence {reference_fasta} " "{gvcfs} " "--logging_level INFO " "--log_to_file {prefix}.GenotypeGVCFs.log " "{params.genotypegvcfs} " "--out {vcf_output} " ">& {prefix}.GenotypeGVCFs".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="4G")
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bam = resolve_argument(params.bam, sep=",") reference_fasta = get_reference(params) bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")]) stmnts = [] if not os.path.exists(prefix + ".annotated.vcf.gz"): tmpfile, pre_statement, post_statement = self.pre_process( params.vcf, outfile, params) stmnts.append(pre_statement) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type VariantAnnotator " "--variant {tmpfile} " "{bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {prefix}.VariantAnnotator.log " "--annotation FisherStrand " "--annotation StrandOddsRatio " "--annotation ReadPosRankSumTest " "--annotation RMSMappingQuality " "--annotation MappingQualityRankSumTest " "{params.options} " "--out {prefix}.annotated.vcf.gz " ">& {prefix}.VariantAnnotator.err".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) stmnts.append(post_statement) else: E.warn("using pre-existing file {} with annotated variants".format( prefix + ".annotated.vcf.gz")) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) return self.run_statements(stmnts, job_memory="3G")
def run(self, infiles, outfile, params): if not outfile.endswith("-pass.fastq.gz"): raise ValueError( "outfile must end in -pass.fastq.gz, got {}".format(outfile)) if params.min_size_bytes: before = len(infiles) infiles = [ x for x in infiles if os.path.getsize(x) >= params.min_size_bytes ] E.debug( "removing small files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if params.newer_than: before = len(infiles) cutoff = os.path.getmtime(params.newer_than) infiles = [x for x in infiles if os.path.getmtime(x) > cutoff] E.debug( "removing old files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if len(infiles) == 0: E.warn("no files left after filtering, creating empty file") IOTools.touch_file(outfile) return infiles = " ".join(infiles) outfile_fail = IOTools.snip(outfile, "-pass.fastq.gz") + "-fail.fastq.gz" statement = ("zcat {infiles} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "- " "| gzip " "> {outfile}".format(**locals())) return P.run(statement)
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=",") # "-T {outfile}.tmpdir -k " outfile = IOTools.snip(outfile, ".gz") # note that lumpy removes the temporary directory # after running, thus make sure it is unique and exists return P.run("{params.path} " "-B {bam} " "-o {outfile} " "-T %(tmpdir)s_{self.__name__} " "-v " "{params.options} " ">& {outfile}.log; " "vcf-sort {outfile} " "| bgzip > {outfile}.gz; " "tabix -p vcf {outfile}.gz".format(**locals()))
def run(self, outfile, params): bam = resolve_argument(params.bam) # rename index from x.bai to x.bam.bai outprefix = IOTools.snip(outfile, ".bam", ".cram") statement = ("java -Xmx8000m -jar {params.path} " "MarkDuplicates " "INPUT={bam} " "TMP_DIR=%(tmpdir)s " "CREATE_INDEX=TRUE " "REFERENCE_SEQUENCE={params.reference_fasta} " "METRICS_FILE={outfile}.metrics " "{params.options} " "OUTPUT={outfile} " ">& {outfile}.log; " "mv {outprefix}.bai {outfile}.bai".format(**locals())) # 12G is required for java overhead return P.run(statement, job_memory="12G")
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS # ensure output directory exists. # This should be done on the pipeline level, but # ruffus currently seems not to allow this. outdir = os.path.dirname(outfile) if outdir and not os.path.exists(outdir): os.makedirs(outdir) output_files = [ self.map_table_to_file(x, outfile) for x in self.tablenames ] kwargs = { 'output_files': output_files, 'input_files': infiles, 'outdir': outdir } if self._runtime_regex: kwargs["alias"] = self.build_alias(str(infiles), regex=self._runtime_regex, alias=self._runtime_alias) self.save_meta(outfile, **kwargs) if self.ignore: found = False for i in self.ignore: if i in outdir: found = True break if found: E.warn("skipping task {} at runtime, an empty file is created". format(outfile)) IOTools.touch_file(outfile) return # if self.runtime_filter: # TODO: create empty outfile if regex matches # pass if only_info: E.warn( "only_info - meta information in {} has been updated".format( IOTools.snip(outfile) + ".info")) return # AH: duplicated from above? params = self.build_params(output_files=output_files) on_error_options = ["raise", "ignore"] on_error = params.get("on_error", "raise") if on_error not in on_error_options: raise ValueError("unknown option to 'on_error': '{}' " "should be one of '{}'".format( on_error, ",".join(on_error_options))) if self.ignore_task(infiles, outfile, params): return # deal with placeholder files created by identity that are # located on a remote mount point def map_to_mount(fn): if os.path.exists(fn + ".mnt"): if not P.PARAMS["mount_point"]: raise ValueError( "encountered mounted file {}, but no mount point present" .format(fn)) with open(fn + ".mnt") as inf: mount_path = inf.read() return os.path.join(P.PARAMS["mount_point"], mount_path) else: return fn # replace infiles with mount locations if necessary if isinstance(infiles, list): infiles = [map_to_mount(x) for x in infiles] else: infiles = map_to_mount(infiles) try: benchmark = self.run(infiles, outfile, as_namedtuple(params)) except Exception as ex: on_error = params.get("on_error", "raise") if on_error == "raise": raise elif on_error == "ignore": E.warn( "error occured during execution of {} but will be ignored:\n{}" .format(self.__name__, ex)) E.warn( "an empty output file {} will be created.".format(outfile)) IOTools.touch_file(outfile) benchmark = None if benchmark: self.save_benchmark(outfile, benchmark)
def run(self, outfile, params): local_options = [] outfile = os.path.abspath(outfile) outdir = os.path.dirname(outfile) # assumption is that index is called xyz.fa without the .fa. reference_fasta = IOTools.snip(params.reference_fasta, ".fa", ".fasta") if not os.path.exists(reference_fasta): raise ValueError("input reference {} does not exist".format(reference_fasta)) if "--jobs" in params.options or "-j" in params.options: job_threads = int(re.search("(--jobs|-j)\s*(\d+)", params.options).groups()[1]) else: job_threads = 8 if "--memory-limit" in params.options or "-m" in params.options: job_memory_gb = int(re.search("(--memory-limit|-m)\s*(\d+)", params.options).groups()[1]) else: job_memory_gb = 60 local_options.append("--memory-limit {}".format(job_memory_gb)) if job_memory_gb < 60: E.warn("isaac-align likely to require at least 60Gb of memory, {}G requested".format( job_memory_gb)) job_memory = "{}G".format(float(job_memory_gb) / job_threads) fastq_dir = os.path.join(outdir, "input_fastq") if not os.path.exists(fastq_dir): os.makedirs(fastq_dir) if len(params.fastq) == 2: if not os.path.exists(os.path.join(fastq_dir, "lane1_read1.fastq.gz")): os.symlink(os.path.abspath(params.fastq[0]), os.path.join(fastq_dir, "lane1_read1.fastq.gz")) if not os.path.exists(os.path.join(fastq_dir, "lane1_read2.fastq.gz")): os.symlink(os.path.abspath(params.fastq[1]), os.path.join(fastq_dir, "lane1_read2.fastq.gz")) else: raise NotImplementedError("expected 2 fastq files, received only {}".format(len(params.fastq))) intermediate_bam = os.path.join(outdir, "Aligned", "Projects", "default", "default", "sorted.bam") # picard statement to set readgroup picard_statement = self.build_picard_statement( intermediate_bam, outfile, params) tmpdir = os.path.join(outdir, "TEMP") local_options = " ".join(local_options) # isaac generates output files in working directory, so do a cd and make # sure that absolute path names are used elsewhere. statement = ( "cd {outdir}; " "{self.path} " "--reference-genome {reference_fasta}/sorted-reference.xml " "--base-calls {fastq_dir} " "--base-calls-format fastq-gz " "--temp-directory {tmpdir} " "--cleanup-intermediary 1 " "--bam-gzip-level {params.bam_gzip_level} " "{params.options} " "{local_options} " ">& {outfile}.isaac.log; " "{picard_statement}; " "rm -rf {tmpdir} " .format(**locals())) return P.run(statement)
module_dirs = [os.path.join(os.path.dirname(__file__))] module_dirs.extend([ x.strip() for x in os.environ.get("DAISY_TASKLIBRARY", "").split(",") if x.strip() ]) modules = [] for idx, root in enumerate(module_dirs): for module in glob.glob(os.path.join(root, "*.py")): if "flycheck" in module: continue if module.endswith("__init__.py"): continue module_name = IOTools.snip(os.path.basename(module)) if idx == 0: modules.append( importlib.import_module( "daisy.TaskLibrary.{}".format(module_name))) else: spec = importlib.util.spec_from_file_location( "daisy.UserLibrary.{}".format(module_name), module) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) modules.append(foo) # TODO: use derivation instead of name prefix map_tool_to_runner = dict() map_metric_to_runner = dict() map_collate_to_runner = dict()
def pre_process(self, infile, outfile, params): statements = [] infile = IOTools.snip(infile, ".bam") tmpdir = P.get_parameters_as_namedtuple().tmpdir outprefix = os.path.basename(os.path.dirname(outfile)) if params.copy_bam: statements.append("cp @[email protected] @[email protected]; " "cp @[email protected] @[email protected]") if params.split_bam: statements.append("daisy bam2bam-split-reads " "-i @[email protected] " "-o - " "{params.split_bam} " "--log={outfile}_split_bam.log " "2> {outfile}_split_bam.err " "> @[email protected]; ".format(**locals())) if params.bam2bam: statements.append("daisy bam2bam " "--stdin=@[email protected] " "{params.bam2bam} " "--log={outfile}_bam2bam.log " "2> {outfile}_bam2bam.err " "> @[email protected]; ".format(**locals())) if params.region: statements.append( "samtools view -b @[email protected] {} > @[email protected]".format( params.region)) if params.shift_quality: statements.append("samtools view -h @[email protected] " "| perl -lane " "'if(/^@/) {{print; next;}} " "@qual=split(//, $F[10]); " "$_=chr(ord($_)+{}) for (@qual); " "$F[10]=join(\"\",@qual); " "print join(\"\\t\", @F)' " "| samtools view -bS > @[email protected]".format( params.shift_quality)) if is_true(params.remove_chr): # also substitute chrM to MT. statements.append("samtools view -h @[email protected] " "| awk -v OFS='\\t' '" "$1 == \"@SQ\" " "{{ gsub(\"chrM\", \"chrMT\", $2); " " gsub(\"chr\", \"\", $2); print; next }} " "{{ gsub(\"chrM\", \"chrMT\", $3); " " gsub(\"chr\", \"\", $3); print; next}} '" "| samtools view -bS - " "2> {outfile}_remove_chr.log " "> @[email protected]; ".format(**locals())) if not statements: return infile + ".bam", "", "" filename, build_statement, cleanup_statement = P.join_statements( statements, infile) filename += ".bam" build_statement += ( "; samtools index {filename} >& {outfile}.index.log".format( **locals())) return filename, build_statement, cleanup_statement
def run(self, infile, outfile, params): # TODO: bam_fastqc_sequence_length_distribution.tsv may # contain ranges such as '30-31'. Convert to beginning of # range like in this perl command: # # perl -p -i -e "s/\-\d+//" # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv if infile.endswith(".gz"): prefix = IOTools.snip(os.path.basename(infile[:-3])) else: prefix = IOTools.snip(os.path.basename(infile)) outdir = os.path.dirname(outfile) datafile = os.path.join(outdir, "{}_fastqc".format(prefix), "fastqc_data.txt") if not os.path.exists(datafile): if not os.path.exists(outdir): os.makedirs(outdir) retval = P.run( "{params.path} " "{params.options} " "--extract " "--outdir {outdir} " "{infile} " ">& {outfile} ".format(**locals()), **params._asdict()) else: IOTools.touch_file(outfile) retval = None def _split_output(lines): body, header, section, status = [], None, None, None for line in lines: if line.startswith("##FastQC"): continue elif line.startswith("#"): header, body = line[1:-1].split("\t"), [] elif line.startswith(">>END_MODULE"): yield section, header, body, status body, header, section, status = [], None, None, None elif line.startswith(">>"): section, status = line[2:-1].split("\t") else: fields = line[:-1].split("\t") body.append(fields) # split into separate files for upload summary_data = [] with IOTools.open_file(datafile) as inf: for section, header, body, status in _split_output(inf): if len(body) == 0: continue summary_data.append((section, status)) tablename = "{}_".format(self.name) + re.sub( " ", "_", section).lower() if tablename not in self.tablenames: raise ValueError( "unknown tablename {}, expected one of {}".format( tablename, self.tablenames)) output_file = ".".join((outfile, tablename, "tsv")) with open(output_file, "w") as outf: outf.write("\t".join([x.lower() for x in header]) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") output_file = ".".join( (outfile, "{}_summary".format(self.name), "tsv")) with IOTools.open_file(output_file, "w") as outf: outf.write("section\tstatus\n") for section, status in summary_data: outf.write("{}\t{}\n".format(section, status)) return retval