def __init__(self, input_file, output_prefix, threads=1, options={}, dependencies=()): # See below for parameters in common between SE/PE cmd = _get_common_parameters(threads=threads, options=options) # Prefix for output files, ensure that all end up in temp folder cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s") output_tmpl = output_prefix + ".%s.gz" cmd.set_kwargs( TEMP_OUT_BASENAME=os.path.basename(output_prefix), OUT_SETTINGS=output_prefix + ".settings", OUT_MATE_1=output_tmpl % ("truncated", ), OUT_DISCARDED=output_tmpl % ("discarded", ), ) cmd.set_option("--file1", "%(IN_READS_1)s") cmd.set_kwargs(IN_READS_1=input_file) apply_options(cmd, options) CommandNode.__init__( self, command=cmd.finalize(), threads=threads, description="<AdapterRM (SE): %s -> '%s.*'>" % ( fileutils.describe_files(input_file), output_prefix, ), dependencies=dependencies, )
def __init__(self, config, reference, infiles, outfile, threads=1, dependencies=()): threads = _get_max_threads(reference, threads) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") command.set_option("-nt", threads) _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<GATK Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, threads=threads, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Detect Input Duplication: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, parameters): description = "<MarkDuplicates: %s>" \ % (describe_files(parameters.input_bams),) PicardNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__(self, parameters): description = "<MarkDuplicates: %s>" \ % (describe_files(parameters.input_bams),) PicardNode.__init__(self, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): input_bams = safe_coerce_to_tuple(input_bams) builder = factory.new("rmdup_collapsed") builder.add_value("%(TEMP_IN_BAM)s") builder.set_kwargs(OUT_STDOUT=output_bam, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_bams) if not keep_dupes: builder.set_option("--remove-duplicates") description = "<FilterCollapsedBAM: %s>" \ % (describe_files(input_bams),) MultiBAMInputNode.__init__(self, config=config, input_bams=input_bams, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Detect Input Duplication: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<GATK Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Validate FASTA Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies) assert len(self.output_files) == 1, self.output_files
def __init__(self, input_files, output_file, dependencies=()): self._output_file = output_file Node.__init__(self, description="<MergeCoverage: %s -> '%s'>" % (describe_files(input_files), self._output_file), input_files=input_files, output_files=self._output_file, dependencies=dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Validate FASTA Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies) assert len(self.output_files) == 1, self.output_files
def __init__(self, parameters): description = "<mapDamage (plots): %s -> '%s'>" \ % (describe_files(parameters.input_files), parameters.output_directory) MultiBAMInputNode.__init__(self, config=parameters.config, input_bams=parameters.input_files, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__(self, parameters): description = "<mapDamage (plots): %s -> '%s'>" \ % (describe_files(parameters.input_files), parameters.output_directory) MultiBAMInputNode.__init__(self, config=parameters.config, input_bams=parameters.input_files, command=parameters.command.finalize(), description=description, dependencies=parameters.dependencies)
def __init__( self, reference, input_files, output_directory, title="mapDamage", options={}, dependencies=(), ): merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", ], IN_STDIN=merge, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join( output_directory, "Fragmisincorporation_plot.pdf" ), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION, ) apply_options(command, options) CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (plots): %s -> '%s'>" % (describe_files(merge.input_files), output_directory,), dependencies=dependencies, )
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description = description, input_files = input_files, output_files = output_file, dependencies = dependencies)
def __init__(self, parameters): command = parameters.command.finalize() self._multi_file_input = len(parameters.input_files) > 1 if self._multi_file_input: cat = _build_cat_command(parameters.input_files, "uncompressed_input") command = ParallelCmds((command, cat)) CommandNode.__init__(self, command=command, threads=parameters.threads, description="<AdapterRM (SE): %s -> '%s.*'>" % (fileutils.describe_files(parameters.input_files), parameters.output_prefix), dependencies=parameters.dependencies)
def __init__(self, input_files, destination, dependencies=()): cat_cmd = factory.new("cat") cat_cmd.add_multiple_values(input_files) cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE) cat_cmd = cat_cmd.finalize() zip_cmd = AtomicCmd("gzip", IN_STDIN=cat_cmd, OUT_STDOUT=destination) description = "<Cat %s -> %s>" \ % (fileutils.describe_files(input_files), destination) CommandNode.__init__(self, description=description, command=ParallelCmds((cat_cmd, zip_cmd)), dependencies=dependencies)
def __init__(self, tree_files, output_file, taxa=(), dependencies=()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description=description, input_files=self._tree_files, output_files=self._output_file, dependencies=dependencies)
def __init__(self, config, input_files, output_file, dependencies=()): input_files = safe_coerce_to_tuple(input_files) builder = factory.new("duphist") builder.add_value('%(TEMP_IN_BAM)s') builder.set_kwargs(OUT_STDOUT=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) description = "<DuplicateHistogram: %s -> %r>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset self._files = set() for (read_type, filename) in input_files.iteritems(): if read_type == "Paired": self._files.add((read_type, filename.format(Pair=1))) self._files.add((read_type, filename.format(Pair=2))) else: self._files.add((read_type, filename)) input_files = [filename for _, filename in self._files] Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset self._files = set() for (read_type, filename) in input_files.iteritems(): if read_type == "Paired": self._files.add((read_type, filename.format(Pair=1))) self._files.add((read_type, filename.format(Pair=2))) else: self._files.add((read_type, filename)) input_files = [filename for _, filename in self._files] Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, input_files, destination, dependencies=()): cat_cmd = factory.new("cat") cat_cmd.add_multiple_values(input_files) cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE) cat_cmd = cat_cmd.finalize() zip_cmd = AtomicCmd("gzip", IN_STDIN=cat_cmd, OUT_STDOUT=destination) description = "<Cat %s -> %s>" \ % (fileutils.describe_files(input_files), destination) CommandNode.__init__(self, description=description, command=ParallelCmds((cat_cmd, zip_cmd)), dependencies=dependencies)
def __init__(self, tree_files, output_file, taxa = (), dependencies = ()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description = description, input_files = self._tree_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies=()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description=description, input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__( self, reference, input_files, output_file, directory, options={}, dependencies=(), ): stats_out_fname = "Stats_out_MCMC_correct_prob.csv" merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--rescale-only", "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s", ], IN_STDIN=merge, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION, ) apply_options(command, options) self._directory = directory CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (rescale): %s -> %r>" % (describe_files(merge.input_files), output_file,), dependencies=dependencies, )
def __init__(self, input_files, output_file, offset, collapsed=False, dependencies=()): command = factory.new(":validate_fastq") command.set_option("--offset", offset) if collapsed: command.set_option("--collapsed") command.add_multiple_values(input_files) command.set_kwargs(OUT_STDOUT=output_file) CommandNode.__init__( self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), command=command.finalize(), dependencies=dependencies, )
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): merge = merge_bam_files_command(input_bams) builder = factory.new("rmdup_collapsed") builder.set_kwargs(IN_STDIN=merge, OUT_STDOUT=output_bam) if not keep_dupes: builder.set_option("--remove-duplicates") description = "<FilterCollapsedBAM: %s>" % (describe_files( merge.input_files), ) CommandNode.__init__( self, command=ParallelCmds([merge, builder.finalize()]), description=description, dependencies=dependencies, )
def __init__(self, config, target_name, input_files, output_file, prefix, regions_file=None, dependencies=()): input_files = safe_coerce_to_tuple(input_files) index_format = regions_file and prefix['IndexFormat'] builder = factory.new("depths") builder.add_value("%(TEMP_IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) if regions_file: index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format) builder.set_option('--regions-file', '%(IN_REGIONS)s') builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=index_file) description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, index_format=index_format, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__( self, config, input_bams, output_bam, output_metrics=None, keep_dupes=False, dependencies=(), ): params = picard_command(config, "MarkDuplicates") _set_max_open_files(params, "MAX_FILE_HANDLES") params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=") params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; required # because .csi indexed BAM records can have "invalid" bins. params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") params.add_multiple_options("I", input_bams, sep="=") if not keep_dupes: # Remove duplicates from output by default to save disk-space params.set_option("REMOVE_DUPLICATES", "True", sep="=", fixed=False) output_metrics = output_metrics or swap_ext(output_bam, ".metrics") params.set_kwargs(OUT_BAM=output_bam, OUT_METRICS=output_metrics) description = "<MarkDuplicates: %s>" % (describe_files(input_bams), ) PicardNode.__init__( self, command=params.finalize(), description=description, dependencies=dependencies, )
def test_describe_files__no_files(): assert_equal(describe_files(()), "No files")
def test_describe_files__same_path_abs__1_differences(): fpaths = ("/var/foo/faz", "/var/foo/fao") assert_equal(describe_files(fpaths), "'/var/foo/fa?'")
def test_describe_files__same_path_abs__3_differences(): fpaths = ("/var/foo/bar", "/var/foo/foo") assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
def test_describe_files__different_paths_rel(): fpaths = ("var/foo/bar", "var/bar/foo") assert_equal(describe_files(fpaths), "2 files")
def test_describe_files__same_path_rel(): fpaths = ("var/foo/bar", "var/foo/foo") assert_equal(describe_files(fpaths), "2 files in 'var/foo'")
def test_describe_files__no_files(): assert_equal(describe_files(()), "No files")
def test_describe_files__iterable(): fpaths = iter(("/var/foo/bar", "/var/foo/foo")) assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
def test_describe_files__same_path_abs__3_differences(): fpaths = ("/var/foo/bar", "/var/foo/foo") assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
def test_describe_files__single_file(): fpath = "/var/foo/bar" assert_equal(describe_files((fpath,)), repr(fpath))
def test_describe_files__same_path_abs__1_differences(): fpaths = ("/var/foo/faz", "/var/foo/fao") assert_equal(describe_files(fpaths), "'/var/foo/fa?'")
def test_describe_files__same_path_rel(): fpaths = ("var/foo/bar", "var/foo/foo") assert_equal(describe_files(fpaths), "2 files in 'var/foo'")
def test_describe_files__different_paths_rel(): fpaths = ("var/foo/bar", "var/bar/foo") assert_equal(describe_files(fpaths), "2 files")
def test_describe_files__iterable(): fpaths = iter(("/var/foo/bar", "/var/foo/foo")) assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
def test_describe_files__single_file(): fpath = "/var/foo/bar" assert_equal(describe_files((fpath, )), repr(fpath))