def _do_test_parallel_commands__ready_two(first, second, result): cmd_mock_1 = flexmock(AtomicCmd(["ls"])) cmd_mock_1.should_receive('ready').and_return(first).at_least.once cmd_mock_2 = flexmock(AtomicCmd(["ls"])) cmd_mock_2.should_receive('ready').and_return(second) cmds = ParallelCmds([cmd_mock_1, cmd_mock_2]) assert_equal(cmds.ready(), result)
def test_parallel_commands__join_before_run(): mocks = [] for value in reversed(range(3)): cmd_mock = flexmock(AtomicCmd("true")) cmd_mock.should_receive('join').and_return([value]).never mocks.append(cmd_mock) cmds = ParallelCmds(mocks) assert_equal(cmds.join(), [None, None, None])
def test_parallel_commands__ready_single(value): cmd = AtomicCmd(["ls"]) cmd.ready = Mock() cmd.ready.return_value = value cmds = ParallelCmds([cmd]) assert cmds.ready() == value cmd.ready.assert_called()
def test_parallel_commands__run(): mocks = [] for _ in range(3): cmd_mock = flexmock(AtomicCmd(["ls"])) cmd_mock.should_receive('run').with_args("xTMPx").once mocks.append(cmd_mock) cmds = ParallelCmds(mocks) cmds.run("xTMPx")
def test_parallel_commands__join_before_run(): mock = Mock() cmd_1 = AtomicCmd(["ls"]) cmd_1.join = mock.join_1 cmd_2 = AtomicCmd(["ls"]) cmd_2.join = mock.join_2 cmd_3 = AtomicCmd(["ls"]) cmd_3.join = mock.join_3 cmds = ParallelCmds((cmd_3, cmd_2, cmd_1)) assert cmds.join() == [None, None, None] assert mock.mock_calls == []
def test_parallel_commands__ready_two(first, second, result): cmd_1 = AtomicCmd(["ls"]) cmd_1.ready = Mock() cmd_1.ready.return_value = first cmd_2 = AtomicCmd(["ls"]) cmd_2.ready = Mock() cmd_2.ready.return_value = second cmds = ParallelCmds([cmd_1, cmd_2]) assert cmds.ready() == result cmd_1.ready.assert_called() assert bool(first) == bool(cmd_2.ready.call_count)
def __init__(self, infile, outfile, regions, options, dependencies=()): vcffilter = factory.new("vcf_filter") vcffilter.add_value("%(IN_VCF)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) apply_options(vcffilter, options) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) description = "<VCFFilter: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=ParallelCmds([vcffilter.finalize(), bgzip.finalize()]), dependencies=dependencies, )
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<GATK Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__( self, input_file_1, output_file, reference, prefix, input_file_2=None, threads=1, algorithm="mem", mapping_options={}, cleanup_options={}, dependencies=(), ): if algorithm not in ("mem", "bwasw"): raise NotImplementedError("BWA algorithm %r not implemented" % (algorithm, )) threads = _get_max_threads(reference, threads) aln = _new_bwa_command( ("bwa", algorithm, prefix, "%(IN_FILE_1)s"), prefix, IN_FILE_1=input_file_1, OUT_STDOUT=AtomicCmd.PIPE, ) if input_file_2: aln.add_value("%(IN_FILE_2)s") aln.set_kwargs(IN_FILE_2=input_file_2) aln.set_option("-t", threads) # Mark alternative hits as secondary; required by e.g. Picard aln.set_option("-M") cleanup = _new_cleanup_command(aln, output_file, reference, paired_end=input_file_1 and input_file_2) apply_options(aln, mapping_options) apply_options(cleanup, cleanup_options) description = _get_node_description( name="BWA", algorithm="%s%s" % (algorithm.upper(), "_PE" if input_file_2 else "_SE"), input_files_1=input_file_1, input_files_2=input_file_2, prefix=prefix, ) CommandNode.__init__( self, command=ParallelCmds([aln.finalize(), cleanup.finalize()]), description=description, threads=threads, dependencies=dependencies, )
def test_pformat__sets__nested(): cmd_1 = AtomicCmd(("echo", "foo"), OUT_STDOUT=AtomicCmd.PIPE) cmd_2 = AtomicCmd("gzip", IN_STDIN=cmd_1) cmd_3 = AtomicCmd("sha1sum") set_1 = ParallelCmds((cmd_1, cmd_2)) set_2 = SequentialCmds((set_1, cmd_3)) assert pformat(set_2) == ( "Sequential processes:\n" " Parallel processes:\n" " Process 1:\n" " Command = echo foo\n" " STDOUT = Piped to process 2\n" " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1}.stderr'\n" "\n" " Process 2:\n" " Command = gzip\n" " STDIN = Piped from process 1\n" " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2}.stdout'\n" " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2}.stderr'\n" "\n" " Process 3:\n" " Command = sha1sum\n" " STDOUT* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3}.stdout'\n" " STDERR* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3}.stderr'").format( cmd_1=id(cmd_1), cmd_2=id(cmd_2), cmd_3=id(cmd_3))
def test_parallel_commands__run(): mock = Mock() cmd_1 = AtomicCmd(["ls"]) cmd_1.run = mock.run_1 cmd_2 = AtomicCmd(["ls"]) cmd_2.run = mock.run_2 cmd_3 = AtomicCmd(["ls"]) cmd_3.run = mock.run_3 cmds = ParallelCmds((cmd_1, cmd_2, cmd_3)) cmds.run("xTMPx") assert mock.mock_calls == [ call.run_1("xTMPx"), call.run_2("xTMPx"), call.run_3("xTMPx"), ]
def __init__( self, reference, input_files, output_directory, title="mapDamage", options={}, dependencies=(), ): merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", ], IN_STDIN=merge, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join( output_directory, "Fragmisincorporation_plot.pdf" ), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION, ) apply_options(command, options) CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (plots): %s -> '%s'>" % (describe_files(merge.input_files), output_directory,), dependencies=dependencies, )
def __init__(self, parameters): commands = [ parameters.commands[key].finalize() for key in ("cat", "filter", "bgzip") ] description = "<VCFFilter: '%s' -> '%s'>" % (parameters.infile, parameters.outfile) CommandNode.__init__(self, description=description, command=ParallelCmds(commands), dependencies=parameters.dependencies)
def __init__( self, reference, infile, bedfile, outfile, mpileup_options={}, bcftools_options={}, dependencies=(), ): mpileup = AtomicCmdBuilder( ("bcftools", "mpileup", "%(IN_BAMFILE)s"), IN_BAMFILE=infile, IN_INTERVALS=bedfile, OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=BCFTOOLS_VERSION, ) # Ignore read-groups for pileup mpileup.add_option("--ignore-RG") # Reference sequence (FASTA) mpileup.add_option("--fasta-ref", reference) # Output compressed VCF mpileup.add_option("--output-type", "u") if bedfile: mpileup.set_option("--regions-file", "%(IN_INTERVALS)s") apply_options(mpileup, mpileup_options) genotype = AtomicCmdBuilder( ("bcftools", "call", "-"), IN_STDIN=mpileup, IN_BAMFILE=infile, OUT_STDOUT=outfile, CHECK_VERSION=BCFTOOLS_VERSION, ) genotype.set_option("--output-type", "z") apply_options(genotype, bcftools_options) CommandNode.__init__( self, description="<GenotypeRegions: '%s' -> '%s'>" % ( infile, outfile, ), command=ParallelCmds([mpileup.finalize(), genotype.finalize()]), dependencies=dependencies, )
def __init__(self, parameters): command = ParallelCmds([parameters.commands[key].finalize() for key in parameters.order]) input_file = parameters.input_file_fq description = _get_node_description(name="BWA Samse", input_files_1=input_file, prefix=parameters.prefix) CommandNode.__init__(self, command=command, description=description, dependencies=parameters.dependencies)
def __init__( self, input_file_fq_1, input_file_fq_2, input_file_sai_1, input_file_sai_2, output_file, reference, prefix, mapping_options={}, cleanup_options={}, dependencies=(), ): sampe = _new_bwa_command( ( "bwa", "sampe", prefix, "%(IN_SAI_1)s", "%(IN_SAI_2)s", "%(IN_FQ_1)s", "%(IN_FQ_2)s", ), prefix, IN_SAI_1=input_file_sai_1, IN_SAI_2=input_file_sai_2, IN_FQ_1=input_file_fq_1, IN_FQ_2=input_file_fq_2, OUT_STDOUT=AtomicCmd.PIPE, ) cleanup = _new_cleanup_command(sampe, output_file, reference, paired_end=True) apply_options(sampe, mapping_options) apply_options(cleanup, cleanup_options) CommandNode.__init__( self, command=ParallelCmds([sampe.finalize(), cleanup.finalize()]), description=_get_node_description( name="BWA Sampe", input_files_1=input_file_fq_1, input_files_2=input_file_fq_2, prefix=prefix, ), dependencies=dependencies, )
def __init__(self, parameters): command = parameters.command.finalize() self._multi_file_input = len(parameters.input_files) > 1 if self._multi_file_input: cat = _build_cat_command(parameters.input_files, "uncompressed_input") command = ParallelCmds((command, cat)) CommandNode.__init__(self, command=command, threads=parameters.threads, description="<AdapterRM (SE): %s -> '%s.*'>" % (fileutils.describe_files(parameters.input_files), parameters.output_prefix), dependencies=parameters.dependencies)
def __init__(self, input_files, destination, dependencies=()): cat_cmd = factory.new("cat") cat_cmd.add_multiple_values(input_files) cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE) cat_cmd = cat_cmd.finalize() zip_cmd = AtomicCmd("gzip", IN_STDIN=cat_cmd, OUT_STDOUT=destination) description = "<Cat %s -> %s>" \ % (fileutils.describe_files(input_files), destination) CommandNode.__init__(self, description=description, command=ParallelCmds((cat_cmd, zip_cmd)), dependencies=dependencies)
def __init__(self, parameters): command = ParallelCmds([parameters.commands[key].finalize() for key in parameters.order]) description \ = _get_node_description(name="BWA", algorithm='Backtrack', input_files_1=parameters.input_file, prefix=parameters.prefix, threads=parameters.threads) CommandNode.__init__(self, command=command, description=description, threads=parameters.threads, dependencies=parameters.dependencies)
def __init__(self, parameters): _check_bwa_prefix(parameters.prefix) algorithm = parameters.algorithm.upper() algorithm += "_PE" if parameters.input_file_2 else "_SE" desc = _get_node_description(name="BWA", algorithm=algorithm, input_files_1=parameters.input_file_1, input_files_2=parameters.input_file_2, prefix=parameters.prefix) command = ParallelCmds([cmd.finalize() for cmd in parameters.commands.itervalues()]) CommandNode.__init__(self, command=command, description=desc, threads=parameters.threads, dependencies=parameters.dependencies)
def __init__(self, parameters): command = ParallelCmds( [parameters.commands[key].finalize() for key in parameters.order]) algorithm = "PE" if parameters.input_file_2 else "SE" description \ = _get_node_description(name="Bowtie2", algorithm=algorithm, input_files_1=parameters.input_file_1, input_files_2=parameters.input_file_2, prefix=parameters.prefix, threads=parameters.threads) CommandNode.__init__(self, command=command, description=description, threads=parameters.threads, dependencies=parameters.dependencies)
def __init__( self, reference, input_files, output_file, directory, options={}, dependencies=(), ): stats_out_fname = "Stats_out_MCMC_correct_prob.csv" merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--rescale-only", "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s", ], IN_STDIN=merge, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION, ) apply_options(command, options) self._directory = directory CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (rescale): %s -> %r>" % (describe_files(merge.input_files), output_file,), dependencies=dependencies, )
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): merge = merge_bam_files_command(input_bams) builder = factory.new("rmdup_collapsed") builder.set_kwargs(IN_STDIN=merge, OUT_STDOUT=output_bam) if not keep_dupes: builder.set_option("--remove-duplicates") description = "<FilterCollapsedBAM: %s>" % (describe_files( merge.input_files), ) CommandNode.__init__( self, command=ParallelCmds([merge, builder.finalize()]), description=description, dependencies=dependencies, )
def __init__(self, config, input_bams, command, index_format=None, description=None, threads=1, dependencies=()): self._input_bams = safe_coerce_to_tuple(input_bams) self._index_format = index_format if not self._input_bams: raise ValueError("No input BAM files specified!") elif len(self._input_bams) > 1 and index_format: raise ValueError("BAM index cannot be required for > 1 file") elif index_format not in (None, ".bai", ".csi"): raise ValueError("Unknown index format %r" % (index_format, )) if len(self._input_bams) > 1: merge = picard_command(config, "MergeSamFiles") merge.set_option("SO", "coordinate", sep="=") merge.set_option("COMPRESSION_LEVEL", 0, sep="=") merge.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; this # is because .csi indexed BAM records can have "invalid" bins. merge.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") merge.add_multiple_options("I", input_bams, sep="=") merge.set_kwargs(TEMP_OUT_BAM=self.PIPE_FILE) command = ParallelCmds([merge.finalize(), command]) CommandNode.__init__(self, command=command, description=description, threads=threads, dependencies=dependencies)
def __init__( self, input_file_fq, input_file_sai, output_file, reference, prefix, mapping_options={}, cleanup_options={}, dependencies=(), ): samse = _new_bwa_command( ("bwa", "samse"), prefix, IN_FILE_SAI=input_file_sai, IN_FILE_FQ=input_file_fq, OUT_STDOUT=AtomicCmd.PIPE, ) samse.add_value(prefix) samse.add_value("%(IN_FILE_SAI)s") samse.add_value("%(IN_FILE_FQ)s") cleanup = _new_cleanup_command(samse, output_file, reference) apply_options(samse, mapping_options) apply_options(cleanup, cleanup_options) CommandNode.__init__( self, command=ParallelCmds([samse.finalize(), cleanup.finalize()]), description=_get_node_description(name="BWA Samse", input_files_1=input_file_fq, prefix=prefix), dependencies=dependencies, )
def test_parallel_commands__reject_empty_commandset(): with pytest.raises(CmdError): ParallelCmds([])
def test_parallel_commands__join_after_run(temp_folder): cmds = ParallelCmds([AtomicCmd("true") for _ in range(3)]) cmds.run(temp_folder) assert_equal(cmds.join(), [0, 0, 0])
def test_parallel_commands__join_failure_3(temp_folder): mocks = _setup_mocks_for_failure(True, True, False) cmds = ParallelCmds(mocks) cmds.run(temp_folder) assert_equal(cmds.join(), ['SIGTERM', 'SIGTERM', 1])
def test_parallel_commands__reject_sequential(): command = AtomicCmd(["ls"]) seqcmd = SequentialCmds([command]) with pytest.raises(CmdError): ParallelCmds([seqcmd])
def test_parallel_commands__reject_noncommand(): with pytest.raises(CmdError): ParallelCmds([object()])
def __init__( self, input_file_1, input_file_2, output_file, reference, prefix, threads=2, log_file=None, mapping_options={}, cleanup_options={}, dependencies=(), ): # Setting IN_FILE_2 to None makes AtomicCmd ignore this key aln = _bowtie2_template( ("bowtie2", ), prefix, OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=BOWTIE2_VERSION, ) aln.set_option("-x", prefix) if log_file is not None: aln.set_kwargs(OUT_STDERR=log_file) if input_file_1 and not input_file_2: aln.add_option("-U", input_file_1) elif input_file_1 and input_file_2: aln.add_option("-1", input_file_1) aln.add_option("-2", input_file_2) else: raise NodeError("Input 1, OR both input 1 and input 2 must " "be specified for Bowtie2 node") max_threads = _get_max_threads(reference, threads) aln.set_option("--threads", max_threads) cleanup = _new_cleanup_command(aln, output_file, reference, paired_end=input_file_1 and input_file_2) apply_options(aln, mapping_options) apply_options(cleanup, cleanup_options) algorithm = "PE" if input_file_2 else "SE" description = _get_node_description( name="Bowtie2", algorithm=algorithm, input_files_1=input_file_1, input_files_2=input_file_2, prefix=prefix, threads=threads, ) CommandNode.__init__( self, command=ParallelCmds([aln.finalize(), cleanup.finalize()]), description=description, threads=threads, dependencies=dependencies, )
def test_sequential_commands__accept_parallel(): command = AtomicCmd(["ls"]) parcmd = ParallelCmds([command]) SequentialCmds([parcmd])
def _do_test_parallel_commands__ready_single(value): cmd_mock = flexmock(AtomicCmd(["ls"])) cmd_mock.should_receive('ready').and_return(value).at_least.once cmds = ParallelCmds([cmd_mock]) assert_equal(cmds.ready(), value)