def test_builder__set_kwargs__after_finalize(): expected = {"IN_PATH": "/a/b/"} builder = AtomicCmdBuilder("echo") builder.set_kwargs(IN_PATH="/a/b/") builder.finalize() assert_raises(AtomicCmdBuilderError, builder.set_kwargs, OUT_PATH="/dst/file") assert_equal(builder.kwargs, expected)
def __init__( self, reference, infile, bedfile, outfile, mpileup_options={}, bcftools_options={}, dependencies=(), ): mpileup = AtomicCmdBuilder( ("bcftools", "mpileup", "%(IN_BAMFILE)s"), IN_BAMFILE=infile, IN_INTERVALS=bedfile, OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=BCFTOOLS_VERSION, ) # Ignore read-groups for pileup mpileup.add_option("--ignore-RG") # Reference sequence (FASTA) mpileup.add_option("--fasta-ref", reference) # Output compressed VCF mpileup.add_option("--output-type", "u") if bedfile: mpileup.set_option("--regions-file", "%(IN_INTERVALS)s") apply_options(mpileup, mpileup_options) genotype = AtomicCmdBuilder( ("bcftools", "call", "-"), IN_STDIN=mpileup, IN_BAMFILE=infile, OUT_STDOUT=outfile, CHECK_VERSION=BCFTOOLS_VERSION, ) genotype.set_option("--output-type", "z") apply_options(genotype, bcftools_options) CommandNode.__init__( self, description="<GenotypeRegions: '%s' -> '%s'>" % ( infile, outfile, ), command=ParallelCmds([mpileup.finalize(), genotype.finalize()]), dependencies=dependencies, )
def __init__(self, infile, outfile, regions, options, dependencies=()): vcffilter = factory.new("vcf_filter") vcffilter.add_value("%(IN_VCF)s") for contig in regions["HomozygousContigs"]: vcffilter.add_option("--homozygous-chromosome", contig) vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE) apply_options(vcffilter, options) bgzip = AtomicCmdBuilder(["bgzip"], IN_STDIN=vcffilter, OUT_STDOUT=outfile) description = "<VCFFilter: '%s' -> '%s'>" % ( infile, outfile, ) CommandNode.__init__( self, description=description, command=ParallelCmds([vcffilter.finalize(), bgzip.finalize()]), dependencies=dependencies, )
def __init__(self, input_file, output_file, algorithm="auto", options={}, dependencies=()): command = AtomicCmdBuilder( _PRESETS[algorithm.lower()] + ["%(IN_FASTA)s"], IN_FASTA=input_file, OUT_STDOUT=output_file, CHECK_VERSION=MAFFT_VERSION, ) apply_options(command, options) self._output_file = output_file CommandNode.__init__( self, command=command.finalize(), description="<MAFFTNode (%s): '%s' -> '%s'>" % ( algorithm, input_file, output_file, ), dependencies=dependencies, )
def merge_bam_files_command(input_files): merge = AtomicCmdBuilder( ["samtools", "merge", "-u", "-"], OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=SAMTOOLS_VERSION, ) merge.add_multiple_values(input_files) return merge.finalize()
def test_builder__finalize__calls_atomiccmd(): was_called = [] class _AtomicCmdMock: def __init__(self, *args, **kwargs): assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"],)) assert_equal(kwargs, {"IN_FILE": "/in/file", "OUT_FILE": "/out/file", "set_cwd": True}) was_called.append(True) with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock): builder = AtomicCmdBuilder("echo", set_cwd=True) builder.add_option("-out", "%(OUT_FILE)s") builder.add_value("%(IN_FILE)s") builder.set_kwargs(OUT_FILE="/out/file", IN_FILE="/in/file") builder.finalize() assert was_called
def __init__( self, reference, input_files, output_directory, title="mapDamage", options={}, dependencies=(), ): merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", ], IN_STDIN=merge, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join( output_directory, "Fragmisincorporation_plot.pdf" ), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION, ) apply_options(command, options) CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (plots): %s -> '%s'>" % (describe_files(merge.input_files), output_directory,), dependencies=dependencies, )
def test_builder__finalize__calls_atomiccmd(): was_called = [] class _AtomicCmdMock(object): def __init__(self, *args, **kwargs): assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"], )) assert_equal(kwargs, { "IN_FILE": "/in/file", "OUT_FILE": "/out/file", "set_cwd": True }) was_called.append(True) with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock): builder = AtomicCmdBuilder("echo", set_cwd=True) builder.add_option("-out", "%(OUT_FILE)s") builder.add_value("%(IN_FILE)s") builder.set_kwargs(OUT_FILE="/out/file", IN_FILE="/in/file") builder.finalize() assert was_called
def __init__(self, input_file, k_groups, output_root, samples=None, dependencies=()): self._samples = samples self._input_file = input_file self._k_groups = k_groups group_key = "Group(%i)" % (self._k_groups,) self._supervised = samples and any((row[group_key] != '-') for row in samples.itervalues()) assert k_groups in (2, 3), k_groups prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder("admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True) cmd.set_option("-s", random.randint(0, 2 ** 16 - 1)) if self._supervised: cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__(self, description="<Admixture -> '%s.*''>" % (output_prefix,), command=cmd.finalize(), dependencies=dependencies)
def __init__(self, reference, directory, options={}, dependencies=()): command = AtomicCmdBuilder( [ "mapDamage", "--stats-only", "-r", "%(IN_REFERENCE)s", "-d", "%(TEMP_DIR)s", ], IN_REFERENCE=reference, TEMP_OUT_FREQ_3p="3pGtoA_freq.txt", TEMP_OUT_FREQ_5p="5pCtoT_freq.txt", TEMP_OUT_COMP_USER="******", TEMP_OUT_MISINCORP="misincorporation.txt", TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", OUT_COMP_GENOME=os.path.join(directory, "dnacomp_genome.csv"), OUT_MCMC_PROBS=os.path.join(directory, "Stats_out_MCMC_correct_prob.csv"), OUT_MCMC_HIST=os.path.join(directory, "Stats_out_MCMC_hist.pdf"), OUT_MCMC_ITER=os.path.join(directory, "Stats_out_MCMC_iter.csv"), OUT_MCMC_ITERSUM=os.path.join( directory, "Stats_out_MCMC_iter_summ_stat.csv" ), OUT_MCMC_POSTPRED=os.path.join(directory, "Stats_out_MCMC_post_pred.pdf"), OUT_MCMC_TRACE=os.path.join(directory, "Stats_out_MCMC_trace.pdf"), CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION, CHECK_R_INLINE=rtools.requirement("inline"), CHECK_R_GGPLOT2=rtools.requirement("ggplot2"), CHECK_R_RCPP=rtools.requirement("Rcpp"), CHECK_R_GAM=rtools.requirement("gam"), CHECK_R_RCPPGSL=rtools.requirement("RcppGSL"), ) apply_options(command, options) self._directory = directory CommandNode.__init__( self, command=command.finalize(), description="<mapDamage (model): %r>" % (directory,), dependencies=dependencies, )
def __init__(self, input_alignment, input_partition, output_file, dependencies=()): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. input_partition -- A set of partitions in a format readable by RAxML. output_filename -- Filename for the output binary sequence.""" command = AtomicCmdBuilder("parse-examl", set_cwd=True) command.set_option("-s", "%(TEMP_OUT_ALN)s") command.set_option("-q", "%(TEMP_OUT_PART)s") # Output file will be named output.binary, and placed in the CWD command.set_option("-n", "output") # Substitution model command.set_option("-m", "DNA", fixed=False) command.set_kwargs( # Auto-delete: Symlinks TEMP_OUT_PART=os.path.basename(input_partition), TEMP_OUT_ALN=os.path.basename(input_alignment), # Input files, are not used directly (see below) IN_ALIGNMENT=input_alignment, IN_PARTITION=input_partition, # Final output file, are not created directly OUT_BINARY=output_file, CHECK_EXAML=PARSER_VERSION, ) CommandNode.__init__( self, command=command.finalize(), description="<ExaMLParser: '%s' -> '%s'>" % (input_alignment, output_file), dependencies=dependencies, ) self._symlinks = [ os.path.abspath(input_alignment), os.path.abspath(input_partition), ] self._output_file = os.path.basename(output_file)
def __init__( self, reference, input_files, output_file, directory, options={}, dependencies=(), ): stats_out_fname = "Stats_out_MCMC_correct_prob.csv" merge = merge_bam_files_command(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--rescale-only", "-i", "-", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s", ], IN_STDIN=merge, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION, ) apply_options(command, options) self._directory = directory CommandNode.__init__( self, command=ParallelCmds([merge, command.finalize()]), description="<mapDamage (rescale): %s -> %r>" % (describe_files(merge.input_files), output_file,), dependencies=dependencies, )
def __init__(self, input_alignment, input_partitions, output_tree, dependencies=()): command = AtomicCmdBuilder("raxmlHPC") # Compute a randomized parsimony starting tree command.set_option("-y") # Output files are saved with a .Pypeline postfix, and subsequently renamed command.set_option("-n", "Pypeline") # Model required, but not used command.set_option("-m", "GTRGAMMA") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Set random seed for bootstrap generation. May be set to allow replicability command.set_option("-p", int(random.random() * 2 ** 31 - 1), fixed=False) # Symlink to sequence and partitions, to prevent the creation of *.reduced files # outside temp folder command.set_option("-s", "%(TEMP_OUT_ALIGNMENT)s") command.set_option("-q", "%(TEMP_OUT_PARTITION)s") command.set_kwargs( IN_ALIGNMENT=input_alignment, IN_PARTITION=input_partitions, # TEMP_OUT_ is used to automatically remove these files TEMP_OUT_ALIGNMENT="RAxML_alignment", TEMP_OUT_PARTITION="RAxML_partitions", TEMP_OUT_INFO="RAxML_info.Pypeline", OUT_TREE=output_tree, CHECK_VERSION=RAXML_VERSION, ) self._input_alignment = input_alignment self._input_partitions = input_partitions self._output_tree = output_tree CommandNode.__init__( self, command=command.finalize(), description="<RAxMLParsimonyTree: '%s' -> '%s'>" % (input_alignment, output_tree), dependencies=dependencies, )
def __init__(self, input_file, k_groups, output_root, groups, dependencies=()): self._groups = groups self._input_file = input_file prefix = os.path.splitext(os.path.basename(input_file))[0] output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups)) cmd = AtomicCmdBuilder( "admixture", IN_FILE_BED=input_file, IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"), IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"), TEMP_OUT_FILE_BED=prefix + ".bed", TEMP_OUT_FILE_BIM=prefix + ".bim", TEMP_OUT_FILE_FAM=prefix + ".fam", TEMP_OUT_FILE_POP=prefix + ".pop", OUT_P=output_prefix + ".P", OUT_Q=output_prefix + ".Q", OUT_STDOUT=output_prefix + ".log", CHECK_VERSION=ADMIXTURE_VERSION, set_cwd=True, ) cmd.set_option("-s", random.randint(0, 2**16 - 1)) cmd.set_option("--supervised") cmd.add_value("%(TEMP_OUT_FILE_BED)s") cmd.add_value(int(k_groups)) CommandNode.__init__( self, description="<Admixture -> '%s.*''>" % (output_prefix, ), command=cmd.finalize(), dependencies=dependencies, )
def _do_test_builder__add_or_set_option__after_finalize(setter): builder = AtomicCmdBuilder("find") builder.finalize() assert_raises(AtomicCmdBuilderError, setter, builder, "-size", "1")
def test_builder__finalize__returns_singleton(): builder = AtomicCmdBuilder("echo") assert builder.finalize() is builder.finalize()
def __init__( self, input_alignment, output_template, input_partition=None, model="GTRGAMMAI", replicates="autoMRE", threads=1, dependencies=(), ): """ Arguments: input_alignment -- An alignment file in a format readable by RAxML. input_partition -- A set of partitions in a format readable by RAxML. output_template -- A template string used to construct final filenames. Should consist of a full path, including a single '%s', which is replaced with the variable part of RAxML output files (e.g. 'info', 'bestTree', ...). Example destination: '/disk/project/SN013420.RAxML.%s' Example output: '/disk/project/SN013420.RAxML.bestTree' """ if threads > 1: command = AtomicCmdBuilder("raxmlHPC-PTHREADS") command.set_option("-T", threads) version = RAXML_PTHREADS_VERSION else: command = AtomicCmdBuilder("raxmlHPC") version = RAXML_VERSION # Perform rapid bootstrapping command.set_option("-f", "a") # Output files are saved with a .PALEOMIX postfix, and subsequently renamed command.set_option("-n", "PALEOMIX") # Ensures that output is saved to the temporary directory command.set_option("-w", "%(TEMP_DIR)s") # Symlink to sequence and partitions, to prevent the creation of *.reduced files # outside temp folder. In addition, it may be nessesary to remove the .reduced # files if created command.set_option("-s", "%(TEMP_OUT_ALN)s") if input_partition is not None: command.set_option("-q", "%(TEMP_OUT_PART)s") command.set_kwargs( IN_PARTITION=input_partition, TEMP_OUT_PART=os.path.basename(input_partition), TEMP_OUT_PART_R=os.path.basename(input_partition) + ".reduced", ) command.set_kwargs( # Auto-delete: Symlinks and .reduced files that RAxML may generate TEMP_OUT_ALN=os.path.basename(input_alignment), TEMP_OUT_ALN_R=os.path.basename(input_alignment) + ".reduced", # Input files, are not used directly (see below) IN_ALIGNMENT=input_alignment, # Final output files, are not created directly OUT_INFO=output_template % "info", OUT_BESTTREE=output_template % "bestTree", OUT_BOOTSTRAP=output_template % "bootstrap", OUT_BIPART=output_template % "bipartitions", OUT_BIPARTLABEL=output_template % "bipartitionsBranchLabels", CHECK_VERSION=version, ) # Use the GTRGAMMA model of NT substitution by default command.set_option("-m", model, fixed=False) # Enable Rapid Boostrapping and set random seed. May be set to a fixed value to # allow replicability. command.set_option("-x", int(random.random() * 2 ** 31 - 1), fixed=False) # Set random seed for parsimony inference. May be set to allow replicability. command.set_option("-p", int(random.random() * 2 ** 31 - 1), fixed=False) # Terminate bootstrapping upon convergence, not after N repetitions command.set_option("-N", replicates, fixed=False) self._symlinks = [input_alignment, input_partition] self._template = os.path.basename(output_template) CommandNode.__init__( self, command=command.finalize(), description="<RAxMLRapidBS: '%s' -> '%s'>" % (input_alignment, output_template % ("*",)), threads=threads, dependencies=dependencies, )