def expand_globs(config, is_test=False): """detect and expand glob expressions in the input section. A glob expression is any filename that contains a '*'. Multiple glob expressions can be combined on the same line by a ','. A "find" expression is detected starting with 'find'. These expressions will be evaluated in a shell and the results insterted into the dictionary. If a filename starts with "file=", the contents of the file following the "=" are read and inserted. Multiple files can be separated by a ','. If a glob or find expression is evaluated to nothing, an exception is raised unless ``is_test`` is set. In that case, two files will be returned called "test1" and "test2". """ for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if value.startswith("find"): try: data = E.run(value, return_stdout=True) except Exception as e: data = e.output d[key] = [x for x in data.split("\n") if x] elif "*" in value: if "," in value: v = [glob.glob(x.strip()) for x in value.split(",")] v = [item for sublist in v for item in sublist] else: v = glob.glob(value) d[key] = v elif value.startswith("file="): filenames = [x.strip() for x in value.split("=")[1].split(",")] paths = [] for fn in filenames: with IOTools.open_file(fn) as inf: paths.extend([x.strip() for x in inf if x.strip()]) d[key] = paths if len(d[key]) == 0: if not is_test: raise ValueError( "expression '{}' expanded to nothing".format(value)) else: # insert some random files for testing purposes: if "*" in value: # replace glob expressions value = re.sub(",.*", "", value) d[key] = [re.sub("[*]", "test1", value), re.sub("[*]", "test2", value)] else: if "bam" in value: d[key] = ["test1.bam", "test2.bam"] elif "vcf" in value: d[key] = ["test1.vcf.gz", "test2.vcf.gz"] else: d[key] = ["test1.txt", "test2.txt"] return config
def inner(self, outfile, *args, **kwargs): try: f() except Exception as e: E.warn("received exception {} - touching {}".format( str(e), outfile)) IOTools.touch_file(outfile)
def run(self, infile, outfile, params): if "reference_fasta" in params._fields: reference_fasta = "REFERENCE_SEQUENCE={}".format( params.reference_fasta) else: reference_fasta = "" # command can fail when no output is produced, but still produce output # 12G is required for java overhead retval = P.run("java -Xmx8000m -jar {params.path} " "CollectMultipleMetrics " "{reference_fasta} " "INPUT={infile} " "TMP_DIR=%(tmpdir)s " "{params.options} " "OUTPUT={outfile} " ">& {outfile} ".format(**locals()), job_memory="12G", ignore_errors=True) def get_section(section, data): pattern = "## {}".format(section) keep = False result = [] for line in data: if line.startswith("##"): if line.startswith(pattern): keep = True else: keep = False if keep: result.append(line) return result for tablename in self.tablenames: filename = re.sub("histogram", "metrics", tablename) raw = filename[len("picard_"):] src = outfile + "." + raw dest = outfile + "." + tablename + ".tsv" if not os.path.exists(src): E.warn("no file {}, ignored".format(src)) continue with IOTools.open_file(src) as inf: data = inf.readlines() if tablename.endswith("metrics"): data = get_section("METRICS", data) elif tablename.endswith("histogram"): data = get_section("HISTOGRAM", data) with IOTools.open_file(dest, "w") as outf: outf.write("".join(data)) return retval
def run(self, infile, outfile, params): if params.reference_fasta is None: raise ValueError("please provide a reference database") statement = ( "{params.path_nucmer} -p {outfile} {params.reference_fasta} {infile} >& {outfile}.nucmer; " "{params.path_dnadiff} -p {outfile} -d {outfile}.delta >& {outfile}.dnadiff; " "{params.path_mummerplot} --large --fat --png {outfile}.1delta >& {outfile}.mummerplot" .format(**locals())) retval = P.run(statement) IOTools.touch_file(outfile) return retval
def ignore_task(self, infiles, outfiles, params): """return True if task should be ignored. This method will also create the output file(s). """ if self._ignore: m = str(outfiles) for ignore in IOTools.val2list(self._ignore): if ignore in m: E.warn("task {} will be ignored".format(self.__name__)) for f in IOTools.val2list(outfiles): E.info("creating empty file {}".format(f)) IOTools.touch_file(f) return True return False
def save_benchmark(self, outfile, benchmark): if not isinstance(benchmark, list): benchmark = [benchmark] # flatten if nested list and remove None benchmark = [ x for x in IOTools.flatten(benchmark, ltypes=(list, )) if x is not None ] filename = self.build_meta_filename(outfile, "benchmark.bench") if not benchmark: E.warn("could not save benchmark info to {}".format(filename)) return try: header = benchmark[0]._fields except AttributeError as ex: E.warn("could not save benchmark timings for {}:" " {} from {}".format(outfile, str(ex), str(benchmark[0]))) return with open(filename, "w") as outf: outf.write("\t".join(header) + "\n") for b in benchmark: outf.write("\t".join(map(str, b)) + "\n")
def run(self, infile, outfile, params): if params.reference_fasta_map is None: raise ValueError("bam2reference requires a reference sequence map") reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") retval, diff = get_reference_for_bam(infile, fasta) if retval is None: if diff is None: retval = "corrupted" else: retval = "unknown" E.debug("differences: {}".format(str(diff))) path = "" else: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) path = map_path2name.get(retval, os.path.basename(retval)) with IOTools.open_file(outfile, "w") as outf: outf.write("filename\treference\tpath\n") outf.write("\t".join((infile, retval, path)) + "\n") return None
def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".vcf.gz") vcf_output = prefix + ".raw.vcf.gz" if not os.path.exists(vcf_output): stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.HaplotypeCaller.log " "{params.haplotypecaller} " "--out {vcf_output} " ">& {prefix}.HaplotypeCaller.err".format(**locals())) else: E.warn("output file {vcf_output} already exists - " "it will not be recomputed".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="5G")
def run(self, infiles, outfile, params): def _link(infile, outfile): if os.path.exists(os.path.abspath(outfile)): return dirname = os.path.dirname(outfile) if not os.path.exists(dirname): os.makedirs(dirname) os.symlink(infile, os.path.abspath(outfile)) rx = re.compile(params.regex) outfiles = [] for infile in infiles: outpath = os.path.join( os.path.dirname(outfile), rx.search(infile).expand(params.pattern_out)) for suffix in self.suffixes: for fn in glob.glob(infile + suffix): _link(fn, outpath + suffix) _link(os.path.abspath(infile), outpath) outfiles.append(outpath) with IOTools.open_file(outfile, "w") as outf: outf.write("\n".join(outfiles) + "\n")
def expand_generators(config): """expand generator expressions in option lists. A generator expression are valid python syntax and has the following syntax:: options: generate=["--chrom={}".format(x) for x in [1,2,3,4,5]] """ to_delete = [] for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if value.startswith("generate="): expression = re.sub("^generate=\s*", "", value) if expression.startswith("'") and expression.startswith("'"): expression = expression[1:-1] try: argument_list = eval(expression) except SyntaxError as ex: raise ValueError( "error occured while evaluating generator " "expression {}: {}".format(expression, ex)) if isinstance(d, list): d.extend(argument_list) to_delete.append((d, key)) else: d[key] = argument_list for d, key in to_delete[::-1]: del d[key] return config
def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".bam") stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type RealignerTargetCreator " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.RealignerTargetCreator.log " "{params.realignertargetcreator} " "--out {outfile}.realign.intervals " ">& {outfile}.RealignerTargetCreator.err".format(**locals())) stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type IndelRealigner " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--targetIntervals {outfile}.realign.intervals " "--logging_level INFO " "--log_to_file {outfile}.IndelRealigner.log " "{params.indelrealigner} " "--out @[email protected] " ">& {outfile}.IndelRealigner.err".format(**locals())) stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type BaseRecalibrator " "--input_file @[email protected] " "--reference_sequence {reference_fasta} " "--logging_level INFO " "{params.baserecalibrator} " "--log_to_file {outfile}.BaseRecalibrator.log " "--out {outfile}.recal_data.table " ">& {outfile}.BaseRecalibrator.err".format(**locals())) stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type PrintReads " "--input_file @[email protected] " "--reference_sequence {reference_fasta} " "--BQSR {outfile}.recal_data.table " "--logging_level INFO " "--log_to_file {outfile}.PrintReads.log " "--out {outfile} " ">& {outfile}.PrintReads.err".format(**locals())) stmnts.append("mv {prefix}.bai {outfile}.bam.bai") return self.run_statements(stmnts, job_memory="3G")
def run(self, outfile, params): retvals = [] prefix = IOTools.snip(outfile, ".bed.gz") vcffile = prefix + ".vcf.gz" if not os.path.exists(vcffile): retvals.extend(run_tool_delly.run(self, vcffile, params)) statements = [] statements.append("{self.path_bcftools} query " "{params.bcftools_options} " "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" " "{vcffile} " "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{" "case \"DEL\": $5=0; break; " "case \"DUP\": $5=3; break; " "case \"INS\": next; break; " "}}; print }}' " "| bgzip " "> {outfile}".format(**locals())) statements.append("tabix -f -p bed {outfile}".format(**locals())) statement = "; ".join(statements) retvals.append(P.run(statement)) return retvals
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) tools = glob.glob( os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools", "*.py")) counter = E.Counter() for tool in tools: counter.found += 1 tool_module = re.sub(".py", "", os.path.basename(tool)) tool_name = re.sub("_", "-", tool_module) if tool_name in ("__init__", "cli"): c.ignored += 1 continue dest = os.path.join("tools", "{}.rst".format(tool_name)) if os.path.exists(dest) and not options.output_force: counter.skipped += 1 continue with IOTools.openFile(dest, "w") as outf: outf.write(TEMPLATE_TOOL.format(**locals())) counter.new += 1 E.info(counter) E.stop()
def build_readgroup_string(outfile, params): if params.readgroup_id_regex is None: readgroup_id = IOTools.snip(os.path.basename(outfile), ".bam") else: try: readgroup_id = "-".join(re.search( params.readgroup_id_regex, outfile).groups()) except AttributeError as ex: raise AttributeError("regular expression {} does not match {}".format( params.readgroup_id_regex, outfile)) if params.readgroup_sample_regex is None: readgroup_sample = readgroup_id else: try: readgroup_sample = "-".join(re.search( params.readgroup_sample_regex, outfile).groups()) except AttributeError as ex: raise AttributeError("regular expression {} does not match {}".format( params.readgroup_sample_regex, outfile)) readgroup_string = "@RG\tID:{}\tSM:{}".format( readgroup_id, readgroup_sample) if params.readgroup_header: readgroup_string += "\t{}".format(params.readgroup_header) return readgroup_string, readgroup_id, readgroup_sample
def run(self, infile, outfile, params): if params.reference_bed is None: raise ValueError("{} requires reference_bed to be set".format( self.name)) # requires a consistent sort order, so sort both files. # It also requires the chromosome content to be identical, # so restrict output to common sets. tmpf = P.get_temp_filename(clear=True) tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz" stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile, params.reference_bed) statements = [stmnt] statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa " "| bgzip " "> {outfile}.shared.bed.gz") statements.append("{params.path} intersect " "-a {tmpf_test} " "-b {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_test.bed.gz") statements.append("{params.path} intersect " "-b {tmpf_test} " "-a {tmpf_truth} " "-wa -v" "| bgzip " "> {outfile}.unique_truth.bed.gz") statements.append("rm -f {tmpf_test} {tmpf_truth}") for section in self.sections: statements.append( "tabix -p bed {outfile}.{section}.bed.gz".format(**locals())) statement = "; ".join(statements) retval = P.run(statement.format(**locals())) # these are small files, so doing it here. Implement tabix.count() # method counts = dict() for section in self.sections: # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf: inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz") counts[section] = len(list(inf.fetch())) inf.close() with IOTools.open_file(outfile, "w") as outf: outf.write("section\tcounts\n") outf.write("\n".join( ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n") return retval
def run(self, infiles, outfile, params): if not outfile.endswith("-pass.fastq.gz"): raise ValueError( "outfile must end in -pass.fastq.gz, got {}".format(outfile)) if params.min_size_bytes: before = len(infiles) infiles = [ x for x in infiles if os.path.getsize(x) >= params.min_size_bytes ] E.debug( "removing small files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if params.newer_than: before = len(infiles) cutoff = os.path.getmtime(params.newer_than) infiles = [x for x in infiles if os.path.getmtime(x) > cutoff] E.debug( "removing old files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if len(infiles) == 0: E.warn("no files left after filtering, creating empty file") IOTools.touch_file(outfile) return infiles = " ".join(infiles) outfile_fail = IOTools.snip(outfile, "-pass.fastq.gz") + "-fail.fastq.gz" statement = ("zcat {infiles} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "- " "| gzip " "> {outfile}".format(**locals())) return P.run(statement)
def run(self, infile, outfile, params): outfile_pass = IOTools.snip(outfile, ".tsv") + "-pass.fastq.gz" outfile_fail = IOTools.snip(outfile, ".tsv") + "-fail.fastq.gz" statement = ("zcat {infile} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "--output-stats-tsv={outfile} " "- " "| gzip " "> {outfile_pass} " "".format(**locals())) return P.run(statement)
def run(self, infile, outfile, params): if params.reference_database is None: raise ValueError("please provide a reference database") statement = ( "{params.path_lastal} {params.lastal_options} " "{params.reference_database} {infile} " "| {params.path_lastsplit} {params.lastsplit_options} " "| {params.path_mafsort} " "| gzip " "> {outfile}.maf.gz; " "{params.path_lastdotplot} " "<(zcat {outfile}.maf.gz " "| daisy maf2maf --log={outfile}.filter.log --min-length={params.min_contig_length} ) " "{outfile}.png ".format(**locals())) retval = P.run(statement, job_memory="15G") IOTools.touch_file(outfile) return retval
def get_default_params(): """return default parameters for tools/metrics. Could be refactored to read defaults from a user specified file. The current implementation takes the one located within the repository. """ with IOTools.open_file( os.path.join(os.path.dirname(__file__), "defaults.yml")) as inf: result = yaml.load(inf, Loader=RoundTripLoader) return result
def resolve_argument(argument, sep=","): """if argument is a container type (dict, list, tuple) resolve its contents to comma-separated list. """ if isinstance(argument, dict): if len(argument) != 1: raise ValueError( "expected a single entry dictionary, got '{}'".format( argument)) return sep.join(x[2] for x in IOTools.nested_iter(argument)) elif isinstance(argument, list) or isinstance(argument, tuple): return sep.join(argument) # special treatment for output from run_collate_link_output elif "filelist" in argument: f = [ x.strip() for x in IOTools.open_file(argument).readlines() if not x.startswith("#") ] return sep.join([x for x in f if x]) return argument
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bams = resolve_argument(params.bam, ",") reference_fasta = get_reference(params) statements, gvcfs = [], [] # TODO: sort out multi-threading for idx, bam in enumerate(bams.split(",")): output = prefix + "." + str(idx) + ".g.vcf" gvcfs.append(output) if os.path.exists(output): E.info("{} already exists - skipped".format(output)) continue statements.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--emitRefConfidence GVCF " "--logging_level INFO " "--log_to_file {prefix}.HaplotypeCaller.{idx}.log " "{params.haplotypecaller} " "--out {output} " ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals())) if statements: self.run_statements(statements, job_memory="4G") stmnts = [] gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs]) vcf_output = prefix + ".raw.vcf.gz" stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type GenotypeGVCFs " "--reference_sequence {reference_fasta} " "{gvcfs} " "--logging_level INFO " "--log_to_file {prefix}.GenotypeGVCFs.log " "{params.genotypegvcfs} " "--out {vcf_output} " ">& {prefix}.GenotypeGVCFs".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="4G")
def line_grouper(filename): rx = re.compile("\d{4}-\d{2}-\d{2} ") with IOTools.open_file(filename) as infile: last_line = None for line in infile: line = line.strip() if not rx.match(line): last_line = " ".join((last_line, line)) else: if last_line: yield last_line last_line = line yield last_line
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bam = resolve_argument(params.bam, sep=",") reference_fasta = get_reference(params) bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")]) stmnts = [] if not os.path.exists(prefix + ".annotated.vcf.gz"): tmpfile, pre_statement, post_statement = self.pre_process( params.vcf, outfile, params) stmnts.append(pre_statement) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type VariantAnnotator " "--variant {tmpfile} " "{bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {prefix}.VariantAnnotator.log " "--annotation FisherStrand " "--annotation StrandOddsRatio " "--annotation ReadPosRankSumTest " "--annotation RMSMappingQuality " "--annotation MappingQualityRankSumTest " "{params.options} " "--out {prefix}.annotated.vcf.gz " ">& {prefix}.VariantAnnotator.err".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) stmnts.append(post_statement) else: E.warn("using pre-existing file {} with annotated variants".format( prefix + ".annotated.vcf.gz")) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) return self.run_statements(stmnts, job_memory="3G")
def run(self, infile, outfile, params): with IOTools.open_file(outfile, "w") as outf: outf.write("contig\tcount\tsum\tmin\tmax\tmean\t" "median\tstddev\tcollapse\n") retval = P.run("zcat {infile} " "| awk '{{printf(\"%%s\\t%%i\\n\", $1, $3-$2); " " printf(\"total\\t%%i\\n\", $3-$2)}}' " "| sort -k1,1 " "| {params.path} groupby " "-g 1 " "-c 2 " "-o count,sum,min,max,mean,median,stddev,collapse " "{params.options} " "2> {outfile}.log " ">> {outfile}; ".format(**locals())) return retval
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=",") # "-T {outfile}.tmpdir -k " outfile = IOTools.snip(outfile, ".gz") # note that lumpy removes the temporary directory # after running, thus make sure it is unique and exists return P.run("{params.path} " "-B {bam} " "-o {outfile} " "-T %(tmpdir)s_{self.__name__} " "-v " "{params.options} " ">& {outfile}.log; " "vcf-sort {outfile} " "| bgzip > {outfile}.gz; " "tabix -p vcf {outfile}.gz".format(**locals()))
def run(self, outfile, params): bam = resolve_argument(params.bam) # rename index from x.bai to x.bam.bai outprefix = IOTools.snip(outfile, ".bam", ".cram") statement = ("java -Xmx8000m -jar {params.path} " "MarkDuplicates " "INPUT={bam} " "TMP_DIR=%(tmpdir)s " "CREATE_INDEX=TRUE " "REFERENCE_SEQUENCE={params.reference_fasta} " "METRICS_FILE={outfile}.metrics " "{params.options} " "OUTPUT={outfile} " ">& {outfile}.log; " "mv {outprefix}.bai {outfile}.bai".format(**locals())) # 12G is required for java overhead return P.run(statement, job_memory="12G")
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS if self.mountpoint: # revert mount redirection for arvados to allow redirection # on individual cluster nodes for d, key, value in IOTools.nested_iter(infiles): d[key] = re.sub(self.mountpoint, "arv=", value) self.instantiate_input(infiles) self.save_meta(outfile, output_file=outfile) if only_info: E.warn("only_info - meta information has been updated") return params = self.build_params(output_file=outfile) benchmark = self.run(outfile, as_namedtuple(params)) self.save_benchmark(outfile, benchmark)
def collect_file_meta_information(file_dict, nostats=False): """collect meta information on files Arg: file_dict(dict) : nested dictionary Returns: info(list) """ results = [] for d, key, filenames in IOTools.nested_iter(file_dict): if filenames is None: continue if isinstance(filenames, str): filenames = filenames.split(",") filenames = [x.strip() for x in filenames] for filename in filenames: abspath = os.path.realpath(filename) if nostats: st_size, st_mtime, st_ctime = 0, 0, 0 else: if not os.path.exists(abspath): raise OSError("file {} does not exist".format(filename)) s = os.stat(filename) st_size, st_mtime, st_ctime = s.st_size, s.st_mtime, s.st_ctime results.append( collections.OrderedDict( list( zip(("path", "abspath", "size", "modification_time", "creation_time"), (filename, abspath, st_size, st_mtime, st_ctime))))) return results
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--restrict-regex", dest="restrict_regex", action="append", help="pattern to restrict tests to certain tools/metrics. " "Can be specified multiple times [%default]") parser.add_option( "--data-directory", dest="data_directory", help="directory with sample data sets. This will override the default " "datadir in the configuration file and the environment variable " "DAISY_TEST_DATADIR [%default]") parser.add_option( "--library-directory", dest="library_directory", action="append", help="directory TaskLibrary functions. Will be added to the built-in " "and the one specified in DAISY_TASKLIBRARY environment variable " "[%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--keep-failed-temp", dest="keep_failed_temp", action="store_true", help="keep temporary files of failed tests [%default]") parser.set_defaults( restrict_regex=[], always_mount=False, data_directory=None, keep_failed_temp=False, library_directories=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) P.get_parameters() # load the built-in tests filenames = [ os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary", "test_task_library.yml") ] if "DAISY_TASKLIBRARY" in os.environ: filenames.append( os.path.join(os.environ["DAISY_TASKLIBRARY"], "test_task_library.yml")) filenames.extend(options.library_directories) master_config = None for fn in filenames: if not os.path.exists(fn): E.warn("file {} does not exist".format(fn)) continue with IOTools.open_file(fn) as inf: raw_txt = inf.read() test_config = yaml.load(raw_txt) if test_config is None: E.warn("file {} is empty".format(fn)) continue data_directory = os.environ.get("DAISY_TEST_DATADIR", test_config.get("data_directory")) if options.data_directory: data_directory = options.data_directory # reload config with placeholders replaced test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt)) if master_config is None: master_config = test_config else: # add additional tool/test metrics master_config["tool"].update(test_config.get("tool", {})) master_config["metric"].update(test_config.get("metric", {})) for test_section, testclass, map_name_to_runner in [ ("tool", TestTool, map_tool_to_runner), ("metric", TestMetric, map_metric_to_runner) ]: ignore = master_config[test_section].get("ignore", []) # propagate config variables testclass.test_config = master_config for task, taskf in sorted(map_name_to_runner.items()): found = False for to_ignore in ignore: if re.match(to_ignore, task): found = True if found: continue if options.restrict_regex: take = False for x in options.restrict_regex: if re.search(x, task): take = True if not take: continue add_tests(task, taskf, testclass) failed = False with arvados_enabled(always_mount=options.always_mount): for testclass in [TestTool, TestMetric]: suite = unittest.TestLoader().loadTestsFromTestCase(testclass) result = unittest.TextTestRunner(verbosity=2).run(suite) failed |= not result.wasSuccessful() # remove all tests in test class - necessary if function is # called repeatedly clear_tests(testclass) E.stop() return failed
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="only show what will be done, don't do it [%default]") parser.add_option("-l", "--link", dest="link", action="store_true", help="link instead of rename [%default]") parser.set_defaults(dry_run=False, link=False) (options, args) = E.start(parser, argv) config = P.get_parameters("benchmark.yml") old_data, new_data = [], [] for old_info in glob.glob("*.dir/tool.info"): old_dir, old_file = os.path.split(old_info) old_info = Toolkit.read_data(old_info) old_data.append((old_dir, old_info)) tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config) config_files = Workflow.expand_globs(config["input"]) input_combos = Workflow.build_combinations(config_files) map_property_to_dir = collections.defaultdict(list) for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files) result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir")) new_data.append((result_dir, taskf)) for a, x, y in IOTools.nested_iter(taskf.input_files): map_property_to_dir[(x, y)].append(result_dir) map_property_to_dir[("name", taskf.name)].append(result_dir) for x, y in list(taskf._option_dict.items()): map_property_to_dir[(x, y)].append(result_dir) # match by input_files options.stdout.write("\t".join(("old", "new", "matching")) + "\n") for old_dir, old_info in old_data: targets = [] for a, x, y in IOTools.nested_iter(old_info["input_files"]): if (x, y) in map_property_to_dir: targets.extend(map_property_to_dir[(x, y)]) for x, y in list(old_info.items()): try: targets.extend(map_property_to_dir[(x, y)]) except TypeError: pass counts = collections.Counter(targets) max_count = max(counts.values()) max_count_items = [ x for x, y in list(counts.items()) if y == max_count ] if len(max_count_items) > 1: E.warn("multiple matches for {}, ignored".format(old_dir)) continue new_dir = max_count_items[0] options.stdout.write("\t".join(map(str, (old_dir, new_dir, max_count))) + "\n") if os.path.exists(new_dir): raise ValueError("directory {} already exists".format(new_dir)) if options.dry_run: continue if options.link: os.symlink(old_dir, new_dir) else: os.rename(old_dir, new_dir) E.stop()