def test_validate_report_spec(self): for path in walker(DATA_DIR_REPORT_SPECS, json_filter): if os.path.basename(path).startswith("report-specs"): f = _to_assertion(path, validate_report_spec) f(self) self.assertIsInstance(load_report_spec_from_json(path), ReportSpec)
def test_validate_pipeline_datastore_view_rules(self): for path in walker(DATA_DIR_DSVIEW, json_filter): f = _to_assertion(path, validate_datastore_view_rules) f(self) assert isinstance( load_pipeline_datastore_view_rules_from_json(path), PipelineDataStoreViewRules)
def test_validate_pipeline_datastore_view_rules(self): for path in walker(DATA_DIR_DSVIEW, json_filter): f = _to_assertion(path, validate_datastore_view_rules) f(self) self.assertIsInstance( load_pipeline_datastore_view_rules_from_json(path), PipelineDataStoreViewRules)
def dataset_walker(root_dir): filter_func = is_xml_dataset return walker(root_dir, filter_func)
def _run_bam_to_fastx(program_name, fastx_reader, fastx_writer, input_file_name, output_file_name, tmp_dir=None, seqid_prefix=None, subreads_in=None): """ Converts a dataset to a set of fastx file, possibly archived. Can take a subreadset or consensusreadset as input. Will convert to either fasta or fastq. If the dataset is barcoded, it will split the fastx files per-barcode. If the output file is .zip, the fastx file(s) will be archived accordingly. """ assert isinstance(program_name, str) barcode_mode = False barcode_sets = set() output_is_archive = (output_file_name.endswith(".zip") or output_file_name.endswith(".tar.gz") or output_file_name.endswith(".tgz")) if output_is_archive: with openDataSet(input_file_name) as ds_in: barcode_mode = ds_in.isBarcoded if barcode_mode: # attempt to collect the labels of barcodes used on this # dataset. assumes that all BAM files used the same barcodes for bam in ds_in.externalResources: if bam.barcodes is not None: barcode_sets.add(bam.barcodes) barcode_labels = [] bio_samples_to_bc = None if barcode_mode: if len(barcode_sets) == 1: bc_file = list(barcode_sets)[0] log.info("Reading barcode labels from %s", bc_file) try: with BarcodeSet(bc_file) as bc_in: for bc in bc_in: barcode_labels.append(bc.id) except IOError as e: log.error("Can't read %s", bc_file) log.error(e) elif len(barcode_sets) > 1: log.warning("Multiple barcode sets used for this SubreadSet:") for fn in sorted(list(barcode_sets)): log.warning(" %s", fn) else: log.info("No barcode labels available") if subreads_in is not None: bio_samples_to_bc = {} with SubreadSet(subreads_in, strict=True) as subread_ds: if subread_ds.isBarcoded: # pylint: disable=no-member bio_samples_to_bc = get_barcode_sample_mappings(subread_ds) base_ext = re.sub("bam2", ".", program_name) suffix = "{f}.gz".format(f=base_ext) tmp_out_dir = tempfile.mkdtemp(dir=tmp_dir) tmp_out_prefix = op.join(tmp_out_dir, "tmp_fastx") args = [ program_name, "-o", tmp_out_prefix, input_file_name, ] if barcode_mode: args.insert(1, "--split-barcodes") if seqid_prefix is not None: args.extend(["--seqid-prefix", pipes.quote(seqid_prefix)]) log.info(" ".join(args)) remove_files = [] result = run_cmd(" ".join(args), stdout_fh=sys.stdout, stderr_fh=sys.stderr) def _is_fastx_file(fn): return fn.startswith(tmp_out_prefix) and fn.endswith(suffix) try: assert result.exit_code == 0, "{p} exited with code {c}".format( p=program_name, c=result.exit_code) if output_is_archive: tc_out_dir = op.dirname(output_file_name) fastx_file_names = [] # find the barcoded FASTX files and un-gzip them to the same # output directory and file prefix as the ultimate output for fn in walker(tmp_out_dir, _is_fastx_file): if barcode_mode: # bam2fastx outputs files with the barcode indices # encoded in the file names; here we attempt to # translate these to barcode labels, falling back on # the original indices if necessary bc_fwd_rev = fn.split(".")[-3].split("_") bc_label = "unbarcoded" if (bc_fwd_rev != ["65535", "65535"] and bc_fwd_rev != ["-1", "-1"]): def _label_or_none(x): try: bc = int(x) if bc < 0: return "none" elif bc < len(barcode_labels): return barcode_labels[bc] except ValueError as e: pass return x bc_fwd_label = _label_or_none(bc_fwd_rev[0]) bc_rev_label = _label_or_none(bc_fwd_rev[1]) bc_label = "{f}--{r}".format(f=bc_fwd_label, r=bc_rev_label) suffix2 = ".{l}{t}".format(l=bc_label, t=base_ext) if bio_samples_to_bc is not None: sample = bio_samples_to_bc.get(bc_label, "unknown") suffix2 = ".{}".format(sample) + suffix2 else: suffix2 = base_ext base = re.sub(".zip$", "", re.sub(".tar.gz", "", re.sub(".tgz", "", op.basename(output_file_name)))) fn_out = base if not fn_out.endswith(suffix2): fn_out = re.sub(base_ext, suffix2, fn_out) fastx_out = op.join(tc_out_dir, fn_out) _ungzip_fastx(fn, fastx_out) fastx_file_names.append(fastx_out) remove_files.append(fn) assert len(fastx_file_names) > 0 remove_files.extend(fastx_file_names) return archive_files(fastx_file_names, output_file_name) else: tmp_out = "{p}{b}.gz".format(p=tmp_out_prefix, b=base_ext) _ungzip_fastx(tmp_out, output_file_name) remove_files = [tmp_out] finally: for fn in remove_files: os.remove(fn) return 0
def test_validate_pipeline_presets(self): for path in walker(DATA_DIR_PRESETS, json_filter): f = _to_assertion(path, validate_presets) f(self) assert isinstance(load_pipeline_presets_from(path), PipelinePreset)
def test_validate_pipeline_presets(self): for path in walker(DATA_DIR_PRESETS, json_filter): f = _to_assertion(path, validate_presets) f(self) self.assertIsInstance(load_pipeline_presets_from(path), PipelinePreset)
def test_validate_tool_contracts(self): for path in walker(DATA_DIR_TC, json_filter): f = _to_assertion(path, validate_tc) f(self) self.assertIsInstance(load_tool_contract_from(path), ToolContract)