def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, task_id=Constants.TOOL_ID): datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) != 1: new_resource_file = op.splitext(output_file)[0] + ".bam" ds_in.consolidate(new_resource_file, numFiles=n_files) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, task_id + "-out-2", ext_res.metaType, ext_res.bam) datastore_files.append(ds_file) for index in ext_res.indices: if index.metaType in Constants.BAI_FILE_TYPES: ds_file = DataStoreFile(index.uniqueId, task_id + "-out-3", index.metaType, index.resourceId) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def _make_datastore(subreads): files = [ DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-out-0", FileTypes.DS_SUBREADS.file_type_id, subreads) ] ds = DataStore(files) ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name ds.write_json(ds_path) return ds_path
def dataset_to_datastore(dataset_file, datastore_file, source_id="dataset_to_datastore"): """Copied from pbcoretools.tasks.barcoding""" # FIXME: replace barcoding dsmd = get_dataset_metadata(dataset_file) ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file) ds_out = DataStore([ds_file]) ds_out.write_json(datastore_file) return 0
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, consolidate_f=lambda ds: ds.consolidate): # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) <= 0: raise ValueError( "DataSet {} must contain one or more files!".format( dataset_file)) new_resource_file = bam_of_dataset(output_file) consolidate_f(ds_in)(new_resource_file, numFiles=n_files, useTmp=False) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work reads_name = get_reads_name(ds_in) for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, Constants.TOOL_ID + "-out-2", ext_res.metaType, ext_res.bam, name=reads_name, description=reads_name) datastore_files.append(ds_file) # Prevent duplicated index files being added to datastore, since consolidated # dataset may contain multiple indices pointing to the same physical file added_resources = set() for index in ext_res.indices: if (index.metaType in Constants.BAI_FILE_TYPES and index.resourceId not in added_resources): added_resources.add(index.resourceId) ds_file = DataStoreFile( index.uniqueId, Constants.TOOL_ID + "-out-3", index.metaType, index.resourceId, name="Index of {}".format(reads_name.lower()), description="Index of {}".format( reads_name.lower())) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def test_datastore_paths(self): tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name base_dir = os.path.dirname(tmpfile) tmp_ds = os.path.join(base_dir, "datastore.json") dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task", FileTypes.DS_SUBREADS.file_type_id, os.path.basename(tmpfile), False, "Subreads", "Subread DataSet XML") ds = DataStore([dsf]) ds.write_json(tmp_ds) with open(tmp_ds) as json_in: d = json.loads(json_in.read()) self.assertFalse(os.path.isabs(d['files'][0]['path'])) ds = DataStore.load_from_json(tmp_ds) self.assertEqual(ds.files.values()[0].path, tmpfile)
def datastore_to_datastorefile_objs(in_datastore_json, allowed_types=ALLOWED_TYPES): """Return (datastorefile_objs, type_id, cls, ext) datastorefile_objs -- a list of DataStoreFile objects. type_id -- id cls -- e.g., SubreadSet ext -- e.g., subreadset.xml """ datastore = DataStore.load_from_json(in_datastore_json) allowed_type_ids = [t.file_type_id for t in allowed_types] # Is input datastore empty? if len(datastore.files) == 0: raise ValueError( "Expected one or more dataset files in datastore {}".format( in_datastore_json)) # Do all files share the same type? observed_type_ids = list( set([f.file_type_id for f in datastore.files.values()])) if len(observed_type_ids) != 1: raise ValueError( "Could not handle datastore of mixed types: {}!".format( observed_type_ids)) # Is it an allowed file type? type_id = observed_type_ids[0] if not type_id in allowed_type_ids: raise ValueError( "Could not handle {} dataset in datastore file {}, only support {}!" .format(type_id, in_datastore_json, allowed_type_ids)) cls = _type_id_to_cls(type_id) ext = _type_id_to_ext(type_id) return datastore.files.values(), type_id, cls, ext
def to_reports(subreads, output_dir): output_files = [] log.info("Loading {f}".format(f=subreads)) ds = SubreadSet(subreads) ds.loadStats() for base, module in [("filter_stats_xml", filter_stats_xml), ("adapter_xml", adapter_xml), ("loading_xml", loading_xml), ("control", control)]: constants = getattr(module, "Constants") task_id = constants.TOOL_ID to_report = getattr(module, "to_report_impl") try: rpt_output_dir = os.path.join(output_dir, base) os.mkdir(rpt_output_dir) file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base)) report = to_report(ds, rpt_output_dir) log.info("Writing {f}".format(f=file_name)) report.write_json(file_name) output_files.append(DataStoreFile( uuid=report.uuid, source_id=task_id, type_id=FileTypes.REPORT.file_type_id, path=file_name, is_chunked=False, name=base)) except InvalidStatsError as e: log.error("This dataset lacks some required statistics") log.error("Skipping generation of {b} report".format(b=base)) datastore = DataStore(output_files) return datastore
def test_load_datastore_from_file(self): """ Can load Datastore from Json :return: """ ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) self.assertIsInstance(ds, DataStore)
def mock_update_barcoded_sample_metadata(base_dir, datastore_file, input_reads, barcode_set, use_barcode_uuids=True): """ Function to mimic the actual update function, without actually reading any barcoding information from the datasets. Instead, the barcodes defined in the input dataset will be applied sequentially. """ barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update( input_reads, barcode_set, datastore_file, None) barcode_ids = {name: i for i, name in enumerate(barcode_names)} bc_pairs = [] ds_files = {} for bc_label in barcode_uuids_d.keys(): bc_fw_label, bc_rev_label = bc_label.split("--") bc_pairs.append((barcode_ids[bc_fw_label], barcode_ids[bc_rev_label])) suffix = ".{l}.subreadset.xml".format(l=bc_label) for ds_file in update_files: if ds_file.path.endswith(suffix): ds_files[bc_pairs[-1]] = ds_file new_files = [] assert len(bc_pairs) >= len(update_files) for bc_pair in bc_pairs: ds_file = ds_files[bc_pair] new_files.append( _mock_update_barcoded_sample_metadata(base_dir, ds_file, barcode_names, parent_info, use_barcode_uuids, bc_pair, bio_samples_d, barcode_uuids_d)) return DataStore(new_files)
def run_dev_txt_to_datastore(rtc): p = os.path.dirname(rtc.task.output_files[0]) sleep_multiplier = rtc.task.options[ 'pbsmrtpipe.task_options.sleep_multiplier'] t_sleep = sleep_multiplier * random.random() log.info("Sleeping for %.1f seconds", t_sleep) time.sleep(t_sleep) from pbcore.io import SubreadSet num_subreadsets = rtc.task.options[ 'pbsmrtpipe.task_options.num_subreadsets'] sset = SubreadSet(rtc.task.input_files[0]) add_parent = True if len(sset.metadata.provenance) > 0: log.warn("Not adding provenance since input already has a parent") add_parent = False def to_f(x): source_id = "out-1" sset_out = sset.copy() sset_out.newUuid(random=True) if add_parent: sset_out.metadata.addParentDataSet(sset.uuid, sset.datasetType, createdBy="AnalysisJob", timeStampedName="") file_name = "file-{x:03d}.subreadset.xml".format(x=x) out_path = os.path.join(p, file_name) sset_out.write(out_path) sset_uuid = sset_out.uniqueId name = "subreadset-{}".format(x) dsf = DataStoreFile(sset_uuid, source_id, FileTypes.DS_SUBREADS.file_type_id, file_name, name=name, description="{} Example Description".format(name)) return dsf files = [to_f(i + 1) for i in xrange(num_subreadsets)] ds = DataStore(files) ds.write_json(rtc.task.output_files[0]) return 0
def test_datastore_file_name_and_description(self): """ Make sure output files have non-blank name and description. """ ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) rx = re.compile(r'[a-zA-Z0-9]{1,}') for fd in ds.files.values(): for x in (fd.name, fd.description): self.assertTrue(rx.search(x))
def iterate_datastore_read_set_files( datastore_file, allowed_read_types=Constants.ALLOWED_BC_TYPES): """ Iterate over dataset (e.g., SubreadSet or ConsensusReadSet) files listed in a datastore JSON. """ ds = DataStore.load_from_json(datastore_file) files = ds.files.values() for f in files: if f.file_type_id in allowed_read_types: yield f
def _run_auto_ccs_outputs_barcoded(datastore_in, datastore_out, nproc=Constants.MAX_NPROC): base_dir = op.dirname(datastore_out) files = DataStore.load_from_json(datastore_in).files.values() ccs_files = [] for ds_file in files: # FIXME use a better file_id if ds_file.file_type_id == FileTypes.DS_CCS.file_type_id and ds_file.file_id == "barcoding.tasks.lima-0": ccs_files.append(ds_file.path) log.info("Exporting %s", ds_file.path) log.info("Exporting %d CCS datasets", len(ccs_files)) args = [(f, base_dir) for f in ccs_files] output_files = list(itertools.chain.from_iterable( pool_map(__run_ccs_bam_fastq_exports, args, nproc))) output_files.extend([ _create_zipped_fastq(output_files, "all_barcodes.fastq.tar.gz"), _create_zipped_fasta(output_files, "all_barcodes.fasta.tar.gz") ]) DataStore(output_files).write_json(datastore_out) return 0
def run_args(args): dstore = DataStore.load_from_json(os.path.realpath(args.datastore)) ds_in = ConsensusReadSet(args.ccs_in, trustCounts=True) ds_out = ConsensusReadSet(*([f.path for f in dstore.files.values()]), trustCounts=True) sanitize_dataset_tags(ds_out, remove_hidden=True) ds_out.name = ds_in.name.replace(" (filtered)", "") + " (trimmed)" ds_out.subdatasets = [] ds_out.write("trimmed.consensusreadset.xml") return 0
def run_after(self, rtc, output_dir): with openDataSet(rtc.task.output_files[0]) as f: f.assertIndexed() self.assertEqual(len(f.toExternalFiles()), 1) # test for bug 33778 qnames = set() for rec in f: qnames.add(rec.qName) self.assertEqual(len(qnames), len(f)) ds = DataStore.load_from_json(rtc.task.output_files[1]) self.assertEqual(len(ds.files), 2)
def wrapper(self): ds_path = os.path.join(self.job_dir, "workflow", "datastore.json") ds = DataStore.load_from_json(ds_path) # log.info("Loaded datastore {d}".format(d=ds)) for ds_file in ds.files.values(): if ds_file.file_type_id == file_type_id: started_at = time.time() validator_func(ds_file.path, **kwargs) run_time = time.time() - started_at log.debug("Successfully validated in {s:.2f} sec {p}".format(p=ds_file.path, s=run_time)) self.assertTrue(True)
def gather_chunks(chunks, output_file, nproc=1): if len(chunks) == 1: datastore = DataStore.load_from_json(op.realpath(chunks[0])) log.info("Writing datastore to %s", output_file) datastore.write_json(output_file) return len(datastore.files) file_names_by_bc = defaultdict(list) datastore_files_by_bc = {} for file_name in chunks: log.info("Reading datastore from %s", file_name) datastore = DataStore.load_from_json(op.realpath(file_name)) for ds_file in datastore.files.values(): ds_file_name = op.realpath(ds_file.path) base_name = op.basename(ds_file_name) fields = base_name.split(".") bc_pair = fields[-3] file_names_by_bc[bc_pair].append(ds_file_name) datastore_files_by_bc[bc_pair] = ds_file log.info("Found %d unique barcode pairs", len(file_names_by_bc)) _results = [] pool = multiprocessing.Pool(nproc) for bc_pair, file_names in file_names_by_bc.items(): _results.append( pool.apply_async(_merge_chunks, (file_names, datastore_files_by_bc[bc_pair]))) pool.close() pool.join() datastore_files = [r.get() for r in _results] datastore_out = DataStore(datastore_files) log.info("Writing datastore to %s", output_file) datastore_out.write_json(output_file) return len(datastore_files)
def test_gather_datastore_json(): import subprocess from pbcommand.models import DataStore d = '/pbi/dept/secondary/siv/testdata/pbsvtools-unittest/data/test_scatter_align_datastore/' if1 = op.join(d, '1.aln.datastore.json') if2 = op.join(d, '2.aln.datastore.json') of = tempfile.NamedTemporaryFile(suffix=".datastore.json").name args = ['python', '-m', 'pbcoretools.tasks.gather', of, if1, if2] subprocess.check_call(args) out_fns = DataStore.load_from_json(of).to_dict()['files'] expected_bam_1 = op.join(d, '1.bam') expected_bam_2 = op.join(d, '2.bam') assert out_fns[0]['path'] == expected_bam_1 assert out_fns[1]['path'] == expected_bam_2
def update_barcoded_sample_metadata( base_dir, datastore_file, input_reads, barcode_set, isoseq_mode=False, use_barcode_uuids=True, nproc=1, min_score_filter=Constants.BARCODE_QUALITY_GREATER_THAN): """ Given a datastore JSON of SubreadSets produced by barcoding, apply the following updates to each: 1. Include only the BioSample(s) corresponding to its barcode 2. Add the BioSample name to the dataset name 3. Add a ParentDataSet record in the Provenance section. """ barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update( input_reads, barcode_set, datastore_file) pool = multiprocessing.Pool(nproc) _results = [] for ds_file in update_files: _results.append( pool.apply_async(_update_barcoded_sample_metadata, (base_dir, ds_file, barcode_names, parent_info, isoseq_mode, use_barcode_uuids, bio_samples_d, barcode_uuids_d, min_score_filter))) pool.close() pool.join() datastore_files = [r.get() for r in _results] # copy over the un-barcoded reads BAM dstore = DataStore.load_from_json(datastore_file) files = dstore.files.values() for f in files: if f.file_id != "barcoding.tasks.lima-0": datastore_files.append(f) return DataStore(datastore_files)
def test_integration(self): ds_out = op.join(self._output_dir, "datastore.json") args = [ "python", "-m", "pbreports.report.subreads_reports", pbtestdata.get_file("subreads-sequel"), ds_out ] o, c, m = backticks(" ".join(args)) self.assertEqual(c, 0) self.assertTrue(op.exists(ds_out)) datastore = DataStore.load_from_json(ds_out) datastore_files = [f for u, f in datastore.files.iteritems()] self.assertEqual(sorted([f.file_id for f in datastore_files]), [ "pbreports.tasks.adapter_report_xml", "pbreports.tasks.filter_stats_report_xml", "pbreports.tasks.loading_report_xml" ])
def _get_barcoded_datasets(reads_file): dir_name = os.path.dirname(os.path.abspath(reads_file)) if reads_file.endswith(".datastore.json"): datastore = DataStore.load_from_json(reads_file) datasets = [ _to_abs_path(dir_name, f.path) for u, f in datastore.files.iteritems() if f.file_type_id in Constants.VALID_FT_IDS ] if len(datasets) == 0: raise ValueError("No datasets containing barcoded reads were " + "present in the input. This could mean that " + "demultiplexing was run with incorrect inputs " + "or an overly restrictive minimum barcode score.") return datasets else: return [reads_file]
def test_datastore_report_file_uuid(self): """Test that the DataStore file and the Underlying Report have the same UUID""" ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) n_tested = 0 for ds_file in ds.files.values(): if ds_file.file_type_id == FileTypes.REPORT.file_type_id: rpt = load_report_from_json(ds_file.path) emsg = "{p}: {u1} != {u2}".format(p=ds_file.path, u1=rpt.uuid, u2=ds_file.uuid) # by convention the DS UUID and the Report UUID should the same value self.assertEqual(rpt.uuid, ds_file.uuid, emsg) n_tested += 1 if n_tested == 0: raise unittest.SkipTest( "Warning. No Report JSON files in datastore.")
def _validate_datastore_reports(self, validate_func): ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) # found one or more valid Report have_reports = True for ds_file in ds.files.values(): if ds_file.file_type_id == FileTypes.REPORT.file_type_id: try: _ = validate_func(ds_file.path) except ValueError as e: self.fail( "Report validation failed:\n{e}".format(e=str(e))) else: have_reports = True if not have_reports: raise unittest.SkipTest("No Report JSON files in datastore.") return have_reports
def run_args(args): sample_name = None if not args.single_sample and not args.all_samples: bam = openDataFile(args.samples_file) sample_name = bam.readGroupTable[0].SampleName log.info("Sample name is {}".format(sample_name)) elif args.all_samples: sample_name = "All Samples" files = [] for file_id, file_type, label in FILE_IDS_AND_NAMES: file_path = getattr(args, file_id) if file_path is None: log.info("Skipping {}".format(file_id)) continue assert file_path is not None and op.exists(file_path) if sample_name: label += " ({})".format(sample_name) files.append(to_datastore_file(file_path, file_id, file_type, label)) DataStore(files).write_json(args.datastore) return 0
def run_args(args): datastore_out = op.abspath(args.datastore_out) base_dir = op.dirname(datastore_out) datastore_files = [] with ConsensusReadSet(args.dataset_file, strict=True) as ds: bam_file_name, file_prefix = get_prefix_and_bam_file_name( ds, is_barcoded=False) if args.mode == "fasta": datastore_files.extend(to_fastx_files( FileTypes.FASTA, ds, args.dataset_file, Constants.FASTA_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip)) elif args.mode == "fastq": datastore_files.extend(to_fastx_files( FileTypes.FASTQ, ds, args.dataset_file, Constants.FASTQ_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip)) elif args.mode == "consolidate": if bam_file_name is None: datastore_files.append( consolidate_bam(base_dir, file_prefix, ds, min_rq=args.min_rq)) DataStore(datastore_files).write_json(datastore_out) return 0
def test_datastore_dataset_file_uuid(self): """Test that the DataStore file and the Underlying Report have the same UUID""" dataset_type_ids = FileTypes.ALL_DATASET_TYPES().keys() ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) n_tested = 0 for ds_file in ds.files.values(): if ds_file.file_type_id in dataset_type_ids: path = ds_file.path dsf_uuid = ds_file.uuid uuid = getDataSetUuid(path) self.assertEqual( uuid, dsf_uuid, "{p}: {u1} != {u2}".format(p=path, u1=uuid, u2=dsf_uuid)) n_tested += 1 if n_tested == 0: raise unittest.SkipTest( "Warning. No DataSet XML files in datastore.")
def test_failure_no_inputs(self): ds = DataStore([]) ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name ds.write_json(ds_path) with self.assertRaises(ValueError) as err: report = run_to_report(ds_path, self.barcodes, self.subreads)
def _check_datastore(self, file_name): ds = DataStore.load_from_json(file_name) files = sorted([f.source_id for f in ds.files.values()]) assert files == ["mapped_bam", "mapped_bam_bai"]
def write_and_initialize_data_store_json(file_name, ds_files): ds = DataStore(ds_files) ds.write_json(file_name) return ds
def gather_datastore(input_files, output_file, skip_empty=True): ds = DataStore([]) for i_fn in input_files: for uuid, f in DataStore.load_from_json(i_fn).files.items(): ds.add(f) ds.write_json(output_file)
def _to_datastore(dx): # Friction to get around service endpoint not returning a list of files ds_files = [_to_ds_file(d) for d in dx] return DataStore(ds_files)