def gather_chunks(chunks, output_file, nproc=1): if len(chunks) == 1: datastore = DataStore.load_from_json(op.realpath(chunks[0])) log.info("Writing datastore to %s", output_file) datastore.write_json(output_file) return len(datastore.files) file_names_by_bc = defaultdict(list) datastore_files_by_bc = {} for file_name in chunks: log.info("Reading datastore from %s", file_name) datastore = DataStore.load_from_json(op.realpath(file_name)) for ds_file in datastore.files.values(): ds_file_name = op.realpath(ds_file.path) base_name = op.basename(ds_file_name) fields = base_name.split(".") bc_pair = fields[-3] file_names_by_bc[bc_pair].append(ds_file_name) datastore_files_by_bc[bc_pair] = ds_file log.info("Found %d unique barcode pairs", len(file_names_by_bc)) _results = [] pool = multiprocessing.Pool(nproc) for bc_pair, file_names in file_names_by_bc.items(): _results.append( pool.apply_async(_merge_chunks, (file_names, datastore_files_by_bc[bc_pair]))) pool.close() pool.join() datastore_files = [r.get() for r in _results] datastore_out = DataStore(datastore_files) log.info("Writing datastore to %s", output_file) datastore_out.write_json(output_file) return len(datastore_files)
def test_load_datastore_from_file(self): """ Can load Datastore from Json :return: """ ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) self.assertIsInstance(ds, DataStore)
def datastore_to_datastorefile_objs(in_datastore_json, allowed_types=ALLOWED_TYPES): """Return (datastorefile_objs, type_id, cls, ext) datastorefile_objs -- a list of DataStoreFile objects. type_id -- id cls -- e.g., SubreadSet ext -- e.g., subreadset.xml """ datastore = DataStore.load_from_json(in_datastore_json) allowed_type_ids = [t.file_type_id for t in allowed_types] # Is input datastore empty? if len(datastore.files) == 0: raise ValueError( "Expected one or more dataset files in datastore {}".format( in_datastore_json)) # Do all files share the same type? observed_type_ids = list( set([f.file_type_id for f in datastore.files.values()])) if len(observed_type_ids) != 1: raise ValueError( "Could not handle datastore of mixed types: {}!".format( observed_type_ids)) # Is it an allowed file type? type_id = observed_type_ids[0] if not type_id in allowed_type_ids: raise ValueError( "Could not handle {} dataset in datastore file {}, only support {}!" .format(type_id, in_datastore_json, allowed_type_ids)) cls = _type_id_to_cls(type_id) ext = _type_id_to_ext(type_id) return datastore.files.values(), type_id, cls, ext
def iterate_datastore_read_set_files( datastore_file, allowed_read_types=Constants.ALLOWED_BC_TYPES): """ Iterate over dataset (e.g., SubreadSet or ConsensusReadSet) files listed in a datastore JSON. """ ds = DataStore.load_from_json(datastore_file) files = ds.files.values() for f in files: if f.file_type_id in allowed_read_types: yield f
def test_datastore_file_name_and_description(self): """ Make sure output files have non-blank name and description. """ ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) rx = re.compile(r'[a-zA-Z0-9]{1,}') for fd in ds.files.values(): for x in (fd.name, fd.description): self.assertTrue(rx.search(x))
def run_args(args): dstore = DataStore.load_from_json(os.path.realpath(args.datastore)) ds_in = ConsensusReadSet(args.ccs_in, trustCounts=True) ds_out = ConsensusReadSet(*([f.path for f in dstore.files.values()]), trustCounts=True) sanitize_dataset_tags(ds_out, remove_hidden=True) ds_out.name = ds_in.name.replace(" (filtered)", "") + " (trimmed)" ds_out.subdatasets = [] ds_out.write("trimmed.consensusreadset.xml") return 0
def wrapper(self): ds_path = os.path.join(self.job_dir, "workflow", "datastore.json") ds = DataStore.load_from_json(ds_path) # log.info("Loaded datastore {d}".format(d=ds)) for ds_file in ds.files.values(): if ds_file.file_type_id == file_type_id: started_at = time.time() validator_func(ds_file.path, **kwargs) run_time = time.time() - started_at log.debug("Successfully validated in {s:.2f} sec {p}".format(p=ds_file.path, s=run_time)) self.assertTrue(True)
def run_after(self, rtc, output_dir): with openDataSet(rtc.task.output_files[0]) as f: f.assertIndexed() self.assertEqual(len(f.toExternalFiles()), 1) # test for bug 33778 qnames = set() for rec in f: qnames.add(rec.qName) self.assertEqual(len(qnames), len(f)) ds = DataStore.load_from_json(rtc.task.output_files[1]) self.assertEqual(len(ds.files), 2)
def test_gather_datastore_json(): import subprocess from pbcommand.models import DataStore d = '/pbi/dept/secondary/siv/testdata/pbsvtools-unittest/data/test_scatter_align_datastore/' if1 = op.join(d, '1.aln.datastore.json') if2 = op.join(d, '2.aln.datastore.json') of = tempfile.NamedTemporaryFile(suffix=".datastore.json").name args = ['python', '-m', 'pbcoretools.tasks.gather', of, if1, if2] subprocess.check_call(args) out_fns = DataStore.load_from_json(of).to_dict()['files'] expected_bam_1 = op.join(d, '1.bam') expected_bam_2 = op.join(d, '2.bam') assert out_fns[0]['path'] == expected_bam_1 assert out_fns[1]['path'] == expected_bam_2
def test_datastore_paths(self): tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name base_dir = os.path.dirname(tmpfile) tmp_ds = os.path.join(base_dir, "datastore.json") dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task", FileTypes.DS_SUBREADS.file_type_id, os.path.basename(tmpfile), False, "Subreads", "Subread DataSet XML") ds = DataStore([dsf]) ds.write_json(tmp_ds) with open(tmp_ds) as json_in: d = json.loads(json_in.read()) self.assertFalse(os.path.isabs(d['files'][0]['path'])) ds = DataStore.load_from_json(tmp_ds) self.assertEqual(ds.files.values()[0].path, tmpfile)
def test_integration(self): ds_out = op.join(self._output_dir, "datastore.json") args = [ "python", "-m", "pbreports.report.subreads_reports", pbtestdata.get_file("subreads-sequel"), ds_out ] o, c, m = backticks(" ".join(args)) self.assertEqual(c, 0) self.assertTrue(op.exists(ds_out)) datastore = DataStore.load_from_json(ds_out) datastore_files = [f for u, f in datastore.files.iteritems()] self.assertEqual(sorted([f.file_id for f in datastore_files]), [ "pbreports.tasks.adapter_report_xml", "pbreports.tasks.filter_stats_report_xml", "pbreports.tasks.loading_report_xml" ])
def test_datastore_report_file_uuid(self): """Test that the DataStore file and the Underlying Report have the same UUID""" ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) n_tested = 0 for ds_file in ds.files.values(): if ds_file.file_type_id == FileTypes.REPORT.file_type_id: rpt = load_report_from_json(ds_file.path) emsg = "{p}: {u1} != {u2}".format(p=ds_file.path, u1=rpt.uuid, u2=ds_file.uuid) # by convention the DS UUID and the Report UUID should the same value self.assertEqual(rpt.uuid, ds_file.uuid, emsg) n_tested += 1 if n_tested == 0: raise unittest.SkipTest( "Warning. No Report JSON files in datastore.")
def _get_barcoded_datasets(reads_file): dir_name = os.path.dirname(os.path.abspath(reads_file)) if reads_file.endswith(".datastore.json"): datastore = DataStore.load_from_json(reads_file) datasets = [ _to_abs_path(dir_name, f.path) for u, f in datastore.files.iteritems() if f.file_type_id in Constants.VALID_FT_IDS ] if len(datasets) == 0: raise ValueError("No datasets containing barcoded reads were " + "present in the input. This could mean that " + "demultiplexing was run with incorrect inputs " + "or an overly restrictive minimum barcode score.") return datasets else: return [reads_file]
def _run_auto_ccs_outputs_barcoded(datastore_in, datastore_out, nproc=Constants.MAX_NPROC): base_dir = op.dirname(datastore_out) files = DataStore.load_from_json(datastore_in).files.values() ccs_files = [] for ds_file in files: # FIXME use a better file_id if ds_file.file_type_id == FileTypes.DS_CCS.file_type_id and ds_file.file_id == "barcoding.tasks.lima-0": ccs_files.append(ds_file.path) log.info("Exporting %s", ds_file.path) log.info("Exporting %d CCS datasets", len(ccs_files)) args = [(f, base_dir) for f in ccs_files] output_files = list(itertools.chain.from_iterable( pool_map(__run_ccs_bam_fastq_exports, args, nproc))) output_files.extend([ _create_zipped_fastq(output_files, "all_barcodes.fastq.tar.gz"), _create_zipped_fasta(output_files, "all_barcodes.fasta.tar.gz") ]) DataStore(output_files).write_json(datastore_out) return 0
def _validate_datastore_reports(self, validate_func): ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) # found one or more valid Report have_reports = True for ds_file in ds.files.values(): if ds_file.file_type_id == FileTypes.REPORT.file_type_id: try: _ = validate_func(ds_file.path) except ValueError as e: self.fail( "Report validation failed:\n{e}".format(e=str(e))) else: have_reports = True if not have_reports: raise unittest.SkipTest("No Report JSON files in datastore.") return have_reports
def test_datastore_dataset_file_uuid(self): """Test that the DataStore file and the Underlying Report have the same UUID""" dataset_type_ids = FileTypes.ALL_DATASET_TYPES().keys() ds = DataStore.load_from_json(_to_ds_json(self.job_dir)) n_tested = 0 for ds_file in ds.files.values(): if ds_file.file_type_id in dataset_type_ids: path = ds_file.path dsf_uuid = ds_file.uuid uuid = getDataSetUuid(path) self.assertEqual( uuid, dsf_uuid, "{p}: {u1} != {u2}".format(p=path, u1=uuid, u2=dsf_uuid)) n_tested += 1 if n_tested == 0: raise unittest.SkipTest( "Warning. No DataSet XML files in datastore.")
def update_barcoded_sample_metadata( base_dir, datastore_file, input_reads, barcode_set, isoseq_mode=False, use_barcode_uuids=True, nproc=1, min_score_filter=Constants.BARCODE_QUALITY_GREATER_THAN): """ Given a datastore JSON of SubreadSets produced by barcoding, apply the following updates to each: 1. Include only the BioSample(s) corresponding to its barcode 2. Add the BioSample name to the dataset name 3. Add a ParentDataSet record in the Provenance section. """ barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update( input_reads, barcode_set, datastore_file) pool = multiprocessing.Pool(nproc) _results = [] for ds_file in update_files: _results.append( pool.apply_async(_update_barcoded_sample_metadata, (base_dir, ds_file, barcode_names, parent_info, isoseq_mode, use_barcode_uuids, bio_samples_d, barcode_uuids_d, min_score_filter))) pool.close() pool.join() datastore_files = [r.get() for r in _results] # copy over the un-barcoded reads BAM dstore = DataStore.load_from_json(datastore_file) files = dstore.files.values() for f in files: if f.file_id != "barcoding.tasks.lima-0": datastore_files.append(f) return DataStore(datastore_files)
def _check_datastore(self, file_name): ds = DataStore.load_from_json(file_name) files = sorted([f.source_id for f in ds.files.values()]) assert files == ["mapped_bam", "mapped_bam_bai"]
def gather_datastore(input_files, output_file, skip_empty=True): ds = DataStore([]) for i_fn in input_files: for uuid, f in DataStore.load_from_json(i_fn).files.items(): ds.add(f) ds.write_json(output_file)