def gather_chunks(chunks, output_file, nproc=1): if len(chunks) == 1: datastore = DataStore.load_from_json(op.realpath(chunks[0])) log.info("Writing datastore to %s", output_file) datastore.write_json(output_file) return len(datastore.files) file_names_by_bc = defaultdict(list) datastore_files_by_bc = {} for file_name in chunks: log.info("Reading datastore from %s", file_name) datastore = DataStore.load_from_json(op.realpath(file_name)) for ds_file in datastore.files.values(): ds_file_name = op.realpath(ds_file.path) base_name = op.basename(ds_file_name) fields = base_name.split(".") bc_pair = fields[-3] file_names_by_bc[bc_pair].append(ds_file_name) datastore_files_by_bc[bc_pair] = ds_file log.info("Found %d unique barcode pairs", len(file_names_by_bc)) _results = [] pool = multiprocessing.Pool(nproc) for bc_pair, file_names in file_names_by_bc.items(): _results.append( pool.apply_async(_merge_chunks, (file_names, datastore_files_by_bc[bc_pair]))) pool.close() pool.join() datastore_files = [r.get() for r in _results] datastore_out = DataStore(datastore_files) log.info("Writing datastore to %s", output_file) datastore_out.write_json(output_file) return len(datastore_files)
def to_reports(subreads, output_dir): output_files = [] log.info("Loading {f}".format(f=subreads)) ds = SubreadSet(subreads) ds.loadStats() for base, module in [("filter_stats_xml", filter_stats_xml), ("adapter_xml", adapter_xml), ("loading_xml", loading_xml), ("control", control)]: constants = getattr(module, "Constants") task_id = constants.TOOL_ID to_report = getattr(module, "to_report_impl") try: rpt_output_dir = os.path.join(output_dir, base) os.mkdir(rpt_output_dir) file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base)) report = to_report(ds, rpt_output_dir) log.info("Writing {f}".format(f=file_name)) report.write_json(file_name) output_files.append(DataStoreFile( uuid=report.uuid, source_id=task_id, type_id=FileTypes.REPORT.file_type_id, path=file_name, is_chunked=False, name=base)) except InvalidStatsError as e: log.error("This dataset lacks some required statistics") log.error("Skipping generation of {b} report".format(b=base)) datastore = DataStore(output_files) return datastore
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, task_id=Constants.TOOL_ID): datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) != 1: new_resource_file = op.splitext(output_file)[0] + ".bam" ds_in.consolidate(new_resource_file, numFiles=n_files) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, task_id + "-out-2", ext_res.metaType, ext_res.bam) datastore_files.append(ds_file) for index in ext_res.indices: if index.metaType in Constants.BAI_FILE_TYPES: ds_file = DataStoreFile(index.uniqueId, task_id + "-out-3", index.metaType, index.resourceId) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def mock_update_barcoded_sample_metadata(base_dir, datastore_file, input_reads, barcode_set, use_barcode_uuids=True): """ Function to mimic the actual update function, without actually reading any barcoding information from the datasets. Instead, the barcodes defined in the input dataset will be applied sequentially. """ barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update( input_reads, barcode_set, datastore_file, None) barcode_ids = {name: i for i, name in enumerate(barcode_names)} bc_pairs = [] ds_files = {} for bc_label in barcode_uuids_d.keys(): bc_fw_label, bc_rev_label = bc_label.split("--") bc_pairs.append((barcode_ids[bc_fw_label], barcode_ids[bc_rev_label])) suffix = ".{l}.subreadset.xml".format(l=bc_label) for ds_file in update_files: if ds_file.path.endswith(suffix): ds_files[bc_pairs[-1]] = ds_file new_files = [] assert len(bc_pairs) >= len(update_files) for bc_pair in bc_pairs: ds_file = ds_files[bc_pair] new_files.append( _mock_update_barcoded_sample_metadata(base_dir, ds_file, barcode_names, parent_info, use_barcode_uuids, bc_pair, bio_samples_d, barcode_uuids_d)) return DataStore(new_files)
def _make_datastore(subreads): files = [ DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-out-0", FileTypes.DS_SUBREADS.file_type_id, subreads) ] ds = DataStore(files) ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name ds.write_json(ds_path) return ds_path
def dataset_to_datastore(dataset_file, datastore_file, source_id="dataset_to_datastore"): """Copied from pbcoretools.tasks.barcoding""" # FIXME: replace barcoding dsmd = get_dataset_metadata(dataset_file) ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file) ds_out = DataStore([ds_file]) ds_out.write_json(datastore_file) return 0
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, consolidate_f=lambda ds: ds.consolidate): # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) <= 0: raise ValueError( "DataSet {} must contain one or more files!".format( dataset_file)) new_resource_file = bam_of_dataset(output_file) consolidate_f(ds_in)(new_resource_file, numFiles=n_files, useTmp=False) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work reads_name = get_reads_name(ds_in) for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, Constants.TOOL_ID + "-out-2", ext_res.metaType, ext_res.bam, name=reads_name, description=reads_name) datastore_files.append(ds_file) # Prevent duplicated index files being added to datastore, since consolidated # dataset may contain multiple indices pointing to the same physical file added_resources = set() for index in ext_res.indices: if (index.metaType in Constants.BAI_FILE_TYPES and index.resourceId not in added_resources): added_resources.add(index.resourceId) ds_file = DataStoreFile( index.uniqueId, Constants.TOOL_ID + "-out-3", index.metaType, index.resourceId, name="Index of {}".format(reads_name.lower()), description="Index of {}".format( reads_name.lower())) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def test_datastore_paths(self): tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name base_dir = os.path.dirname(tmpfile) tmp_ds = os.path.join(base_dir, "datastore.json") dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task", FileTypes.DS_SUBREADS.file_type_id, os.path.basename(tmpfile), False, "Subreads", "Subread DataSet XML") ds = DataStore([dsf]) ds.write_json(tmp_ds) with open(tmp_ds) as json_in: d = json.loads(json_in.read()) self.assertFalse(os.path.isabs(d['files'][0]['path'])) ds = DataStore.load_from_json(tmp_ds) self.assertEqual(ds.files.values()[0].path, tmpfile)
def run_dev_txt_to_datastore(rtc): p = os.path.dirname(rtc.task.output_files[0]) sleep_multiplier = rtc.task.options[ 'pbsmrtpipe.task_options.sleep_multiplier'] t_sleep = sleep_multiplier * random.random() log.info("Sleeping for %.1f seconds", t_sleep) time.sleep(t_sleep) from pbcore.io import SubreadSet num_subreadsets = rtc.task.options[ 'pbsmrtpipe.task_options.num_subreadsets'] sset = SubreadSet(rtc.task.input_files[0]) add_parent = True if len(sset.metadata.provenance) > 0: log.warn("Not adding provenance since input already has a parent") add_parent = False def to_f(x): source_id = "out-1" sset_out = sset.copy() sset_out.newUuid(random=True) if add_parent: sset_out.metadata.addParentDataSet(sset.uuid, sset.datasetType, createdBy="AnalysisJob", timeStampedName="") file_name = "file-{x:03d}.subreadset.xml".format(x=x) out_path = os.path.join(p, file_name) sset_out.write(out_path) sset_uuid = sset_out.uniqueId name = "subreadset-{}".format(x) dsf = DataStoreFile(sset_uuid, source_id, FileTypes.DS_SUBREADS.file_type_id, file_name, name=name, description="{} Example Description".format(name)) return dsf files = [to_f(i + 1) for i in xrange(num_subreadsets)] ds = DataStore(files) ds.write_json(rtc.task.output_files[0]) return 0
def _run_auto_ccs_outputs_barcoded(datastore_in, datastore_out, nproc=Constants.MAX_NPROC): base_dir = op.dirname(datastore_out) files = DataStore.load_from_json(datastore_in).files.values() ccs_files = [] for ds_file in files: # FIXME use a better file_id if ds_file.file_type_id == FileTypes.DS_CCS.file_type_id and ds_file.file_id == "barcoding.tasks.lima-0": ccs_files.append(ds_file.path) log.info("Exporting %s", ds_file.path) log.info("Exporting %d CCS datasets", len(ccs_files)) args = [(f, base_dir) for f in ccs_files] output_files = list(itertools.chain.from_iterable( pool_map(__run_ccs_bam_fastq_exports, args, nproc))) output_files.extend([ _create_zipped_fastq(output_files, "all_barcodes.fastq.tar.gz"), _create_zipped_fasta(output_files, "all_barcodes.fasta.tar.gz") ]) DataStore(output_files).write_json(datastore_out) return 0
def run_args(args): datastore_out = op.abspath(args.datastore_out) base_dir = op.dirname(datastore_out) datastore_files = [] with ConsensusReadSet(args.dataset_file, strict=True) as ds: bam_file_name, file_prefix = get_prefix_and_bam_file_name( ds, is_barcoded=False) if args.mode == "fasta": datastore_files.extend(to_fastx_files( FileTypes.FASTA, ds, args.dataset_file, Constants.FASTA_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip)) elif args.mode == "fastq": datastore_files.extend(to_fastx_files( FileTypes.FASTQ, ds, args.dataset_file, Constants.FASTQ_FILE_IDS, base_dir, file_prefix, args.min_rq, no_zip=args.no_zip)) elif args.mode == "consolidate": if bam_file_name is None: datastore_files.append( consolidate_bam(base_dir, file_prefix, ds, min_rq=args.min_rq)) DataStore(datastore_files).write_json(datastore_out) return 0
def run_args(args): sample_name = None if not args.single_sample and not args.all_samples: bam = openDataFile(args.samples_file) sample_name = bam.readGroupTable[0].SampleName log.info("Sample name is {}".format(sample_name)) elif args.all_samples: sample_name = "All Samples" files = [] for file_id, file_type, label in FILE_IDS_AND_NAMES: file_path = getattr(args, file_id) if file_path is None: log.info("Skipping {}".format(file_id)) continue assert file_path is not None and op.exists(file_path) if sample_name: label += " ({})".format(sample_name) files.append(to_datastore_file(file_path, file_id, file_type, label)) DataStore(files).write_json(args.datastore) return 0
def update_barcoded_sample_metadata( base_dir, datastore_file, input_reads, barcode_set, isoseq_mode=False, use_barcode_uuids=True, nproc=1, min_score_filter=Constants.BARCODE_QUALITY_GREATER_THAN): """ Given a datastore JSON of SubreadSets produced by barcoding, apply the following updates to each: 1. Include only the BioSample(s) corresponding to its barcode 2. Add the BioSample name to the dataset name 3. Add a ParentDataSet record in the Provenance section. """ barcode_names, bio_samples_d, barcode_uuids_d, update_files, parent_info = _load_files_for_update( input_reads, barcode_set, datastore_file) pool = multiprocessing.Pool(nproc) _results = [] for ds_file in update_files: _results.append( pool.apply_async(_update_barcoded_sample_metadata, (base_dir, ds_file, barcode_names, parent_info, isoseq_mode, use_barcode_uuids, bio_samples_d, barcode_uuids_d, min_score_filter))) pool.close() pool.join() datastore_files = [r.get() for r in _results] # copy over the un-barcoded reads BAM dstore = DataStore.load_from_json(datastore_file) files = dstore.files.values() for f in files: if f.file_id != "barcoding.tasks.lima-0": datastore_files.append(f) return DataStore(datastore_files)
def test_failure_no_inputs(self): ds = DataStore([]) ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name ds.write_json(ds_path) with self.assertRaises(ValueError) as err: report = run_to_report(ds_path, self.barcodes, self.subreads)
def gather_datastore(input_files, output_file, skip_empty=True): ds = DataStore([]) for i_fn in input_files: for uuid, f in DataStore.load_from_json(i_fn).files.items(): ds.add(f) ds.write_json(output_file)
def _to_datastore(dx): # Friction to get around service endpoint not returning a list of files ds_files = [_to_ds_file(d) for d in dx] return DataStore(ds_files)
def write_and_initialize_data_store_json(file_name, ds_files): ds = DataStore(ds_files) ds.write_json(file_name) return ds