def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, task_id=Constants.TOOL_ID): datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) != 1: new_resource_file = op.splitext(output_file)[0] + ".bam" ds_in.consolidate(new_resource_file, numFiles=n_files) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, task_id + "-out-2", ext_res.metaType, ext_res.bam) datastore_files.append(ds_file) for index in ext_res.indices: if index.metaType in Constants.BAI_FILE_TYPES: ds_file = DataStoreFile(index.uniqueId, task_id + "-out-3", index.metaType, index.resourceId) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, consolidate_f=lambda ds: ds.consolidate): # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) <= 0: raise ValueError( "DataSet {} must contain one or more files!".format( dataset_file)) new_resource_file = bam_of_dataset(output_file) consolidate_f(ds_in)(new_resource_file, numFiles=n_files, useTmp=False) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work reads_name = get_reads_name(ds_in) for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, Constants.TOOL_ID + "-out-2", ext_res.metaType, ext_res.bam, name=reads_name, description=reads_name) datastore_files.append(ds_file) # Prevent duplicated index files being added to datastore, since consolidated # dataset may contain multiple indices pointing to the same physical file added_resources = set() for index in ext_res.indices: if (index.metaType in Constants.BAI_FILE_TYPES and index.resourceId not in added_resources): added_resources.add(index.resourceId) ds_file = DataStoreFile( index.uniqueId, Constants.TOOL_ID + "-out-3", index.metaType, index.resourceId, name="Index of {}".format(reads_name.lower()), description="Index of {}".format( reads_name.lower())) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def test_datastore_file(self): tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name ds = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task", FileTypes.DS_SUBREADS.file_type_id, tmpfile, False, "Subreads", "Subread DataSet XML") log.info("DataStoreFile: {s}".format(s=ds)) ds2 = DataStoreFile.from_dict(ds.to_dict()) for attr in [ "uuid", "file_type_id", "file_id", "path", "is_chunked", "name", "description" ]: self.assertEqual(getattr(ds2, attr), getattr(ds, attr))
def to_reports(subreads, output_dir): output_files = [] log.info("Loading {f}".format(f=subreads)) ds = SubreadSet(subreads) ds.loadStats() for base, module in [("filter_stats_xml", filter_stats_xml), ("adapter_xml", adapter_xml), ("loading_xml", loading_xml), ("control", control)]: constants = getattr(module, "Constants") task_id = constants.TOOL_ID to_report = getattr(module, "to_report_impl") try: rpt_output_dir = os.path.join(output_dir, base) os.mkdir(rpt_output_dir) file_name = os.path.join(rpt_output_dir, "{b}.json".format(b=base)) report = to_report(ds, rpt_output_dir) log.info("Writing {f}".format(f=file_name)) report.write_json(file_name) output_files.append(DataStoreFile( uuid=report.uuid, source_id=task_id, type_id=FileTypes.REPORT.file_type_id, path=file_name, is_chunked=False, name=base)) except InvalidStatsError as e: log.error("This dataset lacks some required statistics") log.error("Skipping generation of {b} report".format(b=base)) datastore = DataStore(output_files) return datastore
def to_datastore_file(file_name, file_id, file_type, label): return DataStoreFile(uuid.uuid4(), file_id, file_type.file_type_id, op.abspath(file_name), name=label, description=label)
def _to_datastore_file(file_name, file_id, file_type, description): return DataStoreFile(uuid.uuid4(), file_id, file_type.file_type_id, op.abspath(file_name), name=op.basename(file_name), description=description)
def __create_zipped_fastx(file_type_id, source_id, ds_files, output_file): fastx_files = [f.path for f in ds_files if f.file_type_id == file_type_id] with tarfile.open(output_file, mode="w:gz") as tgz_out: def _write_fastx(fh): arcname = re.sub(".gz", "", op.basename(fh.name)) fastx_in_info = tgz_out.gettarinfo(fileobj=fh, arcname=arcname) # XXX This is very slow but necessary if fh.name.endswith(".gz"): fastx_in_info.size = fh.seek(0, io.SEEK_END) fh.seek(0) tgz_out.addfile(fastx_in_info, fileobj=fh) for file_name in fastx_files: if file_name.endswith(".zip"): with ZipFile(file_name, "r") as zip_in: for fn in zip_in.namelist(): with zip_in.open(fn, mode="r") as fastx_in: _write_fastx(fastx_in) elif file_name.endswith(".gz"): with gzip.open(file_name, "r") as fastx_in: _write_fastx(fastx_in) else: with open(file_name, "r") as fastx_in: _write_fastx(fastx_in) file_type_label = file_type_id.split(".")[-1].upper() return DataStoreFile(uuid.uuid4(), source_id, FileTypes.TGZ.file_type_id, op.abspath(output_file), name="All Barcodes ({l})".format(l=file_type_label))
def _update_analysis_reports_and_datastore(tnode_, task_): assert (len(tnode_.meta_task.output_file_display_names) == len(tnode_.meta_task.output_file_descriptions) == len(tnode_.meta_task.output_types) == len(task_.output_files)) for i_file, (file_type_, path_, name, description) in enumerate(zip( tnode_.meta_task.output_types, task_.output_files, tnode_.meta_task.output_file_display_names, tnode_.meta_task.output_file_descriptions)): source_id = "{t}-out-{i}".format(t=task_.task_id, i=i_file) if tnode_.meta_task.datastore_source_id is not None: source_id = tnode_.meta_task.datastore_source_id ds_uuid = _get_or_create_uuid_from_file(path_, file_type_) is_chunked_ = _is_chunked_task_node_type(tnode_) ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_, is_chunked=is_chunked_, name=name, description=description) ds.add(ds_file_) ds.write_update_json(job_resources.datastore_json) # Update Services services_add_datastore_file(ds_file_) dsr = DU.datastore_to_report(ds) R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html')) if file_type_ == FileTypes.REPORT: T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir)) update_analysis_file_links(tnode_.idx, path_)
def _to_ds_file(d): # is_chunk this isn't exposed at the service level return DataStoreFile(d['uuid'], d['sourceId'], d['fileTypeId'], d['path'], is_chunked=False, name=d.get("name", ""), description=d.get("description", ""))
def _make_datastore(subreads): files = [ DataStoreFile(uuid.uuid4(), "barcoding.tasks.lima-out-0", FileTypes.DS_SUBREADS.file_type_id, subreads) ] ds = DataStore(files) ds_path = tempfile.NamedTemporaryFile(suffix=".datastore.json").name ds.write_json(ds_path) return ds_path
def dataset_to_datastore(dataset_file, datastore_file, source_id="dataset_to_datastore"): """Copied from pbcoretools.tasks.barcoding""" # FIXME: replace barcoding dsmd = get_dataset_metadata(dataset_file) ds_file = DataStoreFile(dsmd.uuid, source_id, dsmd.metatype, dataset_file) ds_out = DataStore([ds_file]) ds_out.write_json(datastore_file) return 0
def _merge_chunks(file_names, datastore_file): output_file = op.basename(file_names[0]) ds = openDataSet(*file_names) if len(file_names) > 1: bam_file = ".".join(output_file.split(".")[:-2] + ["bam"]) ds.consolidate(bam_file, useTmp=False) bc_file = ds.subdatasets[0].externalResources[0].barcodes if bc_file is not None: bc_file = op.realpath(bc_file) ds.externalResources[0].barcodes = bc_file ds.write(output_file, relPaths=False) log.info("Wrote %s", output_file) return DataStoreFile(ds.uuid, datastore_file.source_id, ds.datasetType, op.abspath(output_file))
def test_datastore_paths(self): tmpfile = tempfile.NamedTemporaryFile(suffix=".subreadset.xml").name base_dir = os.path.dirname(tmpfile) tmp_ds = os.path.join(base_dir, "datastore.json") dsf = DataStoreFile(str(uuid.uuid4()), "pbcommand.tasks.dev_task", FileTypes.DS_SUBREADS.file_type_id, os.path.basename(tmpfile), False, "Subreads", "Subread DataSet XML") ds = DataStore([dsf]) ds.write_json(tmp_ds) with open(tmp_ds) as json_in: d = json.loads(json_in.read()) self.assertFalse(os.path.isabs(d['files'][0]['path'])) ds = DataStore.load_from_json(tmp_ds) self.assertEqual(ds.files.values()[0].path, tmpfile)
def _update_analysis_reports_and_datastore(tnode_, task_): for file_type_, path_ in zip(tnode_.meta_task.output_types, task_.output_files): source_id = "{t}-{f}".format(t=task_.task_id, f=file_type_.file_type_id) ds_uuid = _get_dataset_uuid_or_create_uuid(path_) ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_) ds.add(ds_file_) ds.write_update_json(job_resources.datastore_json) # Update Services services_add_datastore_file(ds_file_) dsr = DU.datastore_to_report(ds) R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html')) if file_type_ == FileTypes.REPORT: T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir)) update_analysis_file_links(tnode_.idx, path_)
def to_f(x): source_id = "out-1" sset_out = sset.copy() sset_out.newUuid(random=True) if add_parent: sset_out.metadata.addParentDataSet(sset.uuid, sset.datasetType, createdBy="AnalysisJob", timeStampedName="") file_name = "file-{x:03d}.subreadset.xml".format(x=x) out_path = os.path.join(p, file_name) sset_out.write(out_path) sset_uuid = sset_out.uniqueId name = "subreadset-{}".format(x) dsf = DataStoreFile(sset_uuid, source_id, FileTypes.DS_SUBREADS.file_type_id, file_name, name=name, description="{} Example Description".format(name)) return dsf
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::pbsmrtpipe.log", FileTypes.LOG.file_type_id, pb_log_path, name="Analysis Log", description="pbsmrtpipe log") master_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::master.log", FileTypes.LOG.file_type_id, master_log_path, name="Master Log", description="Master log") ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings( workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) if workflow_level_opts.system_message is not None: slog.info("Command: {m}".format(m=workflow_level_opts.system_message)) slog.info("Entry Points:") slog.info("\n" + pprint.pformat(ep_d, indent=4)) slog.info("Workflow Options:") slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4)) slog.info("Task Options:") slog.info("\n" + pprint.pformat(task_opts, indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) log.info("wrote current env to {e}".format(e=env_path)) try: sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env( ) log.info(sa_system) for c in sa_components: log.info(c) except Exception: # black hole exception log.warn("unable to determine SMRT Analysis version.") pass slog.info( "completed setting up job directory resources and logs in {r}".format( r=job_root_dir)) return job_resources, ds, master_log_df
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe {v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? # For historical reasons, this is a bit non-obvious. The "master" log is now at the # the SMRT Link level, so we've promoted the pbsmrtpipe "master" log (i.e., master.log) to the # be Analysis Details Log using the pbsmrtpipe::pbsmrtpipe.log source Id. There's also this friction point # of marketing using "Analysis" vs "pbsmrtpipe which has generated some inconsistency. smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), GlobalConstants.SOURCE_ID_INFO_LOG, FileTypes.LOG.file_type_id, pb_log_path, name="Analysis Log", description="pbsmrtpipe INFO log") master_log_df = DataStoreFile(str(uuid.uuid4()), GlobalConstants.SOURCE_ID_MASTER_LOG, FileTypes.LOG.file_type_id, master_log_path, name="Analysis Details Log", description="Analysis Details log") ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings( workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) if workflow_level_opts.system_message is not None: slog.info("Command: {m}".format(m=workflow_level_opts.system_message)) slog.info("Entry Points:") slog.info("\n" + pprint.pformat(ep_d, indent=4)) slog.info("Workflow Options:") slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4)) slog.info("Task Options:") slog.info("\n" + pprint.pformat(task_opts, indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) log.info("wrote current env to {e}".format(e=env_path)) slog.info( "completed setting up job directory resources and logs in {r}".format( r=job_root_dir)) return job_resources, ds, master_log_df