def test_version(self): # this is just for logging purposes. log.info("Running pbsmrtpipe version {x}".format(x=pbsmrtpipe.get_version())) log.info("Running on platform {p}".format(p=platform.platform())) log.info("Running on {s}".format(s=platform.node())) log.info("Running nproc {n}".format(n=multiprocessing.cpu_count())) self.assertIsNotNone(pbsmrtpipe.get_version())
def pipeline_template_to_dict(pipeline, rtasks): """ Convert and write the pipeline template to avro compatible dict :type pipeline: Pipeline """ options = [] task_pboptions = [] joptions = _pipeline_to_task_options(rtasks, pipeline) for jtopt in joptions: try: pbopt = _option_jschema_to_pb_option(jtopt) task_pboptions.append(pbopt) except Exception as e: sys.stderr.write("Failed to convert {p}\n".format(p=jtopt)) raise e all_entry_points = [_to_entry_bindings(rtasks, bs[0], bs[1]) for bs in pipeline.entry_bindings] entry_points_d = {d['entryId']: d for d in all_entry_points} bindings = [PipelineBinding(_to_pipeline_binding(b_out), _to_pipeline_binding(b_in)) for b_out, b_in in pipeline.bindings] tags = list(set(pipeline.tags)) desc = "Pipeline {i} description".format(i=pipeline.idx) if pipeline.description is None else pipeline.description comment = "Created pipeline {i} with pbsmrtpipe v{v}".format(i=pipeline.idx, v=pbsmrtpipe.get_version()) return dict(id=pipeline.pipeline_id, name=pipeline.display_name, _comment=comment, version=pipeline.version, entryPoints=entry_points_d.values(), bindings=[b.to_dict() for b in bindings], tags=tags, options=options, taskOptions=[x.to_dict() for x in task_pboptions], description=desc)
def get_main_parser(): desc = "Tool for testing current workflow configuration." p = get_default_argparser(pbsmrtpipe.get_version(), desc) _add_preset_xml_option(p) _add_full_option(p) p.set_defaults(func=_args_run_diagnostics) return p
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType ds = write_and_initialize_data_store_json(job_resources.datastore_json, []) slog.info("successfully initialized datastore.") write_workflow_settings(workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) log.info("Workflow Options:") log.info(pprint.pformat(workflow_level_opts.to_dict(), indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) try: sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env() log.info(sa_system) for c in sa_components: log.info(c) except Exception: # black hole exception log.warn("unable to determine SMRT Analysis version.") pass slog.info("completed setting up job directory resources and logs in {r}".format(r=job_root_dir)) return job_resources, ds
def get_parser(): desc = "Pbsmrtpipe workflow engine" p = get_default_argparser(pbsmrtpipe.get_version(), desc) sp = p.add_subparsers(help='commands') def builder(subparser_id, description, options_func, exe_func): TU.subparser_builder(sp, subparser_id, description, options_func, exe_func) wf_desc = "Run a pipeline using a pipeline template or with explict Bindings and EntryPoints." builder('pipeline', wf_desc, add_pipline_parser_options, _args_run_pipeline) # Run a pipeline by id pipline_id_desc = "Run a registered pipline by specifiying the pipline id." builder('pipeline-id', pipline_id_desc, add_pipeline_id_parser_options, _args_run_pipeline_id) builder('task', "Run Task by id.", add_task_parser_options, _args_task_runner) # Show Templates desc = "List all pipeline templates. A pipeline 'id' can be referenced in " \ "your my_pipeline.xml file using '<import-template id=\"pbsmrtpipe.pipelines.my_pipeline_id\" />. This " \ "can replace the explicit listing of EntryPoints and Bindings." builder('show-templates', desc, add_run_show_templates_options, _args_run_show_templates) # Show Template Details builder('show-template-details', "Show details about a specific Pipeline template.", add_show_template_details_parser_options, _args_run_show_template_details) # Show Tasks show_tasks_desc = "Show completed list of Tasks by id. Use ENV {x} to define a " \ "custom directory of tool contracts. These TCs will override " \ "the installed TCs (e.g., {x}=/path/to/my-tc-dir/)".format(x=ENV_TC_DIR) builder('show-tasks', show_tasks_desc, lambda x: x, _args_run_show_tasks) # Show Task id details desc_task_details = "Show Details of a particular task by id (e.g., 'pbsmrtpipe.tasks.filter_report'). Use 'show-tasks' to get a completed list of registered tasks." builder('show-task-details', desc_task_details, add_show_task_options, _args_run_show_task_details) wfo_desc = "Display all workflow level options that can be set in <options /> for preset.xml" builder('show-workflow-options', wfo_desc, _add_output_preset_xml_option, _args_run_show_workflow_level_options) diag_desc = "Diagnostic tests of preset.xml and cluster configuration" builder('run-diagnostic', diag_desc, add_args_run_diagnstic, _args_run_diagnostics) return p
def to_dict(self): t = self.task.to_dict() if self.cluster: cr = {name: str(t) for name, t in self.cluster.cluster_templates.iteritems()} else: cr = None return dict(id=self.task.task_id, task=t, env={}, cluster=cr, version=pbsmrtpipe.get_version(), resource_types=self.task.resources)
def to_dict(self): t = self.task.to_dict() if self.cluster: cr = {name: str(t) for name, t in self.cluster.cluster_templates.iteritems()} else: cr = None # this duplication of task ids should just be flattened out by clarifying # at the model level return dict(id=self.task.task_id, uuid=self.task.uuid, task=t, env={}, cluster=cr, version=pbsmrtpipe.get_version(), resource_types=self.task.resources)
def _pbsmrtipe_setup_log(alog, **kwargs): """Setup stdout log. pbsmrtpipe will setup pbsmrtpipe.log, master.log This should only emit 'status.*' messages. """ str_formatter = '[%(levelname)s] %(asctime)-15s %(message)s' level = kwargs.get('level', logging.INFO) setup_log(alog, level=level, file_name=None, log_filter=StdOutStatusLogFilter(), str_formatter=str_formatter) slog.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version()))
def _to_report(bg, job_output_dir, job_id, state, was_successful, run_time, error_message=None): """ High Level Report of the workflow state Write the output of workflow datastore to pbreports report object Workflow summary .dot/svg (collapsed workflow) Workflow details .dot/svg (chunked workflow) To add: - Resolved WorkflowSettings (e.g., nproc, max_workers) - :type bg: BindingsGraph """ emsg = "" if error_message is None else error_message attributes = [Attribute('was_successful', was_successful, name="Was Successful"), Attribute('total_run_time_sec', int(run_time), name="Walltime (sec)"), Attribute('error_message', emsg, name="Error Message"), Attribute('job_id', job_id, name="Job Id"), Attribute('job_state', state, name="Job State"), Attribute('job_output_dir', job_output_dir, name="Job Output Directory"), Attribute('pbsmrtpipe_version', pbsmrtpipe.get_version(), name="pbsmrtpipe Version")] columns = [Column('task_id', header='Task id'), Column('was_successful', header='Was Successful'), Column('state', header="Task State"), Column('run_time_sec', header="Run Time (sec)"), Column('nproc', header="# of procs")] tasks_table = Table('tasks', columns=columns) for tnode in bg.all_task_type_nodes(): tasks_table.add_data_by_column_id('task_id', str(tnode)) tasks_table.add_data_by_column_id('nproc', bg.node[tnode]['nproc']) tasks_table.add_data_by_column_id('state', bg.node[tnode]['state']) tasks_table.add_data_by_column_id('was_successful', bg.node[tnode]['state'] == TaskStates.SUCCESSFUL) # rt_ = bg.node[tnode]['run_time'] # rtime = None if rt_ is None else int(rt_) tasks_table.add_data_by_column_id('run_time_sec', bg.node[tnode]['run_time']) ep_table = _to_table("entry_points", bg, bg.entry_binding_nodes()) fnodes_table = _to_table("file_node", bg, bg.file_nodes()) report = Report('pbsmrtpipe', tables=[tasks_table, ep_table, fnodes_table], attributes=attributes) return report
def _pbsmrtipe_setup_log(alog, **kwargs): """Setup stdout log. pbsmrtpipe will setup pbsmrtpipe.log, master.log This should only emit 'status.*' messages. """ # This is a essentially just a bootstrapping step before the job-dir/logs # can be created and proper log files (pbsmrtpipe.log, master.log) will # be setup for this to work with the new global dict setup model would have to # extended to support adding a custom filter. str_formatter = '%(message)s' level = kwargs.get('level', logging.INFO) setup_log(alog, level=level, file_name=None, log_filter=StdOutStatusLogFilter(), str_formatter=str_formatter) slog.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version()))
def _wrapper(*args, **kwargs): started_at = time.time() state = False try: state = func(*args, **kwargs) except Exception as e: emsg = "Error executing function {f}".format(f=func.__name__) log.error(emsg) sys.stderr.write(emsg + "\n") slog.error(e) exc_type, exc_value, exc_tb = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_tb, file=sys.stderr) type_, value_, traceback_ = sys.exc_info() log_traceback(slog, e, traceback_) log_traceback(log, e, traceback_) state = False finally: print "Shutting down." run_time = time.time() - started_at run_time_min = run_time / 60.0 _m = "was Successful" if state else "Failed" msg = "Completed execution pbsmrtpipe v{x}. Workflow {s} in {r:.2f} sec ({m:.2f} min)".format( s=_m, r=run_time, x=pbsmrtpipe.get_version(), m=run_time_min) slog.info(msg) log.info(msg) if not state: sys.stderr.write(msg + "\n") return 0 if state else -1
def pipeline_template_to_dict(pipeline, rtasks): """ Convert and write the pipeline template to avro compatible dict :type pipeline: Pipeline """ options = [] task_pboptions = [] joptions = _pipeline_to_task_options(rtasks, pipeline) for jtopt in joptions: try: pbopt = _option_jschema_to_pb_option(jtopt) task_pboptions.append(pbopt) except Exception as e: log.error("Failed to convert {p}\n".format(p=jtopt)) raise e # The Pipeline entry points and bindings should have been objects, not these encoded "simple" versions. # This generates a bunch of dictionary-mania nonsense all_entry_points = [ _to_entry_bindings(rtasks, bs[0], bs[1]) for bs in pipeline.entry_bindings ] # The Entry Points only communicate the fundamental interface to the pipeline. This allows the pipeline instance # to be loaded from the JSON file entry_points_d = {} for d in all_entry_points: i = d['entryId'] if i in entry_points_d: entry_points_d[i]['tasks'].append(d['task']) else: entry_points_d[i] = dict(entryId=i, name=d['name'], fileTypeId=d['fileTypeId'], tasks=[d['task']]) bindings = [ PipelineBinding(_to_pipeline_binding(b_out), _to_pipeline_binding(b_in)) for b_out, b_in in pipeline.bindings ] tags = list(set(pipeline.tags)) desc = "Pipeline {i} description".format( i=pipeline.idx ) if pipeline.description is None else pipeline.description comment = "Created pipeline {i} with pbsmrtpipe v{v}".format( i=pipeline.idx, v=pbsmrtpipe.get_version()) # Sort the Task Options by id to group by namespace and have slightly # better diffs on the json files sorted_task_options_d = sorted([x.to_dict() for x in task_pboptions], key=lambda x: x['id']) return dict(id=pipeline.pipeline_id, name=pipeline.display_name, _comment=comment, version=pipeline.version, entryPoints=entry_points_d.values(), bindings=[b.to_dict() for b in bindings], tags=tags, options=options, taskOptions=sorted_task_options_d, description=desc)
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::pbsmrtpipe.log", FileTypes.LOG.file_type_id, pb_log_path, name="Analysis Log", description="pbsmrtpipe log") master_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::master.log", FileTypes.LOG.file_type_id, master_log_path, name="Master Log", description="Master log") ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings( workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) if workflow_level_opts.system_message is not None: slog.info("Command: {m}".format(m=workflow_level_opts.system_message)) slog.info("Entry Points:") slog.info("\n" + pprint.pformat(ep_d, indent=4)) slog.info("Workflow Options:") slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4)) slog.info("Task Options:") slog.info("\n" + pprint.pformat(task_opts, indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) log.info("wrote current env to {e}".format(e=env_path)) try: sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env( ) log.info(sa_system) for c in sa_components: log.info(c) except Exception: # black hole exception log.warn("unable to determine SMRT Analysis version.") pass slog.info( "completed setting up job directory resources and logs in {r}".format( r=job_root_dir)) return job_resources, ds, master_log_df
def _to_report(bg, job_output_dir, job_id, state, was_successful, run_time, error_message=None, report_uuid=None): """ High Level Report of the workflow state Write the output of workflow datastore to pbreports report object Workflow summary .dot/svg (collapsed workflow) Workflow details .dot/svg (chunked workflow) To add: - Resolved WorkflowSettings (e.g., nproc, max_workers) - :type bg: BindingsGraph """ emsg = "" if error_message is None else error_message columns = [ Column('task_id', header='Task id'), Column('was_successful', header='Was Successful'), Column('state', header="Task State"), Column('run_time_sec', header="Run Time (sec)"), Column('nproc', header="# of procs"), Column("num_core_hours", header="Core Hours") ] tasks_table = Table('tasks', title="Tasks", columns=columns) for tnode in bg.all_task_type_nodes(): nproc = bg.node[tnode]['nproc'] # the task might not be completed. run_time_sec = bg.node[tnode]['run_time'] if run_time_sec is None: core_hours = 0.0 else: core_hours = (run_time_sec / 60.0 / 60.0) * nproc tasks_table.add_data_by_column_id('task_id', str(tnode)) tasks_table.add_data_by_column_id('nproc', bg.node[tnode]['nproc']) tasks_table.add_data_by_column_id('state', bg.node[tnode]['state']) tasks_table.add_data_by_column_id( 'was_successful', bg.node[tnode]['state'] == TaskStates.SUCCESSFUL) # rt_ = bg.node[tnode]['run_time'] # rtime = None if rt_ is None else int(rt_) tasks_table.add_data_by_column_id('run_time_sec', bg.node[tnode]['run_time']) tasks_table.add_data_by_column_id('num_core_hours', round(core_hours, 4)) total_core_hours = sum( tasks_table.get_column_by_id('num_core_hours').values) attributes = [ Attribute('was_successful', was_successful, name="Was Successful"), Attribute('total_run_time_sec', int(run_time), name="Walltime (sec)"), Attribute('error_message', emsg, name="Error Message"), Attribute('job_id', job_id, name="Job Id"), Attribute('job_state', state, name="Job State"), Attribute('job_output_dir', job_output_dir, name="Job Output Directory"), Attribute('pbsmrtpipe_version', pbsmrtpipe.get_version(), name="pbsmrtpipe Version"), Attribute('total_core_hours', round(total_core_hours, 4), "Total core hours") ] ep_table = _to_table("entry_points", bg, bg.entry_binding_nodes(), "Entry Points") fnodes_table = _to_table("file_node", bg, bg.file_nodes(), "File Nodes") # this would be nice if the DataSet UUIDs of the entry-points are added to the # dataset_uuids of the report. report = Report('pbsmrtpipe', tables=[tasks_table, ep_table, fnodes_table], attributes=attributes, uuid=report_uuid) return report
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe {v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? # For historical reasons, this is a bit non-obvious. The "master" log is now at the # the SMRT Link level, so we've promoted the pbsmrtpipe "master" log (i.e., master.log) to the # be Analysis Details Log using the pbsmrtpipe::pbsmrtpipe.log source Id. There's also this friction point # of marketing using "Analysis" vs "pbsmrtpipe which has generated some inconsistency. smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), GlobalConstants.SOURCE_ID_INFO_LOG, FileTypes.LOG.file_type_id, pb_log_path, name="Analysis Log", description="pbsmrtpipe INFO log") master_log_df = DataStoreFile(str(uuid.uuid4()), GlobalConstants.SOURCE_ID_MASTER_LOG, FileTypes.LOG.file_type_id, master_log_path, name="Analysis Details Log", description="Analysis Details log") ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings( workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) if workflow_level_opts.system_message is not None: slog.info("Command: {m}".format(m=workflow_level_opts.system_message)) slog.info("Entry Points:") slog.info("\n" + pprint.pformat(ep_d, indent=4)) slog.info("Workflow Options:") slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4)) slog.info("Task Options:") slog.info("\n" + pprint.pformat(task_opts, indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) log.info("wrote current env to {e}".format(e=env_path)) slog.info( "completed setting up job directory resources and logs in {r}".format( r=job_root_dir)) return job_resources, ds, master_log_df