def run_pipeline(registered_pipelines_d, registered_file_types_d, registered_tasks_d, chunk_operators, workflow_template_xml, entry_points_d, output_dir, preset_xml, rc_preset_or_none, service_uri, force_distribute=None, force_chunk_mode=None, debug_mode=None): """ Entry point for running a pipeline :param workflow_template_xml: path to workflow xml :param entry_points_d: :param output_dir: :param preset_xml: path to preset xml (or None) :return: exit code :type registered_tasks_d: dict[str, pbsmrtpipe.pb_tasks.core.MetaTask] :type registered_file_types_d: dict[str, pbsmrtpipe.pb_tasks.core.FileType] :type workflow_template_xml: str :type output_dir: str :type preset_xml: str | None :type service_uri: str | None :type force_distribute: None | bool :rtype: int """ log.debug(pprint.pformat(entry_points_d)) workflow_bindings, workflow_level_opts, task_opts, cluster_render = _load_io_for_workflow(registered_tasks_d, registered_pipelines_d, workflow_template_xml, entry_points_d, preset_xml, rc_preset_or_none, force_distribute=force_distribute, force_chunk_mode=force_chunk_mode, debug_mode=debug_mode) slog.info("building graph") bg = B.binding_strs_to_binding_graph(registered_tasks_d, workflow_bindings) slog.info("successfully loaded graph from bindings.") valid_chunk_operators = {} # Disabled chunk operators if necessary if workflow_level_opts.chunk_mode is False: slog.info("Chunk mode is False. Disabling {n} chunk operators.".format(n=len(chunk_operators))) else: # Validate chunk operators, or skip if malformed. for chunk_operator_id, chunk_operator in chunk_operators.iteritems(): try: validate_operator(chunk_operator, registered_tasks_d) valid_chunk_operators[chunk_operator_id] = chunk_operator except MalformedChunkOperatorError as e: log.warn("Invalid chunk operator {i}. {m}".format(i=chunk_operator_id, m=e.message)) filtered_chunk_operators_d = _filter_chunk_operators(bg, valid_chunk_operators) # Container to hold all the resources global_registry = GlobalRegistry(registered_tasks_d, registered_file_types_d, filtered_chunk_operators_d, cluster_render) return exe_workflow(global_registry, entry_points_d, bg, task_opts, workflow_level_opts, output_dir, service_uri)
def run_pipeline(registered_pipelines_d, registered_file_types_d, registered_tasks_d, chunk_operators, workflow_template_xml, entry_points_d, output_dir, preset_xmls, rc_preset_or_none, service_uri, force_distribute=None, force_chunk_mode=None, debug_mode=None): """ Entry point for running a pipeline :param workflow_template_xml: path to workflow xml :param entry_points_d: :param output_dir: :param preset_xmls: list of path to preset xml :return: exit code :type registered_tasks_d: dict[str, pbsmrtpipe.pb_tasks.core.MetaTask] :type registered_file_types_d: dict[str, pbsmrtpipe.pb_tasks.core.FileType] :type workflow_template_xml: str :type output_dir: str :type preset_xmls: list[str] :type service_uri: str | None :type force_distribute: None | bool :rtype: int """ log.debug(pprint.pformat(entry_points_d)) workflow_bindings, workflow_level_opts, task_opts, cluster_render = _load_io_for_workflow(registered_tasks_d, registered_pipelines_d, workflow_template_xml, entry_points_d, preset_xmls, rc_preset_or_none, force_distribute=force_distribute, force_chunk_mode=force_chunk_mode, debug_mode=debug_mode) slog.info("building graph") bg = B.binding_strs_to_binding_graph(registered_tasks_d, workflow_bindings) slog.info("successfully loaded graph from bindings.") valid_chunk_operators = {} # Disabled chunk operators if necessary if workflow_level_opts.chunk_mode is False: slog.info("Chunk mode is False. Disabling {n} chunk operators.".format(n=len(chunk_operators))) else: # Validate chunk operators, or skip if malformed. for chunk_operator_id, chunk_operator in chunk_operators.iteritems(): try: validate_operator(chunk_operator, registered_tasks_d) valid_chunk_operators[chunk_operator_id] = chunk_operator except MalformedChunkOperatorError as e: log.warn("Invalid chunk operator {i}. {m}".format(i=chunk_operator_id, m=e.message)) filtered_chunk_operators_d = _filter_chunk_operators(bg, valid_chunk_operators) # Container to hold all the resources global_registry = GlobalRegistry(registered_tasks_d, registered_file_types_d, filtered_chunk_operators_d, cluster_render) return exe_workflow(global_registry, entry_points_d, bg, task_opts, workflow_level_opts, output_dir, service_uri)
def run_single_task(registered_file_types_d, registered_tasks_d, chunk_operators, entry_points_d, task_id, output_dir, preset_xmls, rc_preset_or_none, service_config, force_distribute=None, force_chunk_mode=None, debug_mode=None): """ Entry Point for running a single task :param task_id: :param output_dir: :return: """ print entry_points_d meta_task = registered_tasks_d.get(task_id, None) if meta_task is None: raise KeyError( "Unable to find task id '{i}' in registered tasks. Use " "'show-tasks' to get a list of registered tasks.".format( i=task_id)) workflow_level_opts, task_opts, cluster_render = _load_io_for_task( registered_tasks_d, entry_points_d, preset_xmls, rc_preset_or_none, force_distribute=force_distribute, force_chunk_mode=force_chunk_mode, debug_mode=debug_mode) slog.info("building bindings graph") binding_str = _task_to_binding_strings(meta_task) bg = B.binding_strs_to_binding_graph(registered_tasks_d, binding_str) slog.info("successfully bindings graph for task {i}".format(i=task_id)) # Validate chunk operators valid_chunk_operators = { k: v for k, v in chunk_operators.iteritems() if validate_operator(v, registered_tasks_d) } filtered_chunk_operators_d = _filter_chunk_operators( bg, valid_chunk_operators) # Container to hold all the resources global_registry = GlobalRegistry(registered_tasks_d, registered_file_types_d, filtered_chunk_operators_d, cluster_render) return exe_workflow(global_registry, entry_points_d, bg, task_opts, workflow_level_opts, output_dir, service_config)
def setUpClass(cls): pipeline = REGISTERED_PIPELINES[cls.PB_PIPELINE_ID] log.debug(pipeline) cls.bindings = pipeline.all_bindings cls.EPOINTS_D = { k: get_temp_file(v) for k, v in cls.EPOINTS_NAMES.iteritems() } log.debug(pprint.pformat(cls.bindings, indent=4)) log.debug( "Number of registered tasks {n}".format(n=len(REGISTERED_TASKS))) cls.bgraph = B.binding_strs_to_binding_graph(REGISTERED_TASKS, cls.bindings) d = os.path.expanduser('~/scratch/tmp_pbsmrtpipe') if getpass.getuser( ) == 'mkocher' else None cls.output_dir = tempfile.mkdtemp(prefix='job_test_', dir=d) preset_record = IO.parse_pipeline_preset_xml( os.path.join(TEST_DATA_DIR, cls.PRESET_XML)) cls.workflow_options = preset_record.to_workflow_level_opt() # leave this for now cls.envs = [] cls.cluster_engine = C.load_installed_cluster_templates_by_name("sge")
def registry_runner(registry_, rtasks, output_dir, emit_xml=False): """ :type registry_: PipelineRegistry :param rtasks: :type output_dir: str :type emit_xml: bool :rtype: int """ # this will emit the PTs to an output dir import pbsmrtpipe.pb_io as IO # get around circular imports import pbsmrtpipe.graph.bgraph as B r = registry_ log.info("Validating pipelines") # Validating pipeline integrity for i, p in r.pipelines.iteritems(): log.debug("Validating pipeline {}".format(i)) bg = B.binding_strs_to_binding_graph(rtasks, list(p.all_bindings)) # or this will raise is_valid = B.validate_binding_graph_integrity(bg) log.info("Pipeline {} is valid? {}".format(i, is_valid)) # Make the dir if it' doesn't exist output_pipeline_dir = os.path.abspath(os.path.expanduser(output_dir)) log.info("Writing {x} Pipeline Templates to {o}".format(o=output_dir, x=len( r.pipelines))) print "Emitting pipelines to output dir {d}".format(d=output_pipeline_dir) IO.write_pipeline_templates_to_json(r.pipelines.values(), rtasks, output_pipeline_dir) for p in r.pipelines.values(): if emit_xml: file_name = p.idx + "_pipeline.xml" path = os.path.join(output_dir, file_name) xml = IO.pipeline_to_xml(p) with open(path, 'w') as f: f.write(str(xml)) log.info("writing pipeline {x}".format(x=path)) _d = dict(n=len(r.pipelines), d=output_pipeline_dir, x=len(r.original_pipeline_ids), a=len(r.all_pipelines)) log.info( "Successfully wrote {n} new pipelines (previously loaded {x} all pipelines {a} to {d}" .format(**_d)) return 0
def test_all_sane(self): """Test that all pipelines are well defined""" errors = [] rtasks, rfiles_types, chunk_operators, pipelines = L.load_all() for pipeline_id, pipeline in pipelines.items(): emsg = "Pipeline {p} is not valid.".format(p=pipeline_id) log.debug("Checking Sanity of registered Pipeline {i}".format( i=pipeline_id)) log.info(pipeline_id) log.debug(pipeline) try: # Validate with Avro d = pipeline_template_to_dict(pipeline, rtasks) _ = validate_pipeline_template(d) name = pipeline_id + "_pipeline_template.avro" output_file = get_temp_file(suffix=name) log.info( "{p} converted to avro successfully".format(p=pipeline_id)) bg = BU.binding_strs_to_binding_graph(rtasks, pipeline.all_bindings) BU.validate_binding_graph_integrity(bg) BU.validate_compatible_binding_file_types(bg) validate_entry_points(d) # pprint.pprint(d) # for debugging purposes output_json = output_file.replace(".avro", '.json') log.info("writing pipeline to {p}".format(p=output_json)) with open(output_json, 'w') as j: j.write(json.dumps(d, sort_keys=True, indent=4)) log.info( "writing pipeline template to {o}".format(o=output_file)) # Test writing to avro if the pipeline is actually valid write_pipeline_template_to_avro(pipeline, rtasks, output_file) log.info("Pipeline {p} is valid.".format(p=pipeline_id)) log.info("Loading avro {i} from {p}".format(i=pipeline_id, p=output_file)) pipeline_d = load_pipeline_template_from_avro(output_file) self.assertIsInstance(pipeline_d, dict) except Exception as e: m = emsg + " Error: " + e.message log.error(m) errors.append(m) log.error(emsg) log.error(e) msg = "\n".join(errors) if errors else "" self.assertEqual([], errors, msg)
def registry_runner(registry_, rtasks, output_dir, emit_xml=False): """ :type registry_: PipelineRegistry :param rtasks: :type output_dir: str :type emit_xml: bool :rtype: int """ # this will emit the PTs to an output dir import pbsmrtpipe.pb_io as IO # get around circular imports import pbsmrtpipe.graph.bgraph as B r = registry_ log.info("Validating pipelines") # Validating pipeline integrity for i, p in r.pipelines.iteritems(): log.debug("Validating pipeline {}".format(i)) bg = B.binding_strs_to_binding_graph(rtasks, list(p.all_bindings)) # or this will raise is_valid = B.validate_binding_graph_integrity(bg) log.info("Pipeline {} is valid? {}".format(i, is_valid)) # Make the dir if it' doesn't exist output_pipeline_dir = os.path.abspath(os.path.expanduser(output_dir)) log.info("Writing {x} Pipeline Templates to {o}".format(o=output_dir, x=len(r.pipelines))) print "Emitting pipelines to output dir {d}".format(d=output_pipeline_dir) IO.write_pipeline_templates_to_json(r.pipelines.values(), rtasks, output_pipeline_dir) for p in r.pipelines.values(): if emit_xml: file_name = p.idx + "_pipeline.xml" path = os.path.join(output_dir, file_name) xml = IO.pipeline_to_xml(p) with open(path, 'w') as f: f.write(str(xml)) log.info("writing pipeline {x}".format(x=path)) _d = dict(n=len(r.pipelines), d=output_pipeline_dir, x=len(r.original_pipeline_ids), a=len(r.all_pipelines)) log.info("Successfully wrote {n} new pipelines (previously loaded {x} all pipelines {a} to {d}".format(**_d)) return 0
def _run_driver_from_job_config(job_config): """ :type job_config: JobConfig :param job_config: :return: """ job_output_dir = job_config.tmp_dir_func(job_config.job_name) tmp_dir = job_config.tmp_dir_func(job_config.job_name + '_tmp') ep_d = {e_id: job_config.tmp_file_func(file_name) for e_id, file_name in job_config.ep_d.iteritems()} rtasks, chunk_operators = _get_registered_tasks_and_operators() rfiles = _get_registered_files() bgraph_ = B.binding_strs_to_binding_graph(rtasks, job_config.bindings_str) state = _test_run_driver(chunk_operators, rtasks, rfiles, ep_d, bgraph_, job_output_dir, tmp_dir, job_config.task_opts, job_config.cluster_renderer) return state
def run_single_task(registered_file_types_d, registered_tasks_d, chunk_operators, entry_points_d, task_id, output_dir, preset_xmls, rc_preset_or_none, service_config, force_distribute=None, force_chunk_mode=None, debug_mode=None): """ Entry Point for running a single task :param task_id: :param output_dir: :return: """ print entry_points_d meta_task = registered_tasks_d.get(task_id, None) if meta_task is None: raise KeyError("Unable to find task id '{i}' in registered tasks. Use " "'show-tasks' to get a list of registered tasks.".format(i=task_id)) workflow_level_opts, task_opts, cluster_render = _load_io_for_task(registered_tasks_d, entry_points_d, preset_xmls, rc_preset_or_none, force_distribute=force_distribute, force_chunk_mode=force_chunk_mode, debug_mode=debug_mode) slog.info("building bindings graph") binding_str = _task_to_binding_strings(meta_task) bg = B.binding_strs_to_binding_graph(registered_tasks_d, binding_str) slog.info("successfully bindings graph for task {i}".format(i=task_id)) # Validate chunk operators valid_chunk_operators = {k: v for k, v in chunk_operators.iteritems() if validate_operator(v, registered_tasks_d)} filtered_chunk_operators_d = _filter_chunk_operators(bg, valid_chunk_operators) # Container to hold all the resources global_registry = GlobalRegistry(registered_tasks_d, registered_file_types_d, filtered_chunk_operators_d, cluster_render) return exe_workflow(global_registry, entry_points_d, bg, task_opts, workflow_level_opts, output_dir, service_config)
def setUpClass(cls): pipeline = REGISTERED_PIPELINES[cls.PB_PIPELINE_ID] log.debug(pipeline) cls.bindings = pipeline.all_bindings cls.EPOINTS_D = {k: get_temp_file(v) for k, v in cls.EPOINTS_NAMES.iteritems()} log.debug(pprint.pformat(cls.bindings, indent=4)) log.debug("Number of registered tasks {n}".format(n=len(REGISTERED_TASKS))) cls.bgraph = B.binding_strs_to_binding_graph(REGISTERED_TASKS, cls.bindings) d = os.path.expanduser('~/scratch/tmp_pbsmrtpipe') if getpass.getuser() == 'mkocher' else None cls.output_dir = tempfile.mkdtemp(prefix='job_test_', dir=d) preset_record = IO.parse_pipeline_preset_xml(os.path.join(TEST_DATA_DIR, cls.PRESET_XML)) cls.workflow_options = preset_record.to_workflow_level_opt() # leave this for now cls.envs = [] cls.cluster_engine = C.load_installed_cluster_templates_by_name("sge")
def _run_driver_from_job_config(job_config): """ :type job_config: JobConfig :param job_config: :return: """ job_output_dir = job_config.tmp_dir_func(job_config.job_name) tmp_dir = job_config.tmp_dir_func(job_config.job_name + '_tmp') ep_d = { e_id: job_config.tmp_file_func(file_name) for e_id, file_name in job_config.ep_d.iteritems() } rtasks, chunk_operators = _get_registered_tasks_and_operators() rfiles = _get_registered_files() bgraph_ = B.binding_strs_to_binding_graph(rtasks, job_config.bindings_str) state = _test_run_driver(chunk_operators, rtasks, rfiles, ep_d, bgraph_, job_output_dir, tmp_dir, job_config.task_opts, job_config.cluster_renderer) return state