Exemplo n.º 1
0
def run_pipeline(registered_pipelines_d, registered_file_types_d, registered_tasks_d,
                 chunk_operators, workflow_template_xml, entry_points_d,
                 output_dir, preset_xml, rc_preset_or_none, service_uri,
                 force_distribute=None, force_chunk_mode=None, debug_mode=None):
    """
    Entry point for running a pipeline

    :param workflow_template_xml: path to workflow xml
    :param entry_points_d:
    :param output_dir:
    :param preset_xml: path to preset xml (or None)
    :return: exit code

    :type registered_tasks_d: dict[str, pbsmrtpipe.pb_tasks.core.MetaTask]
    :type registered_file_types_d: dict[str, pbsmrtpipe.pb_tasks.core.FileType]
    :type workflow_template_xml: str
    :type output_dir: str
    :type preset_xml: str | None
    :type service_uri: str | None
    :type force_distribute: None | bool

    :rtype: int
    """
    log.debug(pprint.pformat(entry_points_d))

    workflow_bindings, workflow_level_opts, task_opts, cluster_render = _load_io_for_workflow(registered_tasks_d,
                                                                                              registered_pipelines_d,
                                                                                              workflow_template_xml,
                                                                                              entry_points_d, preset_xml,
                                                                                              rc_preset_or_none,
                                                                                              force_distribute=force_distribute,
                                                                                              force_chunk_mode=force_chunk_mode,
                                                                                              debug_mode=debug_mode)

    slog.info("building graph")
    bg = B.binding_strs_to_binding_graph(registered_tasks_d, workflow_bindings)
    slog.info("successfully loaded graph from bindings.")

    valid_chunk_operators = {}
    # Disabled chunk operators if necessary
    if workflow_level_opts.chunk_mode is False:
        slog.info("Chunk mode is False. Disabling {n} chunk operators.".format(n=len(chunk_operators)))
    else:
        # Validate chunk operators, or skip if malformed.
        for chunk_operator_id, chunk_operator in chunk_operators.iteritems():
            try:
                validate_operator(chunk_operator, registered_tasks_d)
                valid_chunk_operators[chunk_operator_id] = chunk_operator
            except MalformedChunkOperatorError as e:
                log.warn("Invalid chunk operator {i}. {m}".format(i=chunk_operator_id, m=e.message))

    filtered_chunk_operators_d = _filter_chunk_operators(bg, valid_chunk_operators)
    # Container to hold all the resources
    global_registry = GlobalRegistry(registered_tasks_d,
                                     registered_file_types_d,
                                     filtered_chunk_operators_d,
                                     cluster_render)

    return exe_workflow(global_registry, entry_points_d, bg, task_opts,
                        workflow_level_opts, output_dir, service_uri)
Exemplo n.º 2
0
def run_pipeline(registered_pipelines_d, registered_file_types_d, registered_tasks_d,
                 chunk_operators, workflow_template_xml, entry_points_d,
                 output_dir, preset_xmls, rc_preset_or_none, service_uri,
                 force_distribute=None, force_chunk_mode=None, debug_mode=None):
    """
    Entry point for running a pipeline

    :param workflow_template_xml: path to workflow xml
    :param entry_points_d:
    :param output_dir:
    :param preset_xmls: list of path to preset xml
    :return: exit code

    :type registered_tasks_d: dict[str, pbsmrtpipe.pb_tasks.core.MetaTask]
    :type registered_file_types_d: dict[str, pbsmrtpipe.pb_tasks.core.FileType]
    :type workflow_template_xml: str
    :type output_dir: str
    :type preset_xmls: list[str]
    :type service_uri: str | None
    :type force_distribute: None | bool

    :rtype: int
    """
    log.debug(pprint.pformat(entry_points_d))

    workflow_bindings, workflow_level_opts, task_opts, cluster_render = _load_io_for_workflow(registered_tasks_d,
                                                                                              registered_pipelines_d,
                                                                                              workflow_template_xml,
                                                                                              entry_points_d, preset_xmls,
                                                                                              rc_preset_or_none,
                                                                                              force_distribute=force_distribute,
                                                                                              force_chunk_mode=force_chunk_mode,
                                                                                              debug_mode=debug_mode)

    slog.info("building graph")
    bg = B.binding_strs_to_binding_graph(registered_tasks_d, workflow_bindings)
    slog.info("successfully loaded graph from bindings.")

    valid_chunk_operators = {}
    # Disabled chunk operators if necessary
    if workflow_level_opts.chunk_mode is False:
        slog.info("Chunk mode is False. Disabling {n} chunk operators.".format(n=len(chunk_operators)))
    else:
        # Validate chunk operators, or skip if malformed.
        for chunk_operator_id, chunk_operator in chunk_operators.iteritems():
            try:
                validate_operator(chunk_operator, registered_tasks_d)
                valid_chunk_operators[chunk_operator_id] = chunk_operator
            except MalformedChunkOperatorError as e:
                log.warn("Invalid chunk operator {i}. {m}".format(i=chunk_operator_id, m=e.message))

    filtered_chunk_operators_d = _filter_chunk_operators(bg, valid_chunk_operators)
    # Container to hold all the resources
    global_registry = GlobalRegistry(registered_tasks_d,
                                     registered_file_types_d,
                                     filtered_chunk_operators_d,
                                     cluster_render)

    return exe_workflow(global_registry, entry_points_d, bg, task_opts,
                        workflow_level_opts, output_dir, service_uri)
Exemplo n.º 3
0
def run_single_task(registered_file_types_d,
                    registered_tasks_d,
                    chunk_operators,
                    entry_points_d,
                    task_id,
                    output_dir,
                    preset_xmls,
                    rc_preset_or_none,
                    service_config,
                    force_distribute=None,
                    force_chunk_mode=None,
                    debug_mode=None):
    """
    Entry Point for running a single task

    :param task_id:
    :param output_dir:
    :return:
    """

    print entry_points_d
    meta_task = registered_tasks_d.get(task_id, None)

    if meta_task is None:
        raise KeyError(
            "Unable to find task id '{i}' in registered tasks. Use "
            "'show-tasks' to get a list of registered tasks.".format(
                i=task_id))

    workflow_level_opts, task_opts, cluster_render = _load_io_for_task(
        registered_tasks_d,
        entry_points_d,
        preset_xmls,
        rc_preset_or_none,
        force_distribute=force_distribute,
        force_chunk_mode=force_chunk_mode,
        debug_mode=debug_mode)

    slog.info("building bindings graph")
    binding_str = _task_to_binding_strings(meta_task)

    bg = B.binding_strs_to_binding_graph(registered_tasks_d, binding_str)
    slog.info("successfully bindings graph for task {i}".format(i=task_id))

    # Validate chunk operators
    valid_chunk_operators = {
        k: v
        for k, v in chunk_operators.iteritems()
        if validate_operator(v, registered_tasks_d)
    }
    filtered_chunk_operators_d = _filter_chunk_operators(
        bg, valid_chunk_operators)
    # Container to hold all the resources
    global_registry = GlobalRegistry(registered_tasks_d,
                                     registered_file_types_d,
                                     filtered_chunk_operators_d,
                                     cluster_render)

    return exe_workflow(global_registry, entry_points_d, bg, task_opts,
                        workflow_level_opts, output_dir, service_config)
Exemplo n.º 4
0
    def setUpClass(cls):
        pipeline = REGISTERED_PIPELINES[cls.PB_PIPELINE_ID]
        log.debug(pipeline)

        cls.bindings = pipeline.all_bindings
        cls.EPOINTS_D = {
            k: get_temp_file(v)
            for k, v in cls.EPOINTS_NAMES.iteritems()
        }

        log.debug(pprint.pformat(cls.bindings, indent=4))
        log.debug(
            "Number of registered tasks {n}".format(n=len(REGISTERED_TASKS)))

        cls.bgraph = B.binding_strs_to_binding_graph(REGISTERED_TASKS,
                                                     cls.bindings)
        d = os.path.expanduser('~/scratch/tmp_pbsmrtpipe') if getpass.getuser(
        ) == 'mkocher' else None
        cls.output_dir = tempfile.mkdtemp(prefix='job_test_', dir=d)

        preset_record = IO.parse_pipeline_preset_xml(
            os.path.join(TEST_DATA_DIR, cls.PRESET_XML))
        cls.workflow_options = preset_record.to_workflow_level_opt()

        # leave this for now
        cls.envs = []
        cls.cluster_engine = C.load_installed_cluster_templates_by_name("sge")
Exemplo n.º 5
0
def registry_runner(registry_, rtasks, output_dir, emit_xml=False):
    """

    :type registry_: PipelineRegistry
    :param rtasks:
    :type output_dir: str
    :type emit_xml: bool

    :rtype: int
    """
    # this will emit the PTs to an output dir

    import pbsmrtpipe.pb_io as IO
    # get around circular imports
    import pbsmrtpipe.graph.bgraph as B

    r = registry_

    log.info("Validating pipelines")
    # Validating pipeline integrity
    for i, p in r.pipelines.iteritems():
        log.debug("Validating pipeline {}".format(i))
        bg = B.binding_strs_to_binding_graph(rtasks, list(p.all_bindings))
        # or this will raise
        is_valid = B.validate_binding_graph_integrity(bg)
        log.info("Pipeline {} is valid? {}".format(i, is_valid))

    # Make the dir if it' doesn't exist
    output_pipeline_dir = os.path.abspath(os.path.expanduser(output_dir))

    log.info("Writing {x} Pipeline Templates to {o}".format(o=output_dir,
                                                            x=len(
                                                                r.pipelines)))
    print "Emitting pipelines to output dir {d}".format(d=output_pipeline_dir)

    IO.write_pipeline_templates_to_json(r.pipelines.values(), rtasks,
                                        output_pipeline_dir)

    for p in r.pipelines.values():
        if emit_xml:
            file_name = p.idx + "_pipeline.xml"
            path = os.path.join(output_dir, file_name)
            xml = IO.pipeline_to_xml(p)
            with open(path, 'w') as f:
                f.write(str(xml))
            log.info("writing pipeline {x}".format(x=path))

    _d = dict(n=len(r.pipelines),
              d=output_pipeline_dir,
              x=len(r.original_pipeline_ids),
              a=len(r.all_pipelines))
    log.info(
        "Successfully wrote {n} new pipelines (previously loaded {x} all pipelines {a} to {d}"
        .format(**_d))
    return 0
Exemplo n.º 6
0
    def test_all_sane(self):
        """Test that all pipelines are well defined"""
        errors = []
        rtasks, rfiles_types, chunk_operators, pipelines = L.load_all()

        for pipeline_id, pipeline in pipelines.items():
            emsg = "Pipeline {p} is not valid.".format(p=pipeline_id)
            log.debug("Checking Sanity of registered Pipeline {i}".format(
                i=pipeline_id))
            log.info(pipeline_id)
            log.debug(pipeline)
            try:
                # Validate with Avro
                d = pipeline_template_to_dict(pipeline, rtasks)
                _ = validate_pipeline_template(d)
                name = pipeline_id + "_pipeline_template.avro"
                output_file = get_temp_file(suffix=name)
                log.info(
                    "{p} converted to avro successfully".format(p=pipeline_id))

                bg = BU.binding_strs_to_binding_graph(rtasks,
                                                      pipeline.all_bindings)
                BU.validate_binding_graph_integrity(bg)
                BU.validate_compatible_binding_file_types(bg)
                validate_entry_points(d)
                # pprint.pprint(d)

                # for debugging purposes
                output_json = output_file.replace(".avro", '.json')
                log.info("writing pipeline to {p}".format(p=output_json))
                with open(output_json, 'w') as j:
                    j.write(json.dumps(d, sort_keys=True, indent=4))

                log.info(
                    "writing pipeline template to {o}".format(o=output_file))

                # Test writing to avro if the pipeline is actually valid
                write_pipeline_template_to_avro(pipeline, rtasks, output_file)
                log.info("Pipeline {p} is valid.".format(p=pipeline_id))

                log.info("Loading avro {i} from {p}".format(i=pipeline_id,
                                                            p=output_file))
                pipeline_d = load_pipeline_template_from_avro(output_file)
                self.assertIsInstance(pipeline_d, dict)

            except Exception as e:
                m = emsg + " Error: " + e.message
                log.error(m)
                errors.append(m)
                log.error(emsg)
                log.error(e)

        msg = "\n".join(errors) if errors else ""
        self.assertEqual([], errors, msg)
Exemplo n.º 7
0
def registry_runner(registry_, rtasks, output_dir, emit_xml=False):
    """

    :type registry_: PipelineRegistry
    :param rtasks:
    :type output_dir: str
    :type emit_xml: bool

    :rtype: int
    """
    # this will emit the PTs to an output dir

    import pbsmrtpipe.pb_io as IO
    # get around circular imports
    import pbsmrtpipe.graph.bgraph as B

    r = registry_

    log.info("Validating pipelines")
    # Validating pipeline integrity
    for i, p in r.pipelines.iteritems():
        log.debug("Validating pipeline {}".format(i))
        bg = B.binding_strs_to_binding_graph(rtasks, list(p.all_bindings))
        # or this will raise
        is_valid = B.validate_binding_graph_integrity(bg)
        log.info("Pipeline {} is valid? {}".format(i, is_valid))

    # Make the dir if it' doesn't exist
    output_pipeline_dir = os.path.abspath(os.path.expanduser(output_dir))

    log.info("Writing {x} Pipeline Templates to {o}".format(o=output_dir, x=len(r.pipelines)))
    print "Emitting pipelines to output dir {d}".format(d=output_pipeline_dir)

    IO.write_pipeline_templates_to_json(r.pipelines.values(), rtasks, output_pipeline_dir)

    for p in r.pipelines.values():
        if emit_xml:
            file_name = p.idx + "_pipeline.xml"
            path = os.path.join(output_dir, file_name)
            xml = IO.pipeline_to_xml(p)
            with open(path, 'w') as f:
                f.write(str(xml))
            log.info("writing pipeline {x}".format(x=path))

    _d = dict(n=len(r.pipelines),
              d=output_pipeline_dir,
              x=len(r.original_pipeline_ids),
              a=len(r.all_pipelines))
    log.info("Successfully wrote {n} new pipelines (previously loaded {x} all pipelines {a} to {d}".format(**_d))
    return 0
Exemplo n.º 8
0
def _run_driver_from_job_config(job_config):
    """
    :type job_config: JobConfig
    :param job_config:
    :return:
    """
    job_output_dir = job_config.tmp_dir_func(job_config.job_name)
    tmp_dir = job_config.tmp_dir_func(job_config.job_name + '_tmp')

    ep_d = {e_id: job_config.tmp_file_func(file_name) for e_id, file_name in job_config.ep_d.iteritems()}

    rtasks, chunk_operators = _get_registered_tasks_and_operators()
    rfiles = _get_registered_files()
    bgraph_ = B.binding_strs_to_binding_graph(rtasks, job_config.bindings_str)

    state = _test_run_driver(chunk_operators, rtasks, rfiles, ep_d, bgraph_, job_output_dir, tmp_dir, job_config.task_opts, job_config.cluster_renderer)
    return state
Exemplo n.º 9
0
def run_single_task(registered_file_types_d, registered_tasks_d, chunk_operators,
                    entry_points_d, task_id, output_dir, preset_xmls, rc_preset_or_none,
                    service_config,
                    force_distribute=None,
                    force_chunk_mode=None,
                    debug_mode=None):
    """
    Entry Point for running a single task

    :param task_id:
    :param output_dir:
    :return:
    """

    print entry_points_d
    meta_task = registered_tasks_d.get(task_id, None)

    if meta_task is None:
        raise KeyError("Unable to find task id '{i}' in registered tasks. Use "
                       "'show-tasks' to get a list of registered tasks.".format(i=task_id))

    workflow_level_opts, task_opts, cluster_render = _load_io_for_task(registered_tasks_d, entry_points_d,
                                                                       preset_xmls, rc_preset_or_none,
                                                                       force_distribute=force_distribute,
                                                                       force_chunk_mode=force_chunk_mode,
                                                                       debug_mode=debug_mode)

    slog.info("building bindings graph")
    binding_str = _task_to_binding_strings(meta_task)

    bg = B.binding_strs_to_binding_graph(registered_tasks_d, binding_str)
    slog.info("successfully bindings graph for task {i}".format(i=task_id))

    # Validate chunk operators
    valid_chunk_operators = {k: v for k, v in chunk_operators.iteritems() if validate_operator(v, registered_tasks_d)}
    filtered_chunk_operators_d = _filter_chunk_operators(bg, valid_chunk_operators)
    # Container to hold all the resources
    global_registry = GlobalRegistry(registered_tasks_d,
                                     registered_file_types_d,
                                     filtered_chunk_operators_d,
                                     cluster_render)

    return exe_workflow(global_registry, entry_points_d, bg, task_opts,
                        workflow_level_opts, output_dir, service_config)
Exemplo n.º 10
0
    def setUpClass(cls):
        pipeline = REGISTERED_PIPELINES[cls.PB_PIPELINE_ID]
        log.debug(pipeline)

        cls.bindings = pipeline.all_bindings
        cls.EPOINTS_D = {k: get_temp_file(v) for k, v in cls.EPOINTS_NAMES.iteritems()}

        log.debug(pprint.pformat(cls.bindings, indent=4))
        log.debug("Number of registered tasks {n}".format(n=len(REGISTERED_TASKS)))

        cls.bgraph = B.binding_strs_to_binding_graph(REGISTERED_TASKS, cls.bindings)
        d = os.path.expanduser('~/scratch/tmp_pbsmrtpipe') if getpass.getuser() == 'mkocher' else None
        cls.output_dir = tempfile.mkdtemp(prefix='job_test_', dir=d)

        preset_record = IO.parse_pipeline_preset_xml(os.path.join(TEST_DATA_DIR, cls.PRESET_XML))
        cls.workflow_options = preset_record.to_workflow_level_opt()

        # leave this for now
        cls.envs = []
        cls.cluster_engine = C.load_installed_cluster_templates_by_name("sge")
Exemplo n.º 11
0
def _run_driver_from_job_config(job_config):
    """
    :type job_config: JobConfig
    :param job_config:
    :return:
    """
    job_output_dir = job_config.tmp_dir_func(job_config.job_name)
    tmp_dir = job_config.tmp_dir_func(job_config.job_name + '_tmp')

    ep_d = {
        e_id: job_config.tmp_file_func(file_name)
        for e_id, file_name in job_config.ep_d.iteritems()
    }

    rtasks, chunk_operators = _get_registered_tasks_and_operators()
    rfiles = _get_registered_files()
    bgraph_ = B.binding_strs_to_binding_graph(rtasks, job_config.bindings_str)

    state = _test_run_driver(chunk_operators, rtasks, rfiles, ep_d, bgraph_,
                             job_output_dir, tmp_dir, job_config.task_opts,
                             job_config.cluster_renderer)
    return state