示例#1
0
class KlioJobConfig(object):
    """Job-specific config representing the ``job_config`` key of
    ``klio-job.yaml``.

    ``job_config`` is both for any user-specific job configuration needed,
    as well as klio-related configuration (i.e. timeouts, metrics).

    See :ref:`documentation <job-config>` for information on available
    configuration.

    Attributes:
        job_name (str): Name of Klio job.
        version (int): Version of Klio job.
        allow_non_klio_messages (bool): Allow this job to process free-form,
            non-KlioMessage messages.
        blocking (bool): Wait for Dataflow job to finish before exiting.
        metrics (dict): Dictionary representing desired metrics configuration.
        events (``KlioIOConfigContainer``): Job event I/O configuration.
        data (``KlioIOConfigContainer``): Job data I/O configuration.

    Args:
        config_dict (dict): dictionary representation of ``job_config``
            as parsed from ``klio-job.yaml``.
    """

    ATTRIBS_TO_SKIP = ["version", "job_name"]

    # required attributes
    job_name = utils.field(type=str, repr=True)
    version = utils.field(type=int)

    # optional attributes
    allow_non_klio_messages = utils.field(type=bool, default=False)
    metrics = utils.field(default={})
    blocking = utils.field(type=bool, default=False)

    def __config_post_init__(self, config_dict):
        self._raw = config_dict
        self._scanned_io_subclasses = None
        self.USER_ATTRIBS = []

        self._parse_io(config_dict)

        declared_config = self._as_dict()
        for key, value in self._raw.items():
            if key not in declared_config:
                # set user attributes to job_config - but only the first
                # level, for example, from this
                # job_config:
                #    foo:
                #      key1: value1
                #      list1:
                #        - one
                #        - two
                # gets parsed to:
                # job_config.foo -> {"key1": "value1", "alist": ["one", "two"]}
                setattr(self, key, value)
                # key track of user-set attributes so that when as_dict()
                # is called, we re-add it to what _as_dict() returns
                self.USER_ATTRIBS.append({key: value})

    def _parse_io(self, config_dict):
        event_inputs = self._create_config_objects(
            config_dict.get("events", {}).get("inputs", {}),
            io.KlioIOType.EVENT,
            io.KlioIODirection.INPUT,
        )
        event_outputs = self._create_config_objects(
            config_dict.get("events", {}).get("outputs", {}),
            io.KlioIOType.EVENT,
            io.KlioIODirection.OUTPUT,
        )
        self.events = KlioIOConfigContainer(inputs=event_inputs,
                                            outputs=event_outputs)

        data_inputs = self._create_config_objects(
            config_dict.get("data", {}).get("inputs", {}),
            io.KlioIOType.DATA,
            io.KlioIODirection.INPUT,
        )
        data_outputs = self._create_config_objects(
            config_dict.get("data", {}).get("outputs", {}),
            io.KlioIOType.DATA,
            io.KlioIODirection.OUTPUT,
        )
        self.data = KlioIOConfigContainer(inputs=data_inputs,
                                          outputs=data_outputs)

    def _get_all_config_subclasses(self):
        if self._scanned_io_subclasses is not None:
            return self._scanned_io_subclasses

        # notice this will end up including intermediate classes but that
        # shouldn't matter since they shouldn't "support" any valid
        # combinations of type/direction
        all_subclasses = []

        def traverse(cls):
            for subclass in cls.__subclasses__():
                all_subclasses.append(subclass)
                traverse(subclass)

        traverse(io.KlioIOConfig)
        self._scanned_io_subclasses = all_subclasses
        return all_subclasses

    def _create_config_objects(self, configs, io_type, io_direction):
        options = dict([
            (x.name, x) for x in self._get_all_config_subclasses()
            if x.supports_type(io_type) and x.supports_direction(io_direction)
        ])
        objs = []
        for name, config in configs.items():
            type_name = config["type"].lower()
            if type_name not in options:
                raise Exception("{} is not a valid {} {}".format(
                    config["type"], io_type.name, io_direction.name))
            subclass = options[type_name]
            objs.append(subclass.from_dict(config, io_type, io_direction))
        return objs

    def _as_dict(self):
        config_dict = attr.asdict(
            self, filter=lambda x, _: x.name not in self.ATTRIBS_TO_SKIP)
        config_dict["events"] = {}
        config_dict["events"]["inputs"] = [
            ei.as_dict() for ei in self.events.inputs
        ]
        config_dict["events"]["outputs"] = [
            eo.as_dict() for eo in self.events.outputs
        ]

        config_dict["data"] = {}
        config_dict["data"]["inputs"] = [
            di.as_dict() for di in self.data.inputs
        ]
        config_dict["data"]["outputs"] = [
            do.as_dict() for do in self.data.outputs
        ]
        return config_dict

    def as_dict(self):
        """Return a dictionary representation of the :class:`KlioJobConfig`
        object.

        .. tip::

            Use this method to access any custom config key/value pairs
            defined under ``klio-job.yaml::job_config``.
        """

        config_dict = self._as_dict()
        for attrib in self.USER_ATTRIBS:
            for key, value in attrib.items():
                config_dict[key] = value

        return config_dict

    def __repr__(self):
        return "KlioJobConfig(job_name=%s)" % self.job_name
示例#2
0
class KlioPipelineConfig(object):
    """Pipeline-specific config representing the ``pipeline_options`` key
    of ``klio-job.yaml``.

    .. note::

        ``pipeline_options`` map 1:1 to options supported in Apache Beam and
        its runners (i.e. Dataflow). See `all supported pipeline options
        <https://github.com/apache/beam/blob/master/sdks/python/apache_beam/
        options/pipeline_options.py>`_ for available options.

    Any instance attribute not defined in this class but is available in Apache
    Beam or its runners will still be passed through when running the
    pipeline.

    See :ref:`documentation <kliopipelineconfig>` for information on available
    configuration.

    Args:
        config_dict (dict): dictionary representation of ``pipeline_options``
            as parsed from ``klio-job.yaml``.

    Attributes:
        job_name (str): Name of Klio job.
        version (int): Version of Klio job.
        (remaining attributes): See `all supported pipeline options
            <https://github.com/apache/beam/blob/master/sdks/python/
            apache_beam/options/pipeline_options.py>`_ for all available
            remaining attributes.
    """

    ATTRIBS_TO_SKIP = ["version"]

    job_name = utils.field(repr=True, type=str)
    version = utils.field(type=int)

    # TODO: Do we still want this to be the default
    runner = utils.field(type=str, default="DataflowRunner")
    streaming = utils.field(type=bool, default=True)

    # debug options
    # TODO: add validation - if job is instreaming mode and no setup.py,
    # then config must use the "beam_fn_api" experiment (we could also just
    # automatically put it in there)
    experiments = utils.field(default=[])

    # setup options
    sdk_location = utils.field(type=str, default=None)
    setup_file = utils.field(type=str, default=None)
    requirements_file = utils.field(type=str, default=None)

    # GCP options
    project = utils.field(type=str, default=None)
    staging_location = utils.field(type=str, default=None)
    temp_location = utils.field(type=str, default=None)
    region = utils.field(type=str, default="europe-west1")
    subnetwork = utils.field(type=str, default=None)
    update = utils.field(type=bool, default=False)
    dataflow_endpoint = utils.field(type=str, default=None)
    service_account_email = utils.field(type=str, default=None)
    no_auth = utils.field(type=bool, default=False)
    template_location = utils.field(type=str, default=None)
    labels = utils.field(default=[])
    label = utils.field(type=str, default=None)
    transform_name_mapping = utils.field(type=str, default=None)
    enable_streaming_engine = utils.field(type=bool, default=False)
    dataflow_kms_key = utils.field(type=str, default=None)
    flexrs_goal = utils.field(
        type=str,
        default=None,
        validator=attr.validators.optional(
            attr.validators.in_(["COST_OPTIMIZED", "SPEED_OPTIMIZED"])),
    )

    # worker options
    autoscaling_algorithm = utils.field(
        type=str,
        default="NONE",
        validator=attr.validators.optional(
            attr.validators.in_(["THROUGHPUT_BASED", "NONE"])),
    )
    num_workers = utils.field(type=int, default=2)
    disk_size_gb = utils.field(type=int, default=32)
    worker_machine_type = utils.field(type=str, default="n1-standard-2")
    worker_harness_container_image = utils.field(type=str, default=None)
    worker_disk_type = utils.field(
        type=str,
        default=None,
    )
    use_public_ips = utils.field(type=bool, default=None)
    min_cpu_platform = utils.field(type=str, default=None)
    dataflow_worker_jar = utils.field(type=str, default=None)

    # Direct on GKE options
    gke_namespace = utils.field(type=str, default=None)

    # profiling options
    profile_location = utils.field(type=str, default=None)
    profile_cpu = utils.field(type=bool, default=None)
    profile_memory = utils.field(type=bool, default=None)
    profile_sample_rate = utils.field(type=float, default=None)

    def __config_post_init__(self, config_dict):
        if self.label:
            self.labels.append(self.label)

        self.max_num_workers = max(2, self.num_workers)
        self.USER_ATTRIBS = []

        valid_disk_types = ["local-ssd", "pd-ssd", "pd-standard"]

        def format_disk_type(simple_type):
            return WORKER_DISK_TYPE_URL.format(
                project=self.project,
                region=self.region,
                disk_type=simple_type,
            )

        # worker_disk_type may or may not already be formatted as a URL
        if self.worker_disk_type is not None:
            if self.worker_disk_type in valid_disk_types:
                self.worker_disk_type = format_disk_type(self.worker_disk_type)
            elif self.worker_disk_type != format_disk_type(
                    self.worker_disk_type.split("/")[-1]):
                raise ValueError(
                    "Invalid pipeline_options.worker_disk_type: '{}'".format(
                        self.worker_disk_type))

        declared_config = self._as_dict()
        for key, value in config_dict.items():
            if key not in declared_config:
                # set user attributes to job_config - but only the first
                # level, for example, from this
                # pipeline_options:
                #    foo:
                #      key1: value1
                #      list1:
                #        - one
                #        - two
                # gets parsed to:
                # pipeline_options.foo ->
                #   {"key1": "value1", "alist": ["one", "two"]}
                setattr(self, key, value)
                # key track of user-set attributes so that when as_dict()
                # is called, we re-add it to what _as_dict() returns
                self.USER_ATTRIBS.append({key: value})

    def _as_dict(self):
        return attr.asdict(
            self, filter=lambda x, _: x.name not in self.ATTRIBS_TO_SKIP)

    def as_dict(self):
        """Return a dictionary representation of the
        :class:`KlioPipelineConfig` object.
        """
        config_dict = self._as_dict()
        for attrib in self.USER_ATTRIBS:
            for key, value in attrib.items():
                config_dict[key] = value

        return config_dict

    def __repr__(self):
        return "KlioPipelineConfig(job_name=%s)" % self.job_name
示例#3
0
class ConfigTestClass(object):
    f1 = utils.field(type=str, default=None)
    f2 = utils.field(type=str)