Пример #1
0
class AllocationConfiguration:
    # Default value for cpu.cpu_period [ms] (used as denominator).
    cpu_quota_period: Numeric(1000, 1000000) = 1000

    # Multiplier of AllocationType.CPU_SHARES allocation value.
    # E.g. setting 'CPU_SHARES' to 2.0 will set 2000 shares effectively
    # in cgroup cpu controller.
    cpu_shares_unit: Numeric(1000, 1000000) = 1000

    # Default resource allocation for last level cache (L3) and memory bandwidth
    # for root RDT group.
    # Root RDT group is used as default group for all tasks, unless explicitly reconfigured by
    # allocator.
    # `None` (the default value) means no limit (effectively set to maximum available value).
    default_rdt_l3: Str = None
    default_rdt_mb: Str = None
    def __init__(
            self,
            node: nodes.Node,
            metrics_storage: storage.Storage = DEFAULT_STORAGE,
            action_delay: Numeric(0, 60) = 1.,  # [s]
            rdt_enabled: Optional[bool] = None,  # Defaults(None) - auto configuration.
            extra_labels: Dict[Str, Str] = None,
            event_names: List[str] = None,
            enable_derived_metrics: bool = False,
            _allocation_configuration: Optional[AllocationConfiguration] = None,
    ):

        self._node = node
        self._metrics_storage = metrics_storage
        self._action_delay = action_delay
        self._rdt_enabled = rdt_enabled
        # Disabled by default, to be overridden by subclasses.
        self._rdt_mb_control_required = False
        # Disabled by default, to overridden by subclasses.
        self._rdt_cache_control_required = False
        self._extra_labels = extra_labels or dict()
        self._finish = False  # Guard to stop iterations.
        self._last_iteration = time.time()  # Used internally by wait function.
        self._allocation_configuration = _allocation_configuration
        self._event_names = event_names or DEFAULT_EVENTS
        self._enable_derived_metrics = enable_derived_metrics
Пример #3
0
class Prometheus:
    host: str
    port: int
    timeout: Optional[Numeric(1, 60)] = 1.0
    ssl: Optional[SSL] = None
    time: Optional[str] = None  # Evaluation timestamp.

    def do_query(self, query: str, use_time: bool = True):
        """ Implements: https://prometheus.io/docs/prometheus/2.16/querying/api/#instant-queries"""
        url = URL_TPL.format(prometheus_ip='{}:{}'.format(
            self.host, str(self.port)),
                             path=QUERY_PATH,
                             name=query)

        if self.time and use_time:
            url += '&time={}'.format(self.time)

        try:
            if self.ssl:
                s = requests.Session()
                s.mount(self.ip, HTTPSAdapter())
                response = s.get(url,
                                 timeout=self.timeout,
                                 verify=self.ssl.server_verify,
                                 cert=self.ssl.get_client_certs())
            else:
                response = requests.get(url, timeout=self.timeout)
                response.raise_for_status()
        except requests.exceptions.RequestException as e:
            raise PrometheusDataProviderException(e)

        return response.json()['data']['result']
Пример #4
0
class Kubeapi:
    host: Str = None
    port: Str = None  # Because !Env is String and another type cast might be problematic

    client_token_path: Optional[Path(absolute=True,
                                     mode=os.R_OK)] = SERVICE_TOKEN_FILENAME
    server_cert_ca_path: Optional[Path(absolute=True,
                                       mode=os.R_OK)] = SERVICE_CERT_FILENAME

    timeout: Numeric(1, 60) = 5  # [s]
    monitored_namespaces: List[Str] = field(
        default_factory=lambda: ["default"])

    def __post_init__(self):
        self.endpoint = "https://{}:{}".format(self.host, self.port)

        log.debug("Created kubeapi endpoint %s", self.endpoint)

        with pathlib.Path(self.client_token_path).open() as f:
            self.service_token = f.read()

    def request_kubeapi(self, target):

        full_url = urljoin(self.endpoint, target)

        r = requests.get(full_url,
                         headers={
                             "Authorization":
                             "Bearer {}".format(self.service_token),
                         },
                         timeout=self.timeout,
                         verify=self.server_cert_ca_path)

        if not r.ok:
            log.error(
                'An unexpected error occurred for target "%s": %i %s - %s',
                target, r.status_code, r.reason, r.raw)
        r.raise_for_status()

        return r.json()

    def delete(self, target):
        full_url = urljoin(self.endpoint, target)

        r = requests.delete(full_url,
                            headers={
                                "Authorization":
                                "Bearer {}".format(self.service_token),
                            },
                            timeout=self.timeout,
                            verify=self.server_cert_ca_path)

        if not r.ok:
            log.error(
                'An unexpected error occurred for target "%s": %i %s - %s',
                target, r.status_code, r.reason, r.raw)
        r.raise_for_status()

        return r.json()
Пример #5
0
    def __init__(
        self,
        node: nodes.Node,
        allocator: Allocator,
        metrics_storage: storage.Storage = DEFAULT_STORAGE,
        anomalies_storage: storage.Storage = DEFAULT_STORAGE,
        allocations_storage: storage.Storage = DEFAULT_STORAGE,
        action_delay: Numeric(0, 60) = 1.,  # [s]
        rdt_enabled: Optional[
            bool] = None,  # Defaults(None) - auto configuration.
        rdt_mb_control_required: bool = False,
        rdt_cache_control_required: bool = False,
        extra_labels: Dict[Str, Str] = None,
        allocation_configuration: Optional[AllocationConfiguration] = None,
        remove_all_resctrl_groups: bool = False,
        event_names: Optional[List[str]] = None,
        enable_derived_metrics: bool = False,
        task_label_generators: Dict[str, TaskLabelGenerator] = None,
    ):

        self._allocation_configuration = allocation_configuration or AllocationConfiguration(
        )

        super().__init__(
            node,
            metrics_storage,
            action_delay,
            rdt_enabled,
            extra_labels,
            _allocation_configuration=self._allocation_configuration,
            event_names=event_names,
            enable_derived_metrics=enable_derived_metrics,
            task_label_generators=task_label_generators)

        # Allocation specific.
        self._allocator = allocator
        self._allocations_storage = allocations_storage
        self._rdt_mb_control_required = rdt_mb_control_required  # Override False from superclass.
        self._rdt_cache_control_required = rdt_cache_control_required

        # Anomaly.
        self._anomalies_storage = anomalies_storage
        self._anomalies_statistics = AnomalyStatistics()

        # Internal allocation statistics
        self._allocations_counter = 0
        self._allocations_errors = 0

        self._remove_all_resctrl_groups = remove_all_resctrl_groups
Пример #6
0
    def __init__(
        self,
        node: nodes.Node,
        metrics_storage: storage.Storage = DEFAULT_STORAGE,
        action_delay: Numeric(0, 60) = 1.,  # [s]
        rdt_enabled: Optional[
            bool] = None,  # Defaults(None) - auto configuration.
        extra_labels: Dict[Str, Str] = None,
        event_names: List[str] = DEFAULT_EVENTS,
        enable_derived_metrics: bool = False,
        task_label_generators: Dict[str, TaskLabelGenerator] = None,
        _allocation_configuration: Optional[AllocationConfiguration] = None,
    ):

        self._node = node
        self._metrics_storage = metrics_storage
        self._action_delay = action_delay
        self._rdt_enabled = rdt_enabled
        # Disabled by default, to be overridden by subclasses.
        self._rdt_mb_control_required = False
        # Disabled by default, to overridden by subclasses.
        self._rdt_cache_control_required = False
        self._extra_labels = extra_labels or dict()
        self._finish = False  # Guard to stop iterations.
        self._last_iteration = time.time()  # Used internally by wait function.
        self._allocation_configuration = _allocation_configuration
        self._event_names = event_names

        self._enable_derived_metrics = enable_derived_metrics

        # Default value for task_labels_generator.
        if task_label_generators is None:
            self._task_label_generators = {
                'application':
                TaskLabelRegexGenerator('$', '', 'task_name'),
                'application_version_name':
                TaskLabelRegexGenerator('.*$', '', 'task_name'),
            }
        else:
            self._task_label_generators = task_label_generators
        # Generate label value with cpu initial assignment, to simplify
        #   management of distributed model system for plugin:
        #   https://github.com/intel/platform-resource-manager/tree/master/prm"""
        #
        # To not risk subtle bugs in 1.0.x do not add it to _task_label_generators as default,
        #   but make it hardcoded here and possible do be removed.
        self._task_label_generators['initial_task_cpu_assignment'] = \
            TaskLabelResourceGenerator('cpus')
    def __init__(
        self,
        node: nodes.Node,
        detector: detectors.AnomalyDetector,
        metrics_storage: storage.Storage = DEFAULT_STORAGE,
        anomalies_storage: storage.Storage = DEFAULT_STORAGE,
        action_delay: Numeric(0, 60) = 1.,
        rdt_enabled: Optional[bool] = None,
        extra_labels: Dict[Str, Str] = None,
        event_names: Optional[List[str]] = None,
        enable_derived_metrics: bool = False,
    ):
        super().__init__(node, metrics_storage, action_delay, rdt_enabled,
                         extra_labels, event_names, enable_derived_metrics)
        self._detector = detector

        # Anomaly.
        self._anomalies_storage = anomalies_storage
        self._anomalies_statistics = AnomalyStatistics()
Пример #8
0
class MesosNode(Node):
    mesos_agent_endpoint: Url = 'https://127.0.0.1:5051'

    # Timeout to access mesos agent.
    timeout: Numeric(1, 60) = 5.  # [s]

    # https://github.com/kennethreitz/requests/blob/5c1f72e80a7d7ac129631ea5b0c34c7876bc6ed7/requests/api.py#L41
    ssl: Optional[SSL] = None

    METHOD = 'GET_STATE'
    api_path = '/api/v1'

    def get_tasks(self):
        """ only return running tasks """
        full_url = urllib.parse.urljoin(self.mesos_agent_endpoint,
                                        self.api_path)

        if self.ssl:
            r = requests.post(full_url,
                              json=dict(type=self.METHOD),
                              timeout=self.timeout,
                              verify=self.ssl.server_verify,
                              cert=self.ssl.get_client_certs())
        else:
            r = requests.post(full_url,
                              json=dict(type=self.METHOD),
                              timeout=self.timeout)

        r.raise_for_status()
        state = r.json()

        tasks = []

        # Fast return path if there is no any launched tasks.
        if 'launched_tasks' not in state['get_state']['get_tasks']:
            return []

        for launched_task in state['get_state']['get_tasks']['launched_tasks']:
            if 'statuses' not in launched_task or not len(
                    launched_task['statuses']):
                continue

            statuses = launched_task['statuses']
            last_status = statuses[
                -1]  # Assume the last on is the latest state # TODO: confirm
            if last_status['state'] != MESOS_TASK_STATE_RUNNING:
                continue

            if 'executor_pid' not in last_status['container_status']:
                log.warning(
                    "'executor_pid' not found in container status for task %s on agent %s",
                    last_status['task_id']['value'],
                    last_status['agent_id']['value'])
                continue

            executor_pid = last_status['container_status']['executor_pid']

            try:
                cgroup_path = find_cgroup(executor_pid)
            except MesosCgroupNotFoundException:
                logging.warning(
                    f'Cannot find pid/cgroup mesos path for {executor_pid}. '
                    f'Ignoring task (inconsistent state returned from Mesos).')
                continue

            labels = {
                label['key']: label['value']
                for label in launched_task['labels']['labels']
            }

            # Extract scalar resources.
            resources = dict()
            for resource in launched_task['resources']:
                if resource['type'] == 'SCALAR':
                    resources[resource['name']] = float(
                        resource['scalar']['value'])

            tasks.append(
                MesosTask(name=launched_task['name'],
                          executor_pid=executor_pid,
                          cgroup_path=cgroup_path,
                          subcgroups_paths=[],
                          container_id=last_status['container_status']
                          ['container_id']['value'],
                          task_id=last_status['task_id']['value'],
                          agent_id=last_status['agent_id']['value'],
                          executor_id=last_status['executor_id']['value'],
                          labels=labels,
                          resources=resources))

        return tasks
Пример #9
0
class ZookeeperDatabase(Database):
    # used as prefix for key, to namespace all queries
    hosts: List[str]
    namespace: str
    timeout: Numeric(
        1, 60) = 5.  # request timeout in seconds (tries another host) [s]
    ssl: Optional[SSL] = None

    def __post_init__(self):
        from kazoo.client import KazooClient

        if self.ssl:
            if isinstance(self.ssl.server_verify, str):
                self._client = KazooClient(
                    hosts=self.hosts,
                    timeout=self.timeout,
                    handler=SecureSequentialThreadingHandler(),
                    use_ssl=True,
                    verify_certs=True,
                    ca=self.ssl.server_verify,
                    certfile=self.ssl.client_cert_path,
                    keyfile=self.ssl.client_key_path,
                )
            elif isinstance(self.ssl.server_verify, bool):
                self._client = KazooClient(
                    hosts=self.hosts,
                    timeout=self.timeout,
                    handler=SecureSequentialThreadingHandler(),
                    use_ssl=True,
                    verify_certs=self.ssl.server_verify,
                    certfile=self.ssl.client_cert_path,
                    keyfile=self.ssl.client_key_path,
                )

            else:
                raise ValidationError(
                    'SSL server verify must be type of Path or boolean!')
        else:
            self._client = KazooClient(hosts=self.hosts, timeout=self.timeout)

        self._client.start()

    def set(self, key: bytes, value: bytes):
        _validate_key(key)
        _validate_value(value)

        formatted_key = key.decode('ascii')

        full_path = os.path.join(self.namespace, formatted_key)

        self._client.ensure_path(full_path)

        self._client.set(full_path, value)

    def get(self, key: bytes) -> bytes:
        from kazoo.exceptions import NoNodeError
        _validate_key(key)

        formatted_key = key.decode('ascii')

        full_path = os.path.join(self.namespace, formatted_key)

        try:
            data = self._client.get(full_path)
            return bytes(data[0])
        except NoNodeError:
            return None
Пример #10
0
class EtcdDatabase(Database):
    """Access etcd using internal grpc-gateway.

    Support version: 3.2.x (version) (other versions require change of api_path)

    https://coreos.com/etcd/docs/latest/dev-guide/api_grpc_gateway.html
    """

    hosts: List[str]
    timeout: Optional[Numeric(1, 60)] = 5.0
    api_path: Optional[str] = '/v3alpha'
    ssl: Optional[SSL] = None

    def _send(self, url, data):
        response_data = None

        for host in self.hosts:
            try:
                full_url = '{}{}{}'.format(host, self.api_path, url)
                if self.ssl:
                    s = requests.Session()
                    s.mount(host, HTTPSAdapter())
                    r = s.post(full_url,
                               data=json.dumps(data),
                               timeout=self.timeout,
                               verify=self.ssl.server_verify,
                               cert=self.ssl.get_client_certs())
                else:
                    r = requests.post(full_url,
                                      data=json.dumps(data),
                                      timeout=self.timeout)

                r.raise_for_status()
                response_data = r.json()
                break
            except requests.exceptions.Timeout:
                log.warning('EtcdDatabase: Timeout on host {}'.format(host))

        return response_data

    def _format_data(self, data):
        formatted_data = dict()

        for key in data.keys():
            formatted_data[key] = base64.b64encode(data[key]).decode('ascii')

        return formatted_data

    def set(self, key: bytes, value: bytes):
        _validate_key(key)
        _validate_value(value)

        data = {'key': key, 'value': value}

        formatted_data = self._format_data(data)

        url = '/kv/put'

        response_data = self._send(url, formatted_data)

        if not response_data:
            raise TimeoutOnAllHosts(
                'EtcdDatabase: Cannot put key "{}": Timeout on all hosts!'.
                format(key))

    def get(self, key) -> bytes:
        _validate_key(key)

        data = {'key': key}

        formatted_data = self._format_data(data)

        url = '/kv/range'

        response_data = self._send(url, formatted_data)

        if not response_data:
            raise TimeoutOnAllHosts(
                'EtcdDatabase: Cannot get key "{}": Timeout on all hosts!'.
                format(key))

        if 'kvs' in response_data and 'value' in response_data['kvs'][0]:
            return base64.b64decode(response_data['kvs'][0]['value'])

        return None
Пример #11
0
class KafkaStorage(Storage):
    """rst
    Storage for saving metrics in Kafka.

    - ``topic``: **Str**

        name of a kafka topic where message should be saved

    - ``brokers_ips``: **List[IpPort]** = *"127.0.0.1:9092"*

        list of addresses with ports of all kafka brokers (kafka nodes)

    - ``max_timeout_in_seconds``: **Numeric(0, 5)** = *0.5*

        if a message was not delivered in maximum_timeout seconds
        self.store will throw FailedDeliveryException

    - ``extra_config``: **Dict[Str, Str]** = *None*

        additionall key value pairs that will be passed to kafka driver
        https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
        e.g. {'debug':'broker,topic,msg'} to enable logging for kafka producer threads

    - ``ssl``: **Optional[SSL]** = *None*

        secure socket layer object
    """
    topic: Str
    brokers_ips: List[IpPort] = field(default=("127.0.0.1:9092", ))
    max_timeout_in_seconds: Numeric(0, 5) = 0.5  # defaults half of a second
    extra_config: Dict[Str, Str] = None
    ssl: Optional[SSL] = None

    def __post_init__(self) -> None:
        check_kafka_dependency()
        try:
            self._get_ssl_config()
            self.producer = create_kafka_consumer(self.brokers_ips,
                                                  self.extra_config)
        except Exception as e:
            log.exception('Exception during kafka consumer initialization:')
            raise KafkaConsumerInitializationException(str(e))

        self.error_from_callback = None
        """used to pass error from within callback_on_delivery
          (called from different thread) to KafkaStorage instance"""

    def _get_ssl_config(self) -> None:
        """https://github.com/edenhill/librdkafka/wiki/Using-SSL-with-librdkafka"""

        if self.ssl is None:
            return

        if self.extra_config is None:
            self.extra_config = dict()

        self.extra_config['security.protocol'] = 'ssl'

        if isinstance(self.ssl.server_verify, str):
            if 'ssl.ca.location' in self.extra_config:
                log.warning(
                    'KafkaStorage `ssl.ca.location` in config replaced with SSL object!'
                )
            self.extra_config['ssl.ca.location'] = self.ssl.server_verify
        elif self.ssl.server_verify is True:
            raise SSLConfigError(
                "It's necessary to provide CA cert path if you want to check it!"
            )

        client_certs = self.ssl.get_client_certs()
        if isinstance(client_certs, tuple):
            if 'ssl.certificate.location' in self.extra_config:
                log.warning('KafkaStorage `ssl.certificate.location` '
                            'in config replaced with SSL object!')
            self.extra_config['ssl.certificate.location'] = client_certs[0]

            if 'ssl.key.location' in self.extra_config:
                log.warning('KafkaStorage `ssl.key.location` '
                            'in config replaced with SSL object!')
            self.extra_config['ssl.key.location'] = client_certs[1]
        else:
            raise SSLConfigError(
                "It's necessary to provide both client cert and key paths!")

        if 'ssl.cipher.suites' in self.extra_config:
            log.warning('KafkaStorage SSL uses extra config cipher suites!')
        else:
            self.extra_config['ssl.cipher.suites'] = SECURE_CIPHERS

        if 'ssl.enabled.protocols' in self.extra_config:
            log.warn('KafkaStorage SSL `ssl.enabled.protocols` not supported!')
            self.extra_config.pop('ssl.enabled.protocols')

    def callback_on_delivery(self, err, msg) -> None:
        """Called once for each message produced to indicate delivery result.
        Triggered by poll() or flush()."""
        if err is not None:
            self.error_from_callback = err
            log.error('KafkaStorage failed to send message; error message: {}'.
                      format(err))
        else:
            log.log(
                logger.TRACE,
                'KafkaStorage succeeded to send message; message: {}'.format(
                    msg))

    @staticmethod
    def divide_message(msg):
        """Kafka won't accept more than 1Mb messages, therefore too big
        messages need to be divided into smaller chunks"""
        MAX_SIZE = 10**5
        devided_message = []
        msg_size = sys.getsizeof(msg)
        if msg_size < MAX_SIZE:
            return [msg]
        else:
            message = msg.split('\n')
            new_message = ''
            for i in range(len(message)):
                new_metric = ''
                while message[i].startswith('#'):
                    new_metric += message[i] + '\n'
                    i += 1
                new_metric += message[i] + '\n'

                if sys.getsizeof(new_message +
                                 new_metric) > MAX_SIZE and new_message:
                    devided_message.append(new_message)
                    new_message = new_metric
                else:
                    new_message += new_metric

        return devided_message

    def store(self, metrics: List[Metric]) -> None:
        """Stores synchronously metrics in kafka.

        The function returns only after sending the message -
        by using synchronous self.producer.flush to block until
        the message (metrics) are delivered to the kafka.

        Raises:
            * InconvertibleToPrometheusExpositionFormat - if metrics are not convertible
                into prometheus exposition format.
            * FailedDeliveryException - if a message could not be written to kafka.
        """

        if not metrics:
            log.warning('Empty list of metrics, store is skipped!')
            return

        is_convertible, error_message = is_convertable_to_prometheus_exposition_format(
            metrics)
        if not is_convertible:
            log.error('KafkaStorage failed to convert metrics into'
                      'prometheus exposition format; error: "{}"'.format(
                          error_message))
            raise InconvertibleToPrometheusExpositionFormat(error_message)

        timestamp = get_current_time()

        msg = convert_to_prometheus_exposition_format(metrics, timestamp)
        messages = self.divide_message(msg)
        for message in messages:
            self.producer.produce(self.topic,
                                  message.encode('utf-8'),
                                  callback=self.callback_on_delivery)
            r = self.producer.flush(
                self.max_timeout_in_seconds)  # block until all send

            # check if timeout expired
            if r > 0:
                raise FailedDeliveryException(
                    "Maximum timeout {} for sending message had passed.".
                    format(self.max_timeout_in_seconds))

            # check if any failed to be delivered
            if self.error_from_callback is not None:
                # before resetting self.error_from_callback we
                # assign the original value to separate value
                # to pass it to exception
                error_from_callback__original_ref = self.error_from_callback
                self.error_from_callback = None

                raise FailedDeliveryException(
                    "Message has failed to be writen to kafka. API error message: {}."
                    .format(error_from_callback__original_ref))

            log.debug(
                'KafkaStorage: Message size=%i with timestamp=%s stored in kafka topic=%r',
                len(msg), timestamp, self.topic)

        return  # the message has been send to kafka
Пример #12
0
from wca.config import assure_type, ValidationError, WeakValidationError, \
    Url, Path, Numeric, Str, IpPort


class Foo:
    pass


class FooEnum(Enum):
    BAR = 1
    BAZ = 2


@pytest.mark.parametrize(
    'value, expected_type',
    [(1, int), (1, Numeric(0, 3)), (3.5, Numeric(2., 5.)), (1.2, float),
     (True, bool), (True, Optional[bool]), (None, Optional[bool]),
     (1, Optional[int]), (None, Optional[int]), ('str', str),
     ('str', Union[str, float]), (1.2, Union[str, float]), (Foo(), Foo),
     ([Foo()], List[Foo]), ([[1]], List[List[int]]), ({
         'x': 2
     }, Dict[str, int]), ({
         'x': 2.5
     }, Dict[str, Union[int, float]]),
     ({
         2: {
             'x': 2.5
         }
     }, Dict[int, Dict[str, Union[int, float]]]), (FooEnum.BAR, FooEnum),
     (1, FooEnum), (1, Numeric(0, 3)), (3.5, Numeric(2., 5.)),
     ('small_string', Str), ('small_string', Str()),
class KubernetesNode(Node):
    # We need to know what cgroup driver is used to properly build cgroup paths for pods.
    #   Reference in source code for kubernetes version stable 1.13:
    #   https://github.com/kubernetes/kubernetes/blob/v1.13.3/pkg/kubelet/cm/cgroup_manager_linux.go#L207
    cgroup_driver: CgroupDriverType = field(
        default_factory=lambda: CgroupDriverType(CgroupDriverType.CGROUPFS))

    ssl: Optional[SSL] = None

    # By default use localhost, however kubelet may not listen on it.
    kubelet_endpoint: Url = 'https://127.0.0.1:10250'

    # Timeout to access kubernetes agent.
    timeout: Numeric(1, 60) = 5  # [s]

    # List of namespaces to monitor pods in.
    monitored_namespaces: List[Str] = field(
        default_factory=lambda: ["default"])

    def _request_kubelet(self):
        PODS_PATH = '/pods'
        full_url = urljoin(self.kubelet_endpoint, PODS_PATH)

        if self.ssl:
            s = requests.Session()
            s.mount(self.kubelet_endpoint, HTTPSAdapter())
            r = s.get(full_url,
                      json=dict(type='GET_STATE'),
                      timeout=self.timeout,
                      verify=self.ssl.server_verify,
                      cert=self.ssl.get_client_certs())
        else:
            r = requests.get(full_url,
                             json=dict(type='GET_STATE'),
                             timeout=self.timeout)

        r.raise_for_status()

        return r.json()

    def get_tasks(self) -> List[Task]:
        """Returns only running tasks."""
        try:
            kubelet_json_response = self._request_kubelet()
        except requests.exceptions.ConnectionError as e:
            raise TaskSynchronizationException('%s' % e) from e

        tasks = []
        for pod in kubelet_json_response.get('items'):
            container_statuses = pod.get('status').get('containerStatuses')
            if not container_statuses:
                # Lacking needed information.
                continue

            # Ignore pods in not monitored namespaces.
            if pod.get('metadata').get(
                    'namespace') not in self.monitored_namespaces:
                continue

            # Read into variables essential information about pod.
            pod_id = pod.get('metadata').get('uid')
            pod_name = pod.get('metadata').get('name')
            qos = pod.get('status').get('qosClass').lower()
            task_name = pod.get('metadata').get('namespace') + "/" + pod_name
            assert QosClass.has_value(qos)
            if pod.get('metadata').get('labels'):
                labels = {
                    _sanitize_label(key): value
                    for key, value in pod.get('metadata').get(
                        'labels').items()
                }
            else:
                labels = {}
            labels[_sanitize_label(
                QOS_LABELNAME)] = qos  # Add label with QOS class of the pod.

            # Apart from obvious part of the loop it checks whether all
            # containers are in ready state -
            # if at least one is not ready then skip this pod.
            containers_cgroups = []
            are_all_containers_ready = True
            for container in container_statuses:
                if not container.get('ready'):
                    are_all_containers_ready = False
                    container_state = list(container.get('state').keys())[0]
                    log.debug(
                        'Ignore pod with uid={} name={}. Container {} is in state={} .'
                        .format(pod_id, pod_name, container.get('name'),
                                container_state))
                    break

                container_id = container.get('containerID').split(
                    'docker://')[1]
                containers_cgroups.append(
                    _build_cgroup_path(self.cgroup_driver, qos, pod_id,
                                       container_id))
            if not are_all_containers_ready:
                continue

            log.debug(
                'Pod with uid={} name={} is ready and monitored by the system.'
                .format(pod_id, pod_name))

            container_spec = pod.get('spec').get('containers')
            tasks.append(
                KubernetesTask(
                    name=task_name,
                    task_id=pod_id,
                    qos=qos,
                    labels=labels,
                    resources=_calculate_pod_resources(container_spec),
                    cgroup_path=_build_cgroup_path(self.cgroup_driver, qos,
                                                   pod_id),
                    subcgroups_paths=containers_cgroups))

        _log_found_tasks(tasks)

        return tasks
Пример #14
0
class MesosNode(Node):
    """rst
    Class to communicate with orchestrator: Mesos.
    Derived from abstract Node class providing get_tasks interface.

    - ``mesos_agent_endpoint``: **Url** = *'https://127.0.0.1:5051'*

        By default localhost.

    - ``timeout``: **Numeric(1, 60)** = *5*

        Timeout to access kubernetes agent [seconds].

    - ``ssl``: **Optional[SSL]** = *None*

        ssl object used to communicate with kubernetes
    """
    mesos_agent_endpoint: Url = 'https://127.0.0.1:5051'

    # Timeout to access mesos agent.
    timeout: Numeric(1, 60) = 5.  # [s]

    # https://github.com/kennethreitz/requests/blob/5c1f72e80a7d7ac129631ea5b0c34c7876bc6ed7/requests/api.py#L41
    ssl: Optional[SSL] = None

    METHOD = 'GET_STATE'
    api_path = '/api/v1'

    def __post_init__(self):
        log.info('Mesos task discovery on: %r', self.mesos_agent_endpoint)

    def get_tasks(self):
        """ only return running tasks """
        full_url = urllib.parse.urljoin(self.mesos_agent_endpoint, self.api_path)

        try:
            if self.ssl:
                s = requests.Session()
                s.mount(self.mesos_agent_endpoint, HTTPSAdapter())
                r = s.post(
                        full_url,
                        json=dict(type=self.METHOD),
                        timeout=self.timeout,
                        verify=self.ssl.server_verify,
                        cert=self.ssl.get_client_certs())
            else:
                r = requests.post(
                        full_url,
                        json=dict(type=self.METHOD),
                        timeout=self.timeout)
        except requests.exceptions.ConnectionError as e:
            raise TaskSynchronizationException('%s' % e) from e

        r.raise_for_status()
        state = r.json()

        tasks = []

        # Fast return path if there is no any launched tasks.
        if 'launched_tasks' not in state['get_state']['get_tasks']:
            return []

        for launched_task in state['get_state']['get_tasks']['launched_tasks']:
            if 'statuses' not in launched_task or not len(launched_task['statuses']):
                continue

            statuses = launched_task['statuses']
            last_status = statuses[-1]  # Assume the last on is the latest state # TODO: confirm
            if last_status['state'] != MESOS_TASK_STATE_RUNNING:
                continue

            if 'executor_pid' not in last_status['container_status']:
                log.warning("'executor_pid' not found in container status for task %s on agent %s",
                            last_status['task_id']['value'],
                            last_status['agent_id']['value'])
                continue

            executor_pid = last_status['container_status']['executor_pid']
            task_name = launched_task['name']

            try:
                cgroup_path = find_cgroup(executor_pid)
            except MesosCgroupNotFoundException as e:
                log.warning('Cannot determine proper cgroup for task=%r! '
                            'Ignoring this task. Reason: %s', task_name, e)
                continue

            labels = {sanitize_label(label['key']): label['value']
                      for label in launched_task['labels']['labels']}

            # Extract scalar resources.
            resources = calculate_scalar_resources(launched_task['resources'])

            tasks.append(
                MesosTask(
                    name=task_name,
                    executor_pid=executor_pid,
                    cgroup_path=cgroup_path,
                    subcgroups_paths=[],
                    container_id=last_status['container_status']['container_id']['value'],
                    task_id=last_status['task_id']['value'],
                    agent_id=last_status['agent_id']['value'],
                    executor_id=last_status['executor_id']['value'],
                    labels=labels,
                    resources=resources
                )
            )

        return tasks
Пример #15
0
class KafkaStorage(Storage):
    """Storage for saving metrics in Kafka.

    Args:
        brokers_ips:  list of addresses with ports of all kafka brokers (kafka nodes)
        topic: name of a kafka topic where message should be saved
        max_timeout_in_seconds: if a message was not delivered in maximum_timeout seconds
            self.store will throw FailedDeliveryException
        producer_config: additionall key value pairs that will be passed to kafka driver
            https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
            e.g. {'debug':'broker,topic,msg'} to enable logging for kafka producer threads
    """
    topic: Str
    brokers_ips: List[IpPort] = field(default=("127.0.0.1:9092",))
    max_timeout_in_seconds: Numeric(0, 5) = 0.5  # defaults half of a second
    extra_config: Dict[Str, Str] = None

    def __post_init__(self) -> None:
        check_kafka_dependency()
        try:
            self.producer = create_kafka_consumer(self.brokers_ips, self.extra_config)
        except Exception as e:
            log.exception('Exception during kafka consumer initialization:')
            raise KafkaConsumerInitializationException(str(e))
        self.error_from_callback = None
        """used to pass error from within callback_on_delivery
          (called from different thread) to KafkaStorage instance"""

    def callback_on_delivery(self, err, msg) -> None:
        """Called once for each message produced to indicate delivery result.
        Triggered by poll() or flush()."""
        if err is not None:
            self.error_from_callback = err
            log.error(
                'KafkaStorage failed to send message; error message: {}'.format(err))
        else:
            log.log(logger.TRACE,
                    'KafkaStorage succeeded to send message; message: {}'.format(msg))

    def store(self, metrics: List[Metric]) -> None:
        """Stores synchronously metrics in kafka.

        The function returns only after sending the message -
        by using synchronous self.producer.flush to block until
        the message (metrics) are delivered to the kafka.

        Raises:
            * InconvertibleToPrometheusExpositionFormat - if metrics are not convertible
                into prometheus exposition format.
            * FailedDeliveryException - if a message could not be written to kafka.
        """

        if not metrics:
            log.warning('Empty list of metrics, store is skipped!')
            return

        is_convertible, error_message = is_convertable_to_prometheus_exposition_format(metrics)
        if not is_convertible:
            log.error('KafkaStorage failed to convert metrics into'
                      'prometheus exposition format; error: "{}"'
                      .format(error_message))
            raise InconvertibleToPrometheusExpositionFormat(error_message)

        timestamp = get_current_time()

        msg = convert_to_prometheus_exposition_format(metrics, timestamp)
        self.producer.produce(self.topic, msg.encode('utf-8'),
                              callback=self.callback_on_delivery)

        r = self.producer.flush(self.max_timeout_in_seconds)  # block until all send

        # check if timeout expired
        if r > 0:
            raise FailedDeliveryException(
                "Maximum timeout {} for sending message had passed.".format(
                    self.max_timeout_in_seconds))

        # check if any failed to be delivered
        if self.error_from_callback is not None:
            # before resetting self.error_from_callback we
            # assign the original value to separate value
            # to pass it to exception
            error_from_callback__original_ref = self.error_from_callback
            self.error_from_callback = None

            raise FailedDeliveryException(
                "Message has failed to be writen to kafka. API error message: {}.".format(
                    error_from_callback__original_ref))

        log.debug('message size=%i with timestamp=%s stored in kafka topic=%r',
                  len(msg), timestamp, self.topic)

        return  # the message has been send to kafka
Пример #16
0
    def __init__(
            self,
            node: Node,
            metrics_storage: Storage = DEFAULT_STORAGE,
            interval: Numeric(0, 60) = 1.,
            rdt_enabled: Optional[bool] = None,
            gather_hw_mm_topology: Optional[bool] = None,
            extra_labels: Optional[Dict[Str, Str]] = None,
            event_names: List[str] = [],
            perf_aggregate_cpus: bool = True,
            enable_derived_metrics: bool = False,
            uncore_event_names: List[Union[List[str], str]] = [],
            task_label_generators: Optional[Dict[str, TaskLabelGenerator]] = None,
            allocation_configuration: Optional[AllocationConfiguration] = None,
            wss_reset_cycles: Optional[int] = None,
            wss_stable_cycles: int = 0,
            wss_membw_threshold: Optional[float] = None,
            include_optional_labels: bool = False,
            zoneinfo: Union[Str, bool] = True,
            vmstat: Union[Str, bool] = True,
            sched: Union[Str, bool] = False,
    ):

        self._node = node
        self._metrics_storage = metrics_storage
        self._interval = interval
        self._rdt_enabled = rdt_enabled
        self._gather_hw_mm_topology = gather_hw_mm_topology
        self._include_optional_labels = include_optional_labels

        self._extra_labels = {k: str(v) for k, v in
                              extra_labels.items()} if extra_labels else dict()
        log.debug('Extra labels: %r', self._extra_labels)
        self._finish = False  # Guard to stop iterations.
        self._last_iteration = time.time()  # Used internally by wait function.
        self._allocation_configuration = allocation_configuration
        self._event_names = event_names
        self._perf_aggregate_cpus = perf_aggregate_cpus

        # TODO: fix those workarounds for dynamic levels and dynamic perf event metrics.
        # First add dynamic metrics
        for event_name in event_names:
            # is dynamic raw event
            if '__r' in event_name:
                log.debug('Creating metadata for dynamic metric: %r', event_name)
                METRICS_METADATA[event_name] = MetricMetadata(
                    'Hardware PMU counter (raw event)',
                    MetricType.COUNTER,
                    MetricUnit.NUMERIC,
                    MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS,
                    MetricGranularity.TASK,
                    [],
                    'no (event_names)',
                )
        # We had the modify levels for all metrics
        # The set proper levels based on perf_aggregate_cpus value
        if not perf_aggregate_cpus:
            log.debug('Enabling "cpu" level for PERF_SUBSYSTEM_WITH_CGROUPS and derived metrics.')
            for metric_metadata in METRICS_METADATA.values():
                if metric_metadata.source == MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS:
                    metric_metadata.levels = ['cpu']
                if metric_metadata.source == MetricSource.DERIVED_PERF_WITH_CGROUPS:
                    metric_metadata.levels = ['cpu']

        self._enable_derived_metrics = enable_derived_metrics
        self._uncore_events = uncore_event_names

        self._task_label_generators = task_label_generators or {}

        self._wss_reset_cycles = wss_reset_cycles
        self._wss_stable_cycles = wss_stable_cycles
        self._wss_membw_threshold = wss_membw_threshold

        self._uncore_pmu = None

        self._initialize_rdt_callback = None
        self._iterate_body_callback = None
        self._cached_bandwidth = None

        if zoneinfo is True:
            self._zoneinfo = zoneinfo
            zoneinfo_regexp = zoneinfo_module.DEFAULT_REGEXP
            log.debug('Enabled zoneinfo collection')
        elif zoneinfo is False:
            self._zoneinfo = zoneinfo
            log.debug('Disabled zoneinfo collection')
            zoneinfo_regexp = None
        else:
            zoneinfo_regexp = zoneinfo
            self._zoneinfo = True

        # Validate zoneinfo regexp.
        log.debug('zoneinfo=%r regexp=%r', self._zoneinfo, zoneinfo_regexp)
        self._zoneinfo_regexp_compiled = None
        if self._zoneinfo:
            try:
                self._zoneinfo_regexp_compiled = re.compile(zoneinfo_regexp)
            except re.error as e:
                raise ValidationError('zoneinfo_regexp_compile improper regexp: %s' % e)

            if not self._zoneinfo_regexp_compiled.groups == 2:
                raise ValidationError(
                    'zoneinfo_regexp_compile improper number of groups: should be 2')

        # Validate config and vmstat regexp.
        if vmstat in (True, False):
            self._vmstat = vmstat
        else:
            # Got regexp - compile and check...
            try:
                self._vmstat = re.compile(vmstat)
            except re.error as e:
                raise ValidationError('vmstat_regexp_compile improper regexp: %s' % e)

        # Validate config and sched regexp.
        if sched in (True, False):
            self._sched = sched
        else:
            # Got regexp - compile and check...
            try:
                self._sched = re.compile(sched)
            except re.error as e:
                raise ValidationError('sched regex compile improper regexp: %s' % e)
 def reschedule_interval(self, interval: Numeric(0, 60)):
     while True:
         self.reschedule()
         time.sleep(interval)
class KubernetesNode(Node):
    """rst
    Class to communicate with orchestrator: Kubernetes.
    Derived from abstract Node class providing get_tasks interface.

    - ``cgroup_driver``: **CgroupDriverType** = *CgroupDriverType.CGROUPFS*

        We need to know what cgroup driver is used to properly build cgroup paths for pods.
        Reference in source code for kubernetes version stable 1.13:
        https://github.com/kubernetes/kubernetes/blob/v1.13.3/pkg/kubelet/cm/cgroup_manager_linux.go#L207


    - ``ssl``: **Optional[SSL]** = *None*

        ssl object used to communicate with kubernetes

    - ``client_token_path``: **Optional[Path]** = *SERVICE_TOKEN_FILENAME*

        Default path is using by pods. You can override it to use wca outside pod.

    - ``server_cert_ca_path``: **Optional[Path]** = *SERVICE_CERT_FILENAME*

        Default path is using by pods. You can override it to use wca outside pod.

    - ``kubelet_enabled``: **bool** = *False*

        If true use **kubelet**, otherwise **kubeapi**.

    - ``kubelet_endpoint``: **Url** = *'https://127.0.0.1:10250'*

        By default use localhost.

    - ``kubeapi_host``: **Str** = *None*

    - ``kubeapi_port``: **Str** = *None*

    - ``node_ip``: **Str** = *None*

    - ``timeout``: **Numeric(1, 60)** = *5*

        Timeout to access kubernetes agent [seconds].

    - ``monitored_namespaces``: **List[Str]** =  *["default"]*

        List of namespaces to monitor pods in.
    """
    cgroup_driver: CgroupDriverType = CgroupDriverType.CGROUPFS
    ssl: Optional[SSL] = None

    client_token_path: Optional[Path(absolute=True,
                                     mode=os.R_OK)] = SERVICE_TOKEN_FILENAME
    server_cert_ca_path: Optional[Path(absolute=True,
                                       mode=os.R_OK)] = SERVICE_CERT_FILENAME

    kubelet_enabled: bool = False
    kubelet_endpoint: Url = 'https://127.0.0.1:10250'

    kubeapi_host: Str = None
    kubeapi_port: Str = None  # Because !Env is String and another type cast might be problematic
    node_ip: Str = None

    timeout: Numeric(1, 60) = 5  # [s]

    monitored_namespaces: List[Str] = field(
        default_factory=lambda: ["default"])

    def _request_kubeapi(self):
        kubeapi_endpoint = "https://{}:{}".format(self.kubeapi_host,
                                                  self.kubeapi_port)
        log.debug("Created kubeapi endpoint %s", kubeapi_endpoint)

        with pathlib.Path(self.client_token_path).open() as f:
            service_token = f.read()

        pod_list_from_all_namespaces = []
        for namespace in self.monitored_namespaces:
            full_url = urljoin(kubeapi_endpoint,
                               "/api/v1/namespaces/{}/pods".format(namespace))

            r = requests.get(
                full_url,
                headers={"Authorization": "Bearer {}".format(service_token)},
                timeout=self.timeout,
                verify=self.server_cert_ca_path)

            if not r.ok:
                log.error(
                    'An unexpected error occurred for namespace "%s": %i %s - %s',
                    namespace, r.status_code, r.reason, r.raw)
            r.raise_for_status()

            pod_list_from_namespace = r.json().get('items')
            pod_list_from_all_namespaces.extend(pod_list_from_namespace)

        return pod_list_from_all_namespaces

    def _request_kubelet(self):
        PODS_PATH = '/pods'
        full_url = urljoin(self.kubelet_endpoint, PODS_PATH)

        if self.ssl:
            s = requests.Session()
            s.mount(self.kubelet_endpoint, HTTPSAdapter())
            r = s.get(full_url,
                      json=dict(type='GET_STATE'),
                      timeout=self.timeout,
                      verify=self.ssl.server_verify,
                      cert=self.ssl.get_client_certs())
        else:
            r = requests.get(full_url,
                             json=dict(type='GET_STATE'),
                             timeout=self.timeout)

        if not r.ok:
            log.error('%i %s - %s', r.status_code, r.reason, r.raw)
        r.raise_for_status()

        return r.json().get('items')

    def get_tasks(self) -> List[Task]:
        """Returns only running tasks."""
        try:
            if self.kubelet_enabled:
                podlist_json_response = self._request_kubelet()
            else:
                podlist_json_response = self._request_kubeapi()
                if self.node_ip is None:
                    raise ValueError("node_ip is not set in config")
        except requests.exceptions.ConnectionError as e:
            raise TaskSynchronizationException('connection error: %s' %
                                               e) from e
        except requests.exceptions.ReadTimeout as e:
            raise TaskSynchronizationException('timeout: %s' % e) from e

        tasks = []
        for pod in podlist_json_response:
            container_statuses = pod.get('status').get('containerStatuses')

            # Kubeapi returns all pods in cluster
            if not self.kubelet_enabled and pod["status"][
                    "hostIP"] != self.node_ip.strip():
                continue

            # Kubelet return all pods on the node. Ignore pods in not monitored namespaces.
            if self.kubelet_enabled and \
                    pod.get('metadata').get('namespace') not in self.monitored_namespaces:
                continue

            # Lacking needed information.
            if not container_statuses:
                continue

            # Read into variables essential information about pod.
            pod_id = pod.get('metadata').get('uid')
            pod_name = pod.get('metadata').get('name')
            qos = pod.get('status').get('qosClass').lower()
            task_name = pod.get('metadata').get('namespace') + "/" + pod_name
            assert QosClass.has_value(qos)
            if pod.get('metadata').get('labels'):
                labels = {
                    _sanitize_label(key): value
                    for key, value in pod.get('metadata').get(
                        'labels').items()
                }
            else:
                labels = {}
            labels[_sanitize_label(
                QOS_LABELNAME)] = qos  # Add label with QOS class of the pod.

            # Apart from obvious part of the loop it checks whether all
            # containers are in ready state -
            # if at least one is not ready then skip this pod.
            containers_cgroups = []
            are_all_containers_ready = True
            for container in container_statuses:
                if not container.get('ready'):
                    are_all_containers_ready = False
                    container_state = list(container.get('state').keys())[0]
                    log.debug(
                        'Ignore pod with uid={} name={}. Container {} is in state={} .'
                        .format(pod_id, pod_name, container.get('name'),
                                container_state))
                    break

                container_id = container.get('containerID').split(
                    'docker://')[1]
                containers_cgroups.append(
                    _build_cgroup_path(self.cgroup_driver, qos, pod_id,
                                       container_id))
            if not are_all_containers_ready:
                continue

            log.debug(
                'Pod with uid={} name={} is ready and monitored by the system.'
                .format(pod_id, pod_name))

            container_spec = pod.get('spec').get('containers')
            tasks.append(
                KubernetesTask(
                    name=task_name,
                    task_id=pod_id,
                    qos=qos,
                    labels=labels,
                    resources=calculate_pod_resources(container_spec),
                    cgroup_path=_build_cgroup_path(self.cgroup_driver, qos,
                                                   pod_id),
                    subcgroups_paths=containers_cgroups))

        _log_found_tasks(tasks)

        return tasks
    def __init__(
            self,
            node: Node,
            metrics_storage: Storage = DEFAULT_STORAGE,
            interval: Numeric(0, 60) = 1.,
            rdt_enabled: Optional[bool] = None,
            gather_hw_mm_topology: bool = False,
            extra_labels: Optional[Dict[Str, Str]] = None,
            event_names: List[str] = [],
            perf_aggregate_cpus: bool = True,
            enable_derived_metrics: bool = False,
            enable_perf_uncore: Optional[bool] = None,
            task_label_generators: Optional[Dict[str, TaskLabelGenerator]] = None,
            allocation_configuration: Optional[AllocationConfiguration] = None,
            wss_reset_interval: int = 0,
            include_optional_labels: bool = False
    ):

        self._node = node
        self._metrics_storage = metrics_storage
        self._interval = interval
        self._rdt_enabled = rdt_enabled
        self._gather_hw_mm_topology = gather_hw_mm_topology
        self._include_optional_labels = include_optional_labels

        self._extra_labels = {k: str(v) for k, v in
                              extra_labels.items()} if extra_labels else dict()
        log.debug('Extra labels: %r', self._extra_labels)
        self._finish = False  # Guard to stop iterations.
        self._last_iteration = time.time()  # Used internally by wait function.
        self._allocation_configuration = allocation_configuration
        self._event_names = event_names
        log.info('Enabling %i perf events: %s', len(self._event_names),
                 ', '.join(self._event_names))
        self._perf_aggregate_cpus = perf_aggregate_cpus

        # TODO: fix those workarounds for dynamic levels and dynamic perf event metrics.
        # First add dynamic metrics
        for event_name in event_names:
            # is dynamic raw event
            if '__r' in event_name:
                log.debug('Creating metadata for dynamic metric: %r', event_name)
                METRICS_METADATA[event_name] = MetricMetadata(
                    'Hardware PMU counter (raw event)',
                    MetricType.COUNTER,
                    MetricUnit.NUMERIC,
                    MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS,
                    MetricGranularity.TASK,
                    [],
                    'no (event_names)',
                )
        # We had the modify levels for all metrics
        # The set proper levels based on perf_aggregate_cpus value
        if not perf_aggregate_cpus:
            log.debug('Enabling "cpu" level for PERF_SUBSYSTEM_WITH_CGROUPS metrics.')
            for metric_metadata in METRICS_METADATA.values():
                if metric_metadata.source == MetricSource.PERF_SUBSYSTEM_WITH_CGROUPS:
                    metric_metadata.levels = ['cpu']

        self._enable_derived_metrics = enable_derived_metrics
        self._enable_perf_uncore = enable_perf_uncore

        # Default value for task_labels_generator.
        if task_label_generators is None:
            self._task_label_generators = {
                'application':
                    TaskLabelRegexGenerator('$', '', 'task_name'),
                'application_version_name':
                    TaskLabelRegexGenerator('.*$', '', 'task_name'),
            }
        else:
            self._task_label_generators = task_label_generators

        self._wss_reset_interval = wss_reset_interval

        self._uncore_pmu = None

        self._initialize_rdt_callback = None
        self._iterate_body_callback = None