Пример #1
0
    def _load_conf(self, instance):
        self._excluded_filesystems = instance.get('excluded_filesystems', [])
        self._excluded_disks = instance.get('excluded_disks', [])
        self._excluded_mountpoint_re = re.compile(
            instance.get('excluded_mountpoint_re', '^$'))
        self._tag_by_filesystem = _is_affirmative(
            instance.get('tag_by_filesystem', False))
        self._all_partitions = _is_affirmative(
            instance.get('all_partitions', False))
        self._device_tag_re = instance.get('device_tag_re', {})
        self._custom_tags = instance.get('tags', [])
        self._service_check_rw = _is_affirmative(
            instance.get('service_check_rw', False))

        # Force exclusion of CDROM (iso9660) from disk check
        self._excluded_filesystems.append('iso9660')

        # FIXME: 6.x, drop use_mount option in datadog.conf
        self._load_legacy_option(instance,
                                 'use_mount',
                                 False,
                                 operation=_is_affirmative)
        # FIXME: 6.x, drop device_blacklist_re option in datadog.conf
        self._load_legacy_option(instance,
                                 'excluded_disk_re',
                                 '^$',
                                 legacy_name='device_blacklist_re',
                                 operation=re.compile)
Пример #2
0
    def check_health_v1(self, config, tags):
        url = config['api_url'] + '/sys/health'
        health_data = self.access_api(url, config, tags).json()

        cluster_name = health_data.get('cluster_name')
        if cluster_name:
            tags.append('cluster_name:{}'.format(cluster_name))

        vault_version = health_data.get('version')
        if vault_version:
            tags.append('vault_version:{}'.format(vault_version))

        unsealed = not _is_affirmative(health_data.get('sealed'))
        if unsealed:
            self.service_check(self.SERVICE_CHECK_UNSEALED,
                               AgentCheck.OK,
                               tags=tags)
        else:
            self.service_check(self.SERVICE_CHECK_UNSEALED,
                               AgentCheck.CRITICAL,
                               tags=tags)

        initialized = _is_affirmative(health_data.get('initialized'))
        if initialized:
            self.service_check(self.SERVICE_CHECK_INITIALIZED,
                               AgentCheck.OK,
                               tags=tags)
        else:
            self.service_check(self.SERVICE_CHECK_INITIALIZED,
                               AgentCheck.CRITICAL,
                               tags=tags)
Пример #3
0
    def check(self, instance):
        try:
            directory = instance['directory']
        except KeyError:
            raise Exception('DirectoryCheck: missing `directory` in config')

        abs_directory = abspath(directory)
        name = instance.get('name', directory)
        pattern = instance.get('pattern')
        recursive = _is_affirmative(instance.get('recursive', False))
        dirtagname = instance.get('dirtagname', 'name')
        filetagname = instance.get('filetagname', 'filename')
        filegauges = _is_affirmative(instance.get('filegauges', False))
        countonly = _is_affirmative(instance.get('countonly', False))
        ignore_missing = _is_affirmative(instance.get('ignore_missing', False))
        custom_tags = instance.get('tags', [])

        if not exists(abs_directory):
            if ignore_missing:
                self.log.info(
                    'DirectoryCheck: the directory `{}` does not exist. Skipping.'
                    .format(abs_directory))
                return

            raise Exception(
                'DirectoryCheck: the directory `{}` does not exist. Skipping.'.
                format(abs_directory))

        self._get_stats(abs_directory, name, dirtagname, filetagname,
                        filegauges, pattern, recursive, countonly, custom_tags)
Пример #4
0
    def check(self, instance):
        instance_name = instance.get('name')
        if instance_name is None:
            raise Exception("Each instance must have a unique name")

        ssl_validation = _is_affirmative(instance.get('ssl_validation', True))

        server = instance.get('server')
        if 'server' is None:
            raise Exception("Each instance must have a server")

        build_conf = instance.get('build_configuration')
        if build_conf is None:
            raise Exception("Each instance must have a build configuration")

        host = instance.get('host_affected') or self.hostname
        tags = instance.get('tags')
        is_deployment = _is_affirmative(instance.get('is_deployment', False))
        basic_http_authentication = _is_affirmative(
            instance.get('basic_http_authentication', False))

        self._initialize_if_required(instance_name, server, build_conf,
                                     ssl_validation, basic_http_authentication)

        # Look for new successful builds
        if basic_http_authentication:
            new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format(
                server=server,
                build_conf=build_conf,
                since_build=self.last_build_ids[instance_name])
        else:
            new_build_url = self.NEW_BUILD_URL.format(
                server=server,
                build_conf=build_conf,
                since_build=self.last_build_ids[instance_name])

        try:
            resp = requests.get(new_build_url,
                                timeout=self.DEFAULT_TIMEOUT,
                                headers=self.HEADERS,
                                verify=ssl_validation)
            resp.raise_for_status()

            new_builds = resp.json()

            if new_builds["count"] == 0:
                self.log.debug("No new builds found.")
            else:
                self._build_and_send_event(new_builds["build"][0],
                                           instance_name, is_deployment, host,
                                           tags)
        except requests.exceptions.HTTPError:
            self.log.exception(
                "Couldn't fetch last build, got code {0}".format(
                    resp.status_code))
            raise
        except Exception:
            self.log.exception(
                "Couldn't fetch last build, unhandled exception")
            raise
Пример #5
0
    def check(self, instance):
        if "directory" not in instance:
            raise Exception('DirectoryCheck: missing "directory" in config')

        directory = instance["directory"]
        abs_directory = abspath(directory)
        name = instance.get("name", directory)
        pattern = instance.get("pattern", "*")
        recursive = _is_affirmative(instance.get("recursive", False))
        dirtagname = instance.get("dirtagname", "name")
        filetagname = instance.get("filetagname", "filename")
        filegauges = _is_affirmative(instance.get("filegauges", False))
        countonly = _is_affirmative(instance.get("countonly", False))
        ignore_missing = _is_affirmative(instance.get("ignore_missing", False))
        custom_tags = instance.get("tags", [])

        if not exists(abs_directory):
            if ignore_missing:
                self.log.info("DirectoryCheck: \
                              the directory (%s) does not exist. \
                              Skipping." % abs_directory)
                return

            raise Exception("DirectoryCheck: \
                             the directory (%s) does not exist" %
                            abs_directory)

        self._get_stats(abs_directory, name, dirtagname, filetagname,
                        filegauges, pattern, recursive, countonly, custom_tags)
Пример #6
0
    def check(self, instance):
        instance_name = instance.get("name")
        if instance_name is None:
            raise Exception("Each instance must have a unique name")

        server = instance.get("server")
        if server is None:
            raise Exception("Each instance must have a server")

        # Check the server URL for HTTP or HTTPS designation,
        #   fall back to http:// if no scheme present (allows for backwards compatibility).
        server = self._normalize_server_url(server)

        build_conf = instance.get("build_configuration")
        if build_conf is None:
            raise Exception("Each instance must have a build configuration")

        host = instance.get("host_affected") or self.hostname
        tags = instance.get("tags")
        is_deployment = _is_affirmative(instance.get("is_deployment", False))
        basic_http_authentication = _is_affirmative(
            instance.get("basic_http_authentication", False))

        self._initialize_if_required(instance_name, server, build_conf,
                                     basic_http_authentication)

        # Look for new successful builds
        if basic_http_authentication:
            new_build_url = self.NEW_BUILD_URL_AUTHENTICATED.format(
                server=server,
                build_conf=build_conf,
                since_build=self.last_build_ids[instance_name])
        else:
            new_build_url = self.NEW_BUILD_URL.format(
                server=server,
                build_conf=build_conf,
                since_build=self.last_build_ids[instance_name])

        try:
            resp = self.http.get(new_build_url)
            resp.raise_for_status()

            new_builds = resp.json()

            if new_builds["count"] == 0:
                self.log.debug("No new builds found.")
            else:
                self._build_and_send_event(new_builds["build"][0],
                                           instance_name, is_deployment, host,
                                           tags)
        except requests.exceptions.HTTPError:
            self.log.exception("Couldn't fetch last build, got code {}".format(
                resp.status_code))
            raise
        except Exception:
            self.log.exception(
                "Couldn't fetch last build, unhandled exception")
            raise
Пример #7
0
    def check(self, instance):

        # Get properties from conf file
        rm_address = instance.get('resourcemanager_uri', DEFAULT_RM_URI)
        app_tags = instance.get('application_tags', {})
        queue_blacklist = instance.get('queue_blacklist', [])

        if type(app_tags) is not dict:
            self.log.error("application_tags is incorrect: {} is not a dictionary".format(app_tags))
            app_tags = {}

        filtered_app_tags = {}
        for dd_prefix, yarn_key in app_tags.iteritems():
            if yarn_key in self._ALLOWED_APPLICATION_TAGS:
                filtered_app_tags[dd_prefix] = yarn_key
        app_tags = filtered_app_tags

        # Collected by default
        app_tags['app_name'] = 'name'

        # Authenticate our connection to endpoint if required
        username = instance.get('username')
        password = instance.get('password')
        auth = None
        if username is not None and password is not None:
            auth = (username, password)

        # Option to disable verifying ssl certificate
        ssl_verify = _is_affirmative(instance.get('ssl_verify', True))

        # Get additional tags from the conf file
        custom_tags = instance.get('tags', [])
        tags = list(set(custom_tags))

        # Get the cluster name from the conf file
        cluster_name = instance.get('cluster_name')
        if cluster_name is None:
            self.warning(
                "The cluster_name must be specified in the instance configuration, "
                "defaulting to '{}'".format(DEFAULT_CLUSTER_NAME)
            )
            cluster_name = DEFAULT_CLUSTER_NAME

        tags.append('cluster_name:{}'.format(cluster_name))

        # Get metrics from the Resource Manager
        self._yarn_cluster_metrics(rm_address, auth, ssl_verify, tags)
        if _is_affirmative(instance.get('collect_app_metrics', DEFAULT_COLLECT_APP_METRICS)):
            self._yarn_app_metrics(rm_address, auth, ssl_verify, app_tags, tags)
        self._yarn_node_metrics(rm_address, auth, ssl_verify, tags)
        self._yarn_scheduler_metrics(rm_address, auth, ssl_verify, tags, queue_blacklist)
Пример #8
0
    def _load_conf(self, instance):
        tags = instance.get("tags", [])
        ip_address = instance["ip_address"]
        metrics = instance.get('metrics', [])
        if _is_affirmative(instance.get('use_global_metrics', True)):
            metrics.extend(self.init_config.get('global_metrics', []))
        timeout = int(instance.get('timeout', self.DEFAULT_TIMEOUT))
        retries = int(instance.get('retries', self.DEFAULT_RETRIES))
        enforce_constraints = _is_affirmative(
            instance.get('enforce_mib_constraints', True))
        snmp_engine, mib_view_controller = self.create_snmp_engine(
            self.mibs_path)

        return snmp_engine, mib_view_controller, ip_address, tags, metrics, timeout, retries, enforce_constraints
Пример #9
0
    def check(self, instance):
        # Metrics collection
        endpoint = instance.get('prometheus_endpoint')
        if endpoint is None:
            raise CheckException(
                "Unable to find prometheus_endpoint in config file.")

        # By default we send the buckets
        send_buckets = _is_affirmative(
            instance.get('send_histograms_buckets', True))
        custom_tags = instance.get('tags', [])

        try:
            self.process(endpoint,
                         send_histograms_buckets=send_buckets,
                         instance=instance)
            self.service_check(self.PROMETHEUS_SERVICE_CHECK_NAME,
                               PrometheusCheck.OK,
                               tags=custom_tags)
        except requests.exceptions.ConnectionError as e:
            # Unable to connect to the metrics endpoint
            self.service_check(
                self.PROMETHEUS_SERVICE_CHECK_NAME,
                PrometheusCheck.CRITICAL,
                message=
                "Unable to retrieve Prometheus metrics from endpoint {}: {}".
                format(endpoint, e.message),
                tags=custom_tags,
            )

        # Service check to check Gitlab's health endpoints
        for check_type in self.ALLOWED_SERVICE_CHECKS:
            self._check_health_endpoint(instance, check_type, custom_tags)
Пример #10
0
    def _cache_morlist_raw(self, instance):
        """
        Initiate the first layer to refresh the list of MORs (`self.morlist`).

        Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery.

        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        for resource_type in RESOURCE_TYPE_METRICS:
            if i_key in self.morlist_raw and len(self.morlist_raw[i_key].get(
                    resource_type, [])) > 0:
                last = self.cache_config.get_last(CacheConfig.Morlist, i_key)
                self.log.debug(
                    "Skipping morlist collection now, RAW results "
                    "processing not over (latest refresh was {}s ago)".format(
                        time.time() - last))
                return
        self.morlist_raw[i_key] = {}

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(
            instance.get('include_only_marked', False))

        # Discover hosts and virtual machines
        self.pool.apply_async(self._cache_morlist_raw_atomic,
                              args=(instance, [instance_tag], regexes,
                                    include_only_marked))

        self.cache_config.set_last(CacheConfig.Morlist, i_key, time.time())
Пример #11
0
    def _verify_ssl(self, instance):
        # Load the ssl configuration
        ssl_cert_validation = _is_affirmative(
            instance.get('ssl_cert_validation', True))
        ssl_ca_certs = instance.get('ssl_ca_certs', True)

        return ssl_ca_certs if ssl_cert_validation else False
Пример #12
0
    def check(self, instance):
        status_url = instance.get('status_url')
        ping_url = instance.get('ping_url')
        ping_reply = instance.get('ping_reply')

        auth = None
        user = instance.get('user')
        password = instance.get('password')

        tags = instance.get('tags', [])
        http_host = instance.get('http_host')

        timeout = instance.get('timeout', DEFAULT_TIMEOUT)

        disable_ssl_validation = _is_affirmative(
            instance.get('disable_ssl_validation', False))

        if user and password:
            auth = (user, password)

        if status_url is None and ping_url is None:
            raise BadConfigError(
                "No status_url or ping_url specified for this instance")

        pool = None
        if status_url is not None:
            try:
                pool = self._process_status(status_url, auth, tags, http_host,
                                            timeout, disable_ssl_validation)
            except Exception as e:
                self.log.error("Error running php_fpm check: {}".format(e))

        if ping_url is not None:
            self._process_ping(ping_url, ping_reply, auth, tags, pool,
                               http_host, timeout, disable_ssl_validation)
Пример #13
0
    def check_leader_v1(self, config, tags):
        url = config['api_url'] + '/sys/leader'
        leader_data = self.access_api(url, config, tags).json()

        is_leader = _is_affirmative(leader_data.get('is_self'))
        tags.append('is_leader:{}'.format('true' if is_leader else 'false'))

        current_leader = leader_data.get('leader_address')
        previous_leader = config['leader']
        if config['detect_leader'] and current_leader:
            if previous_leader is not None and current_leader != previous_leader:
                self.event({
                    'timestamp':
                    timestamp(),
                    'event_type':
                    self.EVENT_LEADER_CHANGE,
                    'msg_title':
                    'Leader change',
                    'msg_text':
                    'Leader changed from `{}` to `{}`.'.format(
                        previous_leader, current_leader),
                    'alert_type':
                    'info',
                    'source_type_name':
                    self.CHECK_NAME,
                    'host':
                    self.hostname,
                    'tags':
                    tags,
                })
            config['leader'] = current_leader
Пример #14
0
    def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
        use_sudo = _is_affirmative(instance.get('use_sudo', False))
        ceph_args = []
        if use_sudo:
            test_sudo = os.system('setsid sudo -l < /dev/null')
            if test_sudo != 0:
                raise Exception('The dd-agent user does not have sudo access')
            ceph_args = 'sudo {}'.format(ceph_cmd)
        else:
            ceph_args = ceph_cmd

        ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster)

        raw = {}
        for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats',
                    'osd perf', 'health detail'):
            try:
                args = '{} {} -fjson'.format(ceph_args, cmd)
                output, _, _ = get_subprocess_output(args.split(), self.log)
                res = json.loads(output)
            except Exception as e:
                self.log.warning('Unable to parse data from cmd=%s: %s' %
                                 (cmd, str(e)))
                continue

            name = cmd.replace(' ', '_')
            raw[name] = res

        return raw
Пример #15
0
    def _cache_morlist_raw(self, instance):
        """
        Initiate the first layer to refresh the list of MORs (`self.morlist`).

        Resolve the vCenter `rootFolder` and initiate hosts and virtual machines discovery.

        """

        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)
        for resource_type in RESOURCE_TYPE_MAP:
            if i_key in self.morlist_raw and len(self.morlist_raw[i_key].get(
                    resource_type, [])) > 0:
                self.log.debug(
                    "Skipping morlist collection now, RAW results "
                    "processing not over (latest refresh was {0}s ago)".format(
                        time.time() - self.cache_times[i_key][MORLIST][LAST]))
                return
        self.morlist_raw[i_key] = {}

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(
            instance.get('include_only_marked', False))

        # Discover hosts and virtual machines
        self._discover_mor(instance, [instance_tag], regexes,
                           include_only_marked)

        self.cache_times[i_key][MORLIST][LAST] = time.time()
Пример #16
0
    def check(self, instance):
        url = instance.get("url")
        username = instance.get("username")
        password = instance.get("password")
        custom_tags = instance.get('tags', [])
        max_queues = int(instance.get("max_queues", MAX_ELEMENTS))
        max_topics = int(instance.get("max_topics", MAX_ELEMENTS))
        max_subscribers = int(instance.get("max_subscribers", MAX_ELEMENTS))
        detailed_queues = instance.get("detailed_queues", [])
        detailed_topics = instance.get("detailed_topics", [])
        detailed_subscribers = instance.get("detailed_subscribers", [])
        suppress_errors = _is_affirmative(
            instance.get("suppress_errors", False))

        tags = custom_tags + ["url:{0}".format(url)]

        self.log.debug("Processing ActiveMQ data for %s" % url)
        data = self._fetch_data(url, QUEUE_URL, username, password,
                                suppress_errors)
        if data:
            self._process_data(data, "queue", tags, max_queues,
                               detailed_queues)

        data = self._fetch_data(url, TOPIC_URL, username, password,
                                suppress_errors)
        if data:
            self._process_data(data, "topic", tags, max_topics,
                               detailed_topics)

        data = self._fetch_data(url, SUBSCRIBER_URL, username, password,
                                suppress_errors)
        if data:
            self._process_subscriber_data(data, tags, max_subscribers,
                                          detailed_subscribers)
Пример #17
0
    def get_config(self, instance):
        instance_id = hash_mutable(instance)
        config = self.config.get(instance_id)
        if config is None:
            config = {}

            try:
                api_url = instance['api_url']
                api_version = api_url[-1]
                if api_version not in self.api_versions:
                    self.log.warning(
                        'Unknown Vault API version `{}`, using version '
                        '`{}`'.format(api_version, self.DEFAULT_API_VERSION))

                config['api_url'] = api_url
                config['api'] = self.api_versions.get(
                    api_version, self.DEFAULT_API_VERSION)['functions']
            except KeyError:
                self.log.error(
                    'Vault configuration setting `api_url` is required')
                return

            client_token = instance.get('client_token')
            config['headers'] = {
                'X-Vault-Token': client_token
            } if client_token else None

            username = instance.get('username')
            password = instance.get('password')
            config['auth'] = (username,
                              password) if username and password else None

            config['ssl_verify'] = _is_affirmative(
                instance.get('ssl_verify', True))
            config['proxies'] = self.get_instance_proxy(
                instance, config['api_url'])
            config['timeout'] = int(instance.get('timeout', 20))
            config['tags'] = instance.get('tags', [])

            # Keep track of the previous cluster leader to detect changes.
            config['leader'] = None
            config['detect_leader'] = _is_affirmative(
                instance.get('detect_leader'))

            self.config[instance_id] = config

        return config
Пример #18
0
 def __init__(self, name, init_config, agentConfig, instances=None):
     AgentCheck.__init__(self, name, init_config, agentConfig, instances)
     self.cluster_name = None
     for instance in instances or []:
         url = instance.get('url', '')
         parsed_url = urlparse(url)
         ssl_verify = not _is_affirmative(instance.get('disable_ssl_validation', False))
         if not ssl_verify and parsed_url.scheme == 'https':
             self.log.warning('Skipping SSL cert validation for %s based on configuration.' % url)
Пример #19
0
    def _get_pg_attrs(self, instance):
        if _is_affirmative(instance.get('use_psycopg2', False)):
            if psycopg2 is None:
                self.log.error("Unable to import psycopg2, falling back to pg8000")
            else:
                return psycopg2_connect, psycopg2.InterfaceError, psycopg2.ProgrammingError

        # Let's use pg8000
        return pg8000.connect, pg8000.InterfaceError, pg8000.ProgrammingError
Пример #20
0
    def _should_process(self, data_dict, collect_aggregates_only):
        """if collect_aggregates_only, we process only the aggregates
        """
        if _is_affirmative(collect_aggregates_only):
            return self._is_aggregate(data_dict)
        elif str(collect_aggregates_only).lower() == 'both':
            return True

        return data_dict['svname'] != Services.BACKEND
Пример #21
0
def from_instance(instance):
    """
    Create a config object from an instance dictionary
    """
    url = instance.get('url')
    if not url:
        raise ConfigurationError("A URL must be specified in the instance")

    pshard_stats = _is_affirmative(instance.get('pshard_stats', False))
    pshard_graceful_to = _is_affirmative(
        instance.get('pshard_graceful_timeout', False))
    index_stats = _is_affirmative(instance.get('index_stats', False))
    cluster_stats = _is_affirmative(instance.get('cluster_stats', False))
    if 'is_external' in instance:
        cluster_stats = _is_affirmative(instance.get('is_external', False))
    pending_task_stats = _is_affirmative(
        instance.get('pending_task_stats', True))
    admin_forwarder = _is_affirmative(instance.get('admin_forwarder', False))

    # Support URLs that have a path in them from the config, for
    # backwards-compatibility.
    parsed = urlparse.urlparse(url)
    if parsed[2] and not admin_forwarder:
        url = '{}://{}'.format(parsed[0], parsed[1])
    port = parsed.port
    host = parsed.hostname

    custom_tags = instance.get('tags', [])
    service_check_tags = [
        'host:{}'.format(host),
        'port:{}'.format(port),
    ]
    service_check_tags.extend(custom_tags)

    # Tag by URL so we can differentiate the metrics
    # from multiple instances
    tags = ['url:{}'.format(url)]
    tags.extend(custom_tags)

    timeout = instance.get('timeout') or DEFAULT_TIMEOUT

    config = ESInstanceConfig(admin_forwarder=admin_forwarder,
                              pshard_stats=pshard_stats,
                              pshard_graceful_to=pshard_graceful_to,
                              cluster_stats=cluster_stats,
                              index_stats=index_stats,
                              password=instance.get('password'),
                              service_check_tags=service_check_tags,
                              health_tags=[],
                              ssl_cert=instance.get('ssl_cert'),
                              ssl_key=instance.get('ssl_key'),
                              ssl_verify=instance.get('ssl_verify'),
                              tags=tags,
                              timeout=timeout,
                              url=url,
                              username=instance.get('username'),
                              pending_task_stats=pending_task_stats)
    return config
Пример #22
0
    def get_instance_config(self, instance):
        url = instance.get('url')
        if url is None:
            raise Exception("A URL must be specified in the instance")

        pshard_stats = _is_affirmative(instance.get('pshard_stats', False))
        pshard_graceful_to = _is_affirmative(instance.get('pshard_graceful_timeout', False))
        index_stats = _is_affirmative(instance.get('index_stats', False))
        cluster_stats = _is_affirmative(instance.get('cluster_stats', False))
        if 'is_external' in instance:
            cluster_stats = _is_affirmative(instance.get('is_external', False))

        pending_task_stats = _is_affirmative(instance.get('pending_task_stats', True))
        admin_forwarder = _is_affirmative(instance.get('admin_forwarder', False))
        # Support URLs that have a path in them from the config, for
        # backwards-compatibility.
        parsed = urlparse.urlparse(url)
        if parsed[2] != "" and not admin_forwarder:
            url = "%s://%s" % (parsed[0], parsed[1])
        port = parsed.port
        host = parsed.hostname

        custom_tags = instance.get('tags', [])
        service_check_tags = [
            'host:%s' % host,
            'port:%s' % port
        ]
        service_check_tags.extend(custom_tags)

        # Tag by URL so we can differentiate the metrics
        # from multiple instances
        tags = ['url:%s' % url]
        tags.extend(custom_tags)

        timeout = instance.get('timeout') or self.DEFAULT_TIMEOUT

        config = ESInstanceConfig(
            admin_forwarder=admin_forwarder,
            pshard_stats=pshard_stats,
            pshard_graceful_to=pshard_graceful_to,
            cluster_stats=cluster_stats,
            index_stats=index_stats,
            password=instance.get('password'),
            service_check_tags=service_check_tags,
            health_tags=[],
            ssl_cert=instance.get('ssl_cert'),
            ssl_key=instance.get('ssl_key'),
            ssl_verify=instance.get('ssl_verify'),
            tags=tags,
            timeout=timeout,
            url=url,
            username=instance.get('username'),
            pending_task_stats=pending_task_stats
        )
        return config
Пример #23
0
    def check(self, instance):
        if 'url' not in instance:
            raise Exception('Mesos instance missing "url" value.')

        url = instance['url']
        instance_tags = instance.get('tags', [])
        if instance_tags is None:
            instance_tags = []
        tasks = instance.get('tasks', [])
        default_timeout = self.init_config.get('default_timeout', 5)
        timeout = float(instance.get('timeout', default_timeout))
        master_port = instance.get("master_port", DEFAULT_MASTER_PORT)
        ssl_verify = not _is_affirmative(instance.get('disable_ssl_validation', False))

        state_metrics = self._get_constant_attributes(url, timeout, master_port, ssl_verify, instance_tags)
        tags = None

        if state_metrics is None:
            state_metrics = self._get_state(url, timeout, ssl_verify, instance_tags)
        if state_metrics:
            tags = ['mesos_pid:{0}'.format(state_metrics['pid']), 'mesos_node:slave']
            if self.cluster_name:
                tags.append('mesos_cluster:{0}'.format(self.cluster_name))

            tags += instance_tags
            for task in tasks:
                for framework in state_metrics['frameworks']:
                    for executor in framework['executors']:
                        for t in executor['tasks']:
                            if task.lower() in t['name'].lower() and t['slave_id'] == state_metrics['id']:
                                task_tags = ['task_name:' + t['name']] + tags
                                self.service_check(t['name'] + '.ok', self.TASK_STATUS[t['state']], tags=task_tags)
                                for key_name, (metric_name, metric_func) in iteritems(self.TASK_METRICS):
                                    metric_func(self, metric_name, t['resources'][key_name], tags=task_tags)

        stats_metrics = self._get_stats(url, timeout, ssl_verify, instance_tags)
        if stats_metrics:
            tags = tags if tags else instance_tags
            metrics = [
                self.SLAVE_TASKS_METRICS,
                self.SYSTEM_METRICS,
                self.SLAVE_RESOURCE_METRICS,
                self.SLAVE_EXECUTORS_METRICS,
                self.STATS_METRICS,
            ]
            for m in metrics:
                for key_name, (metric_name, metric_func) in iteritems(m):
                    if key_name in stats_metrics:
                        metric_func(self, metric_name, stats_metrics[key_name], tags=tags)

        self.service_check_needed = True
Пример #24
0
    def _load_conf(self, instance):
        tags = instance.get("tags", [])
        ip_address = instance["ip_address"]
        metrics = instance.get('metrics', [])
        timeout = int(instance.get('timeout', self.DEFAULT_TIMEOUT))
        retries = int(instance.get('retries', self.DEFAULT_RETRIES))
        enforce_constraints = _is_affirmative(
            instance.get('enforce_mib_constraints', True))

        instance_key = instance['name']
        cmd_generator = self.generators.get(instance_key, None)
        if not cmd_generator:
            cmd_generator = self.create_command_generator(
                self.mibs_path, self.ignore_nonincreasing_oid)
            self.generators[instance_key] = cmd_generator

        return cmd_generator, ip_address, tags, metrics, timeout, retries, enforce_constraints
Пример #25
0
    def __init__(self, name, init_config, agentConfig, instances):
        for instance in instances:
            if 'name' not in instance:
                instance['name'] = self._get_instance_key(instance)

        # Set OID batch size
        self.oid_batch_size = int(
            init_config.get("oid_batch_size", DEFAULT_OID_BATCH_SIZE))

        # Load Custom MIB directory
        self.mibs_path = None
        self.ignore_nonincreasing_oid = False
        if init_config is not None:
            self.mibs_path = init_config.get("mibs_folder")
            self.ignore_nonincreasing_oid = _is_affirmative(
                init_config.get("ignore_nonincreasing_oid", False))

        NetworkCheck.__init__(self, name, init_config, agentConfig, instances)
Пример #26
0
    def get_instance_config(self, instance):
        if 'url' not in instance:
            raise Exception('Marathon instance missing "url" value.')

        # Load values from the instance config
        url = instance['url']
        user = instance.get('user')
        password = instance.get('password')
        acs_url = instance.get('acs_url')
        if user is not None and password is not None:
            auth = (user, password)
        else:
            auth = None
        ssl_verify = not _is_affirmative(
            instance.get('disable_ssl_validation', False))
        group = instance.get('group', None)

        tags = instance.get('tags', [])
        default_timeout = self.init_config.get('default_timeout',
                                               self.DEFAULT_TIMEOUT)
        timeout = float(instance.get('timeout', default_timeout))

        return url, auth, acs_url, ssl_verify, group, tags, timeout
Пример #27
0
    def _connect(self, instance):
        for e in ("access_id", "access_secret"):
            if e not in instance:
                raise Exception("{0} parameter is required.".format(e))

        s3_settings = {
            "aws_access_key_id": instance.get('access_id', None),
            "aws_secret_access_key": instance.get('access_secret', None),
            "proxy": instance.get('host', 'localhost'),
            "proxy_port": int(instance.get('port', 8080)),
            "is_secure": _is_affirmative(instance.get('is_secure', True))
        }

        if instance.get('s3_root'):
            s3_settings['host'] = instance['s3_root']

        aggregation_key = s3_settings['proxy'] + ":" + str(
            s3_settings['proxy_port'])
        tags = instance.get("tags", [])
        if tags is None:
            tags = []
        tags.append("aggregation_key:{0}".format(aggregation_key))

        try:
            s3 = S3Connection(**s3_settings)
        except Exception as e:
            self.log.error("Error connecting to {0}: {1}".format(
                aggregation_key, e))
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=tags,
                               message=str(e))
            raise

        metrics = instance.get("metrics", [])

        return s3, aggregation_key, tags, metrics
Пример #28
0
    def _cache_morlist_raw(self, instance):
        """
        Initiate the first layer to refresh the list of MORs (`self.morlist`).
        Resolve the vCenter `rootFolder` and initiate hosts and virtual machines
        discovery.
        """
        i_key = self._instance_key(instance)
        self.log.debug("Caching the morlist for vcenter instance %s" % i_key)

        # If the queue is not completely empty, don't do anything
        for resource_type in RESOURCE_TYPE_METRICS:
            if self.mor_objects_queue.contains(
                    i_key) and self.mor_objects_queue.size(
                        i_key, resource_type):
                last = self.cache_config.get_last(CacheConfig.Morlist, i_key)
                self.log.debug(
                    "Skipping morlist collection: the objects queue for the "
                    "resource type '{}' is still being processed "
                    "(latest refresh was {}s ago)".format(
                        resource_type,
                        time.time() - last))
                return

        instance_tag = "vcenter_server:%s" % instance.get('name')
        regexes = {
            'host_include': instance.get('host_include_only_regex'),
            'vm_include': instance.get('vm_include_only_regex')
        }
        include_only_marked = _is_affirmative(
            instance.get('include_only_marked', False))

        # Discover hosts and virtual machines
        self.pool.apply_async(self._cache_morlist_raw_atomic,
                              args=(instance, [instance_tag], regexes,
                                    include_only_marked))

        self.cache_config.set_last(CacheConfig.Morlist, i_key, time.time())
Пример #29
0
    def _psutil_config_to_stats(self, instance):
        """
        Reads `init_config` for `psutil` methods to call on the current process
        Calls those methods and stores the raw output

        :returns a dictionary of statistic_name: value
        """
        process_metrics = instance.get(
            'process_metrics', self.init_config.get('process_metrics', None))
        if not process_metrics:
            self.log.error('No metrics configured for AgentMetrics check!')
            return {}

        methods, metric_types = zip(*[(p['name'], p.get('type', GAUGE))
                                      for p in process_metrics
                                      if _is_affirmative(p.get('active'))])

        names_to_metric_types = {}
        for i, m in enumerate(methods):
            names_to_metric_types[AgentMetrics._get_statistic_name_from_method(
                m)] = metric_types[i]

        stats = AgentMetrics._collect_internal_stats(methods)
        return stats, names_to_metric_types
Пример #30
0
    def _check_connectivity_to_master(self, instance, tags):
        url = instance.get('gitlab_url')
        if url is None:
            # Simply ignore this service check if not configured
            return

        parsed_url = urlparse(url)
        gitlab_host = parsed_url.hostname
        gitlab_port = 443 if parsed_url.scheme == 'https' else (parsed_url.port
                                                                or 80)
        service_check_tags = [
            'gitlab_host:{}'.format(gitlab_host),
            'gitlab_port:{}'.format(gitlab_port)
        ]
        service_check_tags.extend(tags)

        # Load the ssl configuration
        ssl_cert_validation = _is_affirmative(
            instance.get('ssl_cert_validation', True))
        ssl_ca_certs = instance.get('ssl_ca_certs', True)

        verify_ssl = ssl_ca_certs if ssl_cert_validation else False

        # Timeout settings
        timeouts = (
            int(
                instance.get('connect_timeout',
                             GitlabRunnerCheck.DEFAULT_CONNECT_TIMEOUT)),
            int(
                instance.get('receive_timeout',
                             GitlabRunnerCheck.DEFAULT_RECEIVE_TIMEOUT)),
        )

        # Auth settings
        auth = None
        if 'gitlab_user' in instance and 'gitlab_password' in instance:
            auth = (instance['gitlab_user'], instance['gitlab_password'])

        try:
            self.log.debug("checking connectivity against {}".format(url))
            r = requests.get(url,
                             auth=auth,
                             verify=verify_ssl,
                             timeout=timeouts,
                             headers=headers(self.agentConfig))
            if r.status_code != 200:
                self.service_check(
                    self.MASTER_SERVICE_CHECK_NAME,
                    OpenMetricsBaseCheck.CRITICAL,
                    message="Got {} when hitting {}".format(
                        r.status_code, url),
                    tags=service_check_tags,
                )
                raise Exception("Http status code {} on url {}".format(
                    r.status_code, url))
            else:
                r.raise_for_status()

        except requests.exceptions.Timeout:
            # If there's a timeout
            self.service_check(
                self.MASTER_SERVICE_CHECK_NAME,
                OpenMetricsBaseCheck.CRITICAL,
                message="Timeout when hitting {}".format(url),
                tags=service_check_tags,
            )
            raise
        except Exception as e:
            self.service_check(
                self.MASTER_SERVICE_CHECK_NAME,
                OpenMetricsBaseCheck.CRITICAL,
                message="Error hitting {}. Error: {}".format(url, e),
                tags=service_check_tags,
            )
            raise
        else:
            self.service_check(self.MASTER_SERVICE_CHECK_NAME,
                               OpenMetricsBaseCheck.OK,
                               tags=service_check_tags)
        self.log.debug("gitlab check succeeded")