class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get( 'developer_mode') and psutil is not None self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "agent_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ # Events are disabled. return if event.get('agent_key') is None: event['agent_key'] = self.agentConfig['agent_key'] self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Don't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname if message is not None: message = str(message) self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def service_metadata(self, meta_name, value): """ Save metadata. :param meta_name: metadata key name :type meta_name: string :param value: metadata value :type value: string """ self._instance_metadata.append((meta_name, str(value))) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): """ Concatenate and flush instance metadata. """ self.svc_metadata.append( dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): """ Return a list of the metadata dictionaries saved by the check -if any- and clears them out of the instance's service_checks list @return the list of metadata saved by this check @rtype list of metadata dicts """ if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ self.warnings.append(str(warning_message)) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith( 'get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: # Go from `get_memory_info` -> `memory_info` stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn( "Could not serialize output of {0} to dict".format( method)) except psutil.AccessDenied: log.warn( "Cannot call psutil method {0} : Access Denied".format( method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): """ If in developer mode, return a dictionary of statistics about the check run """ stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally:
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_EXPIRY_SECONDS = 300 DEFAULT_MIN_COLLECTION_INTERVAL = 0 _enabled_checks = [] @classmethod def is_check_enabled(cls, name): return name in cls._enabled_checks def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.default_integration_http_timeout = float( agentConfig.get('default_integration_http_timeout', 9)) self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.min_collection_interval = self.init_config.get( 'min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL) self.aggregator = MetricsAggregator( self.hostname, expiry_seconds=self.min_collection_interval + self.DEFAULT_EXPIRY_SECONDS, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get( "use_agent_proxy", True) self.proxies = { "http": None, "https": None, } if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings['host'], port=self.proxy_settings['port']) if self.proxy_settings['user'] and self.proxy_settings['password']: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings['user'], password=self.proxy_settings['password'], uri=uri) self.proxies['http'] = "http://{uri}".format(uri=uri) self.proxies['https'] = "https://{uri}".format(uri=uri) def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) @classmethod def generate_historate_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): cls.historate(self, metric, value, excluding_tags, tags=tags, hostname=hostname, device_name=device_name) return fct @classmethod def generate_histogram_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): tags = list( tags ) # Use a copy of the list to avoid removing tags from originial for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith(exc_tag + ":"): tags.remove(tag) cls.histogram(self, metric, value, tags=tags, hostname=hostname, device_name=device_name) return fct def historate(self, metric, value, excluding_tags, tags=None, hostname=None, device_name=None): """ Function to create a histogram metric for "rate" like metrics. Warning this doesn't use the harmonic mean, beware of what it means when using it. :param metric: The name of the metric :param value: The value to sample for the histogram :param excluding_tags: A list of tags that will be removed when computing the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ tags = list( tags ) # Use a copy of the list to avoid removing tags from originial context = [metric] if tags is not None: context.append("-".join(sorted(tags))) if hostname is not None: context.append("host:" + hostname) if device_name is not None: context.append("device:" + device_name) now = time.time() context = tuple(context) if context in self.historate_dict: if tags is not None: for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith("{0}:".format(exc_tag)): tags.remove(tag) prev_value, prev_ts = self.historate_dict[context] rate = float(value - prev_value) / float(now - prev_ts) self.aggregator.histogram(metric, rate, tags, hostname, device_name) self.historate_dict[context] = (value, now) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Doesn't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname if message is not None: message = unicode( message) # ascii converts to unicode but not viceversa self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def service_metadata(self, meta_name, value): """ Save metadata. :param meta_name: metadata key name :type meta_name: string :param value: metadata value :type value: string """ self._instance_metadata.append((meta_name, unicode(value))) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): """ Concatenate and flush instance metadata. """ self.svc_metadata.append( dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): """ Return a list of the metadata dictionaries saved by the check -if any- and clears them out of the instance's service_checks list @return the list of metadata saved by this check @rtype list of metadata dicts """ if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ warning_message = str(warning_message) self.log.warning(warning_message) self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith( 'get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: # Go from `get_memory_info` -> `memory_info` stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn( "Could not serialize output of {0} to dict".format( method)) except psutil.AccessDenied: log.warn("Cannot call psutil method {} : Access Denied".format( method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): """ If in developer mode, return a dictionary of statistics about the check run """ stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.min_collection_interval) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception as e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats after check {0}".format( self.name)) return instance_statuses def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError() def stop(self): """ To be executed when the agent is being stopped to clean ressources """ pass @classmethod def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None): """ A method used for testing your check without running the agent. """ if path_to_yaml: check_name = os.path.basename(path_to_yaml).split('.')[0] try: f = open(path_to_yaml) except IOError: raise Exception('Unable to open yaml config: %s' % path_to_yaml) yaml_text = f.read() f.close() config = yaml.load(yaml_text, Loader=yLoader) try: check = cls(check_name, config.get('init_config') or {}, agentConfig or {}, config.get('instances')) except TypeError: # Compatibility for the check not supporting instances check = cls(check_name, config.get('init_config') or {}, agentConfig or {}) return check, config.get('instances', []) def normalize(self, metric, prefix=None, fix_case=False): """ Turn a metric into a well-formed metric name prefix.b.c :param metric The metric name to normalize :param prefix A prefix to to add to the normalized name, default None :param fix_case A boolean, indicating whether to make sure that the metric name returned is in underscore_case """ if isinstance(metric, unicode): metric_name = unicodedata.normalize('NFKD', metric).encode( 'ascii', 'ignore') else: metric_name = metric if fix_case: name = self.convert_to_underscore_separated(metric_name) if prefix is not None: prefix = self.convert_to_underscore_separated(prefix) else: name = re.sub(r"[,\+\*\-/()\[\]{}\s]", "_", metric_name) # Eliminate multiple _ name = re.sub(r"__+", "_", name) # Don't start/end with _ name = re.sub(r"^_", "", name) name = re.sub(r"_$", "", name) # Drop ._ and _. name = re.sub(r"\._", ".", name) name = re.sub(r"_\.", ".", name) if prefix is not None: return prefix + "." + name else: return name FIRST_CAP_RE = re.compile('(.)([A-Z][a-z]+)') ALL_CAP_RE = re.compile('([a-z0-9])([A-Z])') METRIC_REPLACEMENT = re.compile(r'([^a-zA-Z0-9_.]+)|(^[^a-zA-Z]+)') DOT_UNDERSCORE_CLEANUP = re.compile(r'_*\._*') def convert_to_underscore_separated(self, name): """ Convert from CamelCase to camel_case And substitute illegal metric characters """ metric_name = self.FIRST_CAP_RE.sub(r'\1_\2', name) metric_name = self.ALL_CAP_RE.sub(r'\1_\2', metric_name).lower() metric_name = self.METRIC_REPLACEMENT.sub('_', metric_name) return self.DOT_UNDERSCORE_CLEANUP.sub('.', metric_name).strip('_') @staticmethod def read_config(instance, key, message=None, cast=None): val = instance.get(key) if val is None: message = message or 'Must provide `%s` value in instance config' % key raise Exception(message) if cast is None: return val else: return cast(val)
class AgentCheck(object): def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config self.agentConfig = agentConfig self.hostname = gethostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter) self.events = [] self.instances = instances or [] def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def run(self): """ Run all instances. """ instance_statuses = [] for i, instance in enumerate(self.instances): try: self.check(instance) instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) # Send the traceback (located at sys.exc_info()[2]) into the InstanceStatus otherwise a traceback won't be able to be printed instance_status = check_status.InstanceStatus(i, check_status.STATUS_ERROR, e, sys.exc_info()[2]) instance_statuses.append(instance_status) return instance_statuses
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 _enabled_checks = [] @classmethod def is_check_enabled(cls, name): return name in cls._enabled_checks def __init__(self, name, init_config, agentConfig, instances=None): from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) self.events = [] self.service_checks = [] if instances: jsoned_instances = json.dumps(instances) encrypted_passwd_list = re.findall('>>>.*?<<<', jsoned_instances) if encrypted_passwd_list: for encrypted_passwd in encrypted_passwd_list: decrypted_passwd = decrypted(encrypted_passwd) jsoned_instances = jsoned_instances.replace( encrypted_passwd, decrypted_passwd) self.instances = convert_to_str( json.loads(jsoned_instances, encoding='utf-8')) else: self.instances = instances else: self.instances = [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} def instance_count(self): return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.histogram(metric, value, tags, hostname, device_name) @classmethod def generate_historate_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): cls.historate(self, metric, value, excluding_tags, tags=tags, hostname=hostname, device_name=device_name) return fct @classmethod def generate_histogram_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): tags = list(tags) for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith(exc_tag + ":"): tags.remove(tag) cls.histogram(self, metric, value, tags=tags, hostname=hostname, device_name=device_name) return fct def historate(self, metric, value, excluding_tags, tags=None, hostname=None, device_name=None): tags = list(tags) context = [metric] if tags is not None: context.append("-".join(sorted(tags))) if hostname is not None: context.append("host:" + hostname) if device_name is not None: context.append("device:" + device_name) now = time.time() context = tuple(context) if context in self.historate_dict: if tags is not None: for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith("{0}:".format(exc_tag)): tags.remove(tag) prev_value, prev_ts = self.historate_dict[context] rate = float(value - prev_value) / float(now - prev_ts) self.aggregator.histogram(metric, rate, tags, hostname, device_name) self.historate_dict[context] = (value, now) def set(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): if event.get('api_key') is None: event['api_key'] = self.agentConfig['api_key'] self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): if hostname is None: hostname = self.hostname if message is not None: message = unicode(message) self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def service_metadata(self, meta_name, value): self._instance_metadata.append((meta_name, unicode(value))) def has_events(self): return len(self.events) > 0 def get_metrics(self): return self.aggregator.flush() def get_events(self): events = self.events self.events = [] return events def get_service_checks(self): service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): self.svc_metadata.append( dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): return len(self.warnings) > 0 def warning(self, warning_message): warning_message = str(warning_message) self.log.warning(warning_message) self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): raise NotImplementedError def get_warnings(self): warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith( 'get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn( "Could not serialize output of {0} to dict".format( method)) except psutil.AccessDenied: log.warn("Cannot call psutil method {} : Access Denied".format( method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=str(e)) finally:
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 _enabled_checks = [] @classmethod def is_check_enabled(cls, name): return name in cls._enabled_checks def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get("developer_mode") and psutil self._internal_profiling_stats = None self.hostname = agentConfig.get("checksd_hostname") or get_hostname(agentConfig) self.log = logging.getLogger("%s.%s" % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get("recent_point_threshold", None), histogram_aggregates=agentConfig.get("histogram_aggregates"), histogram_percentiles=agentConfig.get("histogram_percentiles"), ) if Platform.is_linux() and psutil is not None: procfs_path = self.agentConfig.get("procfs_path", "/proc").rstrip("/") psutil.PROCFS_PATH = procfs_path self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) @classmethod def generate_historate_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): cls.historate(self, metric, value, excluding_tags, tags=tags, hostname=hostname, device_name=device_name) return fct @classmethod def generate_histogram_func(cls, excluding_tags): def fct(self, metric, value, tags=None, hostname=None, device_name=None): tags = list(tags) # Use a copy of the list to avoid removing tags from originial for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith(exc_tag + ":"): tags.remove(tag) cls.histogram(self, metric, value, tags=tags, hostname=hostname, device_name=device_name) return fct def historate(self, metric, value, excluding_tags, tags=None, hostname=None, device_name=None): """ Function to create a histogram metric for "rate" like metrics. Warning this doesn't use the harmonic mean, beware of what it means when using it. :param metric: The name of the metric :param value: The value to sample for the histogram :param excluding_tags: A list of tags that will be removed when computing the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ tags = list(tags) # Use a copy of the list to avoid removing tags from originial context = [metric] if tags is not None: context.append("-".join(sorted(tags))) if hostname is not None: context.append("host:" + hostname) if device_name is not None: context.append("device:" + device_name) now = time.time() context = tuple(context) if context in self.historate_dict: if tags is not None: for tag in list(tags): for exc_tag in excluding_tags: if tag.startswith("{0}:".format(exc_tag)): tags.remove(tag) prev_value, prev_ts = self.historate_dict[context] rate = float(value - prev_value) / float(now - prev_ts) self.aggregator.histogram(metric, rate, tags, hostname, device_name) self.historate_dict[context] = (value, now) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def service_check( self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None ): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Doesn't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname if message is not None: message = unicode(message) # ascii converts to unicode but not viceversa self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message) ) def service_metadata(self, meta_name, value): """ Save metadata. :param meta_name: metadata key name :type meta_name: string :param value: metadata value :type value: string """ self._instance_metadata.append((meta_name, unicode(value))) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): """ Concatenate and flush instance metadata. """ self.svc_metadata.append(dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): """ Return a list of the metadata dictionaries saved by the check -if any- and clears them out of the instance's service_checks list @return the list of metadata saved by this check @rtype list of metadata dicts """ if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ warning_message = str(warning_message) self.log.warning(warning_message) self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith("get_") else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: # Go from `get_memory_info` -> `memory_info` stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn("Could not serialize output of {0} to dict".format(method)) except psutil.AccessDenied: log.warn("Cannot call psutil method {} : Access Denied".format(method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {"before": before, "after": after} def _get_internal_profiling_stats(self): """ If in developer mode, return a dictionary of statistics about the check run """ stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug("Failed to collect Agent Stats before check {0}".format(self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( "min_collection_interval", self.init_config.get("min_collection_interval", self.DEFAULT_MIN_COLLECTION_INTERVAL), ) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago".format( i, self.name, min_collection_interval ) ) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = {"run_time": timeit.default_timer() - check_start_time} if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats, ) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats ) except Exception as e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc() ) finally: self._roll_up_instance_metadata() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() self._set_internal_profiling_stats(before, after) log.info("\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug("Failed to collect Agent Stats after check {0}".format(self.name)) return instance_statuses def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError() def stop(self): """ To be executed when the agent is being stopped to clean ressources """ pass @classmethod def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None): """ A method used for testing your check without running the agent. """ if path_to_yaml: check_name = os.path.basename(path_to_yaml).split(".")[0] try: f = open(path_to_yaml) except IOError: raise Exception("Unable to open yaml config: %s" % path_to_yaml) yaml_text = f.read() f.close() config = yaml.load(yaml_text, Loader=yLoader) try: check = cls(check_name, config.get("init_config") or {}, agentConfig or {}, config.get("instances")) except TypeError: # Compatibility for the check not supporting instances check = cls(check_name, config.get("init_config") or {}, agentConfig or {}) return check, config.get("instances", []) def normalize(self, metric, prefix=None, fix_case=False): """ Turn a metric into a well-formed metric name prefix.b.c :param metric The metric name to normalize :param prefix A prefix to to add to the normalized name, default None :param fix_case A boolean, indicating whether to make sure that the metric name returned is in underscore_case """ if isinstance(metric, unicode): metric_name = unicodedata.normalize("NFKD", metric).encode("ascii", "ignore") else: metric_name = metric if fix_case: name = self.convert_to_underscore_separated(metric_name) if prefix is not None: prefix = self.convert_to_underscore_separated(prefix) else: name = re.sub(r"[,\+\*\-/()\[\]{}\s]", "_", metric_name) # Eliminate multiple _ name = re.sub(r"__+", "_", name) # Don't start/end with _ name = re.sub(r"^_", "", name) name = re.sub(r"_$", "", name) # Drop ._ and _. name = re.sub(r"\._", ".", name) name = re.sub(r"_\.", ".", name) if prefix is not None: return prefix + "." + name else: return name FIRST_CAP_RE = re.compile("(.)([A-Z][a-z]+)") ALL_CAP_RE = re.compile("([a-z0-9])([A-Z])") METRIC_REPLACEMENT = re.compile(r"([^a-zA-Z0-9_.]+)|(^[^a-zA-Z]+)") DOT_UNDERSCORE_CLEANUP = re.compile(r"_*\._*") def convert_to_underscore_separated(self, name): """ Convert from CamelCase to camel_case And substitute illegal metric characters """ metric_name = self.FIRST_CAP_RE.sub(r"\1_\2", name) metric_name = self.ALL_CAP_RE.sub(r"\1_\2", metric_name).lower() metric_name = self.METRIC_REPLACEMENT.sub("_", metric_name) return self.DOT_UNDERSCORE_CLEANUP.sub(".", metric_name).strip("_") @staticmethod def read_config(instance, key, message=None, cast=None): val = instance.get(key) if val is None: message = message or "Must provide `%s` value in instance config" % key raise Exception(message) if cast is None: return val else: return cast(val)
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil is not None self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles') ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ if event.get('api_key') is None: event['api_key'] = self.agentConfig['api_key'] self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Don't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname if message is not None: message = str(message) self.service_checks.append( create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message) ) def service_metadata(self, meta_name, value): """ Save metadata. :param meta_name: metadata key name :type meta_name: string :param value: metadata value :type value: string """ self._instance_metadata.append((meta_name, str(value))) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def _roll_up_instance_metadata(self): """ Concatenate and flush instance metadata. """ self.svc_metadata.append(dict((k, v) for (k, v) in self._instance_metadata)) self._instance_metadata = [] def get_service_metadata(self): """ Return a list of the metadata dictionaries saved by the check -if any- and clears them out of the instance's service_checks list @return the list of metadata saved by this check @rtype list of metadata dicts """ if self._instance_metadata: self._roll_up_instance_metadata() service_metadata = self.svc_metadata self.svc_metadata = [] return service_metadata def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ self.warnings.append(str(warning_message)) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings @staticmethod def _get_statistic_name_from_method(method_name): return method_name[4:] if method_name.startswith('get_') else method_name @staticmethod def _collect_internal_stats(methods=None): current_process = psutil.Process(os.getpid()) methods = methods or DEFAULT_PSUTIL_METHODS filtered_methods = [m for m in methods if hasattr(current_process, m)] stats = {} for method in filtered_methods: # Go from `get_memory_info` -> `memory_info` stat_name = AgentCheck._get_statistic_name_from_method(method) try: raw_stats = getattr(current_process, method)() try: stats[stat_name] = raw_stats._asdict() except AttributeError: if isinstance(raw_stats, numbers.Number): stats[stat_name] = raw_stats else: log.warn("Could not serialize output of {0} to dict".format(method)) except psutil.AccessDenied: log.warn("Cannot call psutil method {} : Access Denied".format(method)) return stats def _set_internal_profiling_stats(self, before, after): self._internal_profiling_stats = {'before': before, 'after': after} def _get_internal_profiling_stats(self): """ If in developer mode, return a dictionary of statistics about the check run """ stats = self._internal_profiling_stats self._internal_profiling_stats = None return stats def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug("Failed to collect Agent Stats before check {0}".format(self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.init_config.get( 'min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL ) ) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug("Not running instance #{0} of check {1} as it ran less than {2}s ago".format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = {'run_time': timeit.default_timer() - check_start_time} if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats ) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats ) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc() ) finally:
class AgentCheck(object): def __init__(self, name, init_config, agentConfig): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config self.agentConfig = agentConfig self.hostname = gethostname(agentConfig) self.log = logging.getLogger('checks.%s' % name) self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter) self.events = [] def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def check(self, instance): """ Overriden by the check class. This will be called to run the check. :param instance: A dict with the instance information. This will vary depending on your config structure. """ raise NotImplementedError() @classmethod def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None): """ A method used for testing your check without running the agent. """ from util import yaml, yLoader if path_to_yaml: check_name = os.path.basename(path_to_yaml).split('.')[0] try: f = open(path_to_yaml) except IOError: raise Exception('Unable to open yaml config: %s' % path_to_yaml) yaml_text = f.read() f.close() config = yaml.load(yaml_text, Loader=yLoader) check = cls(check_name, config.get('init_config') or {}, agentConfig or {}) return check, config.get('instances', []) def normalize(self, metric, prefix=None): """ Turn a metric into a well-formed metric name prefix.b.c :param metric The metric name to normalize :param prefix A prefix to to add to the normalized name, default None """ name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric) # Eliminate multiple _ name = re.sub(r"__+", "_", name) # Don't start/end with _ name = re.sub(r"^_", "", name) name = re.sub(r"_$", "", name) # Drop ._ and _. name = re.sub(r"\._", ".", name) name = re.sub(r"_\.", ".", name) if prefix is not None: return prefix + "." + name else: return name
class AgentCheck(object): OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) SOURCE_TYPE_NAME = None DEFAULT_MIN_COLLECTION_INTERVAL = 0 def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles') ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ if event.get('api_key') is None: event['api_key'] = self.agentConfig['api_key'] self.events.append(event) def service_check(self, check_name, status, tags=None, timestamp=None, hostname=None, check_run_id=None, message=None): """ Save a service check. :param check_name: string, name of the service check :param status: int, describing the status. 0 for success, 1 for warning, 2 for failure :param tags: (optional) list of strings, a list of tags for this run :param timestamp: (optional) float, unix timestamp for when the run occurred :param hostname: (optional) str, host that generated the service check. Defaults to the host_name of the agent :param check_run_id: (optional) int, id used for logging and tracing purposes. Don't need to be unique. If not specified, one will be generated. """ if hostname is None: hostname = self.hostname self.service_checks.append(create_service_check(check_name, status, tags, timestamp, hostname, check_run_id, message)) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def get_service_checks(self): """ Return a list of the service checks saved by the check, if any and clears them out of the instance's service_checks list @return the list of service checks saved by this check @rtype list of service check dicts """ service_checks = self.service_checks self.service_checks = [] return service_checks def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings def run(self): """ Run all instances. """ instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get('min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug("Not running instance #{0} of check {1} as it ran less than {2}s ago".format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now self.check(copy.deepcopy(instance)) if self.has_warnings(): instance_status = check_status.InstanceStatus(i, check_status.STATUS_WARNING, warnings=self.get_warnings() ) else: instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus(i, check_status.STATUS_ERROR, error=e, tb=traceback.format_exc() ) instance_statuses.append(instance_status) return instance_statuses
class AgentCheck(object): def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config self.agentConfig = agentConfig self.hostname = get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None)) self.events = [] self.instances = instances or [] self.warnings = [] self.library_versions = None def instance_count(self): """ Return the number of instances that are configured for this check. """ return len(self.instances) def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def set(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a set value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value for the set :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.set(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ self.events.append(event) def has_events(self): """ Check whether the check has saved any events @return whether or not the check has saved any events @rtype boolean """ return len(self.events) > 0 def get_metrics(self): """ Get all metrics, including the ones that are tagged. @return the list of samples @rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...] """ return self.aggregator.flush() def get_events(self): """ Return a list of the events saved by the check, if any @return the list of events saved by this check @rtype list of event dictionaries """ events = self.events self.events = [] return events def has_warnings(self): """ Check whether the instance run created any warnings """ return len(self.warnings) > 0 def warning(self, warning_message): """ Add a warning message that will be printed in the info page :param warning_message: String. Warning message to be displayed """ self.warnings.append(warning_message) def get_library_info(self): if self.library_versions is not None: return self.library_versions try: self.library_versions = self.get_library_versions() except NotImplementedError: pass def get_library_versions(self): """ Should return a string that shows which version of the needed libraries are used """ raise NotImplementedError def get_warnings(self): """ Return the list of warnings messages to be displayed in the info page """ warnings = self.warnings self.warnings = [] return warnings def run(self): """ Run all instances. """ instance_statuses = [] for i, instance in enumerate(self.instances): try: self.check(instance) if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings()) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=e, tb=traceback.format_exc()) instance_statuses.append(instance_status) return instance_statuses
class ScriptRunner(object): def __init__(self, agent_config, hostname, emitters, path): self.run_count = 0 self.events = [] self.continue_running = True self.agent_config = agent_config self.hostname = hostname self.emitters = emitters self.path = path # 解析指定目录脚本预定义规范 默认为 # {'name': 'default', 'interval': 15, 'version': '1.0.0'} self.standard = {'name': None, 'interval': 15, 'version': '1.0.0'} self.interval = self.standard['interval'] # 仿照插件代码写法实例化MetricAggregator self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter) self.tag = self._get_tags() def run(self, payload): # 运行时间计时器 log.info("Calling %s" % self.path) while self.continue_running: timer = Timer() payload_temp = deepcopy(payload) try: self._standard_parser(self.path) # stderr输出在测试运行时都是空字符串 不管脚本运行正常还是失败 # 此变量暂时保存 后续有需求时再处理 stdout_data, stderr_data = self._get_value() for i in stdout_data: if 'metric' in i: self._parse_metric(i) elif 'event' in i: self._parse_events(i) except IOError as e: io_err_event = self._format_event( str(e), "Reading script file failed", "error", "path:%s" % self.path) self.events.append(io_err_event) except OSError as e: os_err_event = self._format_event( str(e), "Executing script file failed", "error", "path:%s" % self.path) self.events.append(os_err_event) except KeyError as e: key_err_event = self._format_event( str(e), "Output of script file missing or misspelled, or unsupported script file", "error", "path:%s" % self.path) self.events.append(key_err_event) except Exception as e: common_err_event = self._format_event( str(e), "Uncatergorized error when calling script file", "error", "path:%s" % self.path) self.events.append(common_err_event) finally: payload_temp["metrics"].extend(self.aggregator.flush()) payload_temp['events'][self.path] = self.events if not payload_temp["metrics"] and not payload_temp['events'][ self.path]: no_out_event = self._format_event( "There is no output when executing this script", "Uncatergorized error when calling script file", "error", "path:%s" % self.path) payload_temp['events'][self.path] = no_out_event self.events = [] # log.info(payload_temp['metrics']) # log.info(payload_temp['events']) collect_duration = timer.step() payload_temp.emit(log, self.agent_config, self.emitters, self.continue_running) emit_duration = timer.step() # 运行状况记录到日志中 if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info( "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.path, self.run_count, round( collect_duration, 2), round(emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "Script: %s. First flushes done, next flushes will be logged every %s flushes." % self.path, FLUSH_LOGGING_PERIOD) else: log.debug( "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.path, self.run_count, round( collect_duration, 2), round(emit_duration, 2))) time.sleep(self.interval) def _parse_metric(self, line): metric_dict = format_line(line) tags = metric_dict.get('tags', '') if tags: tags = tags.split(',') else: tags = [] tags.extend(self.tag) self.__getattribute__(metric_dict['type'])(metric_dict['metric'], int(metric_dict['value']), tags) def _parse_events(self, line): event_dict = format_line(line) tags = [event_dict['tags']] tags.extend(self.tag) self.event( self._format_event(event_dict['msg'], event_dict['event'], event_dict['severity'], tags)) def _format_event(self, msg, title, alert_type, tags): timestamp = time.time() msg_title = title msg_body = msg event = { 'timestamp': timestamp, 'host': self.hostname, 'event_type': self.path, 'msg_title': msg_title, 'msg_text': msg_body, "alert_type": alert_type, "tags": tags } return event def _get_tags(self): tags = self.agent_config.get('tags', None) tag_temp = ["version:%s" % self.standard['version']] if tags: tag_temp.extend([tag.strip() for tag in tags.split(",")]) return tag_temp def _standard_parser(self, path): """Returns a dict contains name, interval, version parsed from file""" if not self.standard['name']: self.standard['name'] = self.path file_object = open(path) lines = file_object.readlines() for key in self.standard.keys(): for line in lines: if key in line: index = line.index('=') self.standard[key] = line[index + 1:].strip("\n") break file_object.close() log.info("Script: %s Standards: %s" % (self.path, str(self.standard))) self.interval = int(self.standard['interval']) def _get_value(self): def caller(cmd): stdout_data, stderr_data = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() return stdout_data, stderr_data def rm_unexps(out, err): quots = ["\"", "\'"] for quot in quots: out = out.replace(quot, '') err = err.replace(quot, '') if '\r\n' in out or '\r\n' in err: out = out.split('\r\n') err = err.split('\r\n') elif '\r' in out or '\r' in err: out = out.split('\r') err = err.split('\r') elif '\n' in out or '\n' in err: out = out.split('\n') err = err.split('\n') else: out = [out] err = [out] r_out = [] r_err = [] for i in out: try: i = u'' + i except Exception: i = '' finally: r_out.append(i) for i in err: try: i = u'' + i except Exception: i = '' finally: r_err.append(i) return r_out, r_err def vbscript_caller(path): cmd = 'cscript ' + path stdout_data, stderr_data = caller(cmd) # stdout_data = r'' + stdout_data return rm_unexps(stdout_data, stderr_data) def shscript_caller(path): cmd = ['sudo', 'bash', path] stdout_data, stderr_data = caller(cmd) log.info() return rm_unexps(stdout_data, stderr_data) def batscrit_caller(path): cmd = path stdout_data, stderr_data = caller(cmd) return rm_unexps(stdout_data, stderr_data) def pyscript_caller(path): cmd = [sys.executable, path] stdout_data, stderr_data = caller(cmd) return rm_unexps(stdout_data, stderr_data) file_type_methods = { 'vbs': vbscript_caller, 'sh': shscript_caller, 'bat': batscrit_caller, 'py': pyscript_caller } path_sects = self.path.split('.') res = file_type_methods[path_sects[-1]](self.path) return res def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): """ Record the value of a gauge, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value of the gauge :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric :param timestamp: (optional) The timestamp for this metric value """ self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to increment by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): """ Increment a counter with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to decrement by :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submit a raw count with optional tags, hostname and device name :param metric: The name of the metric :param value: The value :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): """ Submits a raw count with optional tags, hostname and device name based on increasing counter values. E.g. 1, 3, 5, 7 will submit 6 on flush. Note that reset counters are skipped. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): """ Submit a point for a metric that will be calculated as a rate on flush. Values will persist across each call to `check` if there is not enough point to generate a rate on the flush. :param metric: The name of the metric :param value: The value of the rate :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): """ Sample a histogram value, with optional tags, hostname and device name. :param metric: The name of the metric :param value: The value to sample for the histogram :param tags: (optional) A list of tags for this metric :param hostname: (optional) A hostname for this metric. Defaults to the current hostname. :param device_name: (optional) The device name for this metric """ self.aggregator.histogram(metric, value, tags, hostname, device_name) def event(self, event): """ Save an event. :param event: The event payload as a dictionary. Has the following structure: { "timestamp": int, the epoch timestamp for the event, "event_type": string, the event time name, "api_key": string, the api key of the account to associate the event with, "msg_title": string, the title of the event, "msg_text": string, the text body of the event, "alert_type": (optional) string, one of ('error', 'warning', 'success', 'info'). Defaults to 'info'. "source_type_name": (optional) string, the source type name, "host": (optional) string, the name of the host, "tags": (optional) list, a list of tags to associate with this event } """ if event.get('api_key') is None: event['api_key'] = self.agent_config['api_key'] self.events.append(event)
class ScriptRunner(object): def __init__(self, agent_config, hostname, emitters, path): self.run_count = 0 self.events = [] self.continue_running = True self.agent_config = agent_config self.hostname = hostname self.emitters = emitters self.path = path self.standard = {'name': None, 'interval': 15, 'version': '1.0.0'} self.interval = self.standard['interval'] self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter) self.tag = self._get_tags() def run(self, payload): log.info("Calling %s" % self.path) while self.continue_running: timer = Timer() payload_temp = deepcopy(payload) try: self._standard_parser(self.path) stdout_data, stderr_data = self._get_value() for i in stdout_data: if 'metric' in i: self._parse_metric(i) elif 'event' in i: self._parse_events(i) except IOError as e: io_err_event = self._format_event( str(e), "Reading script file failed", "error", "path:%s" % self.path) self.events.append(io_err_event) except OSError as e: os_err_event = self._format_event( str(e), "Executing script file failed", "error", "path:%s" % self.path) self.events.append(os_err_event) except KeyError as e: key_err_event = self._format_event( str(e), "Output of script file missing or misspelled, or unsupported script file", "error", "path:%s" % self.path) self.events.append(key_err_event) except Exception as e: common_err_event = self._format_event( str(e), "Uncatergorized error when calling script file", "error", "path:%s" % self.path) self.events.append(common_err_event) finally: payload_temp["metrics"].extend(self.aggregator.flush()) payload_temp['events'][self.path] = self.events if not payload_temp["metrics"] and not payload_temp['events'][ self.path]: no_out_event = self._format_event( "There is no output when executing this script", "Uncatergorized error when calling script file", "error", "path:%s" % self.path) payload_temp['events'][self.path] = no_out_event self.events = [] collect_duration = timer.step() payload_temp.emit(log, self.agent_config, self.emitters, self.continue_running) emit_duration = timer.step() if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info( "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.path, self.run_count, round( collect_duration, 2), round(emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "Script: %s. First flushes done, next flushes will be logged every %s flushes." % self.path, FLUSH_LOGGING_PERIOD) else: log.debug( "Script: %s. Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.path, self.run_count, round( collect_duration, 2), round(emit_duration, 2))) time.sleep(self.interval) def _parse_metric(self, line): metric_dict = format_line(line) tags = metric_dict.get('tags', '') if tags: tags = tags.split(',') else: tags = [] tags.extend(self.tag) self.__getattribute__(metric_dict['type'])(metric_dict['metric'], int(metric_dict['value']), tags) def _parse_events(self, line): event_dict = format_line(line) tags = [event_dict['tags']] tags.extend(self.tag) self.event( self._format_event(event_dict['msg'], event_dict['event'], event_dict['severity'], tags)) def _format_event(self, msg, title, alert_type, tags): timestamp = time.time() msg_title = title msg_body = msg event = { 'timestamp': timestamp, 'host': self.hostname, 'event_type': self.path, 'msg_title': msg_title, 'msg_text': msg_body, "alert_type": alert_type, "tags": tags } return event def _get_tags(self): tags = self.agent_config.get('tags', None) tag_temp = ["version:%s" % self.standard['version']] if tags: tag_temp.extend([tag.strip() for tag in tags.split(",")]) return tag_temp def _standard_parser(self, path): if not self.standard['name']: self.standard['name'] = self.path file_object = open(path) lines = file_object.readlines() for key in self.standard.keys(): for line in lines: if key in line: index = line.index('=') self.standard[key] = line[index + 1:].strip("\n") break file_object.close() log.info("Script: %s Standards: %s" % (self.path, str(self.standard))) self.interval = int(self.standard['interval']) def _get_value(self): def caller(cmd, shell=0): stdout_data, stderr_data = subprocess.Popen( cmd, shell=shell, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() return stdout_data, stderr_data def rm_unexps(out, err): quots = ["\"", "\'"] for quot in quots: out = out.replace(quot, '') err = err.replace(quot, '') if '\r\n' in out or '\r\n' in err: out = out.split('\r\n') err = err.split('\r\n') elif '\r' in out or '\r' in err: out = out.split('\r') err = err.split('\r') elif '\n' in out or '\n' in err: out = out.split('\n') err = err.split('\n') else: out = [out] err = [out] r_out = [] r_err = [] for i in out: try: i = u'' + i except Exception: i = '' finally: r_out.append(i) for i in err: try: i = u'' + i except Exception: i = '' finally: r_err.append(i) return r_out, r_err def vbscript_caller(path): cmd = 'cscript ' + path stdout_data, stderr_data = caller(cmd) log.info("cmd:{}, stdout_data:{}, stderr_data: {}".format( cmd, stdout_data, stderr_data)) return rm_unexps(stdout_data, stderr_data) def shscript_caller(path): cmd = path stdout_data, stderr_data = caller(cmd, 1) log.info("cmd:{}, stdout_data:{}, stderr_data: {}".format( cmd, stdout_data, stderr_data)) return rm_unexps(stdout_data, stderr_data) def batscrit_caller(path): cmd = path stdout_data, stderr_data = caller(cmd) log.info("cmd:{}, stdout_data:{}, stderr_data: {}".format( cmd, stdout_data, stderr_data)) return rm_unexps(stdout_data, stderr_data) def pyscript_caller(path): cwd = os.environ.get('ANT_AGENT_DIR') if IS_WINDOWS: python_execute = os.path.join(cwd, "embedded\python.exe") else: python_execute = os.path.join(cwd, 'embedded/bin/python') cmd = [python_execute, path] stdout_data, stderr_data = caller(cmd) log.info("cmd:{}, stdout_data:{}, stderr_data: {}".format( cmd, stdout_data, stderr_data)) return rm_unexps(stdout_data, stderr_data) file_type_methods = { 'vbs': vbscript_caller, 'sh': shscript_caller, 'bat': batscrit_caller, 'py': pyscript_caller } path_sects = self.path.split('.') res = file_type_methods[path_sects[-1]](self.path) return res def gauge(self, metric, value, tags=None, hostname=None, device_name=None, timestamp=None): self.aggregator.gauge(metric, value, tags, hostname, device_name, timestamp) def increment(self, metric, value=1, tags=None, hostname=None, device_name=None): self.aggregator.increment(metric, value, tags, hostname, device_name) def decrement(self, metric, value=-1, tags=None, hostname=None, device_name=None): self.aggregator.decrement(metric, value, tags, hostname, device_name) def count(self, metric, value=0, tags=None, hostname=None, device_name=None): self.aggregator.submit_count(metric, value, tags, hostname, device_name) def monotonic_count(self, metric, value=0, tags=None, hostname=None, device_name=None): self.aggregator.count_from_counter(metric, value, tags, hostname, device_name) def rate(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.rate(metric, value, tags, hostname, device_name) def histogram(self, metric, value, tags=None, hostname=None, device_name=None): self.aggregator.histogram(metric, value, tags, hostname, device_name) def event(self, event): if event.get('api_key') is None: event['api_key'] = self.agent_config['api_key'] self.events.append(event)