def collect(self, api): # Fetch information analogous to Mongo's db.getReplicationInfo() localdb = api["local"] oplog_data = {} try: for collection_name in ("oplog.rs", "oplog.$main"): ol_options = localdb[collection_name].options() if ol_options: break except pymongo.errors.OperationFailure as e: # In theory this error should only happen when connected to mongos or arbiter. self.log.debug( "Unable to collect oplog metrics from replica set member. Error is: %s", e) return if ol_options: try: oplog_data['logSizeMB'] = round_value( ol_options['size'] / 2.0**20, 2) oplog = localdb[collection_name] oplog_data['usedSizeMB'] = round_value( localdb.command("collstats", collection_name)['size'] / 2.0**20, 2) op_asc_cursor = oplog.find({ "ts": { "$exists": 1 } }).sort("$natural", pymongo.ASCENDING).limit(1) op_dsc_cursor = oplog.find({ "ts": { "$exists": 1 } }).sort("$natural", pymongo.DESCENDING).limit(1) try: first_timestamp = op_asc_cursor[0]['ts'].as_datetime() last_timestamp = op_dsc_cursor[0]['ts'].as_datetime() time_diff = last_timestamp - first_timestamp oplog_data['timeDiff'] = time_diff.total_seconds() except (IndexError, KeyError): # if the oplog collection doesn't have any entries # if an object in the collection doesn't have a ts value, we ignore it pass except KeyError: # encountered an error trying to access options.size for the oplog collection self.log.warning( u"Failed to record `ReplicationInfo` metrics.") self._submit_payload({'oplog': oplog_data})
def collect(self, client): # Fetch information analogous to Mongo's db.getReplicationInfo() localdb = client["local"] oplog_data = {} for collection_name in ("oplog.rs", "oplog.$main"): ol_options = localdb[collection_name].options() if ol_options: break if ol_options: try: oplog_data['logSizeMB'] = round_value( ol_options['size'] / 2.0**20, 2) oplog = localdb[collection_name] oplog_data['usedSizeMB'] = round_value( localdb.command("collstats", collection_name)['size'] / 2.0**20, 2) op_asc_cursor = oplog.find({ "ts": { "$exists": 1 } }).sort("$natural", pymongo.ASCENDING).limit(1) op_dsc_cursor = oplog.find({ "ts": { "$exists": 1 } }).sort("$natural", pymongo.DESCENDING).limit(1) try: first_timestamp = op_asc_cursor[0]['ts'].as_datetime() last_timestamp = op_dsc_cursor[0]['ts'].as_datetime() time_diff = last_timestamp - first_timestamp oplog_data['timeDiff'] = time_diff.total_seconds() except (IndexError, KeyError): # if the oplog collection doesn't have any entries # if an object in the collection doesn't have a ts value, we ignore it pass except KeyError: # encountered an error trying to access options.size for the oplog collection self.log.warning( u"Failed to record `ReplicationInfo` metrics.") self._submit_payload({'oplog': oplog_data})
def calculate_elapsed_time(datestamp, timestamp, qm_timezone, current_time=None): """ Calculate elapsed time in seconds from IBM MQ queue status date and timestamps Expected Timestamp format: %H.%M.%S, e.g. 18.45.20 Expected Datestamp format: %Y-%m-%d, e.g. 2021-09-15 https://www.ibm.com/docs/en/ibm-mq/9.2?topic=reference-display-qstatus-display-queue-status#q086260___3 """ if qm_timezone is not None: qm_tz = tz.gettz(qm_timezone) if qm_tz is None or type(qm_tz) == str: msg = ('Time zone `{}` is not recognized or may be deprecated. ' 'Please specify a valid time zone in IANA/Olson format.'. format(qm_timezone)) raise ValueError(msg) else: qm_tz = tz.UTC if current_time is None: current_time = get_timestamp() else: current_time = current_time """ 1. Construct a datetime object from the IBM MQ timestamp string format 2. Set the QM time zone on the datetime object. 3. Calculate the POSIX timestamp in seconds since EPOCH """ if datestamp and timestamp: timestamp_str = sanitize_strings(datestamp) + ' ' + sanitize_strings( timestamp) timestamp_dt = datetime.strptime(timestamp_str, '%Y-%m-%d %H.%M.%S') timestamp_tz = timestamp_dt.replace(tzinfo=qm_tz) timestamp_posix = (timestamp_tz - EPOCH).total_seconds() else: return None elapsed = round_value(current_time - timestamp_posix) return elapsed
def _check_db(self, instance, custom_tags=None): conn = self._get_conn(instance) tags = self._get_tags(custom_tags, instance) # Ping the database for info, and track the latency. # Process the service check: the check passes if we can connect to Redis start = time.time() try: info = conn.info() latency_ms = round_value((time.time() - start) * 1000, 2) tags = sorted(tags + ["redis_role:%s" % info["role"]]) self.gauge('redis.info.latency_ms', latency_ms, tags=tags) status = AgentCheck.OK self.service_check('redis.can_connect', status, tags=tags) self._collect_metadata(info) except ValueError: status = AgentCheck.CRITICAL self.service_check('redis.can_connect', status, tags=tags) raise except Exception: status = AgentCheck.CRITICAL self.service_check('redis.can_connect', status, tags=tags) raise # Save the database statistics. for key in info.keys(): if self.db_key_pattern.match(key): db_tags = tags + ["redis_db:" + key] # allows tracking percentage of expired keys as DD does not # currently allow arithmetic on metric for monitoring expires_keys = info[key]["expires"] total_keys = info[key]["keys"] persist_keys = total_keys - expires_keys self.gauge("redis.persist", persist_keys, tags=db_tags) self.gauge("redis.persist.percent", 100 * persist_keys / total_keys, tags=db_tags) self.gauge("redis.expires.percent", 100 * expires_keys / total_keys, tags=db_tags) for subkey in self.subkeys: # Old redis module on ubuntu 10.04 (python-redis 0.6.1) does not # returns a dict for those key but a string: keys=3,expires=0 # Try to parse it (see lighthouse #46) try: val = info[key].get(subkey, -1) except AttributeError: val = self._parse_dict_string(info[key], subkey, -1) metric = 'redis.{}'.format(subkey) self.gauge(metric, val, tags=db_tags) # Save a subset of db-wide statistics for info_name in info: if info_name in self.GAUGE_KEYS: self.gauge(self.GAUGE_KEYS[info_name], info[info_name], tags=tags) elif info_name in self.RATE_KEYS: self.rate(self.RATE_KEYS[info_name], info[info_name], tags=tags) # Save the number of commands. self.rate('redis.net.commands', info['total_commands_processed'], tags=tags) if 'instantaneous_ops_per_sec' in info: self.gauge('redis.net.instantaneous_ops_per_sec', info['instantaneous_ops_per_sec'], tags=tags) # Check some key lengths if asked self._check_key_lengths(conn, instance, list(tags)) # Check replication self._check_replication(info, tags) if instance.get("command_stats", False): self._check_command_stats(conn, tags)
def check(self, instance): """ Returns a dictionary that looks a lot like what's sent back by db.serverStatus() """ def total_seconds(td): """ Returns total seconds of a timedelta in a way that's safe for Python < 2.7 """ if hasattr(td, 'total_seconds'): return td.total_seconds() else: return (lag.microseconds + (lag.seconds + lag.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6 if 'server' not in instance: raise Exception("Missing 'server' in mongo config") # x.509 authentication ssl_params = { 'ssl': instance.get('ssl', None), 'ssl_keyfile': instance.get('ssl_keyfile', None), 'ssl_certfile': instance.get('ssl_certfile', None), 'ssl_cert_reqs': instance.get('ssl_cert_reqs', None), 'ssl_ca_certs': instance.get('ssl_ca_certs', None), } for key, param in list(iteritems(ssl_params)): if param is None: del ssl_params[key] server = instance['server'] username, password, db_name, nodelist, clean_server_name, auth_source = self._parse_uri( server, sanitize_username=bool(ssl_params) ) additional_metrics = instance.get('additional_metrics', []) # Get the list of metrics to collect collect_tcmalloc_metrics = 'tcmalloc' in additional_metrics metrics_to_collect = self._get_metrics_to_collect(server, additional_metrics) # Tagging tags = instance.get('tags', []) # ...de-dupe tags to avoid a memory leak tags = list(set(tags)) if not db_name: self.log.info('No MongoDB database found in URI. Defaulting to admin.') db_name = 'admin' service_check_tags = ["db:%s" % db_name] service_check_tags.extend(tags) # ...add the `server` tag to the metrics' tags only # (it's added in the backend for service checks) tags.append('server:%s' % clean_server_name) if nodelist: host = nodelist[0][0] port = nodelist[0][1] service_check_tags = service_check_tags + ["host:%s" % host, "port:%s" % port] timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000 try: cli = pymongo.mongo_client.MongoClient( server, socketTimeoutMS=timeout, connectTimeoutMS=timeout, serverSelectionTimeoutMS=timeout, read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED, **ssl_params ) # some commands can only go against the admin DB admindb = cli['admin'] db = cli[db_name] except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise # Authenticate do_auth = True use_x509 = ssl_params and not password if not username: self.log.debug(u"A username is required to authenticate to `%s`", server) do_auth = False if do_auth: if auth_source: msg = "authSource was specified in the the server URL: using '%s' as the authentication database" self.log.info(msg, auth_source) self._authenticate( cli[auth_source], username, password, use_x509, clean_server_name, service_check_tags ) else: self._authenticate(db, username, password, use_x509, clean_server_name, service_check_tags) try: status = db.command('serverStatus', tcmalloc=collect_tcmalloc_metrics) except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) if status['ok'] == 0: raise Exception(status['errmsg'].__str__()) ops = db.current_op() status['fsyncLocked'] = 1 if ops.get('fsyncLock') else 0 status['stats'] = db.command('dbstats') dbstats = {db_name: {'stats': status['stats']}} # Handle replica data, if any # See # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus # noqa if is_affirmative(instance.get('replica_check', True)): try: data = {} replSet = admindb.command('replSetGetStatus') if replSet: primary = None current = None # need a new connection to deal with replica sets setname = replSet.get('set') cli_rs = pymongo.mongo_client.MongoClient( server, socketTimeoutMS=timeout, connectTimeoutMS=timeout, serverSelectionTimeoutMS=timeout, replicaset=setname, read_preference=pymongo.ReadPreference.NEAREST, **ssl_params ) if do_auth: if auth_source: self._authenticate( cli_rs[auth_source], username, password, use_x509, server, service_check_tags ) else: self._authenticate( cli_rs[db_name], username, password, use_x509, server, service_check_tags ) # Replication set information replset_name = replSet['set'] replset_state = self.get_state_name(replSet['myState']).lower() tags.extend([u"replset_name:{0}".format(replset_name), u"replset_state:{0}".format(replset_state)]) # Find nodes: master and current node (ourself) for member in replSet.get('members'): if member.get('self'): current = member if int(member.get('state')) == 1: primary = member # Compute a lag time if current is not None and primary is not None: if 'optimeDate' in primary and 'optimeDate' in current: lag = primary['optimeDate'] - current['optimeDate'] data['replicationLag'] = total_seconds(lag) if current is not None: data['health'] = current['health'] data['state'] = replSet['myState'] if current is not None: total = 0.0 cfg = cli_rs['local']['system.replset'].find_one() for member in cfg.get('members'): total += member.get('votes', 1) if member['_id'] == current['_id']: data['votes'] = member.get('votes', 1) data['voteFraction'] = data['votes'] / total status['replSet'] = data # Submit events self._report_replica_set_state(data['state'], clean_server_name, replset_name) except Exception as e: if "OperationFailure" in repr(e) and ( "not running with --replSet" in str(e) or "replSetGetStatus" in str(e) ): pass else: raise e # If these keys exist, remove them for now as they cannot be serialized try: status['backgroundFlushing'].pop('last_finished') except KeyError: pass try: status.pop('localTime') except KeyError: pass dbnames = cli.database_names() self.gauge('mongodb.dbs', len(dbnames), tags=tags) for db_n in dbnames: db_aux = cli[db_n] dbstats[db_n] = {'stats': db_aux.command('dbstats')} # Go through the metrics and save the values for metric_name in metrics_to_collect: # each metric is of the form: x.y.z with z optional # and can be found at status[x][y][z] value = status if metric_name.startswith('stats'): continue else: try: for c in metric_name.split("."): value = value[c] except KeyError: continue # value is now status[x][y][z] if not isinstance(value, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead.".format( metric_name, type(value) ) ) # Submit the metric submit_method, metric_name_alias = self._resolve_metric(metric_name, metrics_to_collect) submit_method(self, metric_name_alias, value, tags=tags) for st, value in iteritems(dbstats): for metric_name in metrics_to_collect: if not metric_name.startswith('stats.'): continue try: val = value['stats'][metric_name.split('.')[1]] except KeyError: continue # value is now status[x][y][z] if not isinstance(val, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead.".format( metric_name, type(val) ) ) # Submit the metric metrics_tags = tags + [ u"cluster:db:{0}".format(st), # FIXME 6.0 - keep for backward compatibility u"db:{0}".format(st), ] submit_method, metric_name_alias = self._resolve_metric(metric_name, metrics_to_collect) submit_method(self, metric_name_alias, val, tags=metrics_tags) if is_affirmative(instance.get('collections_indexes_stats')): mongo_version = cli.server_info().get('version', '0.0') if LooseVersion(mongo_version) >= LooseVersion("3.2"): self._collect_indexes_stats(instance, db, tags) else: msg = "'collections_indexes_stats' is only available starting from mongo 3.2: your mongo version is %s" self.log.error(msg, mongo_version) # Report the usage metrics for dbs/collections if 'top' in additional_metrics: try: dbtop = admindb.command('top') for ns, ns_metrics in iteritems(dbtop['totals']): if "." not in ns: continue # configure tags for db name and collection name dbname, collname = ns.split(".", 1) ns_tags = tags + ["db:%s" % dbname, "collection:%s" % collname] # iterate over DBTOP metrics for m in self.TOP_METRICS: # each metric is of the form: x.y.z with z optional # and can be found at ns_metrics[x][y][z] value = ns_metrics try: for c in m.split("."): value = value[c] except Exception: continue # value is now status[x][y][z] if not isinstance(value, (int, long, float)): raise TypeError( u"{0} value is a {1}, it should be an int, a float or a long instead.".format( m, type(value) ) ) # Submit the metric submit_method, metric_name_alias = self._resolve_metric(m, metrics_to_collect, prefix="usage") submit_method(self, metric_name_alias, value, tags=ns_tags) # Keep old incorrect metric if metric_name_alias.endswith('countps'): GAUGE(self, metric_name_alias[:-2], value, tags=ns_tags) except Exception as e: self.log.warning('Failed to record `top` metrics %s' % str(e)) if 'local' in dbnames: # it might not be if we are connectiing through mongos # Fetch information analogous to Mongo's db.getReplicationInfo() localdb = cli['local'] oplog_data = {} for ol_collection_name in ("oplog.rs", "oplog.$main"): ol_options = localdb[ol_collection_name].options() if ol_options: break if ol_options: try: oplog_data['logSizeMB'] = round_value(ol_options['size'] / 2.0 ** 20, 2) oplog = localdb[ol_collection_name] oplog_data['usedSizeMB'] = round_value( localdb.command("collstats", ol_collection_name)['size'] / 2.0 ** 20, 2 ) op_asc_cursor = oplog.find({"ts": {"$exists": 1}}).sort("$natural", pymongo.ASCENDING).limit(1) op_dsc_cursor = oplog.find({"ts": {"$exists": 1}}).sort("$natural", pymongo.DESCENDING).limit(1) try: first_timestamp = op_asc_cursor[0]['ts'].as_datetime() last_timestamp = op_dsc_cursor[0]['ts'].as_datetime() oplog_data['timeDiff'] = total_seconds(last_timestamp - first_timestamp) except (IndexError, KeyError): # if the oplog collection doesn't have any entries # if an object in the collection doesn't have a ts value, we ignore it pass except KeyError: # encountered an error trying to access options.size for the oplog collection self.log.warning(u"Failed to record `ReplicationInfo` metrics.") for m, value in iteritems(oplog_data): submit_method, metric_name_alias = self._resolve_metric('oplog.%s' % m, metrics_to_collect) submit_method(self, metric_name_alias, value, tags=tags) else: self.log.debug('"local" database not in dbnames. Not collecting ReplicationInfo metrics') # get collection level stats try: # Ensure that you're on the right db db = cli[db_name] # grab the collections from the configutation coll_names = instance.get('collections', []) # loop through the collections for coll_name in coll_names: # grab the stats from the collection stats = db.command("collstats", coll_name) # loop through the metrics for m in self.collection_metrics_names: coll_tags = tags + ["db:%s" % db_name, "collection:%s" % coll_name] value = stats.get(m, None) if not value: continue # if it's the index sizes, then it's a dict. if m == 'indexSizes': submit_method, metric_name_alias = self._resolve_metric( 'collection.%s' % m, self.COLLECTION_METRICS ) # loop through the indexes for idx, val in iteritems(value): # we tag the index idx_tags = coll_tags + ["index:%s" % idx] submit_method(self, metric_name_alias, val, tags=idx_tags) else: submit_method, metric_name_alias = self._resolve_metric( 'collection.%s' % m, self.COLLECTION_METRICS ) submit_method(self, metric_name_alias, value, tags=coll_tags) except Exception as e: self.log.warning(u"Failed to record `collection` metrics.") self.log.exception(e) custom_queries = instance.get("custom_queries", []) custom_query_tags = tags + ["db:{}".format(db_name)] for raw_query in custom_queries: try: self._collect_custom_metrics_for_query(db, raw_query, custom_query_tags) except Exception as e: metric_prefix = raw_query.get('metric_prefix') self.log.warning("Errors while collecting custom metrics with prefix %s", metric_prefix, exc_info=e)
def submit_perf_metrics(self, instance, container_tags, container_id, container_stats): try: if container_stats is None: self.log.debug("Empty stats for container %s", container_id) return tags = container_tags[container_id] # CPU metrics cpu_stats = container_stats.get('cpu_stats', {}) prev_cpu_stats = container_stats.get('precpu_stats', {}) value_system = cpu_stats.get('system_cpu_usage') if value_system is not None: self.rate('ecs.fargate.cpu.system', value_system, tags) value_total = cpu_stats.get('cpu_usage', {}).get('total_usage') if value_total is not None: self.rate('ecs.fargate.cpu.user', value_total, tags) prevalue_total = prev_cpu_stats.get('cpu_usage', {}).get('total_usage') prevalue_system = prev_cpu_stats.get('system_cpu_usage') if prevalue_system is not None and prevalue_total is not None: cpu_delta = float(value_total) - float(prevalue_total) system_delta = float(value_system) - float(prevalue_system) else: cpu_delta = 0.0 system_delta = 0.0 active_cpus = float(cpu_stats.get('online_cpus', 0.0)) cpu_percent = 0.0 if system_delta > 0 and cpu_delta > 0 and active_cpus > 0: cpu_percent = (cpu_delta / system_delta) * active_cpus * 100.0 cpu_percent = round_value(cpu_percent, 2) self.gauge('ecs.fargate.cpu.percent', cpu_percent, tags) # Memory metrics memory_stats = container_stats.get('memory_stats', {}) for metric in MEMORY_GAUGE_METRICS: value = memory_stats.get('stats', {}).get(metric) if value is not None and value < CGROUP_NO_VALUE: self.gauge('ecs.fargate.mem.' + metric, value, tags) for metric in MEMORY_RATE_METRICS: value = memory_stats.get('stats', {}).get(metric) if value is not None: self.rate('ecs.fargate.mem.' + metric, value, tags) value = memory_stats.get('max_usage') if value is not None: self.gauge('ecs.fargate.mem.max_usage', value, tags) value = memory_stats.get('usage') if value is not None: self.gauge('ecs.fargate.mem.usage', value, tags) value = memory_stats.get('limit') if value is not None: self.gauge('ecs.fargate.mem.limit', value, tags) # I/O metrics for blkio_cat, metric_name in iteritems(IO_METRICS): read_counter = write_counter = 0 for blkio_stat in container_stats.get("blkio_stats", {}).get(blkio_cat, []): if blkio_stat["op"] == "Read" and "value" in blkio_stat: read_counter += blkio_stat["value"] elif blkio_stat["op"] == "Write" and "value" in blkio_stat: write_counter += blkio_stat["value"] self.rate(metric_name + 'read', read_counter, tags) self.rate(metric_name + 'write', write_counter, tags) except Exception as e: self.warning("Cannot retrieve metrics for %s: %s", container_id, e)
def test_round_modify_sig_digits(self): assert round_value(2.555, precision=2) == 2.560 assert round_value(4.2345, precision=2) == 4.23 assert round_value(4.2345, precision=3) == 4.235
def test_round_modify_method(self): assert round_value(3.5, rounding_method=ROUND_HALF_DOWN) == 3.0
def test_round_half_up(self): assert round_value(3.5) == 4.0
def _check_db(self): conn = self._get_conn(self.instance) # Ping the database for info, and track the latency. # Process the service check: the check passes if we can connect to Redis start = time.time() try: info = conn.info() latency_ms = round_value((time.time() - start) * 1000, 2) tags = list(self.tags) if info.get("role"): tags.append("redis_role:{}".format(info["role"])) else: self.log.debug("Redis role was not found") self.gauge('redis.info.latency_ms', latency_ms, tags=tags) try: config = conn.config_get("maxclients") except redis.ResponseError: # config_get is disabled on some environments self.log.debug("Error querying config") config = {} status = AgentCheck.OK self.service_check('redis.can_connect', status, tags=tags) self._collect_metadata(info) except ValueError: status = AgentCheck.CRITICAL self.service_check('redis.can_connect', status, tags=self.tags) raise except Exception: status = AgentCheck.CRITICAL self.service_check('redis.can_connect', status, tags=self.tags) raise # Save the database statistics. for key in info.keys(): if self.db_key_pattern.match(key): db_tags = tags + ["redis_db:" + key] # allows tracking percentage of expired keys as DD does not # currently allow arithmetic on metric for monitoring expires_keys = info[key]["expires"] total_keys = info[key]["keys"] persist_keys = total_keys - expires_keys self.gauge("redis.persist", persist_keys, tags=db_tags) self.gauge("redis.persist.percent", 100 * persist_keys / total_keys, tags=db_tags) self.gauge("redis.expires.percent", 100 * expires_keys / total_keys, tags=db_tags) for subkey in self.subkeys: # Old redis module on ubuntu 10.04 (python-redis 0.6.1) does not # returns a dict for those key but a string: keys=3,expires=0 # Try to parse it (see lighthouse #46) try: val = info[key].get(subkey, -1) except AttributeError: val = self._parse_dict_string(info[key], subkey, -1) metric = 'redis.{}'.format(subkey) self.gauge(metric, val, tags=db_tags) # Save a subset of db-wide statistics for info_name in info: if info_name in self.GAUGE_KEYS: self.gauge(self.GAUGE_KEYS[info_name], info[info_name], tags=tags) elif info_name in self.RATE_KEYS: self.rate(self.RATE_KEYS[info_name], info[info_name], tags=tags) for config_key, value in iteritems(config): metric_name = self.CONFIG_GAUGE_KEYS.get(config_key) if metric_name is not None: self.gauge(metric_name, value, tags=tags) if self.collect_client_metrics: # Save client connections statistics clients = conn.client_list() clients_by_name = Counter(client["name"] or DEFAULT_CLIENT_NAME for client in clients) for name, count in clients_by_name.items(): self.gauge("redis.net.connections", count, tags=tags + ['source:' + name]) # Save the number of commands. self.rate('redis.net.commands', info['total_commands_processed'], tags=tags) if 'instantaneous_ops_per_sec' in info: self.gauge('redis.net.instantaneous_ops_per_sec', info['instantaneous_ops_per_sec'], tags=tags) # Check some key lengths if asked self._check_key_lengths(conn, list(tags)) # Check replication self._check_replication(info, tags) if self.instance.get("command_stats", False): self._check_command_stats(conn, tags)
def submit_perf_metrics(self, container_tags, container_id, container_stats): try: if container_stats is None: self.log.debug("Empty stats for container %s", container_id) return tags = container_tags[container_id] # CPU metrics cpu_stats = container_stats.get('cpu_stats', {}) prev_cpu_stats = container_stats.get('precpu_stats', {}) value_system = cpu_stats.get('cpu_usage', {}).get('usage_in_kernelmode') if value_system is not None: self.rate('ecs.fargate.cpu.system', value_system, tags) value_user = cpu_stats.get('cpu_usage', {}).get('usage_in_usermode') if value_user is not None: self.rate('ecs.fargate.cpu.user', value_user, tags) value_total = cpu_stats.get('cpu_usage', {}).get('total_usage') if value_total is not None: self.rate('ecs.fargate.cpu.usage', value_total, tags) available_cpu = cpu_stats.get('system_cpu_usage') preavailable_cpu = prev_cpu_stats.get('system_cpu_usage') prevalue_total = prev_cpu_stats.get('cpu_usage', {}).get('total_usage') # This is always false on Windows because the available cpu is not exposed if (available_cpu is not None and preavailable_cpu is not None and value_total is not None and prevalue_total is not None): cpu_delta = float(value_total) - float(prevalue_total) system_delta = float(available_cpu) - float(preavailable_cpu) else: cpu_delta = 0.0 system_delta = 0.0 # Not reported on Windows active_cpus = float(cpu_stats.get('online_cpus', 0.0)) cpu_percent = 0.0 if system_delta > 0 and cpu_delta > 0 and active_cpus > 0: if system_delta > cpu_delta: cpu_percent = (cpu_delta / system_delta) * active_cpus * 100.0 cpu_percent = round_value(cpu_percent, 2) self.gauge('ecs.fargate.cpu.percent', cpu_percent, tags) else: # There is a bug where container CPU usage is occasionally reported as greater than system # CPU usage (which, in fact, represents the maximum available CPU time during this timeframe), # leading to a non-sensical CPU percentage to be reported. To mitigate this we substitute the # system_delta with (t1 - t0)*active_cpus (with a scale factor to convert to nanoseconds) self.log.debug( "Anomalous CPU value for container_id: %s. cpu_percent: %f", container_id, cpu_percent, ) self.log.debug( "ECS container_stats for container_id %s: %s", container_id, container_stats) # example format: '2021-09-22T04:55:52.490012924Z', t1 = container_stats.get('read', '') t0 = container_stats.get('preread', '') try: t_delta = int((parser.isoparse(t1) - parser.isoparse(t0)).total_seconds()) # Simplified formula for cpu_percent where system_delta = t_delta * active_cpus * (10 ** 9) cpu_percent = (cpu_delta / (t_delta * (10**9))) * 100.0 cpu_percent = round_value(cpu_percent, 2) self.gauge('ecs.fargate.cpu.percent', cpu_percent, tags) except ValueError: pass # Memory metrics memory_stats = container_stats.get('memory_stats', {}) for metric in MEMORY_GAUGE_METRICS: value = memory_stats.get('stats', {}).get(metric) if value is not None and value < CGROUP_NO_VALUE: self.gauge('ecs.fargate.mem.' + metric, value, tags) for metric in MEMORY_RATE_METRICS: value = memory_stats.get('stats', {}).get(metric) if value is not None: self.rate('ecs.fargate.mem.' + metric, value, tags) value = memory_stats.get('max_usage') if value is not None: self.gauge('ecs.fargate.mem.max_usage', value, tags) value = memory_stats.get('usage') if value is not None: self.gauge('ecs.fargate.mem.usage', value, tags) value = memory_stats.get('limit') # When there is no hard-limit defined, the ECS API returns that value of 8 EiB # It's not exactly 2^63, but a rounded value of it most probably because of a int->float->int conversion if value is not None and value != 9223372036854771712: self.gauge('ecs.fargate.mem.limit', value, tags) # I/O metrics for blkio_cat, metric_name in iteritems(IO_METRICS): read_counter = write_counter = 0 blkio_stats = container_stats.get("blkio_stats", {}).get(blkio_cat) # In Windows is always "None" (string), so don't report anything if blkio_stats == 'None': continue elif blkio_stats is None: blkio_stats = [] for blkio_stat in blkio_stats: if blkio_stat["op"] == "Read" and "value" in blkio_stat: read_counter += blkio_stat["value"] elif blkio_stat["op"] == "Write" and "value" in blkio_stat: write_counter += blkio_stat["value"] self.rate(metric_name + 'read', read_counter, tags) self.rate(metric_name + 'write', write_counter, tags) # Network metrics networks = container_stats.get('networks', {}) for network_interface, network_stats in iteritems(networks): network_tags = tags + [ "interface:{}".format(network_interface) ] for field_name, metric_name in iteritems( NETWORK_GAUGE_METRICS): metric_value = network_stats.get(field_name) if metric_value is not None: self.gauge(metric_name, metric_value, network_tags) for field_name, metric_name in iteritems(NETWORK_RATE_METRICS): metric_value = network_stats.get(field_name) if metric_value is not None: self.rate(metric_name, metric_value, network_tags) except Exception as e: self.warning("Cannot retrieve metrics for %s: %s", container_id, e)