def _update_app_rrds(data, app_metrics_dir, rrdclient, step, tm_env): """Update core services rrds""" interval = int(step) * 2 total = 0 for app_unique_name in data: try: localdisk = tm_env.svc_localdisk.get(app_unique_name) blkio_major_minor = '{major}:{minor}'.format( major=localdisk['dev_major'], minor=localdisk['dev_minor'], ) except (exc.TreadmillError, IOError, OSError): blkio_major_minor = None rrdfile = os.path.join(app_metrics_dir, '{app}.rrd'.format(app=app_unique_name)) _LOGGER.debug('Update %s metrics from maj:min %s', app_unique_name, blkio_major_minor) rrd.prepare(rrdclient, rrdfile, step, interval) if rrd.update(rrdclient, rrdfile, data[app_unique_name], blkio_major_minor): total += 1 _LOGGER.debug('Updated %d container metrics', total) return total
def _update_core_rrds(data, core_metrics_dir, rrdclient, step, sys_maj_min): """Update core rrds""" interval = int(step) * 2 total = 0 for cgrp in data: rrd_basename = CORE_RRDS[cgrp] rrdfile = os.path.join(core_metrics_dir, rrd_basename) rrd.prepare(rrdclient, rrdfile, step, interval) if rrd.update(rrdclient, rrdfile, data[cgrp], sys_maj_min): total += 1 return total
def _update_service_rrds(data, core_metrics_dir, rrdclient, step, sys_maj_min): """Update core services rrds""" interval = int(step) * 2 total = 0 for svc in data: rrdfile = os.path.join(core_metrics_dir, '{svc}.rrd'.format(svc=svc)) rrd.prepare(rrdclient, rrdfile, step, interval) if rrd.update(rrdclient, rrdfile, data[svc], sys_maj_min): total += 1 _LOGGER.debug('Updated %d service metrics from maj:min %s', total, sys_maj_min) return total
def metrics(step, approot): """Collect node and container metrics.""" tm_env = appenv.AppEnvironment(root=approot) app_metrics_dir = os.path.join(tm_env.metrics_dir, 'apps') core_metrics_dir = os.path.join(tm_env.metrics_dir, 'core') fs.mkdir_safe(app_metrics_dir) fs.mkdir_safe(core_metrics_dir) interval = int(step) * 2 rrdclient = rrdutils.RRDClient('/tmp/treadmill.rrd') # Initiate the list for monitored applications monitored_apps = set( os.path.basename(metric_name)[:-len('.rrd')] for metric_name in glob.glob('%s/*' % app_metrics_dir) if metric_name.endswith('.rrd')) sys_svcs = _core_svcs(approot) sys_svcs_no_metrics = set() sys_maj_min = '{}:{}'.format(*fs.path_to_maj_min(approot)) sys_block_dev = fs.maj_min_to_blk(*fs.path_to_maj_min(approot)) _LOGGER.info('Device %s maj:min = %s for approot: %s', sys_block_dev, sys_maj_min, approot) core_rrds = [ 'treadmill.apps.rrd', 'treadmill.core.rrd', 'treadmill.system.rrd' ] for core_rrd in core_rrds: rrdfile = os.path.join(core_metrics_dir, core_rrd) if not os.path.exists(rrdfile): rrdclient.create(rrdfile, step, interval) while True: starttime_sec = time.time() rrd.update(rrdclient, os.path.join(core_metrics_dir, 'treadmill.apps.rrd'), 'treadmill/apps', sys_maj_min, sys_block_dev) rrd.update(rrdclient, os.path.join(core_metrics_dir, 'treadmill.core.rrd'), 'treadmill/core', sys_maj_min, sys_block_dev) rrd.update(rrdclient, os.path.join(core_metrics_dir, 'treadmill.system.rrd'), 'treadmill', sys_maj_min, sys_block_dev) count = 3 for svc in sys_svcs: if svc in sys_svcs_no_metrics: continue rrdfile = os.path.join(core_metrics_dir, '{svc}.rrd'.format(svc=svc)) if not os.path.exists(rrdfile): rrdclient.create(rrdfile, step, interval) svc_cgrp = os.path.join('treadmill', 'core', svc) rrd.update(rrdclient, rrdfile, svc_cgrp, sys_maj_min, sys_block_dev) count += 1 seen_apps = set() for app_dir in glob.glob('%s/*' % tm_env.apps_dir): if not os.path.isdir(app_dir): continue app_unique_name = os.path.basename(app_dir) seen_apps.add(app_unique_name) try: localdisk = tm_env.svc_localdisk.get(app_unique_name) blkio_major_minor = '{major}:{minor}'.format( major=localdisk['dev_major'], minor=localdisk['dev_minor'], ) block_dev = localdisk['block_dev'] except (exc.TreadmillError, IOError, OSError): blkio_major_minor = None block_dev = None rrd_file = os.path.join( app_metrics_dir, '{app}.rrd'.format(app=app_unique_name)) if not os.path.exists(rrd_file): rrdclient.create(rrd_file, step, interval) app_cgrp = os.path.join('treadmill', 'apps', app_unique_name) rrd.update(rrdclient, rrd_file, app_cgrp, blkio_major_minor, block_dev) count += 1 for app_unique_name in monitored_apps - seen_apps: # Removed metrics for apps that are not present anymore rrd_file = os.path.join( app_metrics_dir, '{app}.rrd'.format(app=app_unique_name)) _LOGGER.info('removing %r', rrd_file) rrdclient.forget(rrd_file) os.unlink(rrd_file) monitored_apps = seen_apps second_used = time.time() - starttime_sec _LOGGER.info('Got %d cgroups metrics in %.3f seconds', count, second_used) if step > second_used: time.sleep(step - second_used) # Gracefull shutdown. _LOGGER.info('service shutdown.')