def _kill_process(leash): leash.terminate() time.sleep(3) if leash.returncode is None: log.error('Process %s did not stop gracefully, killing...' % leash.pid) leash.kill() leash.wait()
def get_recording(data_dir, filename, exists=False, not_exists=False, ensure_path=False): filename = os.path.abspath(os.path.join(data_dir, filename)) if not filename.startswith(data_dir): log.error('Requested recording %s is outside of data root' % filename) raise error.ControlPlaneError('No such recording') if exists and not os.path.exists(filename): log.error('Requested recording %s does not exist' % filename) raise error.ControlPlaneError('No such recording') if not_exists and os.path.exists(filename): log.error('Requested recording %s unexpectedly exists' % filename) raise error.ControlPlaneError('No such recording') if ensure_path: directory = os.path.dirname(filename) if not os.path.exists(directory): try: os.makedirs(directory) except OSError as exc: log.error('Failed to create %s: %s', directory, exc) raise error.ControlPlaneError('No such recording') elif not os.path.isdir(directory): raise error.ControlPlaneError('No such recording') return os.path.dirname(filename), os.path.basename(filename)
def import_metrics(jsondoc): """Update metrics DB from `dict` data structure. The input data structure is expected to be the one produced by SNMP simulator's command responder `fulljson` reporting module. """ flavor = jsondoc.get('format') importer = KNOWN_IMPORTERS.get(flavor) if not importer: log.error('Unknown metric flavor %s, ' 'ignoring' % flavor or '<unspecified>') return try: importer(jsondoc) except Exception as exc: log.error('Metric importer %s failed: %s' % (flavor, exc)) log.error('JSON document causing failure is: %s' % jsondoc) db.session.rollback()
def watch_metrics(watch_dir): log.info('Watching directory %s' % watch_dir) while True: try: files = _traverse_dir(watch_dir) except Exception as exc: log.error('Directory %s traversal failure: %s' % (watch_dir, exc)) time.sleep(10) continue for filename in files: log.info('Processing %s' % filename) try: with open(filename) as fl: jsondoc = json.loads(fl.read()) except Exception as exc: log.error('Error reading file %s: %s' % (filename, exc)) continue finally: os.unlink(filename) try: manager.import_metrics(jsondoc) except Exception as exc: log.error('Error processing file %s: %s' % (filename, exc)) continue time.sleep(POLL_PERIOD)
def manage_executables(watch_dir): known_instances = {} log.info('Watching directory %s' % watch_dir) while True: # Collect and log processes output rlist = { x['pipe'][0]: x['executable'] for x in known_instances.values() if x['state'] == STATE_RUNNING } while True: try: r, w, x = select.select(rlist, [], [], 0.1) except Exception as exc: log.error(exc) break if not r: break timestamp = int(time.time()) for fd in r: executable = rlist[fd] instance = known_instances[executable] console = instance['console'] log.msg('Output from process "%s" begins' % executable) page_text = os.read(fd, console.MAX_CONSOLE_SIZE) page_text = page_text.decode(errors='ignore') console.add(page_text, timestamp) log.msg(page_text) log.msg('Output from process "%s" ends' % executable) # Watch executables existing_files = set() try: files = _traverse_dir(watch_dir) except Exception as exc: log.error('Directory %s traversal failure: %s' % (watch_dir, exc)) time.sleep(10) continue for fl in files: instance = known_instances.get(fl) stat = os.stat(fl).st_mtime if not instance: instance = { 'pid': 0, 'executable': fl, 'file_info': stat, 'leash': None, 'pipe': (None, None), 'state': STATE_ADDED, 'created': time.time(), 'started': None, 'stopped': None, 'runtime': lifecycle.Counter(0), 'changes': lifecycle.Counter(0), 'exits': lifecycle.Counter(0), 'console': lifecycle.ConsoleLog(), } known_instances[fl] = instance log.info('Start tracking executable %s' % fl) pid = instance['leash'].pid if instance['leash'] else '?' if instance['file_info'] != stat: instance['file_info'] = stat instance['state'] = STATE_CHANGED instance['changes'] += 1 log.info('Existing executable %s (PID %s) has ' 'changed' % (fl, pid)) if instance['state'] == STATE_RUNNING: executable = instance['leash'] executable.poll() if executable.returncode is not None: instance['state'] = STATE_DIED instance['stopped'] = time.time() instance['exits'] += 1 uptime = int(time.time() - instance['started'] or time.time()) log.info('Executable %s (PID %s) has died ' '(rc=%s), uptime %s' % (fl, pid, executable.returncode, uptime)) existing_files.add(fl) removed_files = set(known_instances) - existing_files for fl in removed_files: instance = known_instances[fl] instance['state'] = STATE_REMOVED instance['changes'] += 1 log.info('Existing executable %s (PID %s) has been ' 'removed' % (fl, instance['pid'])) for fl, instance in tuple(known_instances.items()): state = instance['state'] if state in (STATE_ADDED, STATE_DIED): if state == STATE_DIED: r, w = instance['pipe'] try: os.close(r) os.close(w) except OSError as exc: log.error(exc) r, w = os.pipe() leash = _run_process(fl, w) instance['leash'] = leash instance['pipe'] = r, w if leash: instance['state'] = STATE_RUNNING instance['started'] = time.time() instance['pid'] = leash.pid log.info('Executable %s (PID %s) has been ' 'started' % (fl, leash.pid)) elif state in (STATE_CHANGED, STATE_REMOVED): leash = instance['leash'] if leash: _kill_process(leash) log.info('Executable %s (PID %s) has been ' 'stopped' % (fl, leash.pid)) r, w = instance['pipe'] if r: try: os.close(r) os.close(w) except OSError as exc: log.error(exc) if state == STATE_CHANGED: instance['state'] = STATE_DIED else: known_instances.pop(fl) log.info('Stopped tracking executable %s' % fl) elif state == STATE_RUNNING: leash = instance['leash'] if _process_is_running(leash): now = time.time() instance['runtime'] = lifecycle.Counter( now - instance['created']) else: instance['state'] = STATE_DIED instance['exits'] += 1 log.info('Executable %s (PID %s) has ' 'died' % (fl, leash.pid)) ReportingManager.process_metrics(watch_dir, *known_instances.values()) time.sleep(POLL_PERIOD)
def _run_process(fl, fd): try: return subprocess.Popen([fl], stdout=fd, stderr=fd) except Exception as exc: log.error('Executable %s failed to start: %s' % (fl, exc))
def collect_metrics(*instances): """Collect process metrics. Example ------- .. code-block:: { 'executable': '/path/to/executable', 'memory': 0, # memory being used (MB, gauge) 'cpu': 0, # consumed cpu time (ms, cumulative) 'files': 0, # number of open files (gauge) 'runtime': 0, # total time this executable has been running # (cumulative) 'exits': 0, # number of unexpected exits (cumulative) 'restarts': 0, # number of restarts because of changes # (cumulative) 'endpoints': { # allocated network endpoints (gauge) 'udpv4': [ '127.0.0.1:161', '127.0.0.2:161 ] }, 'console': [ { 'timestamp': {time}, 'text': '{text} } ] } """ all_metrics = [] for instance in instances: pid = instance['pid'] if not pid: continue try: process = psutil.Process(pid) process_info = process.as_dict() endpoints = collections.defaultdict(list) for kind in ENDPOINT_MAP: for conn in process.connections(kind): endpoints[ENDPOINT_MAP[kind]].append( '%s:%s' % (conn.laddr.ip, conn.laddr.port) ) except psutil.Error as exc: log.error(exc) continue endpoints = collections.defaultdict(list) for kind in ENDPOINT_MAP: for conn in process.connections(kind): endpoints[ENDPOINT_MAP[kind]].append( '%s:%s' % (conn.laddr.ip, conn.laddr.port) ) metrics = { 'memory': lifecycle.Gauge( process_info['memory_info'].vms // 1024 // 1024), 'cpu': lifecycle.Counter( (process_info['cpu_times'].user + process_info['cpu_times'].system) * 1000), 'endpoints': endpoints, 'files': lifecycle.Gauge(process_info['num_fds']), } metrics.update( **{metric: instance[metric] for metric in LIFECYCLE_METRICS}) all_metrics.append(metrics) return all_metrics