def prometheus_cleanup_worker(pid): """Aggregate dead worker's metrics into a single archive file.""" from prometheus_client import multiprocess multiprocess.mark_process_dead(pid) # this takes care of gauges prom_dir = os.environ['prometheus_multiproc_dir'] worker_files = [ 'histogram_{}.db'.format(pid), 'counter_{}.db'.format(pid), ] paths = _filter_exists(os.path.join(prom_dir, f) for f in worker_files) # check at least one worker file exists if not paths: return histogram_path = os.path.join(prom_dir, histogram_archive) counter_path = os.path.join(prom_dir, counter_archive) archive_paths = _filter_exists([histogram_path, counter_path]) collect_paths = paths + archive_paths collector = multiprocess.MultiProcessCollector(None) try: metrics = collector.merge(collect_paths, accumulate=False) except AttributeError: metrics = legacy_collect(collect_paths) tmp_histogram = tempfile.NamedTemporaryFile(delete=False) tmp_counter = tempfile.NamedTemporaryFile(delete=False) write_metrics(metrics, tmp_histogram.name, tmp_counter.name) try: # ensure reader does get partial state with try_prometheus_lock(): os.rename(tmp_histogram.name, histogram_path) os.rename(tmp_counter.name, counter_path) for path in paths: os.unlink(path) except PrometheusLockTimeout: logging.getLogger(__name__).exception( 'Failed to acquire prometheus lock to clean worker files', extra={ 'pid': pid, 'paths': paths, } )
def serve(self, args): threading.currentThread().setName('master') if SETPROCTITLE: setproctitle.setproctitle(args.process_name + ' master %s' % ' '.join(sys.argv[1:])) # Initialize logging, keep this at the beginning! self.init_logging(args.log_level) for f in glob.glob(os.path.join(args.socket_path, 'rest*.sock')): os.unlink(f) for f in glob.glob(os.path.join(args.socket_path, 'notify*.sock')): os.unlink(f) # Initialize translations self.translations = self.get_translations(args.translations_path) if not self.translations: logging.warning( 'no po files found, no translations will be available') else: # TODO: lazy-logging, info message? logging.debug("translations available for: '%s'", ', '.join(self.translations.keys())) if not UJSON: logging.warning( 'ujson module is not available, falling back to slower stdlib json implementation' ) logging.info('starting kopano-mfr') # Fake exit queue. queue = multiprocessing.JoinableQueue(1) queue.put(True) workers = [] for n in range(args.workers): rest_runner = Runner(queue, self.run_rest, 'rest', args.process_name, n) rest_process = multiprocessing.Process(target=rest_runner.run, name='rest{}'.format(n), args=(args.socket_path, n, args)) workers.append(rest_process) notify_runner = Runner(queue, self.run_notify, 'notify', args.process_name, n) notify_process = multiprocessing.Process(target=notify_runner.run, name='notify{}'.format(n), args=(args.socket_path, n, args)) workers.append(notify_process) for worker in workers: worker.daemon = True worker.start() if args.insecure: logging.warning( 'insecure mode - TLS client connections are susceptible to man-in-the-middle attacks and safety checks are off - this is not suitable for production use' ) if args.with_experimental: logging.warning('experimental endpoints are enabled') if args.with_metrics: if PROMETHEUS: if os.environ.get('prometheus_multiproc_dir'): # Spawn the metrics process later, so we can pass along worker name and pids. monitor_workers = [(worker.name, worker.pid) for worker in workers] # Include master process. monitor_workers.append(('master', os.getpid())) metrics_runner = Runner(queue, self.run_metrics, 'metrics', args.process_name, 0) metrics_process = multiprocessing.Process( target=metrics_runner.run, args=(args.socket_path, args, monitor_workers)) metrics_process.daemon = True metrics_process.start() workers.append(metrics_process) else: logging.error('please export "prometheus_multiproc_dir"') self.running = False else: logging.error( 'please install prometheus client python bindings') self.running = False signal.signal(signal.SIGCHLD, self.sigchld) signal.signal(signal.SIGTERM, self.sigterm) try: while self.running: signal.pause() except KeyboardInterrupt: self.running = False logging.info('keyboard interrupt') logging.info('starting shutdown') signal.signal(signal.SIGCHLD, signal.SIG_IGN) if not self.abnormal_shutdown: # Flush queue, to tell workers to cleanly exit. queue.get() try: queue.task_done() except ValueError: # NOTE(longsleep): If a process encountered an error taks_done() was # already called, thus it errors which is ok and can be ignored. pass # Wait for workers to exit. deadline = time.monotonic() + 5 done = [] while deadline > time.monotonic(): ready = multiprocessing.connection.wait([ worker.sentinel for worker in workers if worker.sentinel not in done ], timeout=1) done.extend(ready) if len(done) == len(workers): break # Kill off workers which did not exit. kill = len(done) != len(workers) for worker in workers: if kill and worker.is_alive(): if self.abnormal_shutdown: logging.critical('killing worker: %d', worker.pid) os.kill(worker.pid, signal.SIGKILL) else: logging.warning('terminating worker: %d', worker.pid) worker.terminate() if os.environ.get('prometheus_multiproc_dir' ) and args.with_metrics and PROMETHEUS: prometheus_multiprocess.mark_process_dead(worker.pid) worker.join() # Cleanup potentially left over sockets. sockets = [] for n in range(args.workers): sockets.append('rest%d.sock' % n) for n in range(args.workers): sockets.append('notify%d.sock' % n) for socket in sockets: # noqa: F402 try: unix_socket = os.path.join(args.socket_path, socket) os.unlink(unix_socket) except OSError as err: if err.errno != errno.ENOENT: logging.warning( 'failed to remove socket %s on shutdown, error: %s', unix_socket, err) logging.info('shutdown complete')
def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid)
def child_exit(server, worker): from prometheus_client import multiprocess multiprocess.mark_process_dead(worker.pid)
def test_mark_process_dead_respects_lowercase(self): os.environ['prometheus_multiproc_dir'] = self.tempdir # Just test that this does not raise with a lowercase env var. The # logic is tested elsewhere. mark_process_dead(123)
async def after_server_stop(*args: Any, **kwargs: Any) -> None: multiprocess.mark_process_dead(os.getpid())
def main(): global RUNNING options, args = opt_args() if SETPROCTITLE: setproctitle.setproctitle(options.process_name + ' master') socket_path = options.socket_path or SOCKET_PATH nworkers = options.workers if options.workers is not None else WORKERS create_pidfile(options.pid_file) for f in glob.glob(os.path.join(socket_path, 'rest*.sock')): os.unlink(f) for f in glob.glob(os.path.join(socket_path, 'notify*.sock')): os.unlink(f) q_listener, q = logger_init() logging.info('starting kopano-mfr') workers = [] for n in range(nworkers): process = multiprocessing.Process(target=run_app, args=(socket_path, n, options)) workers.append(process) notify_process = multiprocessing.Process(target=run_notify, args=(socket_path, options)) workers.append(notify_process) if options.with_metrics: if PROMETHEUS: if not os.environ.get('prometheus_multiproc_dir'): logging.error('please export "prometheus_multiproc_dir"') sys.exit(-1) metrics_process = multiprocessing.Process(target=run_metrics, args=(socket_path, options)) workers.append(metrics_process) else: logging.error('please install prometheus client python bindings') sys.exit(-1) for worker in workers: worker.daemon = True worker.start() signal.signal(signal.SIGCHLD, sigchld) signal.signal(signal.SIGTERM, sigterm) try: while RUNNING: signal.pause() except KeyboardInterrupt: RUNNING = False logging.info('keyboard interrupt') logging.info('starting shutdown') for worker in workers: worker.terminate() worker.join() q_listener.stop() sockets = [] for n in range(nworkers): sockets.append('rest%d.sock' % n) sockets.append('notify.sock') for socket in sockets: try: unix_socket = os.path.join(socket_path, socket) os.unlink(unix_socket) except OSError: pass if options.with_metrics: for worker in workers: multiprocess.mark_process_dead(worker.pid) logging.info('shutdown complete')
def cleanup_prometheus_files_at_exit(): if PROMETHEUS_MULTIPROC_DIR: multiprocess.mark_process_dead(os.getpid())