Exemplo n.º 1
0
def prometheus_cleanup_worker(pid):
    """Aggregate dead worker's metrics into a single archive file."""
    from prometheus_client import multiprocess
    multiprocess.mark_process_dead(pid)  # this takes care of gauges
    prom_dir = os.environ['prometheus_multiproc_dir']
    worker_files = [
        'histogram_{}.db'.format(pid),
        'counter_{}.db'.format(pid),
    ]
    paths = _filter_exists(os.path.join(prom_dir, f) for f in worker_files)

    # check at least one worker file exists
    if not paths:
        return

    histogram_path = os.path.join(prom_dir, histogram_archive)
    counter_path = os.path.join(prom_dir, counter_archive)
    archive_paths = _filter_exists([histogram_path, counter_path])

    collect_paths = paths + archive_paths
    collector = multiprocess.MultiProcessCollector(None)

    try:
        metrics = collector.merge(collect_paths, accumulate=False)
    except AttributeError:
        metrics = legacy_collect(collect_paths)

    tmp_histogram = tempfile.NamedTemporaryFile(delete=False)
    tmp_counter = tempfile.NamedTemporaryFile(delete=False)
    write_metrics(metrics, tmp_histogram.name, tmp_counter.name)

    try:
        # ensure reader does get partial state
        with try_prometheus_lock():
            os.rename(tmp_histogram.name, histogram_path)
            os.rename(tmp_counter.name, counter_path)

            for path in paths:
                os.unlink(path)
    except PrometheusLockTimeout:
        logging.getLogger(__name__).exception(
            'Failed to acquire prometheus lock to clean worker files',
            extra={
                'pid': pid,
                'paths': paths,
            }
        )
Exemplo n.º 2
0
    def serve(self, args):
        threading.currentThread().setName('master')
        if SETPROCTITLE:
            setproctitle.setproctitle(args.process_name +
                                      ' master %s' % ' '.join(sys.argv[1:]))

        # Initialize logging, keep this at the beginning!
        self.init_logging(args.log_level)

        for f in glob.glob(os.path.join(args.socket_path, 'rest*.sock')):
            os.unlink(f)
        for f in glob.glob(os.path.join(args.socket_path, 'notify*.sock')):
            os.unlink(f)

        # Initialize translations
        self.translations = self.get_translations(args.translations_path)

        if not self.translations:
            logging.warning(
                'no po files found, no translations will be available')
        else:
            # TODO: lazy-logging, info message?
            logging.debug("translations available for: '%s'",
                          ', '.join(self.translations.keys()))

        if not UJSON:
            logging.warning(
                'ujson module is not available, falling back to slower stdlib json implementation'
            )

        logging.info('starting kopano-mfr')

        # Fake exit queue.
        queue = multiprocessing.JoinableQueue(1)
        queue.put(True)

        workers = []
        for n in range(args.workers):
            rest_runner = Runner(queue, self.run_rest, 'rest',
                                 args.process_name, n)
            rest_process = multiprocessing.Process(target=rest_runner.run,
                                                   name='rest{}'.format(n),
                                                   args=(args.socket_path, n,
                                                         args))
            workers.append(rest_process)
            notify_runner = Runner(queue, self.run_notify, 'notify',
                                   args.process_name, n)
            notify_process = multiprocessing.Process(target=notify_runner.run,
                                                     name='notify{}'.format(n),
                                                     args=(args.socket_path, n,
                                                           args))
            workers.append(notify_process)

        for worker in workers:
            worker.daemon = True
            worker.start()

        if args.insecure:
            logging.warning(
                'insecure mode - TLS client connections are susceptible to man-in-the-middle attacks and safety checks are off - this is not suitable for production use'
            )

        if args.with_experimental:
            logging.warning('experimental endpoints are enabled')

        if args.with_metrics:
            if PROMETHEUS:
                if os.environ.get('prometheus_multiproc_dir'):
                    # Spawn the metrics process later, so we can pass along worker name and pids.
                    monitor_workers = [(worker.name, worker.pid)
                                       for worker in workers]
                    # Include master process.
                    monitor_workers.append(('master', os.getpid()))
                    metrics_runner = Runner(queue, self.run_metrics, 'metrics',
                                            args.process_name, 0)
                    metrics_process = multiprocessing.Process(
                        target=metrics_runner.run,
                        args=(args.socket_path, args, monitor_workers))
                    metrics_process.daemon = True
                    metrics_process.start()
                    workers.append(metrics_process)
                else:
                    logging.error('please export "prometheus_multiproc_dir"')
                    self.running = False
            else:
                logging.error(
                    'please install prometheus client python bindings')
                self.running = False

        signal.signal(signal.SIGCHLD, self.sigchld)
        signal.signal(signal.SIGTERM, self.sigterm)

        try:
            while self.running:
                signal.pause()
        except KeyboardInterrupt:
            self.running = False
            logging.info('keyboard interrupt')

        logging.info('starting shutdown')

        signal.signal(signal.SIGCHLD, signal.SIG_IGN)

        if not self.abnormal_shutdown:
            # Flush queue, to tell workers to cleanly exit.
            queue.get()
            try:
                queue.task_done()
            except ValueError:
                # NOTE(longsleep): If a process encountered an error taks_done() was
                # already called, thus it errors which is ok and can be ignored.
                pass

        # Wait for workers to exit.
        deadline = time.monotonic() + 5
        done = []
        while deadline > time.monotonic():
            ready = multiprocessing.connection.wait([
                worker.sentinel
                for worker in workers if worker.sentinel not in done
            ],
                                                    timeout=1)
            done.extend(ready)
            if len(done) == len(workers):
                break

        # Kill off workers which did not exit.
        kill = len(done) != len(workers)
        for worker in workers:
            if kill and worker.is_alive():
                if self.abnormal_shutdown:
                    logging.critical('killing worker: %d', worker.pid)
                    os.kill(worker.pid, signal.SIGKILL)
                else:
                    logging.warning('terminating worker: %d', worker.pid)
                    worker.terminate()
            if os.environ.get('prometheus_multiproc_dir'
                              ) and args.with_metrics and PROMETHEUS:
                prometheus_multiprocess.mark_process_dead(worker.pid)
            worker.join()

        # Cleanup potentially left over sockets.
        sockets = []
        for n in range(args.workers):
            sockets.append('rest%d.sock' % n)
        for n in range(args.workers):
            sockets.append('notify%d.sock' % n)
        for socket in sockets:  # noqa: F402
            try:
                unix_socket = os.path.join(args.socket_path, socket)
                os.unlink(unix_socket)
            except OSError as err:
                if err.errno != errno.ENOENT:
                    logging.warning(
                        'failed to remove socket %s on shutdown, error: %s',
                        unix_socket, err)

        logging.info('shutdown complete')
Exemplo n.º 3
0
def child_exit(server, worker):
    multiprocess.mark_process_dead(worker.pid)
Exemplo n.º 4
0
def child_exit(server, worker):
    from prometheus_client import multiprocess
    multiprocess.mark_process_dead(worker.pid)
 def test_mark_process_dead_respects_lowercase(self):
     os.environ['prometheus_multiproc_dir'] = self.tempdir
     # Just test that this does not raise with a lowercase env var. The
     # logic is tested elsewhere.
     mark_process_dead(123)
Exemplo n.º 6
0
 async def after_server_stop(*args: Any, **kwargs: Any) -> None:
     multiprocess.mark_process_dead(os.getpid())
Exemplo n.º 7
0
def main():
    global RUNNING

    options, args = opt_args()

    if SETPROCTITLE:
        setproctitle.setproctitle(options.process_name + ' master')

    socket_path = options.socket_path or SOCKET_PATH
    nworkers = options.workers if options.workers is not None else WORKERS

    create_pidfile(options.pid_file)

    for f in glob.glob(os.path.join(socket_path, 'rest*.sock')):
        os.unlink(f)
    for f in glob.glob(os.path.join(socket_path, 'notify*.sock')):
        os.unlink(f)

    q_listener, q = logger_init()
    logging.info('starting kopano-mfr')

    workers = []
    for n in range(nworkers):
        process = multiprocessing.Process(target=run_app, args=(socket_path, n, options))
        workers.append(process)

    notify_process = multiprocessing.Process(target=run_notify, args=(socket_path, options))
    workers.append(notify_process)

    if options.with_metrics:
        if PROMETHEUS:
            if not os.environ.get('prometheus_multiproc_dir'):
                logging.error('please export "prometheus_multiproc_dir"')
                sys.exit(-1)

            metrics_process = multiprocessing.Process(target=run_metrics, args=(socket_path, options))
            workers.append(metrics_process)
        else:
            logging.error('please install prometheus client python bindings')
            sys.exit(-1)

    for worker in workers:
        worker.daemon = True
        worker.start()

    signal.signal(signal.SIGCHLD, sigchld)
    signal.signal(signal.SIGTERM, sigterm)

    try:
        while RUNNING:
            signal.pause()
    except KeyboardInterrupt:
        RUNNING = False
        logging.info('keyboard interrupt')

    logging.info('starting shutdown')

    for worker in workers:
        worker.terminate()
        worker.join()

    q_listener.stop()

    sockets = []
    for n in range(nworkers):
        sockets.append('rest%d.sock' % n)
    sockets.append('notify.sock')
    for socket in sockets:
        try:
            unix_socket = os.path.join(socket_path, socket)
            os.unlink(unix_socket)
        except OSError:
            pass

    if options.with_metrics:
        for worker in workers:
            multiprocess.mark_process_dead(worker.pid)

    logging.info('shutdown complete')
Exemplo n.º 8
0
def cleanup_prometheus_files_at_exit():
    if PROMETHEUS_MULTIPROC_DIR:
        multiprocess.mark_process_dead(os.getpid())