def _start_prod_batching_server( saved_bundle_path: str, api_server_port: int, config: BentoMLConfiguration, prometheus_lock: Optional[multiprocessing.Lock] = None, ): logger.info("Starting BentoML Batching server in production mode..") container = BentoMLContainer() container.config.from_dict(config.as_dict()) from bentoml import marshal from bentoml.server.marshal_server import GunicornMarshalServer container.wire(packages=[sys.modules[__name__], marshal]) # avoid load model before gunicorn fork marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, ) marshal_server.run()
def _start_prod_proxy( saved_bundle_path: str, port: int, api_server_port: int, workers: int, timeout: int, outbound_workers: int, enable_microbatch: bool, mb_max_batch_size: int, mb_max_latency: int, prometheus_lock: Optional[multiprocessing.Lock] = None, ): logger.info("Starting BentoML proxy in production mode..") from bentoml.server.marshal_server import GunicornMarshalServer # avoid load model before gunicorn fork marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, prometheus_lock=prometheus_lock, port=port, workers=workers, timeout=timeout, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=outbound_workers, enable_microbatch=enable_microbatch, mb_max_batch_size=mb_max_batch_size, mb_max_latency=mb_max_latency, ) marshal_server.run()
def serve_gunicorn( port, workers, timeout, bento=None, with_conda=False, enable_microbatch=False, microbatch_workers=1, ): track_cli('serve_gunicorn') bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path) if with_conda: run_with_conda_env( pip_installed_bundle_path, 'bentoml serve_gunicorn {bento} -p {port} -w {workers} ' '--timeout {timeout} {flags}'.format( bento=bento_service_bundle_path, port=port, workers=workers, timeout=timeout, flags="--enable-microbatch" if enable_microbatch else "", ), ) return if workers is None: workers = get_gunicorn_num_of_workers() from bentoml.server.gunicorn_server import GunicornBentoServer if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=bento_service_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( bento_service_bundle_path, api_server_port, workers, timeout, prometheus_lock, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(bento_service_bundle_path, port, workers, timeout) gunicorn_app.run()
def start_prod_server( saved_bundle_path: str, port: int, timeout: int, workers: int, enable_microbatch: bool, microbatch_workers: int, enable_swagger: bool, ): logger.info("Starting BentoML API server in production mode..") import psutil import multiprocessing assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer from bentoml.server.utils import get_gunicorn_num_of_workers from bentoml.utils import reserve_free_port if workers is None: workers = get_gunicorn_num_of_workers() if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( saved_bundle_path, api_server_port, workers, timeout, prometheus_lock, enable_swagger, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(saved_bundle_path, port, workers, timeout, enable_swagger=enable_swagger) gunicorn_app.run()
def serve_gunicorn( port, workers, timeout, bento=None, with_conda=False, enable_microbatch=False, microbatch_workers=1, ): if not psutil.POSIX: _echo( "The `bentoml server-gunicon` command is only supported on POSIX. " "On windows platform, use `bentoml serve` for local API testing and " "docker for running production API endpoint: " "https://docs.docker.com/docker-for-windows/ " ) return bento_service_bundle_path = resolve_bundle_path( bento, pip_installed_bundle_path ) if with_conda: return run_with_conda_env( pip_installed_bundle_path, 'bentoml serve_gunicorn {bento} -p {port} -w {workers} ' '--timeout {timeout} {flags}'.format( bento=bento_service_bundle_path, port=port, workers=workers, timeout=timeout, flags="--enable-microbatch" if enable_microbatch else "", ), ) if workers is None: workers = get_gunicorn_num_of_workers() # Gunicorn only supports POSIX platforms from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=bento_service_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, ) gunicorn_app = GunicornBentoServer( bento_service_bundle_path, api_server_port, workers, timeout, prometheus_lock, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer( bento_service_bundle_path, port, workers, timeout ) gunicorn_app.run()
def start_prod_server( saved_bundle_path: str, port: int = Provide[BentoMLContainer.config.api_server.port], timeout: int = Provide[BentoMLContainer.config.api_server.timeout], workers: int = Provide[BentoMLContainer.api_server_workers], enable_microbatch: bool = Provide[ BentoMLContainer.config.api_server.enable_microbatch], mb_max_batch_size: int = Provide[ BentoMLContainer.config.marshal_server.max_batch_size], mb_max_latency: int = Provide[ BentoMLContainer.config.marshal_server.max_latency], microbatch_workers: int = Provide[ BentoMLContainer.config.marshal_server.workers], enable_swagger: bool = Provide[ BentoMLContainer.config.api_server.enable_swagger], ): logger.info("Starting BentoML API server in production mode..") import multiprocessing import psutil assert ( psutil.POSIX ), "BentoML API Server production mode only supports POSIX platforms" from bentoml.server.gunicorn_server import GunicornBentoServer from bentoml.server.marshal_server import GunicornMarshalServer from bentoml.utils import reserve_free_port if enable_microbatch: prometheus_lock = multiprocessing.Lock() # avoid load model before gunicorn fork with reserve_free_port() as api_server_port: marshal_server = GunicornMarshalServer( bundle_path=saved_bundle_path, port=port, workers=microbatch_workers, prometheus_lock=prometheus_lock, outbound_host="localhost", outbound_port=api_server_port, outbound_workers=workers, mb_max_batch_size=mb_max_batch_size, mb_max_latency=mb_max_latency, ) gunicorn_app = GunicornBentoServer( saved_bundle_path, api_server_port, workers, timeout, prometheus_lock, enable_swagger, ) marshal_server.async_run() gunicorn_app.run() else: gunicorn_app = GunicornBentoServer(saved_bundle_path, port, workers, timeout, enable_swagger=enable_swagger) gunicorn_app.run()