def main(): # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models()
def main(): with open("/src/cortex/serve/log_config.yaml", "r") as f: log_config = yaml.load(f, yaml.FullLoader) # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models() if raw_api_spec["kind"] == "RealtimeAPI": # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py uvicorn.run( "cortex.serve.wsgi:app", host="0.0.0.0", port=int(os.environ["CORTEX_SERVING_PORT"]), workers=int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]), limit_concurrency=int(os.environ["CORTEX_MAX_PROCESS_CONCURRENCY"] ), # this is a per process limit backlog=int(os.environ["CORTEX_SO_MAX_CONN"]), log_config=log_config, log_level="info", ) else: from cortex.serve import batch batch.start()
def main(): # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_tf_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) if has_multiple_tf_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] cache_dir = os.getenv("CORTEX_CACHE_DIR") # when it's deployed locally region = os.getenv("AWS_REGION") # when it's deployed to AWS _, api_spec = get_spec(provider, spec_path, cache_dir, region) predictor_type = predictor_type_from_api_spec(api_spec) multiple_processes = api_spec["predictor"]["processes_per_replica"] > 1 caching_enabled = is_model_caching_enabled(api_spec) model_dir = os.getenv("CORTEX_MODEL_DIR") # start live-reloading when model caching not enabled > 1 cron = None if not caching_enabled: # create cron dirs if they don't exist os.makedirs("/run/cron", exist_ok=True) os.makedirs("/tmp/cron", exist_ok=True) # prepare crons if predictor_type in [PythonPredictorType, ONNXPredictorType ] and are_models_specified(api_spec): cron = FileBasedModelsTreeUpdater( interval=10, api_spec=api_spec, download_dir=model_dir, ) cron.start() elif predictor_type == TensorFlowPredictorType: tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") cron = TFSModelLoader( interval=10, api_spec=api_spec, address=f"{tf_serving_host}:{tf_serving_port}", tfs_model_dir=model_dir, download_dir=model_dir, ) cron.start() elif predictor_type == TensorFlowNeuronPredictorType: cron = prepare_tfs_servers_api(api_spec, model_dir) cron.start() # wait until the cron finishes its first pass if cron: while cron.is_alive() and not cron.ran_once(): time.sleep(0.25) # disable live reloading when the BatchAPI kind is used # disable live reloading for the TF predictor when Inferentia is used and when multiple processes are used (num procs > 1) if api_spec["kind"] != "RealtimeAPI" or ( predictor_type == TensorFlowNeuronPredictorType and has_multiple_tf_servers and num_processes > 1): cron.stop() # to syncronize with the other serving processes open("/mnt/workspace/init_script_run.txt", "a").close() # don't exit the script if the cron is running while cron and cron.is_alive(): time.sleep(0.25) # exit if cron has exited with errors if cron and isinstance(cron.exitcode, int) and cron.exitcode != 0: # if it was killed by a signal if cron.exitcode < 0: sys.exit(-cron.exitcode) sys.exit(cron.exitcode)