def get_spec( provider: str, spec_path: str, cache_dir: str, region: Optional[str] = None, ) -> Tuple[Union[LocalStorage, S3, GCS], dict]: """ Args: provider: "local", "aws" or "gcp". spec_path: Path to API spec (i.e. "s3://cortex-dev-0/apis/iris-classifier/api/69b93378fa5c0218-jy1fjtyihu-9fcc10739e7fc8050cefa8ca27ece1ee/master-spec.json"). cache_dir: Local directory where the API spec gets saved to. region: Region of the bucket. Only required for "S3" provider. """ if provider == "local": storage = LocalStorage(cache_dir) elif provider == "aws": bucket, key = S3.deconstruct_s3_path(spec_path) storage = S3(bucket=bucket, region=region) elif provider == "gcp": bucket, key = GCS.deconstruct_gcs_path(spec_path) storage = GCS(bucket=bucket) else: raise ValueError('invalid "provider" argument') if provider == "local": return storage, read_json(spec_path) local_spec_path = os.path.join(cache_dir, "api_spec.json") if not os.path.isfile(local_spec_path): storage.download_file(key, local_spec_path) return storage, read_json(local_spec_path)
def main(): with open("/src/cortex/serve/log_config.yaml", "r") as f: log_config = yaml.load(f, yaml.FullLoader) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models() # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py uvicorn.run( "cortex.serve.wsgi:app", host="0.0.0.0", port=int(os.environ["CORTEX_SERVING_PORT"]), workers=int(os.environ["CORTEX_WORKERS_PER_REPLICA"]), limit_concurrency=int(os.environ["CORTEX_MAX_WORKER_CONCURRENCY"]), backlog=int(os.environ["CORTEX_SO_MAX_CONN"]), log_config=log_config, log_level="info", )
def main(): # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models()
def start(): cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR", None) tf_serving_port = os.getenv("CORTEX_TF_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) try: raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) api = API(provider=provider, storage=storage, cache_dir=cache_dir, **raw_api_spec) client = api.predictor.initialize_client( model_dir, tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) cx_logger().info("loading the predictor from {}".format( api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client) local_cache["api"] = api local_cache["provider"] = provider local_cache["client"] = client local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args predict_route = "/" if provider != "local": predict_route = "/predict" local_cache["predict_route"] = predict_route except: cx_logger().exception("failed to start api") sys.exit(1) if (provider != "local" and api.monitoring is not None and api.monitoring.model_type == "classification"): try: local_cache["class_set"] = api.get_cached_classes() except: cx_logger().warn( "an error occurred while attempting to load classes", exc_info=True) app.add_api_route(local_cache["predict_route"], predict, methods=["POST"]) app.add_api_route(local_cache["predict_route"], get_summary, methods=["GET"]) return app
def main(): with open("/src/cortex/serve/log_config.yaml", "r") as f: log_config = yaml.load(f, yaml.FullLoader) # wait until neuron-rtd sidecar is ready uses_inferentia = os.getenv("CORTEX_ACTIVE_NEURON") if uses_inferentia: wait_neuron_rtd() # strictly for Inferentia has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: base_serving_port = int(os.environ["CORTEX_TF_BASE_SERVING_PORT"]) num_processes = int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]) used_ports = {} for w in range(int(num_processes)): used_ports[str(base_serving_port + w)] = False with open("/run/used_ports.json", "w+") as f: json.dump(used_ports, f) # get API spec cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) # load tensorflow models into TFS if raw_api_spec["predictor"]["type"] == "tensorflow": load_tensorflow_serving_models() if raw_api_spec["kind"] == "RealtimeAPI": # https://github.com/encode/uvicorn/blob/master/uvicorn/config.py uvicorn.run( "cortex.serve.wsgi:app", host="0.0.0.0", port=int(os.environ["CORTEX_SERVING_PORT"]), workers=int(os.environ["CORTEX_PROCESSES_PER_REPLICA"]), limit_concurrency=int(os.environ["CORTEX_MAX_PROCESS_CONCURRENCY"] ), # this is a per process limit backlog=int(os.environ["CORTEX_SO_MAX_CONN"]), log_config=log_config, log_level="info", ) else: from cortex.serve import batch batch.start()
def get_spec( provider: str, spec_path: str, cache_dir: Optional[str], bucket: Optional[str], region: Optional[str], ) -> Tuple[Union[LocalStorage, S3], dict]: if provider == "local": storage = LocalStorage(cache_dir) else: storage = S3(bucket=bucket, region=region) if provider == "local": return storage, read_json(spec_path) local_spec_path = os.path.join(cache_dir, "api_spec.json") if not os.path.isfile(local_spec_path): _, key = S3.deconstruct_s3_path(spec_path) storage.download_file(key, local_spec_path) return storage, read_json(local_spec_path)
def start_fn(): cache_dir = os.environ["CORTEX_CACHE_DIR"] provider = os.environ["CORTEX_PROVIDER"] spec_path = os.environ["CORTEX_API_SPEC"] project_dir = os.environ["CORTEX_PROJECT_DIR"] model_dir = os.getenv("CORTEX_MODEL_DIR") tf_serving_port = os.getenv("CORTEX_TF_BASE_SERVING_PORT", "9000") tf_serving_host = os.getenv("CORTEX_TF_SERVING_HOST", "localhost") if provider == "local": storage = LocalStorage(os.getenv("CORTEX_CACHE_DIR")) else: storage = S3(bucket=os.environ["CORTEX_BUCKET"], region=os.environ["AWS_REGION"]) has_multiple_servers = os.getenv("CORTEX_MULTIPLE_TF_SERVERS") if has_multiple_servers: with FileLock("/run/used_ports.json.lock"): with open("/run/used_ports.json", "r+") as f: used_ports = json.load(f) for port in used_ports.keys(): if not used_ports[port]: tf_serving_port = port used_ports[port] = True break f.seek(0) json.dump(used_ports, f) f.truncate() try: raw_api_spec = get_spec(provider, storage, cache_dir, spec_path) api = API( provider=provider, storage=storage, model_dir=model_dir, cache_dir=cache_dir, **raw_api_spec, ) client = api.predictor.initialize_client( tf_serving_host=tf_serving_host, tf_serving_port=tf_serving_port) cx_logger().info("loading the predictor from {}".format( api.predictor.path)) predictor_impl = api.predictor.initialize_impl(project_dir, client) local_cache["api"] = api local_cache["provider"] = provider local_cache["client"] = client local_cache["predictor_impl"] = predictor_impl local_cache["predict_fn_args"] = inspect.getfullargspec( predictor_impl.predict).args predict_route = "/" if provider != "local": predict_route = "/predict" local_cache["predict_route"] = predict_route except: cx_logger().exception("failed to start api") sys.exit(1) if (provider != "local" and api.monitoring is not None and api.monitoring.model_type == "classification"): try: local_cache["class_set"] = api.get_cached_classes() except: cx_logger().warn( "an error occurred while attempting to load classes", exc_info=True) app.add_api_route(local_cache["predict_route"], predict, methods=["POST"]) app.add_api_route(local_cache["predict_route"], get_summary, methods=["GET"]) return app
def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: self.ctx = util.read_msgpack(kwargs["local_path"]) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: self.ctx = kwargs["raw_obj"] elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) self.ctx = util.read_msgpack(local_ctx_path) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.metadata_root = self.ctx["metadata_root"] self.cortex_config = self.ctx["cortex_config"] self.deployment_version = self.ctx["deployment_version"] self.root = self.ctx["root"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.apis = self.ctx["apis"] or {} self.api_version = self.cortex_config["api_version"] self.monitoring = None self.project_id = self.ctx["project_id"] self.project_key = self.ctx["project_key"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) host_ip = os.environ["HOST_IP"] datadog.initialize(statsd_host=host_ip, statsd_port="8125") self.statsd = datadog.statsd if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # ID maps self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.id_map = self.apis_id_map
class Context: def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: self.ctx = util.read_msgpack(kwargs["local_path"]) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: self.ctx = kwargs["raw_obj"] elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) self.ctx = util.read_msgpack(local_ctx_path) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.metadata_root = self.ctx["metadata_root"] self.cortex_config = self.ctx["cortex_config"] self.deployment_version = self.ctx["deployment_version"] self.root = self.ctx["root"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.apis = self.ctx["apis"] or {} self.api_version = self.cortex_config["api_version"] self.monitoring = None self.project_id = self.ctx["project_id"] self.project_key = self.ctx["project_key"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) host_ip = os.environ["HOST_IP"] datadog.initialize(statsd_host=host_ip, statsd_port="8125") self.statsd = datadog.statsd if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # ID maps self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.id_map = self.apis_id_map def download_file(self, impl_key, cache_impl_path): if not os.path.isfile(cache_impl_path): self.storage.download_file(impl_key, cache_impl_path) return cache_impl_path def download_python_file(self, impl_key, module_name): cache_impl_path = os.path.join(self.cache_dir, "{}.py".format(module_name)) self.download_file(impl_key, cache_impl_path) return cache_impl_path def load_module(self, module_prefix, module_name, impl_path): full_module_name = "{}_{}".format(module_prefix, module_name) try: impl = imp.load_source(full_module_name, impl_path) except Exception as e: raise UserException("unable to load python file", str(e)) from e return impl def get_request_handler_impl(self, api_name, project_dir): api = self.apis[api_name] try: impl = self.load_module( "request_handler", api["name"], os.path.join(project_dir, api["request_handler"]) ) except CortexException as e: e.wrap("api " + api_name, "request_handler " + api["request_handler"]) raise try: _validate_impl(impl, REQUEST_HANDLER_IMPL_VALIDATION) except CortexException as e: e.wrap("api " + api_name, "request_handler " + api["request_handler"]) raise return impl def get_resource_status(self, resource): key = self.resource_status_key(resource) return self.storage.get_json(key, num_retries=5) def upload_resource_status_start(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, } self.storage.put_json(status, key) def upload_resource_status_no_op(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, "end": timestamp, "exit_code": "succeeded", } self.storage.put_json(status, key) def upload_resource_status_success(self, *resources): self.upload_resource_status_end("succeeded", *resources) def upload_resource_status_failed(self, *resources): self.upload_resource_status_end("failed", *resources) def upload_resource_status_end(self, exit_code, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: status = self.get_resource_status(resource) if status.get("end") != None: continue status["end"] = timestamp status["exit_code"] = exit_code key = self.resource_status_key(resource) self.storage.put_json(status, key) def resource_status_key(self, resource): return os.path.join(self.status_prefix, resource["id"], resource["workload_id"]) def publish_metrics(self, metrics): if self.statsd is None: raise CortexException("statsd client not initialized") # unexpected for metric in metrics: tags = ["{}:{}".format(dim["Name"], dim["Value"]) for dim in metric["Dimensions"]] if metric.get("Unit") == "Count": self.statsd.increment(metric["MetricName"], value=metric["Value"], tags=tags) else: self.statsd.histogram(metric["MetricName"], value=metric["Value"], tags=tags)
def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: ctx_raw = util.read_msgpack(kwargs["local_path"]) self.ctx = _deserialize_raw_ctx(ctx_raw) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: ctx_raw = kwargs["raw_obj"] self.ctx = _deserialize_raw_ctx(ctx_raw) elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) ctx_raw = util.read_msgpack(local_ctx_path) self.ctx = _deserialize_raw_ctx(ctx_raw) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.cortex_config = self.ctx["cortex_config"] self.dataset_version = self.ctx["dataset_version"] self.root = self.ctx["root"] self.raw_dataset = self.ctx["raw_dataset"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.environment = self.ctx["environment"] self.python_packages = self.ctx["python_packages"] or {} self.raw_columns = self.ctx["raw_columns"] or {} self.transformed_columns = self.ctx["transformed_columns"] or {} self.transformers = self.ctx["transformers"] or {} self.aggregators = self.ctx["aggregators"] or {} self.aggregates = self.ctx["aggregates"] or {} self.constants = self.ctx["constants"] or {} self.models = self.ctx["models"] or {} self.estimators = self.ctx["estimators"] or {} self.apis = self.ctx["apis"] or {} self.training_datasets = {k: v["dataset"] for k, v in self.models.items()} self.api_version = self.cortex_config["api_version"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) self.columns = util.merge_dicts_overwrite(self.raw_columns, self.transformed_columns) self.raw_column_names = list(self.raw_columns.keys()) self.transformed_column_names = list(self.transformed_columns.keys()) self.column_names = list(self.columns.keys()) # Internal caches self._transformer_impls = {} self._aggregator_impls = {} self._estimator_impls = {} self._metadatas = {} self._obj_cache = {} self.spark_uploaded_impls = {} # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # Id map self.pp_id_map = ResourceMap(self.python_packages) if self.python_packages else None self.rf_id_map = ResourceMap(self.raw_columns) if self.raw_columns else None self.ag_id_map = ResourceMap(self.aggregates) if self.aggregates else None self.tf_id_map = ResourceMap(self.transformed_columns) if self.transformed_columns else None self.td_id_map = ResourceMap(self.training_datasets) if self.training_datasets else None self.models_id_map = ResourceMap(self.models) if self.models else None self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.constants_id_map = ResourceMap(self.constants) if self.constants else None self.id_map = util.merge_dicts_overwrite( self.pp_id_map, self.rf_id_map, self.ag_id_map, self.tf_id_map, self.td_id_map, self.models_id_map, self.apis_id_map, self.constants_id_map, )
class Context: def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: ctx_raw = util.read_msgpack(kwargs["local_path"]) self.ctx = _deserialize_raw_ctx(ctx_raw) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: ctx_raw = kwargs["raw_obj"] self.ctx = _deserialize_raw_ctx(ctx_raw) elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) ctx_raw = util.read_msgpack(local_ctx_path) self.ctx = _deserialize_raw_ctx(ctx_raw) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.cortex_config = self.ctx["cortex_config"] self.dataset_version = self.ctx["dataset_version"] self.root = self.ctx["root"] self.raw_dataset = self.ctx["raw_dataset"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.environment = self.ctx["environment"] self.python_packages = self.ctx["python_packages"] or {} self.raw_columns = self.ctx["raw_columns"] or {} self.transformed_columns = self.ctx["transformed_columns"] or {} self.transformers = self.ctx["transformers"] or {} self.aggregators = self.ctx["aggregators"] or {} self.aggregates = self.ctx["aggregates"] or {} self.constants = self.ctx["constants"] or {} self.models = self.ctx["models"] or {} self.estimators = self.ctx["estimators"] or {} self.apis = self.ctx["apis"] or {} self.training_datasets = {k: v["dataset"] for k, v in self.models.items()} self.api_version = self.cortex_config["api_version"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) self.columns = util.merge_dicts_overwrite(self.raw_columns, self.transformed_columns) self.raw_column_names = list(self.raw_columns.keys()) self.transformed_column_names = list(self.transformed_columns.keys()) self.column_names = list(self.columns.keys()) # Internal caches self._transformer_impls = {} self._aggregator_impls = {} self._estimator_impls = {} self._metadatas = {} self._obj_cache = {} self.spark_uploaded_impls = {} # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # Id map self.pp_id_map = ResourceMap(self.python_packages) if self.python_packages else None self.rf_id_map = ResourceMap(self.raw_columns) if self.raw_columns else None self.ag_id_map = ResourceMap(self.aggregates) if self.aggregates else None self.tf_id_map = ResourceMap(self.transformed_columns) if self.transformed_columns else None self.td_id_map = ResourceMap(self.training_datasets) if self.training_datasets else None self.models_id_map = ResourceMap(self.models) if self.models else None self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.constants_id_map = ResourceMap(self.constants) if self.constants else None self.id_map = util.merge_dicts_overwrite( self.pp_id_map, self.rf_id_map, self.ag_id_map, self.tf_id_map, self.td_id_map, self.models_id_map, self.apis_id_map, self.constants_id_map, ) def is_raw_column(self, name): return name in self.raw_columns def is_transformed_column(self, name): return name in self.transformed_columns def is_constant(self, name): return name in self.constants def is_aggregate(self, name): return name in self.aggregates def download_file(self, impl_key, cache_impl_path): if not os.path.isfile(cache_impl_path): self.storage.download_file(impl_key, cache_impl_path) return cache_impl_path def download_python_file(self, impl_key, module_name): cache_impl_path = os.path.join(self.cache_dir, "{}.py".format(module_name)) self.download_file(impl_key, cache_impl_path) return cache_impl_path def get_obj(self, key): if key in self._obj_cache: return self._obj_cache[key] cache_path = os.path.join(self.cache_dir, key) self.download_file(key, cache_path) self._obj_cache[key] = util.read_msgpack(cache_path) return self._obj_cache[key] def load_module(self, module_prefix, module_name, impl_key): full_module_name = "{}_{}".format(module_prefix, module_name) try: impl_path = self.download_python_file(impl_key, full_module_name) except CortexException as e: e.wrap("unable to find python file " + module_name) raise try: impl = imp.load_source(full_module_name, impl_path) except Exception as e: raise UserException("unable to load python module " + module_name) from e return impl, impl_path def get_aggregator_impl(self, aggregate_name): aggregator_name = self.aggregates[aggregate_name]["aggregator"] if aggregator_name in self._aggregator_impls: return self._aggregator_impls[aggregator_name] aggregator = self.aggregators[aggregator_name] module_prefix = "aggregator" if "namespace" in aggregator and aggregator.get("namespace", None) is not None: module_prefix += "_" + aggregator["namespace"] try: impl, impl_path = self.load_module( module_prefix, aggregator["name"], aggregator["impl_key"] ) except CortexException as e: e.wrap("aggregate " + aggregate_name, "aggregator") raise try: _validate_impl(impl, AGGREGATOR_IMPL_VALIDATION) except CortexException as e: e.wrap("aggregate " + aggregate_name, "aggregator " + aggregator["name"]) raise self._aggregator_impls[aggregator_name] = (impl, impl_path) return (impl, impl_path) def get_transformer_impl(self, column_name): if self.is_transformed_column(column_name) is not True: return None, None transformer_name = self.transformed_columns[column_name]["transformer"] if transformer_name in self._transformer_impls: return self._transformer_impls[transformer_name] transformer = self.transformers[transformer_name] module_prefix = "transformer" if "namespace" in transformer and transformer.get("namespace", None) is not None: module_prefix += "_" + transformer["namespace"] try: impl, impl_path = self.load_module( module_prefix, transformer["name"], transformer["impl_key"] ) except CortexException as e: e.wrap("transformed column " + column_name, "transformer") raise try: _validate_impl(impl, TRANSFORMER_IMPL_VALIDATION) except CortexException as e: e.wrap("transformed column " + column_name, "transformer " + transformer["name"]) raise self._transformer_impls[transformer_name] = (impl, impl_path) return (impl, impl_path) def get_estimator_impl(self, model_name): estimator_name = self.models[model_name]["estimator"] if estimator_name in self._estimator_impls: return self._estimator_impls[estimator_name] estimator = self.estimators[estimator_name] module_prefix = "estimator" if "namespace" in estimator and estimator.get("namespace", None) is not None: module_prefix += "_" + estimator["namespace"] try: impl, impl_path = self.load_module( module_prefix, estimator["name"], estimator["impl_key"] ) except CortexException as e: e.wrap("model " + model_name, "estimator") raise try: _validate_impl(impl, MODEL_IMPL_VALIDATION) except CortexException as e: e.wrap("model " + model_name, "estimator " + estimator["name"]) raise self._estimator_impls[estimator_name] = (impl, impl_path) return (impl, impl_path) def get_request_handler_impl(self, api_name): api = self.apis[api_name] module_prefix = "request_handler" try: impl, impl_path = self.load_module( module_prefix, api["name"], api["request_handler_impl_key"] ) except CortexException as e: e.wrap("api " + api_name, "request_handler") raise try: _validate_impl(impl, REQUEST_HANDLER_IMPL_VALIDATION) except CortexException as e: e.wrap("api " + api_name, "request_handler " + api["request_handler"]) raise return impl # Mode must be "training" or "evaluation" def get_training_data_parts(self, model_name, mode, part_prefix="part"): training_dataset = self.models[model_name]["dataset"] if mode == "training": data_key = training_dataset["train_key"] elif mode == "evaluation": data_key = training_dataset["eval_key"] else: raise CortexException( "unrecognized training/evaluation mode {} must be one of (train_key, eval_key)".format( mode ) ) training_data_parts_prefix = os.path.join(data_key, part_prefix) return self.storage.search(prefix=training_data_parts_prefix) def store_aggregate_result(self, result, aggregate): self.storage.put_msgpack(result, aggregate["key"]) def extract_column_names(self, input): column_names = set() for resource_name in util.extract_resource_refs(input): if resource_name in self.columns: column_names.add(resource_name) return column_names def model_config(self, model_name): model = self.models[model_name] if model is None: return None estimator = self.estimators[model["estimator"]] target_column = self.columns[util.get_resource_ref(model["target_column"])] if estimator.get("target_column") is not None: target_col_type = self.get_inferred_column_type(target_column["name"]) if target_col_type not in estimator["target_column"]: raise UserException( "model " + model_name, "target_column", target_column["name"], "unsupported type (expected type {}, got type {})".format( util.data_type_str(estimator["target_column"]), util.data_type_str(target_col_type), ), ) model_config = deepcopy(model) config_keys = [ "name", "estimator" "estimator_path" "target_column" "input" "training_input" "hparams" "prediction_key" "data_partition_ratio" "training" "evaluation" "tags", ] util.keep_dict_keys(model_config, config_keys) model_config["target_column"] = target_column["name"] model_config["input"] = self.populate_values( model["input"], estimator["input"], preserve_column_refs=False ) if model.get("training_input") is not None: model_config["training_input"] = self.populate_values( model["training_input"], estimator["training_input"], preserve_column_refs=False ) if model.get("hparams") is not None: model_config["hparams"] = self.populate_values( model["hparams"], estimator["hparams"], preserve_column_refs=False ) return model_config def get_resource_status(self, resource): key = self.resource_status_key(resource) return self.storage.get_json(key) def upload_resource_status_start(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, } self.storage.put_json(status, key) def upload_resource_status_no_op(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, "end": timestamp, "exit_code": "succeeded", } self.storage.put_json(status, key) def upload_resource_status_success(self, *resources): self.upload_resource_status_end("succeeded", *resources) def upload_resource_status_failed(self, *resources): self.upload_resource_status_end("failed", *resources) def upload_resource_status_end(self, exit_code, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: status = self.get_resource_status(resource) if status.get("end") != None: continue status["end"] = timestamp status["exit_code"] = exit_code key = self.resource_status_key(resource) self.storage.put_json(status, key) def resource_status_key(self, resource): return os.path.join(self.status_prefix, resource["id"], resource["workload_id"]) def get_metadata_url(self, resource_id): return os.path.join(self.ctx["metadata_root"], resource_id + ".json") def write_metadata(self, resource_id, metadata): if resource_id in self._metadatas and self._metadatas[resource_id] == metadata: return self._metadatas[resource_id] = metadata self.storage.put_json(metadata, self.get_metadata_url(resource_id)) def get_metadata(self, resource_id, use_cache=True): if use_cache and resource_id in self._metadatas: return self._metadatas[resource_id] metadata = self.storage.get_json(self.get_metadata_url(resource_id), allow_missing=True) self._metadatas[resource_id] = metadata return metadata def get_inferred_column_type(self, column_name): column = self.columns[column_name] column_type = self.columns[column_name]["type"] if column_type == consts.COLUMN_TYPE_INFERRED: column_type = self.get_metadata(column["id"])["type"] self.columns[column_name]["type"] = column_type return column_type # Replace aggregates and constants with their values, and columns with their names (unless preserve_column_refs == False) # Also validate against input_schema (if not None) def populate_values(self, input, input_schema, preserve_column_refs): if input is None: if input_schema is None: return None if input_schema.get("_allow_null") == True: return None raise UserException("Null value is not allowed") if util.is_resource_ref(input): res_name = util.get_resource_ref(input) if res_name in self.constants: if self.constants[res_name].get("value") is not None: const_val = self.constants[res_name]["value"] elif self.constants[res_name].get("path") is not None: const_val = self.storage.get_json_external(self.constants[res_name]["path"]) try: return self.populate_values(const_val, input_schema, preserve_column_refs) except CortexException as e: e.wrap("constant " + res_name) raise if res_name in self.aggregates: agg_val = self.get_obj(self.aggregates[res_name]["key"]) try: return self.populate_values(agg_val, input_schema, preserve_column_refs) except CortexException as e: e.wrap("aggregate " + res_name) raise if res_name in self.columns: if input_schema is not None: col_type = self.get_inferred_column_type(res_name) if col_type not in input_schema["_type"]: raise UserException( "column {}: unsupported input type (expected type {}, got type {})".format( res_name, util.data_type_str(input_schema["_type"]), util.data_type_str(col_type), ) ) if preserve_column_refs: return input else: return res_name if util.is_list(input): elem_schema = None if input_schema is not None: if not util.is_list(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) elem_schema = input_schema["_type"][0] min_count = input_schema.get("_min_count") if min_count is not None and len(input) < min_count: raise UserException( "list has length {}, but the minimum allowed length is {}".format( len(input), min_count ) ) max_count = input_schema.get("_max_count") if max_count is not None and len(input) > max_count: raise UserException( "list has length {}, but the maximum allowed length is {}".format( len(input), max_count ) ) casted = [] for i, elem in enumerate(input): try: casted.append(self.populate_values(elem, elem_schema, preserve_column_refs)) except CortexException as e: e.wrap("index " + i) raise return casted if util.is_dict(input): if input_schema is None: casted = {} for key, val in input.items(): key_casted = self.populate_values(key, None, preserve_column_refs) try: val_casted = self.populate_values(val, None, preserve_column_refs) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key_casted] = val_casted return casted if not util.is_dict(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) min_count = input_schema.get("_min_count") if min_count is not None and len(input) < min_count: raise UserException( "map has length {}, but the minimum allowed length is {}".format( len(input), min_count ) ) max_count = input_schema.get("_max_count") if max_count is not None and len(input) > max_count: raise UserException( "map has length {}, but the maximum allowed length is {}".format( len(input), max_count ) ) is_generic_map = False if len(input_schema["_type"]) == 1: input_type_key = next(iter(input_schema["_type"].keys())) if is_compound_type(input_type_key): is_generic_map = True generic_map_key_schema = input_schema_from_type_schema(input_type_key) generic_map_value = input_schema["_type"][input_type_key] if is_generic_map: casted = {} for key, val in input.items(): key_casted = self.populate_values( key, generic_map_key_schema, preserve_column_refs ) try: val_casted = self.populate_values( val, generic_map_value, preserve_column_refs ) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key_casted] = val_casted return casted # fixed map casted = {} for key, val_schema in input_schema["_type"].items(): if key in input: val = input[key] else: if val_schema.get("_optional") is not True: raise UserException("missing key: " + util.user_obj_str(key)) if val_schema.get("_default") is None: continue val = val_schema["_default"] try: val_casted = self.populate_values(val, val_schema, preserve_column_refs) except CortexException as e: e.wrap(util.user_obj_str(key)) raise casted[key] = val_casted return casted if input_schema is None: return input if not util.is_str(input_schema["_type"]): raise UserException( "unsupported input type (expected type {}, got {})".format( util.data_type_str(input_schema["_type"]), util.user_obj_str(input) ) ) return cast_compound_type(input, input_schema["_type"])
class Context: def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: self.ctx = util.read_msgpack(kwargs["local_path"]) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: self.ctx = kwargs["raw_obj"] elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) self.ctx = util.read_msgpack(local_ctx_path) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.metadata_root = self.ctx["metadata_root"] self.cluster_config = self.ctx["cluster_config"] self.deployment_version = self.ctx["deployment_version"] self.root = self.ctx["root"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.apis = self.ctx["apis"] or {} self.api_version = self.cluster_config["api_version"] self.monitoring = None self.project_id = self.ctx["project_id"] self.project_key = self.ctx["project_key"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cluster_config["bucket"], region=self.cluster_config["region"], client_config={}, ) host_ip = os.environ["HOST_IP"] datadog.initialize(statsd_host=host_ip, statsd_port="8125") self.statsd = datadog.statsd if self.api_version != consts.CORTEX_VERSION: raise ValueError( "api version mismatch (context: {}, image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) # This affects TensorFlow S3 access os.environ["AWS_REGION"] = self.cluster_config.get("region", "") # ID maps self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.id_map = self.apis_id_map def download_file(self, impl_key, cache_impl_path): if not os.path.isfile(cache_impl_path): self.storage.download_file(impl_key, cache_impl_path) return cache_impl_path def download_python_file(self, impl_key, module_name): cache_impl_path = os.path.join(self.cache_dir, "{}.py".format(module_name)) self.download_file(impl_key, cache_impl_path) return cache_impl_path def load_module(self, module_prefix, module_name, impl_path): full_module_name = "{}_{}".format(module_prefix, module_name) if impl_path.endswith(".pickle"): try: impl = imp.new_module(full_module_name) with open(impl_path, "rb") as pickle_file: pickled_dict = dill.load(pickle_file) for key in pickled_dict: setattr(impl, key, pickled_dict[key]) except Exception as e: raise UserException("unable to load pickle", str(e)) from e else: try: impl = imp.load_source(full_module_name, impl_path) except Exception as e: raise UserException(str(e)) from e return impl def get_predictor_class(self, api_name, project_dir): api = self.apis[api_name] if api["predictor"]["type"] == "tensorflow": target_class_name = "TensorFlowPredictor" validations = TENSORFLOW_CLASS_VALIDATION elif api["predictor"]["type"] == "onnx": target_class_name = "ONNXPredictor" validations = ONNX_CLASS_VALIDATION elif api["predictor"]["type"] == "python": target_class_name = "PythonPredictor" validations = PYTHON_CLASS_VALIDATION try: impl = self.load_module( "predictor", api["name"], os.path.join(project_dir, api["predictor"]["path"]) ) except CortexException as e: e.wrap("api " + api_name, "error in " + api["predictor"]["path"]) raise finally: refresh_logger() try: classes = inspect.getmembers(impl, inspect.isclass) predictor_class = None for class_df in classes: if class_df[0] == target_class_name: if predictor_class is not None: raise UserException( "multiple definitions for {} class found; please check your imports and class definitions and ensure that there is only one Predictor class definition".format( target_class_name ) ) predictor_class = class_df[1] if predictor_class is None: raise UserException("{} class is not defined".format(target_class_name)) _validate_impl(predictor_class, validations) except CortexException as e: e.wrap("api " + api_name, "error in " + api["predictor"]["path"]) raise return predictor_class def get_resource_status(self, resource): key = self.resource_status_key(resource) return self.storage.get_json(key, num_retries=5) def upload_resource_status_start(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, } self.storage.put_json(status, key) def upload_resource_status_no_op(self, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: key = self.resource_status_key(resource) status = { "resource_id": resource["id"], "resource_type": resource["resource_type"], "workload_id": resource["workload_id"], "app_name": self.app["name"], "start": timestamp, "end": timestamp, "exit_code": "succeeded", } self.storage.put_json(status, key) def upload_resource_status_success(self, *resources): self.upload_resource_status_end("succeeded", *resources) def upload_resource_status_failed(self, *resources): self.upload_resource_status_end("failed", *resources) def upload_resource_status_end(self, exit_code, *resources): timestamp = util.now_timestamp_rfc_3339() for resource in resources: status = self.get_resource_status(resource) if status.get("end") != None: continue status["end"] = timestamp status["exit_code"] = exit_code key = self.resource_status_key(resource) self.storage.put_json(status, key) def resource_status_key(self, resource): return os.path.join(self.status_prefix, resource["id"], resource["workload_id"]) def publish_metrics(self, metrics): if self.statsd is None: raise CortexException("statsd client not initialized") # unexpected for metric in metrics: tags = ["{}:{}".format(dim["Name"], dim["Value"]) for dim in metric["Dimensions"]] if metric.get("Unit") == "Count": self.statsd.increment(metric["MetricName"], value=metric["Value"], tags=tags) else: self.statsd.histogram(metric["MetricName"], value=metric["Value"], tags=tags)