def test_batch_api( client: cx.Client, api: str, test_s3_path: str, deploy_timeout: int = None, job_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex.yaml", ): api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 api_name = api_specs[0]["name"] client.create_api(api_spec=api_specs[0], project_dir=api_dir) try: assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout ), f"api {api_name} not ready" with open(str(api_dir / "sample.json")) as f: payload = json.load(f) response = None for i in range(retry_attempts + 1): response = request_batch_prediction( client, api_name, item_list=payload, batch_size=2, config={"dest_s3_dir": test_s3_path}, ) if response.status_code == HTTPStatus.OK: break time.sleep(1) assert ( response.status_code == HTTPStatus.OK ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})" job_spec = response.json() # monitor job progress assert job_done( client=client, api_name=job_spec["api_name"], job_id=job_spec["job_id"], timeout=job_timeout, ), f"job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})" finally: delete_apis(client, [api_name])
def test_task_api( client: cx.Client, api: str, deploy_timeout: int = None, job_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex.yaml", ): api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 api_name = api_specs[0]["name"] client.create_api(api_spec=api_specs[0], project_dir=api_dir) try: assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout ), f"api {api_name} not ready" response = None for i in range(retry_attempts + 1): response = request_task( client, api_name, ) if response.status_code == HTTPStatus.OK: break time.sleep(1) job_spec = response.json() assert job_done( client=client, api_name=api_name, job_id=job_spec["job_id"], timeout=job_timeout, ), f"task job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})" finally: delete_apis(client, [api_name])
def test_load_task( printer: Callable, client: cx.Client, api: str, load_config: Dict[str, Union[int, float]], deploy_timeout: int = None, retry_attempts: int = 0, poll_sleep_seconds: int = 1, api_config_name: str = "cortex.yaml", ): jobs = load_config["jobs"] concurrency = load_config["concurrency"] submit_timeout = load_config["submit_timeout"] workload_timeout = load_config["workload_timeout"] api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 api_name = api_specs[0]["name"] client.create_api(api_spec=api_specs[0], project_dir=api_dir) request_stopper = td.Event() map_stopper = td.Event() try: assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout), f"api {api_name} not ready" # give the operator time to start time.sleep(1 * retry_attempts) # submit jobs printer(f"submitting {jobs} jobs concurrently") job_specs = [] threads_futures = request_tasks_concurrently(client, api_name, request_stopper, concurrency, jobs, job_specs) assert wait_on_event( request_stopper, submit_timeout ), f"{jobs} jobs couldn't be submitted in {submit_timeout}s" check_futures_healthy(threads_futures) wait_on_futures(threads_futures) printer("waiting on the jobs") job_ids = [job_spec.json()["job_id"] for job_spec in job_specs] retrieve_results_concurrently( client, api_name, concurrency, map_stopper, job_ids, poll_sleep_seconds=poll_sleep_seconds, timeout=workload_timeout, ) except: # best effort try: api_info = client.get_api(api_name) # only get the last 10 job statuses if "task_job_statuses" in api_info and len( api_info["task_job_statuses"]) > 10: api_info["task_job_statuses"] = api_info["task_job_statuses"][ -10:] printer(json.dumps(api_info, indent=2)) except: pass raise finally: map_stopper.set() delete_apis(client, [api_name])
def test_load_batch( printer: Callable, client: cx.Client, api: str, test_s3_path: str, load_config: Dict[str, Union[int, float]], deploy_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex.yaml", ): jobs = load_config["jobs"] workers_per_job = load_config["workers_per_job"] items_per_job = load_config["items_per_job"] batch_size = load_config["batch_size"] workload_timeout = load_config["workload_timeout"] bucket, key = re.match("s3://(.+?)/(.+)", test_s3_path).groups() s3 = boto3.client("s3") paginator = s3.get_paginator("list_objects_v2") api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 sample_generator_path = api_dir / "sample_generator.py" assert (sample_generator_path.exists() ), "sample_generator.py must be present for the batch load test" sample_generator = load_generator(sample_generator_path) api_name = api_specs[0]["name"] client.create_api(api_spec=api_specs[0], project_dir=api_dir) api_endpoint = client.get_api(api_name)["endpoint"] try: assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout), f"api {api_name} not ready" # submit jobs printer(f"submitting {jobs} jobs") job_specs = [] for _ in range(jobs): for _ in range(retry_attempts + 1): response = request_batch_prediction( client, api_name, item_list=[ sample_generator() for _ in range(items_per_job) ], batch_size=batch_size, workers=workers_per_job, config={"dest_s3_dir": test_s3_path}, ) if response.status_code == HTTPStatus.OK: break time.sleep(1) # retries are only required once retry_attempts = 0 assert ( response.status_code == HTTPStatus.OK ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})" job_specs.append(response.json()) # wait the jobs to finish printer("waiting on the jobs") assert jobs_done( client, api_name, [job_spec["job_id"] for job_spec in job_specs], workload_timeout), f"not all jobs succeed in {workload_timeout}s" # assert jobs printer("checking the jobs' responses") for job_spec in job_specs: job_id: str = job_spec["job_id"] job_status = requests.get( f"{api_endpoint}?jobID={job_id}").json()["job_status"] assert ( job_status["batches_in_queue"] == 0 ), f"there are still batches in queue ({job_status['batches_in_queue']}) for job ID {job_id}" assert job_status["batch_metrics"]["succeeded"] == math.ceil( items_per_job / batch_size) num_objects = 0 for page in paginator.paginate(Bucket=bucket, Prefix=os.path.join(key, job_id)): num_objects += len(page["Contents"]) assert num_objects == 1 except: # best effort try: api_info = client.get_api(api_name) # only get the last 10 job statuses if "batch_job_statuses" in api_info and len( api_info["batch_job_statuses"]) > 10: api_info["batch_job_statuses"] = api_info[ "batch_job_statuses"][-10:] printer(json.dumps(api_info, indent=2)) except: pass raise finally: delete_apis(client, [api_name])
def test_task_api( printer: Callable, client: cx.Client, api: str, deploy_timeout: int = None, job_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex.yaml", ): api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 api_name = api_specs[0]["name"] client.create_api(api_spec=api_specs[0], project_dir=api_dir) try: assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout), f"api {api_name} not ready" response = None for _ in range(retry_attempts + 1): response = request_task( client, api_name, ) if response.status_code == HTTPStatus.OK: break time.sleep(1) job_spec = response.json() assert job_done( client=client, api_name=api_name, job_id=job_spec["job_id"], timeout=job_timeout, ), f"task job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})" except: # best effort try: api_info = client.get_api(api_name) printer(json.dumps(api_info, indent=2)) job_status = client.get_job(api_name, job_spec["job_id"]) printer(json.dumps(job_status, indent=2)) td.Thread(target=lambda: client.stream_job_logs( api_name, job_spec["job_id"]), daemon=True).start() time.sleep(5) except: pass raise finally: delete_apis(client, [api_name])
def test_batch_api( printer: Callable, client: cx.Client, api: str, test_s3_path: str, deploy_timeout: int = None, job_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex.yaml", ): api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 api_name = api_specs[0]["name"] client.create_api(api_spec=api_specs[0], project_dir=api_dir) try: assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout), f"api {api_name} not ready" with open(str(api_dir / "sample.json")) as f: payload = json.load(f) response = None for _ in range(retry_attempts + 1): response = request_batch_prediction( client, api_name, item_list=payload, batch_size=2, config={"dest_s3_dir": test_s3_path}, ) if response.status_code == HTTPStatus.OK: break time.sleep(1) assert ( response.status_code == HTTPStatus.OK ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})" job_spec = response.json() # monitor job progress assert job_done( client=client, api_name=job_spec["api_name"], job_id=job_spec["job_id"], timeout=job_timeout, ), f"job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})" except: # best effort try: api_info = client.get_api(api_name) printer(json.dumps(api_info, indent=2)) job_status = client.get_job(api_name, job_spec["job_id"]) printer(json.dumps(job_status, indent=2)) td.Thread(target=lambda: client.stream_job_logs( api_name, job_spec["job_id"]), daemon=True).start() time.sleep(5) except: pass raise finally: delete_apis(client, [api_name])
def test_task_api( printer: Callable, client: cx.Client, api: str, deploy_timeout: int = None, job_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex_cpu.yaml", node_groups: List[str] = [], local_operator: bool = False, ): api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 if len(node_groups) > 0: api_specs[0]["node_groups"] = node_groups api_name = api_specs[0]["name"] client.deploy(api_spec=api_specs[0]) try: endpoint_override = f"http://localhost:8888/tasks/{api_name}" if local_operator else None assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout, endpoint_override=endpoint_override, ), f"api {api_name} not ready" response = None for _ in range(retry_attempts + 1): response = request_task(client, api_name, local_operator=local_operator) if response.status_code == HTTPStatus.OK: break time.sleep(1) job_spec = response.json() job_id = job_spec["job_id"] endpoint_override = ( f"http://localhost:8888/tasks/{api_name}?jobID={job_id}" if local_operator else None) assert job_done( client=client, api_name=api_name, job_id=job_spec["job_id"], timeout=job_timeout, endpoint_override=endpoint_override, ), f"task job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})" except: # best effort try: api_info = client.get_api(api_name) printer(json.dumps(api_info, indent=2)) job_status = client.get_job(api_name, job_spec["job_id"]) printer(json.dumps(job_status, indent=2)) td.Thread(target=lambda: stream_job_logs(client, api_name, job_spec["job_id"]), daemon=True).start() time.sleep(5) except: pass raise finally: delete_apis(client, [api_name])
def test_batch_api( printer: Callable, client: cx.Client, api: str, test_s3_path: str, deploy_timeout: int = None, job_timeout: int = None, retry_attempts: int = 0, api_config_name: str = "cortex_cpu.yaml", node_groups: List[str] = [], local_operator: bool = False, ): api_dir = TEST_APIS_DIR / api with open(str(api_dir / api_config_name)) as f: api_specs = yaml.safe_load(f) assert len(api_specs) == 1 if len(node_groups) > 0: api_specs[0]["node_groups"] = node_groups api_name = api_specs[0]["name"] client.deploy(api_spec=api_specs[0]) try: endpoint_override = f"http://localhost:8888/batch/{api_name}" if local_operator else None assert endpoint_ready( client=client, api_name=api_name, timeout=deploy_timeout, endpoint_override=endpoint_override, ), f"api {api_name} not ready" with open(str(api_dir / "sample.json")) as f: payload = json.load(f) response = None for _ in range(retry_attempts + 1): response = request_batch_prediction( client, api_name, item_list=payload, batch_size=2, config={"dest_s3_dir": test_s3_path}, local_operator=local_operator, ) if response.status_code == HTTPStatus.OK: break time.sleep(1) assert ( response.status_code == HTTPStatus.OK ), f"status code: got {response.status_code}, expected {HTTPStatus.OK} ({response.text})" job_spec = response.json() # monitor job progress job_id = job_spec["job_id"] endpoint_override = ( f"http://localhost:8888/batch/{api_name}?jobID={job_id}" if local_operator else None) assert job_done( client=client, api_name=api_name, job_id=job_id, timeout=job_timeout, endpoint_override=endpoint_override, ), f"job did not succeed (api_name: {api_name}, job_id: {job_spec['job_id']})" except: # best effort try: api_info = client.get_api(api_name) printer(json.dumps(api_info, indent=2)) job_status = client.get_job(api_name, job_spec["job_id"]) printer(json.dumps(job_status, indent=2)) td.Thread( target=lambda: stream_job_logs(client, api_name, job_spec[ "job_id"]), daemon=True, ).start() time.sleep(5) finally: raise finally: delete_apis(client, [api_name])