def test_inference_job_scale(args): if utils.get_launcher(args.config) == "controller": return job_spec = utils.gen_default_job_description("inference", args.email, args.uid, args.vc, cmd="sleep 600") with utils.run_job(args.rest, job_spec) as job: job_id = job.jid state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" deployment_name = job_id + "-deployment" deployment = utils.kube_get_deployment(args.config, "default", deployment_name) assert 1 == deployment.spec.replicas desired_replicas = 2 logger.info("scale up job %s to %d" % (job_id, desired_replicas)) resp = utils.scale_job(args.rest, args.email, job_id, desired_replicas) assert "Success" == resp time.sleep(30) deployment = utils.kube_get_deployment(args.config, "default", deployment_name) assert desired_replicas == deployment.spec.replicas desired_replicas = 1 logger.info("scale down job %s to %d" % (job_id, desired_replicas)) resp = utils.scale_job(args.rest, args.email, job_id, desired_replicas) assert "Success" == resp time.sleep(30) deployment = utils.kube_get_deployment(args.config, "default", deployment_name) assert desired_replicas == deployment.spec.replicas
def test_fault_tolerance(args): # Job is only retried when launcher is controller. if utils.get_launcher(args.config) == "python": return job_spec = utils.gen_default_job_description("regular", args.email, args.uid, args.vc) with utils.run_job(args.rest, job_spec) as job: endpoints = utils.create_endpoint(args.rest, args.email, job.jid, ["ssh"]) endpoints_ids = list(endpoints.keys()) assert len(endpoints_ids) == 1 endpoint_id = endpoints_ids[0] state = job.block_until_state_not_in( {"unapproved", "queued", "scheduling"}) assert state == "running" ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port) pod = utils.kube_get_pods(args.config, "default", "jobId=%s" % (job.jid))[0] utils.kube_delete_pod(args.config, "default", pod.metadata.name) ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id, state="pending") ssh_endpoint = utils.wait_endpoint_state(args.rest, args.email, job.jid, endpoint_id) ssh_host = "%s.%s" % (ssh_endpoint["nodeName"], ssh_endpoint["domain"]) ssh_port = ssh_endpoint["port"] logger.info("current ssh endpoint is %s:%s", ssh_host, ssh_port) # exec into jobmanager to execute ssh to avoid firewall job_manager_pod = utils.kube_get_pods(args.config, "default", "app=jobmanager")[0] job_manager_pod_name = job_manager_pod.metadata.name alias = args.email.split("@")[0] cmd = [ "ssh", "-i", "/dlwsdata/work/%s/.ssh/id_rsa" % alias, "-p", ssh_port, "-o", "StrictHostKeyChecking=no", "-o", "LogLevel=ERROR", "%s@%s" % (alias, ssh_host), "--", "echo", "dummy" ] code, output = utils.kube_pod_exec(args.config, "default", job_manager_pod_name, "jobmanager", cmd) assert code == 0, "code is %s, output is %s" % (code, output) assert output == "dummy\n", "output is %s" % (output)