예제 #1
0
def _submit_job(image_name, client_args, container_args):
    client = k8s.Client(
        image_name=image_name,
        namespace=client_args.namespace,
        job_name=client_args.job_name,
        event_callback=None,
        cluster_spec=client_args.cluster_spec,
    )

    client.create_master(
        resource_requests=client_args.master_resource_request,
        resource_limits=client_args.master_resource_limit,
        args=container_args,
        pod_priority=client_args.master_pod_priority,
        image_pull_policy=client_args.image_pull_policy,
        restart_policy=client_args.restart_policy,
        volume=client_args.volume,
        envs=parse_envs(client_args.envs),
    )
    logger.info(
        "ElasticDL job %s was successfully submitted. The master pod is: %s." %
        (client_args.job_name, client.get_master_pod_name()))
예제 #2
0
def _submit_job(image_name, client_args, container_args):
    client = k8s.Client(
        image_name=image_name,
        namespace=client_args.namespace,
        job_name=client_args.job_name,
        event_callback=None,
        cluster_spec=client_args.cluster_spec,
        force_use_kube_config_file=client_args.force_use_kube_config_file,
    )

    if client_args.yaml:
        client.dump_master_yaml(
            resource_requests=client_args.master_resource_request,
            resource_limits=client_args.master_resource_limit,
            args=container_args,
            pod_priority=client_args.master_pod_priority,
            image_pull_policy=client_args.image_pull_policy,
            restart_policy=client_args.restart_policy,
            volume=client_args.volume,
            envs=parse_envs(client_args.envs),
            yaml=client_args.yaml,
        )
        logger.info("ElasticDL job %s YAML has been dumped into file %s." %
                    (client_args.job_name, client_args.yaml))
    else:
        client.create_master(
            resource_requests=client_args.master_resource_request,
            resource_limits=client_args.master_resource_limit,
            args=container_args,
            pod_priority=client_args.master_pod_priority,
            image_pull_policy=client_args.image_pull_policy,
            restart_policy=client_args.restart_policy,
            volume=client_args.volume,
            envs=parse_envs(client_args.envs),
        )
        logger.info("ElasticDL job %s was successfully submitted. "
                    "The master pod is: %s." %
                    (client_args.job_name, client.get_master_pod_name()))
예제 #3
0
    def start_embedding_pod_and_redis(
        self,
        command,
        args,
        embedding_service_id=0,
        resource_request="cpu=1,memory=4096Mi",
        resource_limit="cpu=1,memory=4096Mi",
        pod_priority=None,
        volume=None,
        image_pull_policy=None,
        restart_policy="Never",
        **kargs,
    ):
        logger.info("Starting pod for embedding service ...")
        self._k8s_client = k8s.Client(event_callback=None, **kargs)
        pod = self._k8s_client.create_embedding_service(
            worker_id=embedding_service_id,
            resource_requests=resource_request,
            resource_limits=resource_limit,
            pod_priority=pod_priority,
            volume=volume,
            image_pull_policy=image_pull_policy,
            command=command,
            args=args,
            restart_policy=restart_policy,
        )

        # TODO: assign address with pod's domain name instead of pod's ip.
        # and should not fix ports
        address_ip = pod.status.pod_ip
        while not address_ip:
            pod = self._k8s_client.get_embedding_service_pod(
                embedding_service_id)
            address_ip = pod.status.pod_ip
        self._embedding_service_endpoint = {
            address_ip: [30001 + i for i in range(6)]
        }
예제 #4
0
    def __init__(self,
                 task_d,
                 num_workers=1,
                 worker_command=None,
                 worker_args=None,
                 worker_resource_request="cpu=1,memory=4096Mi",
                 worker_resource_limit="cpu=1,memory=4096Mi",
                 worker_pod_priority=None,
                 num_ps=0,
                 ps_command=None,
                 ps_args=None,
                 ps_resource_request="cpu=1,memory=4096Mi",
                 ps_resource_limit="cpu=1,memory=4096Mi",
                 ps_pod_priority=None,
                 volume=None,
                 image_pull_policy=None,
                 restart_policy="Never",
                 envs=None,
                 **kwargs):
        self._num_workers = num_workers
        self._worker_command = worker_command
        self._worker_args = worker_args
        self._worker_resource_request = worker_resource_request
        self._worker_resource_limit = worker_resource_limit
        self._worker_pod_priority = worker_pod_priority

        self._num_ps = num_ps
        self._ps_command = ps_command
        self._ps_args = ps_args
        self._ps_resource_request = ps_resource_request
        self._ps_resource_limit = ps_resource_limit
        self._ps_pod_priority = ps_pod_priority

        self._restart_policy = restart_policy
        self._volume = volume
        self._image_pull_policy = image_pull_policy
        self._envs = envs
        self._task_d = task_d
        self._next_worker_id = itertools.count().__next__

        # Protects followed variables, which are accessed from event_cb.
        self._lock = threading.Lock()
        # worker id to (pod name, phase) mapping
        # phase: None/Pending/Running/Succeeded/Failed/Unknown
        #   None: worker was just launched, haven't received event yet.
        #   Pending: worker pod not started yet
        #   Running: worker pod is running
        #   Succeeded: worker pod finishes all tasks and terminates with
        #       no issue.
        #   Failed: worker pod is killed for some reason
        #   Unknown: unknown
        self._worker_pods_phase = {}
        # pod name to worker id mapping
        self._worker_pod_name_to_id = {}

        self._relaunch_deleted_live_worker = True

        self._ps_pods_phase = {}
        self._ps_pod_name_to_id = {}
        self._relaunch_deleted_live_ps = True

        self._failed_pods = []

        self._k8s_client = k8s.Client(event_callback=self._event_cb, **kwargs)
        self._ps_addrs = self._get_addrs(
            self._num_ps, self._k8s_client.get_ps_service_address)
        # TODO: Select a worker address to be used for broadcasting model
        # parameters under allreduce-strategy.
        self._worker_addrs = self._get_addrs(
            self._num_workers, self._k8s_client.get_worker_service_address)
예제 #5
0
    def test_client(self):
        tracker = WorkerTracker()

        c = k8s.Client(
            image_name="gcr.io/google-samples/hello-app:1.0",
            namespace="default",
            job_name="test-job-%d-%d" %
            (int(time.time()), random.randint(1, 101)),
            event_callback=tracker.event_cb,
        )

        # Start master
        resource = "cpu=100m,memory=64M"
        c.create_master(
            resource_requests=resource,
            resource_limits=resource,
            pod_priority=None,
            args=None,
            volume=None,
            image_pull_policy="Never",
            restart_policy="Never",
        )
        while tracker._count < 1:
            time.sleep(1)

        # Check master pod labels
        master = c.get_master_pod()
        self.assertEqual(master.metadata.labels[k8s.ELASTICDL_JOB_KEY],
                         c.job_name)
        self.assertEqual(
            master.metadata.labels[k8s.ELASTICDL_REPLICA_TYPE_KEY], "master")
        self.assertEqual(
            master.metadata.labels[k8s.ELASTICDL_REPLICA_INDEX_KEY], "0")

        # Start 3 workers
        for i in range(3):
            _ = c.create_worker(
                worker_id=str(i),
                resource_requests=resource,
                resource_limits=resource,
                command=["echo"],
                pod_priority=None,
                args=None,
                volume=None,
                image_pull_policy="Never",
                restart_policy="Never",
                expose_ports=False,
            )
            time.sleep(5)

        # Wait for workers to be added
        while tracker._count < 4:
            time.sleep(1)

        # Check worker pods labels
        for i in range(3):
            worker = c.get_worker_pod(i)
            self.assertEqual(worker.metadata.labels[k8s.ELASTICDL_JOB_KEY],
                             c.job_name)
            self.assertEqual(
                worker.metadata.labels[k8s.ELASTICDL_REPLICA_TYPE_KEY],
                "worker",
            )
            self.assertEqual(
                worker.metadata.labels[k8s.ELASTICDL_REPLICA_INDEX_KEY],
                str(i))

        # Start 3 worker services
        for i in range(3):
            c.create_worker_service(i)

        # Check worker services
        for i in range(3):
            service = c.get_worker_service(i)
            self.assertIsNotNone(service)
            self.assertEqual(service.spec.selector[k8s.ELASTICDL_JOB_KEY],
                             c.job_name)
            self.assertEqual(
                service.spec.selector[k8s.ELASTICDL_REPLICA_TYPE_KEY],
                "worker")
            self.assertEqual(
                service.spec.selector[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i))

        # Start 2 ps pods
        for i in range(2):
            _ = c.create_ps(
                ps_id=str(i),
                resource_requests=resource,
                resource_limits=resource,
                command=["echo"],
                pod_priority=None,
                args=None,
                volume=None,
                image_pull_policy="Never",
                restart_policy="Never",
                expose_ports=False,
            )
            time.sleep(5)

        # Wait for ps to be added
        while tracker._count < 6:
            time.sleep(1)

        # Check ps pods labels
        for i in range(2):
            ps = c.get_ps_pod(i)
            self.assertEqual(ps.metadata.labels[k8s.ELASTICDL_JOB_KEY],
                             c.job_name)
            self.assertEqual(
                ps.metadata.labels[k8s.ELASTICDL_REPLICA_TYPE_KEY], "ps")
            self.assertEqual(
                ps.metadata.labels[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i))

        # Start 2 ps services
        for i in range(2):
            c.create_ps_service(i)

        # Check ps services
        for i in range(2):
            service = c.get_ps_service(i)
            self.assertIsNotNone(service)
            self.assertEqual(service.spec.selector[k8s.ELASTICDL_JOB_KEY],
                             c.job_name)
            self.assertEqual(
                service.spec.selector[k8s.ELASTICDL_REPLICA_TYPE_KEY], "ps")
            self.assertEqual(
                service.spec.selector[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i))

        # Delete master and all ps and workers should also be deleted
        c.delete_master()

        # wait for all ps, workers and services to be deleted
        while tracker._count > 0:
            time.sleep(1)
예제 #6
0
    def __init__(self,
                 task_d,
                 rendezvous_server=None,
                 num_workers=1,
                 worker_command=None,
                 worker_args=None,
                 worker_resource_request="cpu=1,memory=4096Mi",
                 worker_resource_limit="cpu=1,memory=4096Mi",
                 worker_pod_priority=None,
                 num_ps=0,
                 ps_command=None,
                 ps_args=None,
                 ps_resource_request="cpu=1,memory=4096Mi",
                 ps_resource_limit="cpu=1,memory=4096Mi",
                 ps_pod_priority=None,
                 volume=None,
                 image_pull_policy=None,
                 restart_policy="Never",
                 envs=None,
                 disable_relaunch=False,
                 log_file_path=None,
                 **kwargs):
        self._num_workers = num_workers
        self._worker_command = worker_command
        self._worker_args = worker_args
        self._worker_resource_request = worker_resource_request
        self._worker_resource_limit = worker_resource_limit
        self._worker_pod_priority = _parse_worker_pod_priority(
            self._num_workers, worker_pod_priority)

        self._num_ps = num_ps
        self._ps_command = ps_command
        self._ps_args = ps_args
        self._ps_resource_request = ps_resource_request
        self._ps_resource_limit = ps_resource_limit
        self._ps_pod_priority = ps_pod_priority

        self._restart_policy = restart_policy
        self._volume = volume
        self._image_pull_policy = image_pull_policy
        self._envs = envs
        self._task_d = task_d
        self._rendezvous_server = rendezvous_server
        self._next_worker_id = itertools.count().__next__
        self._log_file_path = log_file_path

        # Protects followed variables, which are accessed from event_cb.
        self._lock = threading.Lock()

        self._init_worker_pod_status()

        if disable_relaunch:
            self._k8s_client = k8s.Client(**kwargs)
        else:
            self._k8s_client = k8s.Client(
                event_callback=self._event_cb,
                periodic_call_func=self._process_worker,
                **kwargs)
        self._ps_addrs = self._get_addrs(
            self._num_ps, self._k8s_client.get_ps_service_address)
        self._worker_addrs = []