def get_task_pod_spec(self, volume_mounts, volumes, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, env_vars=None, command=None, args=None, resources=None, node_selector=None, affinity=None, tolerations=None, restart_policy='OnFailure'): """Pod spec to be used to create pods for tasks: master, worker, ps.""" volume_mounts = get_list(volume_mounts) volumes = get_list(volumes) gpu_volume_mounts, gpu_volumes = get_gpu_volumes_def(resources) volume_mounts += gpu_volume_mounts volumes += gpu_volumes pod_container = self.get_pod_container(volume_mounts=volume_mounts, persistence_outputs=persistence_outputs, persistence_data=persistence_data, outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, env_vars=env_vars, command=command, args=args, resources=resources) containers = [pod_container] if self.use_sidecar: sidecar_container = self.get_sidecar_container() containers.append(sidecar_container) node_selector = get_node_selector( node_selector=node_selector, default_node_selector=settings.NODE_SELECTOR_JOBS) affinity = get_affinity( affinity=affinity, default_affinity=settings.AFFINITY_JOBS) tolerations = get_tolerations( tolerations=tolerations, default_tolerations=settings.TOLERATIONS_JOBS) service_account_name = None if settings.K8S_RBAC_ENABLED: service_account_name = settings.K8S_SERVICE_ACCOUNT_NAME return client.V1PodSpec( restart_policy=restart_policy, service_account_name=service_account_name, init_containers=to_list(self.get_init_container(persistence_outputs)), containers=containers, volumes=volumes, node_selector=node_selector, affinity=affinity, tolerations=tolerations)
def start_dockerizer(self, resources=None, node_selector=None, affinity=None, tolerations=None): volumes, volume_mounts = get_docker_volumes() node_selector = get_node_selector( node_selector=node_selector, default_node_selector=conf.get('NODE_SELECTOR_BUILDS')) affinity = get_affinity( affinity=affinity, default_affinity=conf.get('AFFINITY_BUILDS')) tolerations = get_tolerations( tolerations=tolerations, default_tolerations=conf.get('TOLERATIONS_BUILDS')) pod = pods.get_pod( namespace=self.namespace, app=conf.get('APP_LABELS_DOCKERIZER'), name=DOCKERIZER_JOB_NAME, project_name=self.project_name, project_uuid=self.project_uuid, job_name=self.job_name, job_uuid=self.job_uuid, volume_mounts=volume_mounts, volumes=volumes, image=conf.get('JOB_DOCKERIZER_IMAGE'), image_pull_policy=conf.get('JOB_DOCKERIZER_IMAGE_PULL_POLICY'), command=None, args=[self.job_uuid], ports=[], env_vars=self.get_env_vars(), container_name=conf.get('CONTAINER_NAME_DOCKERIZER_JOB'), resources=resources, node_selector=node_selector, affinity=affinity, tolerations=tolerations, role=conf.get('ROLE_LABELS_WORKER'), type=conf.get('TYPE_LABELS_RUNNER'), service_account_name=conf.get('K8S_SERVICE_ACCOUNT_BUILDS'), restart_policy='Never') pod_name = JOB_NAME_FORMAT.format(job_uuid=self.job_uuid, name=DOCKERIZER_JOB_NAME) pod_resp, _ = self.create_or_update_pod(name=pod_name, data=pod) return pod_resp.to_dict()
def start_dockerizer(self, resources=None, node_selector=None, affinity=None, tolerations=None): volumes, volume_mounts = get_docker_volumes() node_selector = get_node_selector( node_selector=node_selector, default_node_selector=settings.NODE_SELECTOR_BUILDS) affinity = get_affinity( affinity=affinity, default_affinity=settings.AFFINITY_BUILDS) tolerations = get_tolerations( tolerations=tolerations, default_tolerations=settings.TOLERATIONS_BUILDS) deployment = pods.get_pod( namespace=self.namespace, app=settings.APP_LABELS_DOCKERIZER, name=self.DOCKERIZER_JOB_NAME, project_name=self.project_name, project_uuid=self.project_uuid, job_name=self.job_name, job_uuid=self.job_uuid, volume_mounts=volume_mounts, volumes=volumes, image=settings.JOB_DOCKERIZER_IMAGE, command=None, args=[self.job_uuid], ports=[], env_vars=self.get_env_vars(), container_name=settings.CONTAINER_NAME_DOCKERIZER_JOB, resources=resources, node_selector=node_selector, affinity=affinity, tolerations=tolerations, role=settings.ROLE_LABELS_WORKER, type=settings.TYPE_LABELS_RUNNER, restart_policy='Never') pod_name = constants.JOB_NAME.format( job_uuid=self.job_uuid, name=self.DOCKERIZER_JOB_NAME) pod_resp, _ = self.create_or_update_pod(name=pod_name, data=deployment) return pod_resp.to_dict()
def _get_affinity(self, affinity): return get_affinity(affinity=affinity, default_affinity=conf.get(AFFINITIES_EXPERIMENTS))
def start_notebook(self, image, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, resources=None, secret_refs=None, configmap_refs=None, node_selector=None, affinity=None, tolerations=None, allow_commits=False): ports = [self.request_notebook_port()] target_ports = [self.PORT] volumes, volume_mounts = get_pod_volumes( persistence_outputs=persistence_outputs, persistence_data=persistence_data) refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes( outputs_refs=outputs_refs_jobs, persistence_outputs=persistence_outputs) volumes += refs_volumes volume_mounts += refs_volume_mounts refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes( outputs_refs=outputs_refs_experiments, persistence_outputs=persistence_outputs) volumes += refs_volumes volume_mounts += refs_volume_mounts shm_volumes, shm_volume_mounts = get_shm_volumes() volumes += shm_volumes volume_mounts += shm_volume_mounts env_vars = get_job_env_vars( persistence_outputs=persistence_outputs, outputs_path=get_notebook_job_outputs_path( persistence_outputs=persistence_outputs, notebook_job=self.job_name), persistence_data=persistence_data, outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments) secret_refs = validate_secret_refs(secret_refs) configmap_refs = validate_configmap_refs(configmap_refs) env_from = get_pod_env_from(secret_refs=secret_refs, configmap_refs=configmap_refs) code_volume, code_volume_mount = self.get_notebook_code_volume() volumes.append(code_volume) volume_mounts.append(code_volume_mount) deployment_name = JOB_NAME_FORMAT.format(name=NOTEBOOK_JOB_NAME, job_uuid=self.job_uuid) node_selector = get_node_selector( node_selector=node_selector, default_node_selector=settings.NODE_SELECTOR_EXPERIMENTS) affinity = get_affinity(affinity=affinity, default_affinity=settings.AFFINITY_EXPERIMENTS) tolerations = get_tolerations( tolerations=tolerations, default_tolerations=settings.TOLERATIONS_EXPERIMENTS) deployment = deployments.get_deployment( namespace=self.namespace, app=settings.APP_LABELS_NOTEBOOK, name=NOTEBOOK_JOB_NAME, project_name=self.project_name, project_uuid=self.project_uuid, job_name=self.job_name, job_uuid=self.job_uuid, volume_mounts=volume_mounts, volumes=volumes, image=image, command=["/bin/sh", "-c"], args=self.get_notebook_args(deployment_name=deployment_name, ports=ports, allow_commits=allow_commits), ports=target_ports, container_name=settings.CONTAINER_NAME_PLUGIN_JOB, env_vars=env_vars, env_from=env_from, resources=resources, node_selector=node_selector, affinity=affinity, tolerations=tolerations, role=settings.ROLE_LABELS_DASHBOARD, type=settings.TYPE_LABELS_RUNNER, service_account_name=settings.K8S_SERVICE_ACCOUNT_EXPERIMENTS) deployment_labels = deployments.get_labels( app=settings.APP_LABELS_NOTEBOOK, project_name=self.project_name, project_uuid=self.project_uuid, job_name=self.job_name, job_uuid=self.job_uuid, role=settings.ROLE_LABELS_DASHBOARD, type=settings.TYPE_LABELS_RUNNER) dep_resp, _ = self.create_or_update_deployment(name=deployment_name, data=deployment) service = services.get_service(namespace=self.namespace, name=deployment_name, labels=deployment_labels, ports=ports, target_ports=target_ports, service_type=self._get_service_type()) service_resp, _ = self.create_or_update_service(name=deployment_name, data=service) results = { 'deployment': dep_resp.to_dict(), 'service': service_resp.to_dict() } if self._use_ingress(): annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS) paths = [{ 'path': '/notebook/{}'.format(self.project_name.replace('.', '/')), 'backend': { 'serviceName': deployment_name, 'servicePort': ports[0] } }] ingress = ingresses.get_ingress(namespace=self.namespace, name=deployment_name, labels=deployment_labels, annotations=annotations, paths=paths) self.create_or_update_ingress(name=deployment_name, data=ingress) return results
def _get_affinity(self, affinity): return get_affinity(affinity=affinity, default_affinity=conf.get('AFFINITY_JOBS'))
def _get_affinity(self, affinity): return get_affinity(affinity=affinity, default_affinity=conf.get('AFFINITY_TENSORBOARDS'))
def _get_affinity(self, affinity): return get_affinity(affinity=affinity, default_affinity=conf.get(AFFINITIES_JOBS))
def _get_affinity(self, affinity): return get_affinity(affinity=affinity, default_affinity=conf.get('AFFINITY_EXPERIMENTS'))
def _get_affinity(self, affinity): return get_affinity(affinity=affinity, default_affinity=conf.get(AFFINITIES_TENSORBOARDS))
def get_task_pod_spec(self, task_type, task_idx, volume_mounts, volumes, env_vars=None, command=None, args=None, sidecar_args=None, persistence_outputs=None, persistence_data=None, outputs_refs_jobs=None, outputs_refs_experiments=None, resources=None, node_selector=None, affinity=None, tolerations=None, restart_policy='OnFailure'): """Pod spec to be used to create pods for tasks: master, worker, ps.""" volume_mounts = get_list(volume_mounts) volumes = get_list(volumes) gpu_volume_mounts, gpu_volumes = get_gpu_volumes_def(resources) volume_mounts += gpu_volume_mounts volumes += gpu_volumes # Add job information env_vars = get_list(env_vars) env_vars.append( client.V1EnvVar( name=constants.CONFIG_MAP_TASK_INFO_KEY_NAME, value=json.dumps({'type': task_type, 'index': task_idx}) ) ) pod_container = self.get_pod_container(volume_mounts=volume_mounts, env_vars=env_vars, command=command, args=args, persistence_outputs=persistence_outputs, persistence_data=persistence_data, outputs_refs_jobs=outputs_refs_jobs, outputs_refs_experiments=outputs_refs_experiments, resources=resources) containers = [pod_container] if self.use_sidecar: sidecar_container = self.get_sidecar_container(task_type=task_type, task_idx=task_idx, args=sidecar_args) containers.append(sidecar_container) node_selector = get_node_selector( node_selector=node_selector, default_node_selector=settings.NODE_SELECTOR_EXPERIMENTS) affinity = get_affinity( affinity=affinity, default_affinity=settings.AFFINITY_EXPERIMENTS) tolerations = get_tolerations( tolerations=tolerations, default_tolerations=settings.TOLERATIONS_EXPERIMENTS) service_account_name = None if settings.K8S_RBAC_ENABLED: service_account_name = settings.K8S_SERVICE_ACCOUNT_NAME return client.V1PodSpec( restart_policy=restart_policy, service_account_name=service_account_name, init_containers=to_list(self.get_init_container(persistence_outputs)), containers=containers, volumes=volumes, node_selector=node_selector, tolerations=tolerations, affinity=affinity)
def test_pod_affinity(self): assert get_affinity(None, None) is None assert get_affinity({'foo': 'bar'}, None) == {'foo': 'bar'} assert get_affinity(None, {"foo": "bar"}) == {'foo': 'bar'} assert get_affinity({'foo': 'bar'}, {"foo": "moo"}) == {'foo': 'bar'}
def start_tensorboard(self, image, outputs_path, persistence_outputs, outputs_specs=None, outputs_refs_jobs=None, outputs_refs_experiments=None, resources=None, node_selector=None, affinity=None, tolerations=None): ports = [self.request_tensorboard_port()] target_ports = [self.PORT] volumes, volume_mounts = get_pod_outputs_volume(persistence_outputs) refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes( outputs_refs=outputs_refs_jobs, persistence_outputs=persistence_outputs) volumes += refs_volumes volume_mounts += refs_volume_mounts refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes( outputs_refs=outputs_specs, persistence_outputs=persistence_outputs) volumes += refs_volumes volume_mounts += refs_volume_mounts refs_volumes, refs_volume_mounts = get_pod_refs_outputs_volumes( outputs_refs=outputs_refs_experiments, persistence_outputs=persistence_outputs) volumes += refs_volumes volume_mounts += refs_volume_mounts # Add volumes for persistence outputs secrets stores_secrets = get_stores_secrets(specs=outputs_specs) self.validate_stores_secrets_keys(stores_secrets=stores_secrets) secrets_volumes, secrets_volume_mounts = self.get_stores_secrets_volumes( stores_secrets=stores_secrets) volumes += secrets_volumes volume_mounts += secrets_volume_mounts # Get persistence outputs secrets auth commands command_args = self.get_stores_secrets_command_args( stores_secrets=stores_secrets) command_args.append("tensorboard --logdir={} --port={}".format( outputs_path, self.PORT)) node_selector = get_node_selector( node_selector=node_selector, default_node_selector=settings.NODE_SELECTOR_TENSORBOARDS) affinity = get_affinity( affinity=affinity, default_affinity=settings.AFFINITY_TENSORBOARDS) tolerations = get_tolerations( tolerations=tolerations, default_tolerations=settings.TOLERATIONS_TENSORBOARDS) deployment = deployments.get_deployment( namespace=self.namespace, app=settings.APP_LABELS_TENSORBOARD, name=TENSORBOARD_JOB_NAME, project_name=self.project_name, project_uuid=self.project_uuid, job_name=self.job_name, job_uuid=self.job_uuid, volume_mounts=volume_mounts, volumes=volumes, image=image, command=["/bin/sh", "-c"], args=[' && '.join(command_args)], ports=target_ports, container_name=settings.CONTAINER_NAME_PLUGIN_JOB, resources=resources, node_selector=node_selector, affinity=affinity, tolerations=tolerations, role=settings.ROLE_LABELS_DASHBOARD, type=settings.TYPE_LABELS_RUNNER) deployment_name = JOB_NAME_FORMAT.format(name=TENSORBOARD_JOB_NAME, job_uuid=self.job_uuid) deployment_labels = deployments.get_labels( app=settings.APP_LABELS_TENSORBOARD, project_name=self.project_name, project_uuid=self.project_uuid, job_name=self.job_name, job_uuid=self.job_uuid, role=settings.ROLE_LABELS_DASHBOARD, type=settings.TYPE_LABELS_RUNNER) dep_resp, _ = self.create_or_update_deployment(name=deployment_name, data=deployment) service = services.get_service(namespace=self.namespace, name=deployment_name, labels=deployment_labels, ports=ports, target_ports=target_ports, service_type=self._get_service_type()) service_resp, _ = self.create_or_update_service(name=deployment_name, data=service) results = { 'deployment': dep_resp.to_dict(), 'service': service_resp.to_dict() } if self._use_ingress(): annotations = json.loads(settings.K8S_INGRESS_ANNOTATIONS) paths = [{ 'path': '/tensorboard/{}'.format(self.project_name.replace('.', '/')), 'backend': { 'serviceName': deployment_name, 'servicePort': ports[0] } }] ingress = ingresses.get_ingress(namespace=self.namespace, name=deployment_name, labels=deployment_labels, annotations=annotations, paths=paths) self.create_or_update_ingress(name=deployment_name, data=ingress) return results