def tolerations(self): cluster, is_distributed, = self.spec.cluster_def worker_tolerations = TensorflowSpecification.get_worker_tolerations( environment=self.spec.config.tensorflow, cluster=cluster, is_distributed=is_distributed) ps_tolerations = TensorflowSpecification.get_ps_tolerations( environment=self.spec.config.tensorflow, cluster=cluster, is_distributed=is_distributed) return { TaskType.MASTER: { 0: self.spec.master_tolerations }, TaskType.WORKER: worker_tolerations, TaskType.PS: ps_tolerations, }
def create_tensorflow_experiment_jobs(experiment, spawner): master_job_uuid = spawner.job_uuids[TaskType.MASTER][0] role = TaskType.MASTER if experiment.backend == ExperimentBackend.KUBEFLOW: role = TaskType.CHIEF create_job(job_uuid=master_job_uuid, experiment=experiment, role=role, resources=spawner.spec.master_resources, node_selector=spawner.spec.master_node_selector, affinity=spawner.spec.master_affinity, tolerations=spawner.spec.master_tolerations) cluster, is_distributed = spawner.spec.cluster_def environment = spawner.spec.config.tensorflow worker_resources = TensorflowSpecification.get_worker_resources( environment=environment, cluster=cluster, is_distributed=is_distributed) worker_node_selectors = TensorflowSpecification.get_worker_node_selectors( environment=environment, cluster=cluster, is_distributed=is_distributed) worker_affinities = TensorflowSpecification.get_worker_affinities( environment=environment, cluster=cluster, is_distributed=is_distributed) worker_tolerations = TensorflowSpecification.get_worker_tolerations( environment=environment, cluster=cluster, is_distributed=is_distributed) for i, worker_job_uuid in enumerate(spawner.job_uuids[TaskType.WORKER]): create_job(job_uuid=worker_job_uuid, experiment=experiment, role=TaskType.WORKER, sequence=i, resources=worker_resources.get(i), node_selector=worker_node_selectors.get(i), affinity=worker_affinities.get(i), tolerations=worker_tolerations.get(i)) ps_resources = TensorflowSpecification.get_ps_resources( environment=environment, cluster=cluster, is_distributed=is_distributed) ps_node_selectors = TensorflowSpecification.get_ps_node_selectors( environment=environment, cluster=cluster, is_distributed=is_distributed) ps_affinities = TensorflowSpecification.get_ps_affinities( environment=environment, cluster=cluster, is_distributed=is_distributed) ps_tolerations = TensorflowSpecification.get_ps_tolerations( environment=environment, cluster=cluster, is_distributed=is_distributed) for i, ps_job_uuid in enumerate(spawner.job_uuids[TaskType.PS]): create_job(job_uuid=ps_job_uuid, experiment=experiment, role=TaskType.PS, sequence=i, resources=ps_resources.get(i), node_selector=ps_node_selectors.get(i), affinity=ps_affinities.get(i), tolerations=ps_tolerations.get(i))