def tolerations(self): cluster, is_distributed, = self.spec.cluster_def worker_tolerations = MXNetSpecification.get_worker_tolerations( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed ) ps_tolerations = MXNetSpecification.get_ps_tolerations( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed ) return { TaskType.MASTER: {0: self.spec.master_tolerations}, TaskType.WORKER: worker_tolerations, TaskType.SERVER: ps_tolerations, }
def affinities(self): cluster, is_distributed, = self.spec.cluster_def worker_affinities = MXNetSpecification.get_worker_affinities( environment=self.spec.config.mxnet, cluster=cluster, is_distributed=is_distributed ) ps_affinities = MXNetSpecification.get_ps_affinities( environment=self.spec.config.mxnet, cluster=cluster, is_distributed=is_distributed ) return { TaskType.MASTER: {0: self.spec.master_affinity}, TaskType.WORKER: worker_affinities, TaskType.SERVER: ps_affinities, }
def handle_mxnet_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources, node_selector=spawner.spec.master_node_selector, affinity=spawner.spec.master_affinity, tolerations=spawner.spec.master_tolerations) cluster, is_distributed = spawner.spec.cluster_def worker_resources = MXNetSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) worker_node_selectors = MXNetSpecification.get_worker_node_selectors( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) worker_affinities = MXNetSpecification.get_worker_affinities( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) worker_tolerations = MXNetSpecification.get_worker_tolerations( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, sequence=i, resources=worker_resources.get(i), node_selector=worker_node_selectors.get(i), affinity=worker_affinities.get(i), tolerations=worker_tolerations.get(i)) server_resources = MXNetSpecification.get_ps_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) server_node_selectors = MXNetSpecification.get_ps_node_selectors( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) server_affinities = MXNetSpecification.get_ps_affinities( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) server_tolerations = MXNetSpecification.get_ps_tolerations( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) for i, server in enumerate(response[TaskType.SERVER]): job_uuid = server['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(server), role=TaskType.SERVER, sequence=i, resources=server_resources.get(i), node_selector=server_node_selectors, affinity=server_affinities, tolerations=server_tolerations)
def create_mxnet_experiment_jobs(experiment, spawner): master_job_uuid = spawner.job_uuids[TaskType.MASTER][0] create_job(job_uuid=master_job_uuid, experiment=experiment, resources=spawner.spec.master_resources, node_selector=spawner.spec.master_node_selector, affinity=spawner.spec.master_affinity, tolerations=spawner.spec.master_tolerations) cluster, is_distributed = spawner.spec.cluster_def worker_resources = MXNetSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) worker_node_selectors = MXNetSpecification.get_worker_node_selectors( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) worker_affinities = MXNetSpecification.get_worker_affinities( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) worker_tolerations = MXNetSpecification.get_worker_tolerations( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) for i, worker_job_uuid in enumerate(spawner.job_uuids[TaskType.WORKER]): create_job(job_uuid=worker_job_uuid, experiment=experiment, role=TaskType.WORKER, sequence=i, resources=worker_resources.get(i), node_selector=worker_node_selectors.get(i), affinity=worker_affinities.get(i), tolerations=worker_tolerations.get(i)) server_resources = MXNetSpecification.get_ps_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) server_node_selectors = MXNetSpecification.get_ps_node_selectors( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) server_affinities = MXNetSpecification.get_ps_affinities( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) server_tolerations = MXNetSpecification.get_ps_tolerations( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) for i, server_job_uuid in enumerate(spawner.job_uuids[TaskType.SERVER]): create_job(job_uuid=server_job_uuid, experiment=experiment, role=TaskType.SERVER, sequence=i, resources=server_resources.get(i), node_selector=server_node_selectors, affinity=server_affinities, tolerations=server_tolerations)