def cluster_def(self): cluster = { TaskType.MASTER: 1, } is_distributed = False environment = self.environment if not environment: return cluster, is_distributed if environment.tensorflow: return TensorflowSpecification.get_cluster_def( cluster=cluster, tensorflow_config=environment.tensorflow) if environment.horovod: return HorovodSpecification.get_cluster_def( cluster=cluster, horovod_config=environment.horovod) if environment.mxnet: return MXNetSpecification.get_cluster_def( cluster=cluster, mxnet_config=environment.mxnet) if environment.pytorch: return PytorchSpecification.get_cluster_def( cluster=cluster, pytorch_config=environment.pytorch) # No specified framework, It should return default standalone mode cluster definition return cluster, is_distributed
def handle_pytorch_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources) cluster, is_distributed, = spawner.spec.cluster_def worker_resources = PytorchSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, resources=worker_resources.get(i))
def handle_pytorch_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources) cluster, is_distributed, = spawner.spec.cluster_def worker_resources = PytorchSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, resources=worker_resources.get(i))
def node_selectors(self): cluster, is_distributed, = self.spec.cluster_def worker_node_selectors = PytorchSpecification.get_worker_node_selectors( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed ) return { TaskType.MASTER: {0: self.spec.master_node_selectors}, TaskType.WORKER: worker_node_selectors, }
def node_selectors(self): cluster, is_distributed, = self.spec.cluster_def worker_node_selectors = PytorchSpecification.get_worker_node_selectors( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) return { TaskType.MASTER: { 0: self.spec.master_node_selectors }, TaskType.WORKER: worker_node_selectors, }
def affinities(self): cluster, is_distributed, = self.spec.cluster_def worker_affinities = PytorchSpecification.get_worker_affinities( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) return { TaskType.MASTER: { 0: self.spec.master_affinity }, TaskType.WORKER: worker_affinities, }
def total_resources(self): environment = self.environment if not environment: return None cluster, is_distributed = self.cluster_def # Check if any framework is defined if environment.tensorflow: return TensorflowSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) if environment.horovod: return HorovodSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) if environment.mxnet: return MXNetSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) if environment.pytorch: return PytorchSpecification.get_total_resources( master_resources=self.master_resources, environment=environment, cluster=cluster, is_distributed=is_distributed) # default value is the master resources return self.master_resources
def test_distributed_pytorch_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/distributed_pytorch_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.PYTORCH assert spec.environment.pytorch.n_workers == 5 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.pytorch.default_worker_resources, PodResourcesConfig) assert isinstance( spec.environment.pytorch.default_worker_resources.cpu, K8SResourcesConfig) assert spec.environment.pytorch.default_worker_resources.cpu.requests == 3 assert spec.environment.pytorch.default_worker_resources.cpu.limits == 3 assert isinstance( spec.environment.pytorch.default_worker_resources.memory, K8SResourcesConfig) assert spec.environment.pytorch.default_worker_resources.memory.requests == 256 assert spec.environment.pytorch.default_worker_resources.memory.limits == 256 assert isinstance(spec.environment.pytorch.worker_resources[0], PodResourcesConfig) assert isinstance(spec.environment.pytorch.worker_resources[0].memory, K8SResourcesConfig) assert spec.environment.pytorch.worker_resources[0].index == 3 assert spec.environment.pytorch.worker_resources[ 0].memory.requests == 300 assert spec.environment.pytorch.worker_resources[ 0].memory.limits == 300 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_resources = PytorchSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_resources) == spec.environment.pytorch.n_workers assert set(worker_resources.values()) == { spec.environment.pytorch.default_worker_resources, spec.environment.pytorch.worker_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 3 * 4, 'limits': 2 + 3 * 4 }, 'memory': { 'requests': 300 + 256 * 4, 'limits': 300 + 256 * 4 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5 }, True)