Пример #1
0
def handle_horovod_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = HorovodSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))
Пример #2
0
def handle_horovod_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = HorovodSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))
Пример #3
0
 def resources(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_resources = HorovodSpecification.get_worker_resources(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed
     )
     return {
         TaskType.MASTER: {0: self.spec.master_resources},
         TaskType.WORKER: worker_resources,
     }
Пример #4
0
 def resources(self):
     cluster, is_distributed, = self.spec.cluster_def
     worker_resources = HorovodSpecification.get_worker_resources(
         environment=self.spec.environment,
         cluster=cluster,
         is_distributed=is_distributed
     )
     return {
         TaskType.MASTER: {0: self.spec.master_resources},
         TaskType.WORKER: worker_resources,
     }
Пример #5
0
    def test_distributed_horovod_passes(self):
        plxfile = PolyaxonFile(
            os.path.abspath('tests/fixtures/distributed_horovod_file.yml'))
        spec = plxfile.specification
        assert spec.version == 1
        assert spec.project.name == 'project1'
        assert isinstance(spec.settings, SettingsConfig)
        assert isinstance(spec.settings.logging, LoggingConfig)
        assert spec.settings.matrix is None
        assert isinstance(spec.environment, EnvironmentConfig)
        assert spec.is_runnable
        assert spec.framework == Frameworks.HOROVOD
        assert spec.environment.horovod.n_workers == 5

        assert isinstance(spec.environment.resources, PodResourcesConfig)
        assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig)
        assert spec.environment.resources.cpu.requests == 1
        assert spec.environment.resources.cpu.limits == 2

        assert isinstance(spec.environment.horovod.default_worker_resources,
                          PodResourcesConfig)
        assert isinstance(
            spec.environment.horovod.default_worker_resources.cpu,
            K8SResourcesConfig)
        assert spec.environment.horovod.default_worker_resources.cpu.requests == 3
        assert spec.environment.horovod.default_worker_resources.cpu.limits == 3
        assert isinstance(
            spec.environment.horovod.default_worker_resources.memory,
            K8SResourcesConfig)
        assert spec.environment.horovod.default_worker_resources.memory.requests == 256
        assert spec.environment.horovod.default_worker_resources.memory.limits == 256

        assert isinstance(spec.environment.horovod.worker_resources[0],
                          PodResourcesConfig)
        assert isinstance(spec.environment.horovod.worker_resources[0].memory,
                          K8SResourcesConfig)
        assert spec.environment.horovod.worker_resources[0].index == 3
        assert spec.environment.horovod.worker_resources[
            0].memory.requests == 300
        assert spec.environment.horovod.worker_resources[
            0].memory.limits == 300

        # check that properties for return list of configs and resources is working
        cluster, is_distributed = spec.cluster_def
        worker_resources = HorovodSpecification.get_worker_resources(
            environment=spec.environment,
            cluster=cluster,
            is_distributed=is_distributed)
        assert len(worker_resources) == spec.environment.horovod.n_workers
        assert set(worker_resources.values()) == {
            spec.environment.horovod.default_worker_resources,
            spec.environment.horovod.worker_resources[0]
        }

        # Check total resources
        assert spec.total_resources == {
            'cpu': {
                'requests': 1 + 3 * 4,
                'limits': 2 + 3 * 4
            },
            'memory': {
                'requests': 300 + 256 * 4,
                'limits': 300 + 256 * 4
            },
            'gpu': None
        }

        assert spec.cluster_def == ({
            TaskType.MASTER: 1,
            TaskType.WORKER: 5
        }, True)