def resources(self): cluster, is_distributed, = self.spec.cluster_def worker_resources = TensorflowSpecification.get_worker_resources( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) ps_resources = TensorflowSpecification.get_ps_resources( environment=self.spec.environment, cluster=cluster, is_distributed=is_distributed) return { TaskType.MASTER: { 0: self.spec.master_resources }, TaskType.WORKER: worker_resources, TaskType.PS: ps_resources, }
def handle_tensorflow_experiment(experiment, spawner, response): # Get the number of jobs this experiment started master = response[TaskType.MASTER] job_uuid = master['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(master), resources=spawner.spec.master_resources) cluster, is_distributed, = spawner.spec.cluster_def worker_resources = TensorflowSpecification.get_worker_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) ps_resources = TensorflowSpecification.get_ps_resources( environment=spawner.spec.environment, cluster=cluster, is_distributed=is_distributed ) for i, worker in enumerate(response[TaskType.WORKER]): job_uuid = worker['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(worker), role=TaskType.WORKER, resources=worker_resources.get(i)) for i, ps in enumerate(response[TaskType.PS]): job_uuid = ps['pod']['metadata']['labels']['job_uuid'] job_uuid = uuid.UUID(job_uuid) create_job(job_uuid=job_uuid, experiment=experiment, definition=get_job_definition(ps), role=TaskType.PS, resources=ps_resources.get(i))
def test_advanced_file_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/advanced_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert spec.is_runnable assert isinstance(spec.environment, EnvironmentConfig) assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert spec.environment.tensorflow.delay_workers_by_global_step is True assert isinstance(spec.environment.tensorflow.run_config, RunConfig) assert spec.environment.tensorflow.run_config.tf_random_seed == 100 assert spec.environment.tensorflow.run_config.save_summary_steps == 100 assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.tensorflow.run_config.session, SessionConfig) assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2 # check properties for returning worker configs and resources assert spec.environment.tensorflow.worker_configs is None assert spec.environment.tensorflow.ps_configs is None assert spec.environment.tensorflow.worker_resources is None assert spec.environment.tensorflow.ps_resources is None cluster, is_distributed = spec.cluster_def assert TensorflowSpecification.get_worker_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_ps_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors. feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None
def test_distributed_tensorflow_passes(self): plxfile = PolyaxonFile( os.path.abspath('tests/fixtures/distributed_tensorflow_file.yml')) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.tensorflow.default_worker_resources, PodResourcesConfig) assert isinstance( spec.environment.tensorflow.default_worker_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_worker_resources.cpu.requests == 3 assert spec.environment.tensorflow.default_worker_resources.cpu.limits == 3 assert isinstance( spec.environment.tensorflow.default_worker_resources.memory, K8SResourcesConfig) assert spec.environment.tensorflow.default_worker_resources.memory.requests == 256 assert spec.environment.tensorflow.default_worker_resources.memory.limits == 256 assert isinstance(spec.environment.tensorflow.worker_resources[0], PodResourcesConfig) assert isinstance( spec.environment.tensorflow.worker_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.worker_resources[0].index == 3 assert spec.environment.tensorflow.worker_resources[ 0].memory.requests == 300 assert spec.environment.tensorflow.worker_resources[ 0].memory.limits == 300 assert isinstance(spec.environment.tensorflow.default_ps_resources, PodResourcesConfig) assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2 assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4 assert isinstance(spec.environment.tensorflow.ps_resources[0], PodResourcesConfig) assert isinstance(spec.environment.tensorflow.ps_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.ps_resources[0].index == 9 assert spec.environment.tensorflow.ps_resources[ 0].memory.requests == 512 assert spec.environment.tensorflow.ps_resources[ 0].memory.limits == 1024 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_resources = TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_resources) == spec.environment.tensorflow.n_workers assert set(worker_resources.values()) == { spec.environment.tensorflow.default_worker_resources, spec.environment.tensorflow.worker_resources[0] } ps_resources = TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(ps_resources) == spec.environment.tensorflow.n_ps assert set(ps_resources.values()) == { spec.environment.tensorflow.default_ps_resources, spec.environment.tensorflow.ps_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 3 * 4 + 2 * 9, 'limits': 2 + 3 * 4 + 4 * 9 }, 'memory': { 'requests': 300 + 256 * 4 + 512, 'limits': 300 + 256 * 4 + 1024 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True)
def test_advanced_file_with_custom_configs_and_resources_passes(self): plxfile = PolyaxonFile( os.path.abspath( 'tests/fixtures/advanced_file_with_custom_configs_and_resources.yml' )) spec = plxfile.specification assert spec.version == 1 assert spec.project.name == 'project1' assert isinstance(spec.settings, SettingsConfig) assert isinstance(spec.settings.logging, LoggingConfig) assert spec.settings.matrix is None assert isinstance(spec.environment, EnvironmentConfig) assert spec.is_runnable assert spec.framework == Frameworks.TENSORFLOW assert spec.environment.tensorflow.n_workers == 5 assert spec.environment.tensorflow.n_ps == 10 assert spec.environment.tensorflow.delay_workers_by_global_step is True assert isinstance(spec.environment.tensorflow.run_config, RunConfig) assert spec.environment.tensorflow.run_config.tf_random_seed == 100 assert spec.environment.tensorflow.run_config.save_summary_steps == 100 assert spec.environment.tensorflow.run_config.save_checkpoints_secs == 60 assert isinstance(spec.environment.resources, PodResourcesConfig) assert isinstance(spec.environment.resources.cpu, K8SResourcesConfig) assert spec.environment.resources.cpu.requests == 1 assert spec.environment.resources.cpu.limits == 2 assert isinstance(spec.environment.tensorflow.run_config.session, SessionConfig) assert spec.environment.tensorflow.run_config.session.allow_soft_placement is True assert spec.environment.tensorflow.run_config.session.intra_op_parallelism_threads == 2 assert spec.environment.tensorflow.run_config.session.inter_op_parallelism_threads == 2 assert isinstance(spec.environment.tensorflow.default_worker_config, SessionConfig) assert spec.environment.tensorflow.default_worker_config.allow_soft_placement is True assert spec.environment.tensorflow.default_worker_config.intra_op_parallelism_threads == 1 assert spec.environment.tensorflow.default_worker_config.inter_op_parallelism_threads == 1 assert isinstance(spec.environment.tensorflow.worker_configs[0], SessionConfig) assert spec.environment.tensorflow.worker_configs[0].index == 3 assert spec.environment.tensorflow.worker_configs[ 0].allow_soft_placement is False assert spec.environment.tensorflow.worker_configs[ 0].intra_op_parallelism_threads == 5 assert spec.environment.tensorflow.worker_configs[ 0].inter_op_parallelism_threads == 5 assert spec.environment.tensorflow.ps_configs is None assert spec.environment.tensorflow.worker_resources is None assert isinstance(spec.environment.tensorflow.default_ps_resources, PodResourcesConfig) assert isinstance(spec.environment.tensorflow.default_ps_resources.cpu, K8SResourcesConfig) assert spec.environment.tensorflow.default_ps_resources.cpu.requests == 2 assert spec.environment.tensorflow.default_ps_resources.cpu.limits == 4 assert isinstance(spec.environment.tensorflow.ps_resources[0], PodResourcesConfig) assert isinstance(spec.environment.tensorflow.ps_resources[0].memory, K8SResourcesConfig) assert spec.environment.tensorflow.ps_resources[0].index == 9 assert spec.environment.tensorflow.ps_resources[ 0].memory.requests == 512 assert spec.environment.tensorflow.ps_resources[ 0].memory.limits == 1024 # check that properties for return list of configs and resources is working cluster, is_distributed = spec.cluster_def worker_configs = TensorflowSpecification.get_worker_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(worker_configs) == spec.environment.tensorflow.n_workers assert set(worker_configs.values()) == { spec.environment.tensorflow.default_worker_config, spec.environment.tensorflow.worker_configs[0] } assert TensorflowSpecification.get_ps_configs( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} assert TensorflowSpecification.get_worker_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) == {} ps_resources = TensorflowSpecification.get_ps_resources( environment=spec.environment, cluster=cluster, is_distributed=is_distributed) assert len(ps_resources) == spec.environment.tensorflow.n_ps assert set(ps_resources.values()) == { spec.environment.tensorflow.default_ps_resources, spec.environment.tensorflow.ps_resources[0] } # Check total resources assert spec.total_resources == { 'cpu': { 'requests': 1 + 2 * 9, 'limits': 2 + 4 * 9 }, 'memory': { 'requests': 512, 'limits': 1024 }, 'gpu': None } assert spec.cluster_def == ({ TaskType.MASTER: 1, TaskType.WORKER: 5, TaskType.PS: 10 }, True) assert isinstance(spec.model, ClassifierConfig) assert isinstance(spec.model.loss, MeanSquaredErrorConfig) assert isinstance(spec.model.optimizer, AdamConfig) assert spec.model.optimizer.learning_rate == 0.21 assert isinstance(spec.model.graph, GraphConfig) assert len(spec.model.graph.layers) == 7 assert spec.model.graph.input_layers == [['images', 0, 0]] assert len(spec.model.graph.output_layers) == 3 assert ['super_dense', 0, 0] in spec.model.graph.output_layers assert isinstance(spec.train.data_pipeline, TFRecordImagePipelineConfig) assert len(spec.train.data_pipeline.feature_processors. feature_processors) == 1 assert isinstance(spec.eval.data_pipeline, TFRecordImagePipelineConfig) assert spec.eval.data_pipeline.feature_processors is None