def test_get_nvidia_label(self): node_id = 6 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus) nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "0,1") gpu_count = 4 job_id = 11 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus) nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "2,3")
def test_calls_where_node_has_no_gpus(self): node_id = 7 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) node_id = 8 self.assertFalse( GPUManager.reserve_gpus_for_job(node_id, required_gpus)) self.assertFalse( GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)) nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "")
def test_release_gpu(self): node_id = 7 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) self.assertTrue( GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)) job_id = 11 self.assertFalse( GPUManager.reserve_gpus_for_job( node_id, required_gpus)) # shouldnt have enough GPUs GPUManager.release_gpus(node_id, 10) self.assertTrue(GPUManager.reserve_gpus_for_job( node_id, required_gpus)) #gpus should be avail again self.assertTrue( GPUManager.assign_gpus_for_job( node_id, job_id, required_gpus)) #gpus should be avail again nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "0,1")
def _configure_all_tasks(self, config, job_exe, job_type): """Configures the given execution with items that apply to all tasks :param config: The execution configuration :type config: :class:`job.execution.configuration.json.exe_config.ExecutionConfiguration` :param job_exe: The job execution model being scheduled :type job_exe: :class:`job.models.JobExecution` :param job_type: The job type model :type job_type: :class:`job.models.JobType` """ config.set_task_ids(job_exe.get_cluster_id()) for task_type in config.get_task_types(): # Configure env vars describing allocated task resources env_vars = {} nvidia_docker_label = None for resource in config.get_resources(task_type).resources: env_name = 'ALLOCATED_%s' % normalize_env_var_name( resource.name) env_vars[ env_name] = '%.1f' % resource.value # Assumes scalar resources if resource.name == "gpus" and int(resource.value) > 0: gpu_list = GPUManager.get_nvidia_docker_label( job_exe.node_id, job_exe.job_id) nvidia_docker_label = DockerParameter( 'env', 'NVIDIA_VISIBLE_DEVICES={}'.format( gpu_list.strip(','))) # Configure env vars for Scale meta-data env_vars['SCALE_JOB_ID'] = unicode(job_exe.job_id) env_vars['SCALE_EXE_NUM'] = unicode(job_exe.exe_num) if job_exe.recipe_id: env_vars['SCALE_RECIPE_ID'] = unicode(job_exe.recipe_id) if job_exe.batch_id: env_vars['SCALE_BATCH_ID'] = unicode(job_exe.batch_id) # Configure workspace volumes workspace_volumes = {} for task_workspace in config.get_workspaces(task_type): logger.debug(self._workspaces) workspace_model = self._workspaces[task_workspace.name] # TODO: Should refactor workspace broker to return a Volume object and remove BrokerVolume if workspace_model.volume: vol_name = get_workspace_volume_name( job_exe, task_workspace.name) cont_path = get_workspace_volume_path(workspace_model.name) if workspace_model.volume.host: host_path = workspace_model.volume.remote_path volume = Volume(vol_name, cont_path, task_workspace.mode, is_host=True, host_path=host_path) else: driver = workspace_model.volume.driver driver_opts = {} # TODO: Hack alert for nfs broker, as stated above, we should return Volume from broker if driver == 'nfs': driver_opts = { 'share': workspace_model.volume.remote_path } volume = Volume(vol_name, cont_path, task_workspace.mode, is_host=False, driver=driver, driver_opts=driver_opts) workspace_volumes[task_workspace.name] = volume config.add_to_task(task_type, env_vars=env_vars, wksp_volumes=workspace_volumes) # Labels for metric grouping job_id_label = DockerParameter( 'label', 'scale-job-id={}'.format(job_exe.job_id)) job_execution_id_label = DockerParameter( 'label', 'scale-job-execution-id={}'.format(job_exe.exe_num)) job_type_name_label = DockerParameter( 'label', 'scale-job-type-name={}'.format(job_type.name)) job_type_version_label = DockerParameter( 'label', 'scale-job-type-version={}'.format(job_type.version)) main_label = DockerParameter('label', 'scale-task-type=main') if nvidia_docker_label: nvidia_runtime_param = DockerParameter('runtime', 'nvidia') config.add_to_task('main', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, main_label, nvidia_docker_label, nvidia_runtime_param ]) else: config.add_to_task('main', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, main_label ]) if not job_type.is_system: pre_label = DockerParameter('label', 'scale-task-type=pre') post_label = DockerParameter('label', 'scale-task-type=post') config.add_to_task('pre', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, pre_label ]) config.add_to_task('post', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, post_label ]) # Configure tasks for logging if settings.LOGGING_ADDRESS is not None: log_driver = DockerParameter('log-driver', 'fluentd') fluent_precision = DockerParameter( 'log-opt', 'fluentd-sub-second-precision=true') log_address = DockerParameter( 'log-opt', 'fluentd-address=%s' % settings.LOGGING_ADDRESS) if not job_type.is_system: pre_task_tag = DockerParameter( 'log-opt', 'tag=%s|%s|%s|%s|%s' % (config.get_task_id('pre'), job_type.name, job_type.version, job_exe.job_id, job_exe.exe_num)) config.add_to_task('pre', docker_params=[ log_driver, fluent_precision, log_address, pre_task_tag ]) post_task_tag = DockerParameter( 'log-opt', 'tag=%s|%s|%s|%s|%s' % (config.get_task_id('post'), job_type.name, job_type.version, job_exe.job_id, job_exe.exe_num)) config.add_to_task('post', docker_params=[ log_driver, fluent_precision, log_address, post_task_tag ]) # TODO: remove es_urls parameter when Scale no longer supports old style job types # Post task needs ElasticSearch URL to grab logs for old artifact registration es_param = DockerParameter( 'env', 'ELASTICSEARCH_URL=%s' % settings.ELASTICSEARCH_URL) config.add_to_task('post', docker_params=[es_param]) main_task_tag = DockerParameter( 'log-opt', 'tag=%s|%s|%s|%s|%s' % (config.get_task_id('main'), job_type.name, job_type.version, job_exe.job_id, job_exe.exe_num)) config.add_to_task('main', docker_params=[ log_driver, fluent_precision, log_address, main_task_tag ])