def test_validate_gpu_spec_against_limits_deterministic(self): '''deterministic test to make sure we get full coverage''' # gpu not supported cfg = { 'gpu_spec': ct.GPUSpec(ct.GPU.K80, 1), 'gpu_limits': { ct.GPU.P100: 1 }, 'limit_type': 'zone', } assert not util.validate_gpu_spec_against_limits(**cfg) # request above limit cfg = { 'gpu_spec': ct.GPUSpec(ct.GPU.K80, 2), 'gpu_limits': { ct.GPU.P100: 1, ct.GPU.K80: 1, }, 'limit_type': 'zone', } assert not util.validate_gpu_spec_against_limits(**cfg) # valid request cfg = { 'gpu_spec': ct.GPUSpec(ct.GPU.K80, 1), 'gpu_limits': { ct.GPU.P100: 1, ct.GPU.K80: 1, }, 'limit_type': 'zone', } assert util.validate_gpu_spec_against_limits(**cfg)
def test_gpuspec_parse_arg(self): with self.assertRaises(ArgumentTypeError): # invalid format string, no x separator. ct.GPUSpec.parse_arg("face") with self.assertRaises(ArgumentTypeError): # Invalid number. ct.GPUSpec.parse_arg("randomxV100") with self.assertRaises(ArgumentTypeError): # invalid GPU type. ct.GPUSpec.parse_arg("8xNONSTANDARD") with self.assertRaises(ArgumentTypeError): # Invalid number for the valid GPU type. ct.GPUSpec.parse_arg("15xV100") self.assertEqual(ct.GPUSpec(ct.GPU.V100, 7), ct.GPUSpec.parse_arg("7xV100", validate_count=False)) # Valid! self.assertEqual(ct.GPUSpec(ct.GPU.V100, 8), ct.GPUSpec.parse_arg("8xV100"))
def test_validate_gpu_spec_against_limits( self, limits: List[int], gpu_type: ct.GPU, count: int, ): """tests gpu validation against limits""" gpu_list = [g for g in ct.GPU] gpu_limits = dict([(gpu_list[i], limits[i]) for i in range(len(limits)) if limits[i]]) spec = ct.GPUSpec(gpu_type, count) valid = util.validate_gpu_spec_against_limits(spec, gpu_limits, 'test') if spec.gpu not in gpu_limits: self.assertFalse(valid) else: self.assertTrue(valid == (spec.count <= gpu_limits[spec.gpu])) return
def test_job_mode(self): """Tests for all possible combinations of the three arguments to resolve_job_mode. """ gpu_spec = ct.GPUSpec(ct.GPU.P100, 4) tpu_spec = ct.TPUSpec(ct.TPU.V2, 8) def assertMode(expected_mode, use_gpu, gpu_spec, tpu_spec): mode = c._job_mode(use_gpu, gpu_spec, tpu_spec) self.assertEqual(mode, expected_mode) # --nogpu and no override. assertMode(JobMode.CPU, False, None, None) # TPU doesn't need GPUs assertMode(JobMode.CPU, False, None, tpu_spec) # Default GPUSpec filled in. assertMode(JobMode.GPU, True, None, None) # Explicit GPU spec, so GPU gets attached. assertMode(JobMode.GPU, True, gpu_spec, None) assertMode(JobMode.GPU, True, gpu_spec, tpu_spec) # If NO explicit GPU is supplied but a TPU is supplied, execute in CPU # mode, ie, don't attach a GPU. assertMode(JobMode.CPU, True, None, tpu_spec) # explicit GPU spec is incompatible with --nogpu in both of the following # cases, irrespective of TPU spec. with self.assertRaises(AssertionError): c._job_mode(False, gpu_spec, None) with self.assertRaises(AssertionError): c._job_mode(False, gpu_spec, tpu_spec)
def submit_ml_job( job_mode: conf.JobMode, docker_args: Dict[str, Any], region: ct.Region, project_id: str, credentials_path: Optional[str] = None, dry_run: bool = False, job_name: Optional[str] = None, machine_type: Optional[ct.MachineType] = None, gpu_spec: Optional[ct.GPUSpec] = None, tpu_spec: Optional[ct.TPUSpec] = None, image_tag: Optional[str] = None, labels: Optional[Dict[str, str]] = None, experiment_config: Optional[ce.ExpConf] = None, script_args: Optional[List[str]] = None, request_retries: Optional[int] = None, xgroup: Optional[str] = None, ) -> None: """Top level function in the module. This function: - builds an image using the supplied docker_args, in either CPU or GPU mode - pushes that image to the Cloud Container Repository of the supplied project_id - generates a sequence of 'JobSpec' instances, one for every combination in the supplied experiment_config, and - batch-submits all jobs to AI Platform Keyword args: - job_mode: caliban.config.JobMode. - docker_args: these arguments are passed through to caliban.docker.build.build_image. - region: the region to use for AI Platform job submission. Different regions support different GPUs. - project_id: GCloud project ID for container storage and job submission. - credentials_path: explicit path to a service account JSON file, if it exists. - dry_run: if True, no actual jobs will be submitted and docker won't actually build; logging side effects will show the user what will happen without dry_run=True. - job_name: optional custom name. This is applied as a label to every job, and used as a prefix for all jobIds submitted to Cloud. - machine_type: the machine type to allocate for each job. Must be one supported by Cloud. - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU. Else, configures the count and type of GPUs to attach to the machine that runs each job. - tpu_spec: if None, defaults to no TPU attached. Else, configures the count and type of TPUs to attach to the machine that runs each job. - image_tag: optional explicit tag of a Container-Registry-available Docker container. If supplied, submit_ml_job will skip the docker build and push phases and use this image_tag directly. - labels: dictionary of KV pairs to apply to each job. User args will also be applied as labels, plus a few default labels supplied by Caliban. - experiment_config: dict of string to list, boolean, string or int. Any lists will trigger a cartesian product out with the rest of the config. A job will be submitted for every combination of parameters in the experiment config. - script_args: these are extra arguments that will be passed to every job executed, in addition to the arguments created by expanding out the experiment config. - request_retries: the number of times to retry each request if it fails for a timeout or a rate limiting request. - xgroup: experiment group for this submission, if None a new group will be created """ if script_args is None: script_args = [] if job_name is None: job_name = "caliban_{}".format(u.current_user()) if job_mode == conf.JobMode.GPU and gpu_spec is None: gpu_spec = ct.GPUSpec(ct.GPU.P100, 1) if machine_type is None: machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode] if experiment_config is None: experiment_config = {} if labels is None: labels = {} if request_retries is None: request_retries = 10 caliban_config = docker_args.get('caliban_config', {}) engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_args, image_tag) if image_tag is None: image_tag = generate_image_tag(project_id, docker_args, dry_run=dry_run) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) specs = build_job_specs( job_name=job_name, image_tag=image_tag, region=region, machine_type=machine_type, experiments=experiments, user_labels=labels, gpu_spec=gpu_spec, tpu_spec=tpu_spec, caliban_config=caliban_config, ) if dry_run: return execute_dry_run(specs) try: submit_job_specs( specs=specs, project_id=project_id, credentials_path=credentials_path, num_specs=len(experiments), request_retries=request_retries, ) except Exception as e: logging.error(f'exception: {e}') logging.error(f'{traceback.format_exc()}') session.commit() # commit here, otherwise will be rolled back logging.info("") logging.info( t.green("Visit {} to see the status of all jobs.".format( job_url(project_id, '')))) logging.info("")