示例#1
0
    def test_validate_gpu_spec_against_limits_deterministic(self):
        '''deterministic test to make sure we get full coverage'''

        # gpu not supported
        cfg = {
            'gpu_spec': ct.GPUSpec(ct.GPU.K80, 1),
            'gpu_limits': {
                ct.GPU.P100: 1
            },
            'limit_type': 'zone',
        }
        assert not util.validate_gpu_spec_against_limits(**cfg)

        # request above limit
        cfg = {
            'gpu_spec': ct.GPUSpec(ct.GPU.K80, 2),
            'gpu_limits': {
                ct.GPU.P100: 1,
                ct.GPU.K80: 1,
            },
            'limit_type': 'zone',
        }
        assert not util.validate_gpu_spec_against_limits(**cfg)

        # valid request
        cfg = {
            'gpu_spec': ct.GPUSpec(ct.GPU.K80, 1),
            'gpu_limits': {
                ct.GPU.P100: 1,
                ct.GPU.K80: 1,
            },
            'limit_type': 'zone',
        }
        assert util.validate_gpu_spec_against_limits(**cfg)
示例#2
0
    def test_gpuspec_parse_arg(self):
        with self.assertRaises(ArgumentTypeError):
            # invalid format string, no x separator.
            ct.GPUSpec.parse_arg("face")

        with self.assertRaises(ArgumentTypeError):
            # Invalid number.
            ct.GPUSpec.parse_arg("randomxV100")

        with self.assertRaises(ArgumentTypeError):
            # invalid GPU type.
            ct.GPUSpec.parse_arg("8xNONSTANDARD")

        with self.assertRaises(ArgumentTypeError):
            # Invalid number for the valid GPU type.
            ct.GPUSpec.parse_arg("15xV100")

        self.assertEqual(ct.GPUSpec(ct.GPU.V100, 7),
                         ct.GPUSpec.parse_arg("7xV100", validate_count=False))

        # Valid!
        self.assertEqual(ct.GPUSpec(ct.GPU.V100, 8),
                         ct.GPUSpec.parse_arg("8xV100"))
示例#3
0
    def test_validate_gpu_spec_against_limits(
        self,
        limits: List[int],
        gpu_type: ct.GPU,
        count: int,
    ):
        """tests gpu validation against limits"""

        gpu_list = [g for g in ct.GPU]
        gpu_limits = dict([(gpu_list[i], limits[i]) for i in range(len(limits))
                           if limits[i]])
        spec = ct.GPUSpec(gpu_type, count)
        valid = util.validate_gpu_spec_against_limits(spec, gpu_limits, 'test')

        if spec.gpu not in gpu_limits:
            self.assertFalse(valid)
        else:
            self.assertTrue(valid == (spec.count <= gpu_limits[spec.gpu]))

        return
示例#4
0
    def test_job_mode(self):
        """Tests for all possible combinations of the three arguments to
    resolve_job_mode.

    """
        gpu_spec = ct.GPUSpec(ct.GPU.P100, 4)
        tpu_spec = ct.TPUSpec(ct.TPU.V2, 8)

        def assertMode(expected_mode, use_gpu, gpu_spec, tpu_spec):
            mode = c._job_mode(use_gpu, gpu_spec, tpu_spec)
            self.assertEqual(mode, expected_mode)

        # --nogpu and no override.
        assertMode(JobMode.CPU, False, None, None)

        # TPU doesn't need GPUs
        assertMode(JobMode.CPU, False, None, tpu_spec)

        # Default GPUSpec filled in.
        assertMode(JobMode.GPU, True, None, None)

        # Explicit GPU spec, so GPU gets attached.
        assertMode(JobMode.GPU, True, gpu_spec, None)
        assertMode(JobMode.GPU, True, gpu_spec, tpu_spec)

        # If NO explicit GPU is supplied but a TPU is supplied, execute in CPU
        # mode, ie, don't attach a GPU.
        assertMode(JobMode.CPU, True, None, tpu_spec)

        # explicit GPU spec is incompatible with --nogpu in both of the following
        # cases, irrespective of TPU spec.
        with self.assertRaises(AssertionError):
            c._job_mode(False, gpu_spec, None)

        with self.assertRaises(AssertionError):
            c._job_mode(False, gpu_spec, tpu_spec)
示例#5
0
def submit_ml_job(
    job_mode: conf.JobMode,
    docker_args: Dict[str, Any],
    region: ct.Region,
    project_id: str,
    credentials_path: Optional[str] = None,
    dry_run: bool = False,
    job_name: Optional[str] = None,
    machine_type: Optional[ct.MachineType] = None,
    gpu_spec: Optional[ct.GPUSpec] = None,
    tpu_spec: Optional[ct.TPUSpec] = None,
    image_tag: Optional[str] = None,
    labels: Optional[Dict[str, str]] = None,
    experiment_config: Optional[ce.ExpConf] = None,
    script_args: Optional[List[str]] = None,
    request_retries: Optional[int] = None,
    xgroup: Optional[str] = None,
) -> None:
  """Top level function in the module. This function:

  - builds an image using the supplied docker_args, in either CPU or GPU mode
  - pushes that image to the Cloud Container Repository of the supplied
    project_id
  - generates a sequence of 'JobSpec' instances, one for every combination in
    the supplied experiment_config, and
  - batch-submits all jobs to AI Platform

  Keyword args:

  - job_mode: caliban.config.JobMode.
  - docker_args: these arguments are passed through to
    caliban.docker.build.build_image.
  - region: the region to use for AI Platform job submission. Different regions
    support different GPUs.
  - project_id: GCloud project ID for container storage and job submission.
  - credentials_path: explicit path to a service account JSON file, if it exists.
  - dry_run: if True, no actual jobs will be submitted and docker won't
    actually build; logging side effects will show the user what will happen
    without dry_run=True.
  - job_name: optional custom name. This is applied as a label to every job,
    and used as a prefix for all jobIds submitted to Cloud.
  - machine_type: the machine type to allocate for each job. Must be one
    supported by Cloud.
  - gpu_spec: if None and job_mode is GPU, defaults to a standard single GPU.
    Else, configures the count and type of GPUs to attach to the machine that
    runs each job.
  - tpu_spec: if None, defaults to no TPU attached. Else, configures the count
    and type of TPUs to attach to the machine that runs each job.
  - image_tag: optional explicit tag of a Container-Registry-available Docker
    container. If supplied, submit_ml_job will skip the docker build and push
    phases and use this image_tag directly.
  - labels: dictionary of KV pairs to apply to each job. User args will also be
    applied as labels, plus a few default labels supplied by Caliban.
  - experiment_config: dict of string to list, boolean, string or int. Any
    lists will trigger a cartesian product out with the rest of the config. A
    job will be submitted for every combination of parameters in the experiment
    config.
  - script_args: these are extra arguments that will be passed to every job
    executed, in addition to the arguments created by expanding out the
    experiment config.
  - request_retries: the number of times to retry each request if it fails for
    a timeout or a rate limiting request.
  - xgroup: experiment group for this submission, if None a new group will
    be created
  """
  if script_args is None:
    script_args = []

  if job_name is None:
    job_name = "caliban_{}".format(u.current_user())

  if job_mode == conf.JobMode.GPU and gpu_spec is None:
    gpu_spec = ct.GPUSpec(ct.GPU.P100, 1)

  if machine_type is None:
    machine_type = conf.DEFAULT_MACHINE_TYPE[job_mode]

  if experiment_config is None:
    experiment_config = {}

  if labels is None:
    labels = {}

  if request_retries is None:
    request_retries = 10

  caliban_config = docker_args.get('caliban_config', {})

  engine = get_mem_engine() if dry_run else get_sql_engine()

  with session_scope(engine) as session:
    container_spec = generate_container_spec(session, docker_args, image_tag)

    if image_tag is None:
      image_tag = generate_image_tag(project_id, docker_args, dry_run=dry_run)

    experiments = create_experiments(
        session=session,
        container_spec=container_spec,
        script_args=script_args,
        experiment_config=experiment_config,
        xgroup=xgroup,
    )

    specs = build_job_specs(
        job_name=job_name,
        image_tag=image_tag,
        region=region,
        machine_type=machine_type,
        experiments=experiments,
        user_labels=labels,
        gpu_spec=gpu_spec,
        tpu_spec=tpu_spec,
        caliban_config=caliban_config,
    )

    if dry_run:
      return execute_dry_run(specs)

    try:
      submit_job_specs(
          specs=specs,
          project_id=project_id,
          credentials_path=credentials_path,
          num_specs=len(experiments),
          request_retries=request_retries,
      )
    except Exception as e:
      logging.error(f'exception: {e}')
      logging.error(f'{traceback.format_exc()}')
      session.commit()  # commit here, otherwise will be rolled back

    logging.info("")
    logging.info(
        t.green("Visit {} to see the status of all jobs.".format(
            job_url(project_id, ''))))
    logging.info("")