def test_job_spec(engine: Engine): job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}} container_spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } def validate_spec(session) -> JobSpec: s = session.query(JobSpec).all() assert len(s) == 1 s = s[0] assert s.platform == Platform.LOCAL assert s.spec == job_spec return s # test basic creation with session_scope(engine) as session: xg = ExperimentGroup.get_or_create(session=session) c = ContainerSpec.get_or_create(session=session, spec=container_spec) e = Experiment.get_or_create(xgroup=xg, container_spec=c) j = JobSpec.get_or_create( experiment=e, spec=job_spec, platform=Platform.LOCAL, ) session.add(xg) # test basic persistence, then add duplicate with session_scope(engine) as session: s = validate_spec(session) session.add( JobSpec.get_or_create( experiment=s.experiment, spec=job_spec, platform=Platform.LOCAL, )) # test get_or_create, then create new spec with session_scope(engine) as session: s = validate_spec(session) session.add( JobSpec.get_or_create( experiment=s.experiment, spec=job_spec, platform=Platform.CAIP, )) # verify that new spec was peristed with session_scope(engine) as session: s = session.query(JobSpec).all() assert len(s) == 2 assert s[0].spec == s[1].spec assert s[0].platform != s[1].platform
def test_job(engine: Engine): args = ['a', 4] kwargs = {'k0': 0, 'k1': 'xyz'} job_spec = {'a': 2, 'b': [0, 1, 2], 'c': {'x': 1, 'y': 'foo'}} container_spec = { 'nogpu': True, 'cloud_key': '/path/to/key.json', 'image_tag': None, 'dir': ['/extra/path0', '/extra/path2'], 'base_dir': '/home/foo', 'module': 'train.py' } # test basic job creation with session_scope(engine) as session: xg = ExperimentGroup() c = ContainerSpec.get_or_create(session=session, spec=container_spec) e = Experiment.get_or_create( xgroup=xg, container_spec=c, args=args, kwargs=kwargs, ) jspec = JobSpec.get_or_create( experiment=e, spec=job_spec, platform=Platform.TEST, ) job = Job(spec=jspec, container='container0', details={'job_id': 123}) session.add(e) # test job persistence with session_scope(engine) as session: j = session.query(Job).all() assert len(j) == 1 j = j[0] assert j.container == 'container0' assert j.experiment.args == args assert j.experiment.kwargs == kwargs assert j.spec.spec == job_spec assert j.details['job_id'] == 123
def replace_caip_job_spec_image(spec: JobSpec, image_id: str) -> JobSpec: '''generates a new JobSpec based on an existing one, but replacing the image id Args: spec: job spec used as basis image_id: new image id Returns: new JobSpec ''' new_spec = deepcopy(spec.spec) new_spec['trainingInput']['masterConfig']['imageUri'] = image_id return JobSpec.get_or_create(experiment=spec.experiment, spec=new_spec, platform=Platform.CAIP)
def replace_gke_job_spec_image(spec: JobSpec, image_id: str) -> JobSpec: '''generates a new JobSpec based on an existing one, but replacing the image id Args: spec: job spec used as basis image_id: new image id Returns: new JobSpec ''' new_spec = deepcopy(spec.spec) for i in range(len(new_spec['template']['spec']['containers'])): new_spec['template']['spec']['containers'][i]['image'] = image_id print return JobSpec.get_or_create( experiment=spec.experiment, spec=new_spec, platform=Platform.GKE, )
def replace_local_job_spec_image(spec: JobSpec, image_id: str) -> JobSpec: '''generates a new JobSpec based on an existing one, but replacing the image id Args: spec: job spec used as basis image_id: new image id Returns: new JobSpec ''' old_image = spec.spec['container'] old_cmd = spec.spec['command'] new_cmd = list(map(lambda x: x if x != old_image else image_id, old_cmd)) return JobSpec.get_or_create( experiment=spec.experiment, spec={ 'command': new_cmd, 'container': image_id, }, platform=Platform.LOCAL, )
def run_experiments(job_mode: c.JobMode, run_args: Optional[List[str]] = None, script_args: Optional[List[str]] = None, image_id: Optional[str] = None, dry_run: bool = False, experiment_config: Optional[ce.ExpConf] = None, xgroup: Optional[str] = None, **build_image_kwargs) -> None: """Builds an image using the supplied **build_image_kwargs and calls `docker run` on the resulting image using sensible defaults. Keyword args: - job_mode: c.JobMode. - run_args: extra arguments to supply to `docker run` after our defaults. - script_args: extra arguments to supply to the entrypoint. (You can - override the default container entrypoint by supplying a new one inside run_args.) - image_id: ID of the image to run. Supplying this will skip an image build. - experiment_config: dict of string to list, boolean, string or int. Any lists will trigger a cartesian product out with the rest of the config. A job will be executed for every combination of parameters in the experiment config. - dry_run: if True, no actual jobs will be executed and docker won't actually build; logging side effects will show the user what will happen without dry_run=True. any extra kwargs supplied are passed through to build_image. """ if run_args is None: run_args = [] if script_args is None: script_args = [] if experiment_config is None: experiment_config = {} docker_args = {k: v for k, v in build_image_kwargs.items()} docker_args['job_mode'] = job_mode engine = get_mem_engine() if dry_run else get_sql_engine() with session_scope(engine) as session: container_spec = generate_container_spec(session, docker_args, image_id) if image_id is None: if dry_run: logging.info("Dry run - skipping actual 'docker build'.") image_id = 'dry_run_tag' else: image_id = b.build_image(**docker_args) experiments = create_experiments( session=session, container_spec=container_spec, script_args=script_args, experiment_config=experiment_config, xgroup=xgroup, ) job_specs = [ JobSpec.get_or_create( experiment=x, spec=_create_job_spec_dict( experiment=x, job_mode=job_mode, run_args=run_args, image_id=image_id, ), platform=Platform.LOCAL, ) for x in experiments ] try: execute_jobs(job_specs=job_specs, dry_run=dry_run) except Exception as e: logging.error(f'exception: {e}') session.commit() # commit here, otherwise will be rolled back
def create_simple_job_spec( self, experiment: Experiment, name: str, image: str, min_cpu: int, min_mem: int, command: Optional[List[str]] = None, env: Dict[str, str] = {}, accelerator: Optional[Accelerator] = None, accelerator_count: int = 1, namespace: str = k.DEFAULT_NAMESPACE, machine_type: Optional[MachineType] = None, preemptible: bool = True, preemptible_tpu: bool = True, tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]: """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster Args: name: job name image: container image url (gcr.io/...) min_cpu: minimum cpu needed, in milli-cpu min_mem: minimum memory needed, in MB command: command to execute, None = container entrypoint args: args to pass to command env: environment vars for container accelerator: accelerator type, None=cpu only accelerator_count: accelerator count namespace: kubernetes namespace machine_type: machine type, None=default for mode (cpu/gpu) preemptible: use preemptible instance preemptible_tpu: use preemptible tpus tpu_driver: tpu driver to use Returns: JobSpec on success, None otherwise """ args = conf.experiment_to_args(experiment.kwargs, experiment.args) # ------------------------------------------------------------------------ # container # tpu/gpu resources container_resources = V1ResourceRequirements( requests=Cluster.container_requests(min_cpu, min_mem), limits=Cluster.container_limits( accelerator, accelerator_count, preemptible_tpu, ), ) container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()] # this is a simple 1-container, 1-pod job, so we just name the # container the same thing (minus the generated suffix) as the job itself container = V1Container( name=name, image=image, command=command, args=args, resources=container_resources, env=container_env, image_pull_policy='Always', ) # ------------------------------------------------------------------------ # template # todo: should we support anything other than a 'never' restart policy? # see this for discussion # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy tolerations = Cluster.tolerations(preemptible=preemptible) # backoff count plus 'OnFailure' may be correct here template_spec = V1PodSpec( restart_policy='Never', containers=[container], tolerations=tolerations, node_selector=Cluster.node_selector( preemptible=preemptible, machine_type=machine_type, accelerator=accelerator, ), host_ipc=True, ) template = V1PodTemplateSpec( metadata=Cluster.template_metadata( accelerator=accelerator, tpu_driver=tpu_driver, ), spec=template_spec, ) # ------------------------------------------------------------------------ # job job_spec = V1JobSpec(template=template, backoff_limit=4) return JobSpec.get_or_create( experiment=experiment, spec=ApiClient().sanitize_for_serialization(job_spec), platform=Platform.GKE, )