async def nginx_job_async( nmrc_path: Path, loop: asyncio.AbstractEventLoop ) -> AsyncIterator[Tuple[str, str]]: async with api_get(path=nmrc_path) as client: secret = uuid4() command = ( f"bash -c \"echo -n '{secret}' > /usr/share/nginx/html/secret.txt; " f"timeout 15m /usr/sbin/nginx -g 'daemon off;'\"" ) container = Container( image=RemoteImage.new_external_image(name="nginx", tag="latest"), command=command, resources=Resources(20, 0.1, None, None, True, None, None), ) job = await client.jobs.run( container, is_preemptible=False, description="test NGINX job" ) try: for i in range(60): status = await client.jobs.status(job.id) if status.status == JobStatus.RUNNING: break await asyncio.sleep(1) else: raise AssertionError("Cannot start NGINX job") yield job.id, str(secret) finally: with suppress(Exception): await client.jobs.kill(job.id)
async def arun_job_and_wait_state( self, image: str, command: str = "", *, description: Optional[str] = None, name: Optional[str] = None, tty: bool = False, wait_state: JobStatus = JobStatus.RUNNING, stop_state: JobStatus = JobStatus.FAILED, ) -> str: __tracebackhide__ = True async with api_get(timeout=CLIENT_TIMEOUT, path=self._nmrc_path) as client: preset = client.presets["cpu-micro"] resources = Resources(memory_mb=preset.memory_mb, cpu=preset.cpu) container = Container( image=client.parse.remote_image(image), command=command, resources=resources, tty=tty, ) job = await client.jobs.run( container, is_preemptible=preset.is_preemptible, description=description, name=name, ) start_time = time() while job.status != wait_state: if stop_state == job.status: raise JobWaitStateStopReached( f"failed running job {job.id}: {stop_state}") if int(time() - start_time) > JOB_TIMEOUT: raise AssertionError( f"timeout exceeded, last output: '{job.status}'") await asyncio.sleep(JOB_WAIT_SLEEP_SECONDS) job = await client.jobs.status(job.id) return job.id
async def cp_s3( root: Root, src: URL, dst: URL, recursive: bool, update: bool, progress: bool ) -> None: if src.scheme == "file" and dst.scheme == "storage": storage_uri = dst local_uri = src upload = True elif src.scheme == "storage" and dst.scheme == "file": storage_uri = src local_uri = dst upload = False else: raise RuntimeError( f"Copy operation of the file with scheme '{src.scheme}'" f" to the file with scheme '{dst.scheme}'" f" is not supported" ) access_key = secrets.token_urlsafe(nbytes=16) secret_key = secrets.token_urlsafe(nbytes=16) minio_dir = f"minio-{secrets.token_hex(nbytes=8)}" s3_uri = f"s3://bucket{storage_uri.path}" minio_script = f"""\ mkdir /mnt/{minio_dir} ln -s /mnt /mnt/{minio_dir}/bucket minio server /mnt/{minio_dir} """ volume = Volume( storage_uri=storage_uri.with_path(""), container_path="/mnt", read_only=False ) server_container = Container( image=RemoteImage(MINIO_IMAGE_NAME, MINIO_IMAGE_TAG), entrypoint="sh", command=f"-c {shlex.quote(minio_script)}", http=HTTPPort(port=9000, requires_auth=False), resources=Resources( memory_mb=1024, cpu=1, gpu=0, gpu_model=None, shm=True, tpu_type=None, tpu_software_version=None, ), env={"MINIO_ACCESS_KEY": access_key, "MINIO_SECRET_KEY": secret_key}, volumes=[volume], ) log.info(f"Launching Amazon S3 gateway for {str(storage_uri.with_path(''))!r}") job_name = f"neuro-upload-server-{secrets.token_hex(nbytes=8)}" job = await root.client.jobs.run(server_container, name=job_name) try: jsprogress = JobStartProgress.create( tty=root.tty, color=root.color, quiet=root.quiet ) while job.status == JobStatus.PENDING: await asyncio.sleep(0.2) job = await root.client.jobs.status(job.id) jsprogress(job) jsprogress.close() local_path = "/data" if not os.path.isdir(local_uri.path): local_path = f"/data/{local_uri.name}" local_uri = local_uri.parent binding = f"{local_uri.path}:/data" if upload: binding += ":ro" cp_cmd = ["sync" if update else "cp"] if recursive: cp_cmd.append("--recursive") if root.verbosity < 0: cp_cmd.append("--quiet") if upload: cp_cmd.append(local_path) cp_cmd.append(s3_uri) else: cp_cmd.append(s3_uri) cp_cmd.append(local_path) aws_script = f"""\ aws configure set default.s3.max_concurrent_requests 100 aws configure set default.s3.max_queue_size 10000 aws --endpoint-url {job.http_url} s3 {" ".join(map(shlex.quote, cp_cmd))} """ if root.verbosity >= 2: aws_script = "set -x\n" + aws_script log.info(f"Launching Amazon S3 client for {local_uri.path!r}") docker = aiodocker.Docker() try: aws_image = f"{AWS_IMAGE_NAME}:{AWS_IMAGE_TAG}" async for info in await docker.images.pull(aws_image, stream=True): # TODO Use some of Progress classes log.debug(str(info)) client_container = await docker.containers.create( config={ "Image": aws_image, "Entrypoint": "sh", "Cmd": ["-c", aws_script], "Env": [ f"AWS_ACCESS_KEY_ID={access_key}", f"AWS_SECRET_ACCESS_KEY={secret_key}", ], "HostConfig": {"Binds": [binding]}, "Tty": True, }, name=f"neuro-upload-client-{secrets.token_hex(nbytes=8)}", ) try: await client_container.start() tasks = [client_container.wait()] async def printlogs(err: bool) -> None: async for piece in await client_container.log( stdout=not err, stderr=err, follow=True, details=(root.verbosity > 1), ): click.echo(piece, nl=False, err=err) if not root.quiet: tasks.append(printlogs(err=True)) if root.verbosity > 0 or progress: tasks.append(printlogs(err=False)) await asyncio.gather(*tasks) exit_code = (await client_container.show())["State"]["ExitCode"] if exit_code: raise RuntimeError(f"AWS copying failed with code {exit_code}") finally: await client_container.delete(force=True) finally: await docker.close() finally: try: await root.client.jobs.kill(job.id) finally: attempts = 10 delay = 0.2 while True: try: await root.client.storage.rm( URL(f"storage:{minio_dir}"), recursive=True ) except IllegalArgumentError: attempts -= 1 if not attempts: raise log.info( "Failed attempt to remove the MinIO directory", exc_info=True ) await asyncio.sleep(delay) delay *= 2 continue break
async def run_job( root: Root, *, image: RemoteImage, gpu: Optional[int], gpu_model: Optional[str], tpu_type: Optional[str], tpu_software_version: Optional[str], cpu: float, memory: int, extshm: bool, http: Optional[int], http_auth: Optional[bool], entrypoint: Optional[str], cmd: Sequence[str], working_dir: Optional[str], volume: Sequence[str], env: Sequence[str], env_file: Sequence[str], restart: str, life_span: Optional[str], port_forward: List[Tuple[int, int]], preemptible: bool, name: Optional[str], tags: Sequence[str], description: Optional[str], wait_start: bool, pass_config: bool, browse: bool, detach: bool, tty: bool, schedule_timeout: Optional[str], ) -> JobDescription: if http_auth is None: http_auth = True elif not http: if http_auth: raise click.UsageError("--http-auth requires --http") else: raise click.UsageError("--no-http-auth requires --http") if browse and not http: raise click.UsageError("--browse requires --http") if browse and not wait_start: raise click.UsageError( "Cannot use --browse and --no-wait-start together") if not wait_start: detach = True if not detach: _check_tty(root, tty) job_restart_policy = JobRestartPolicy(restart) log.debug(f"Job restart policy: {job_restart_policy}") job_life_span = await calc_life_span(root.client, life_span, DEFAULT_JOB_LIFE_SPAN, "job") log.debug(f"Job run-time limit: {job_life_span}") if schedule_timeout is None: job_schedule_timeout = None else: job_schedule_timeout = parse_timedelta( schedule_timeout).total_seconds() log.debug(f"Job schedule timeout: {job_schedule_timeout}") env_parse_result = root.client.parse.envs(env, env_file) env_dict, secret_env_dict = env_parse_result.env, env_parse_result.secret_env real_cmd = _parse_cmd(cmd) log.debug(f'entrypoint="{entrypoint}"') log.debug(f'cmd="{real_cmd}"') log.info(f"Using image '{image}'") if tpu_type: if not tpu_software_version: raise ValueError( "--tpu-sw-version cannot be empty while --tpu-type specified") resources = Resources( memory_mb=memory, cpu=cpu, gpu=gpu, gpu_model=gpu_model, shm=extshm, tpu_type=tpu_type, tpu_software_version=tpu_software_version, ) volume_parse_result = root.client.parse.volumes(volume) volumes = list(volume_parse_result.volumes) secret_files = volume_parse_result.secret_files disk_volumes = volume_parse_result.disk_volumes if pass_config: env_name = NEURO_STEAL_CONFIG if env_name in env_dict: raise ValueError( f"{env_name} is already set to {env_dict[env_name]}") env_var, secret_volume = await upload_and_map_config(root) env_dict[NEURO_STEAL_CONFIG] = env_var volumes.append(secret_volume) if volumes: log.info("Using volumes: \n" + "\n".join(f" {volume_to_verbose_str(v)}" for v in volumes)) container = Container( image=image, entrypoint=entrypoint, command=real_cmd, working_dir=working_dir, http=HTTPPort(http, http_auth) if http else None, resources=resources, env=env_dict, volumes=volumes, secret_env=secret_env_dict, secret_files=secret_files, disk_volumes=disk_volumes, tty=tty, ) job = await root.client.jobs.run( container, is_preemptible=preemptible, name=name, tags=tags, description=description, restart_policy=job_restart_policy, life_span=job_life_span, schedule_timeout=job_schedule_timeout, ) progress = JobStartProgress.create(tty=root.tty, color=root.color, quiet=root.quiet) progress.begin(job) while wait_start and job.status == JobStatus.PENDING: await asyncio.sleep(0.2) job = await root.client.jobs.status(job.id) progress.step(job) progress.end(job) # Even if we detached, but the job has failed to start # (most common reason - no resources), the command fails if job.status == JobStatus.FAILED: sys.exit(job.history.exit_code or EX_PLATFORMERROR) if browse: await browse_job(root, job) if not detach: await process_attach(root, job, tty=tty, logs=True, port_forward=port_forward) return job
async def run_job( root: Root, *, image: RemoteImage, gpu: Optional[int], gpu_model: Optional[str], tpu_type: Optional[str], tpu_software_version: Optional[str], cpu: float, memory: int, extshm: bool, http: Optional[int], http_auth: Optional[bool], entrypoint: Optional[str], cmd: Sequence[str], volume: Sequence[str], env: Sequence[str], env_file: Optional[str], preemptible: bool, name: Optional[str], description: Optional[str], wait_start: bool, pass_config: bool, browse: bool, detach: bool, ) -> JobDescription: if http_auth is None: http_auth = True elif not http: if http_auth: raise click.UsageError("--http-auth requires --http") else: raise click.UsageError("--no-http-auth requires --http") if browse and not http: raise click.UsageError("--browse requires --http") if browse and not wait_start: raise click.UsageError( "Cannot use --browse and --no-wait-start together") if not wait_start: detach = True env_dict = build_env(env, env_file) cmd = " ".join(cmd) if cmd is not None else None log.debug(f'entrypoint="{entrypoint}"') log.debug(f'cmd="{cmd}"') log.info(f"Using image '{image}'") if tpu_type: if not tpu_software_version: raise ValueError( "--tpu-sw-version cannot be empty while --tpu-type specified") resources = Resources( memory_mb=memory, cpu=cpu, gpu=gpu, gpu_model=gpu_model, shm=extshm, tpu_type=tpu_type, tpu_software_version=tpu_software_version, ) volumes = await _build_volumes(root, volume, env_dict) if pass_config: if CONFIG_ENV_NAME in env_dict: raise ValueError( f"{CONFIG_ENV_NAME} is already set to {env_dict[CONFIG_ENV_NAME]}" ) env_var, secret_volume = await upload_and_map_config(root) env_dict[CONFIG_ENV_NAME] = env_var volumes.add(secret_volume) if volumes: log.info("Using volumes: \n" + "\n".join(f" {volume_to_verbose_str(v)}" for v in volumes)) container = Container( image=image, entrypoint=entrypoint, command=cmd, http=HTTPPort(http, http_auth) if http else None, resources=resources, env=env_dict, volumes=list(volumes), ) job = await root.client.jobs.run(container, is_preemptible=preemptible, name=name, description=description) click.echo(JobFormatter(root.quiet)(job)) progress = JobStartProgress.create(tty=root.tty, color=root.color, quiet=root.quiet) while wait_start and job.status == JobStatus.PENDING: await asyncio.sleep(0.2) job = await root.client.jobs.status(job.id) progress(job) progress.close() if browse and job.status != JobStatus.FAILED: await browse_job(root, job) exit_code = None if not detach: msg = textwrap.dedent("""\ Terminal is attached to the remote job, so you receive the job's output. Use 'Ctrl-C' to detach (it will NOT terminate the job), or restart the job with `--detach` option. """) click.echo(click.style(msg, dim=True)) await _print_logs(root, job.id) job = await root.client.jobs.status(job.id) exit_code = job.history.exit_code else: # Even if we detached, but the job has failed to start # (most common reason - no resources), the command fails if job.status == JobStatus.FAILED: exit_code = 125 if exit_code is not None: sys.exit(exit_code) return job