async def nginx_job_async(
    nmrc_path: Path, loop: asyncio.AbstractEventLoop
) -> AsyncIterator[Tuple[str, str]]:
    async with api_get(path=nmrc_path) as client:
        secret = uuid4()
        command = (
            f"bash -c \"echo -n '{secret}' > /usr/share/nginx/html/secret.txt; "
            f"timeout 15m /usr/sbin/nginx -g 'daemon off;'\""
        )
        container = Container(
            image=RemoteImage.new_external_image(name="nginx", tag="latest"),
            command=command,
            resources=Resources(20, 0.1, None, None, True, None, None),
        )

        job = await client.jobs.run(
            container, is_preemptible=False, description="test NGINX job"
        )
        try:
            for i in range(60):
                status = await client.jobs.status(job.id)
                if status.status == JobStatus.RUNNING:
                    break
                await asyncio.sleep(1)
            else:
                raise AssertionError("Cannot start NGINX job")
            yield job.id, str(secret)
        finally:
            with suppress(Exception):
                await client.jobs.kill(job.id)
示例#2
0
    async def arun_job_and_wait_state(
        self,
        image: str,
        command: str = "",
        *,
        description: Optional[str] = None,
        name: Optional[str] = None,
        tty: bool = False,
        wait_state: JobStatus = JobStatus.RUNNING,
        stop_state: JobStatus = JobStatus.FAILED,
    ) -> str:
        __tracebackhide__ = True
        async with api_get(timeout=CLIENT_TIMEOUT,
                           path=self._nmrc_path) as client:
            preset = client.presets["cpu-micro"]
            resources = Resources(memory_mb=preset.memory_mb, cpu=preset.cpu)
            container = Container(
                image=client.parse.remote_image(image),
                command=command,
                resources=resources,
                tty=tty,
            )
            job = await client.jobs.run(
                container,
                is_preemptible=preset.is_preemptible,
                description=description,
                name=name,
            )

            start_time = time()
            while job.status != wait_state:
                if stop_state == job.status:
                    raise JobWaitStateStopReached(
                        f"failed running job {job.id}: {stop_state}")
                if int(time() - start_time) > JOB_TIMEOUT:
                    raise AssertionError(
                        f"timeout exceeded, last output: '{job.status}'")
                await asyncio.sleep(JOB_WAIT_SLEEP_SECONDS)
                job = await client.jobs.status(job.id)

            return job.id
示例#3
0
async def cp_s3(
    root: Root, src: URL, dst: URL, recursive: bool, update: bool, progress: bool
) -> None:
    if src.scheme == "file" and dst.scheme == "storage":
        storage_uri = dst
        local_uri = src
        upload = True
    elif src.scheme == "storage" and dst.scheme == "file":
        storage_uri = src
        local_uri = dst
        upload = False
    else:
        raise RuntimeError(
            f"Copy operation of the file with scheme '{src.scheme}'"
            f" to the file with scheme '{dst.scheme}'"
            f" is not supported"
        )

    access_key = secrets.token_urlsafe(nbytes=16)
    secret_key = secrets.token_urlsafe(nbytes=16)
    minio_dir = f"minio-{secrets.token_hex(nbytes=8)}"
    s3_uri = f"s3://bucket{storage_uri.path}"
    minio_script = f"""\
mkdir /mnt/{minio_dir}
ln -s /mnt /mnt/{minio_dir}/bucket
minio server /mnt/{minio_dir}
"""
    volume = Volume(
        storage_uri=storage_uri.with_path(""), container_path="/mnt", read_only=False
    )
    server_container = Container(
        image=RemoteImage(MINIO_IMAGE_NAME, MINIO_IMAGE_TAG),
        entrypoint="sh",
        command=f"-c {shlex.quote(minio_script)}",
        http=HTTPPort(port=9000, requires_auth=False),
        resources=Resources(
            memory_mb=1024,
            cpu=1,
            gpu=0,
            gpu_model=None,
            shm=True,
            tpu_type=None,
            tpu_software_version=None,
        ),
        env={"MINIO_ACCESS_KEY": access_key, "MINIO_SECRET_KEY": secret_key},
        volumes=[volume],
    )

    log.info(f"Launching Amazon S3 gateway for {str(storage_uri.with_path(''))!r}")
    job_name = f"neuro-upload-server-{secrets.token_hex(nbytes=8)}"
    job = await root.client.jobs.run(server_container, name=job_name)
    try:
        jsprogress = JobStartProgress.create(
            tty=root.tty, color=root.color, quiet=root.quiet
        )
        while job.status == JobStatus.PENDING:
            await asyncio.sleep(0.2)
            job = await root.client.jobs.status(job.id)
            jsprogress(job)
        jsprogress.close()

        local_path = "/data"
        if not os.path.isdir(local_uri.path):
            local_path = f"/data/{local_uri.name}"
            local_uri = local_uri.parent
        binding = f"{local_uri.path}:/data"
        if upload:
            binding += ":ro"
        cp_cmd = ["sync" if update else "cp"]
        if recursive:
            cp_cmd.append("--recursive")
        if root.verbosity < 0:
            cp_cmd.append("--quiet")
        if upload:
            cp_cmd.append(local_path)
            cp_cmd.append(s3_uri)
        else:
            cp_cmd.append(s3_uri)
            cp_cmd.append(local_path)

        aws_script = f"""\
aws configure set default.s3.max_concurrent_requests 100
aws configure set default.s3.max_queue_size 10000
aws --endpoint-url {job.http_url} s3 {" ".join(map(shlex.quote, cp_cmd))}
"""
        if root.verbosity >= 2:
            aws_script = "set -x\n" + aws_script
        log.info(f"Launching Amazon S3 client for {local_uri.path!r}")
        docker = aiodocker.Docker()
        try:
            aws_image = f"{AWS_IMAGE_NAME}:{AWS_IMAGE_TAG}"
            async for info in await docker.images.pull(aws_image, stream=True):
                # TODO Use some of Progress classes
                log.debug(str(info))
            client_container = await docker.containers.create(
                config={
                    "Image": aws_image,
                    "Entrypoint": "sh",
                    "Cmd": ["-c", aws_script],
                    "Env": [
                        f"AWS_ACCESS_KEY_ID={access_key}",
                        f"AWS_SECRET_ACCESS_KEY={secret_key}",
                    ],
                    "HostConfig": {"Binds": [binding]},
                    "Tty": True,
                },
                name=f"neuro-upload-client-{secrets.token_hex(nbytes=8)}",
            )
            try:
                await client_container.start()
                tasks = [client_container.wait()]

                async def printlogs(err: bool) -> None:
                    async for piece in await client_container.log(
                        stdout=not err,
                        stderr=err,
                        follow=True,
                        details=(root.verbosity > 1),
                    ):
                        click.echo(piece, nl=False, err=err)

                if not root.quiet:
                    tasks.append(printlogs(err=True))
                if root.verbosity > 0 or progress:
                    tasks.append(printlogs(err=False))
                await asyncio.gather(*tasks)
                exit_code = (await client_container.show())["State"]["ExitCode"]
                if exit_code:
                    raise RuntimeError(f"AWS copying failed with code {exit_code}")
            finally:
                await client_container.delete(force=True)
        finally:
            await docker.close()
    finally:
        try:
            await root.client.jobs.kill(job.id)
        finally:
            attempts = 10
            delay = 0.2
            while True:
                try:
                    await root.client.storage.rm(
                        URL(f"storage:{minio_dir}"), recursive=True
                    )
                except IllegalArgumentError:
                    attempts -= 1
                    if not attempts:
                        raise
                    log.info(
                        "Failed attempt to remove the MinIO directory", exc_info=True
                    )
                    await asyncio.sleep(delay)
                    delay *= 2
                    continue
                break
示例#4
0
async def run_job(
    root: Root,
    *,
    image: RemoteImage,
    gpu: Optional[int],
    gpu_model: Optional[str],
    tpu_type: Optional[str],
    tpu_software_version: Optional[str],
    cpu: float,
    memory: int,
    extshm: bool,
    http: Optional[int],
    http_auth: Optional[bool],
    entrypoint: Optional[str],
    cmd: Sequence[str],
    working_dir: Optional[str],
    volume: Sequence[str],
    env: Sequence[str],
    env_file: Sequence[str],
    restart: str,
    life_span: Optional[str],
    port_forward: List[Tuple[int, int]],
    preemptible: bool,
    name: Optional[str],
    tags: Sequence[str],
    description: Optional[str],
    wait_start: bool,
    pass_config: bool,
    browse: bool,
    detach: bool,
    tty: bool,
    schedule_timeout: Optional[str],
) -> JobDescription:
    if http_auth is None:
        http_auth = True
    elif not http:
        if http_auth:
            raise click.UsageError("--http-auth requires --http")
        else:
            raise click.UsageError("--no-http-auth requires --http")
    if browse and not http:
        raise click.UsageError("--browse requires --http")
    if browse and not wait_start:
        raise click.UsageError(
            "Cannot use --browse and --no-wait-start together")
    if not wait_start:
        detach = True
    if not detach:
        _check_tty(root, tty)

    job_restart_policy = JobRestartPolicy(restart)
    log.debug(f"Job restart policy: {job_restart_policy}")

    job_life_span = await calc_life_span(root.client, life_span,
                                         DEFAULT_JOB_LIFE_SPAN, "job")
    log.debug(f"Job run-time limit: {job_life_span}")

    if schedule_timeout is None:
        job_schedule_timeout = None
    else:
        job_schedule_timeout = parse_timedelta(
            schedule_timeout).total_seconds()
    log.debug(f"Job schedule timeout: {job_schedule_timeout}")

    env_parse_result = root.client.parse.envs(env, env_file)
    env_dict, secret_env_dict = env_parse_result.env, env_parse_result.secret_env
    real_cmd = _parse_cmd(cmd)

    log.debug(f'entrypoint="{entrypoint}"')
    log.debug(f'cmd="{real_cmd}"')

    log.info(f"Using image '{image}'")

    if tpu_type:
        if not tpu_software_version:
            raise ValueError(
                "--tpu-sw-version cannot be empty while --tpu-type specified")
    resources = Resources(
        memory_mb=memory,
        cpu=cpu,
        gpu=gpu,
        gpu_model=gpu_model,
        shm=extshm,
        tpu_type=tpu_type,
        tpu_software_version=tpu_software_version,
    )

    volume_parse_result = root.client.parse.volumes(volume)
    volumes = list(volume_parse_result.volumes)
    secret_files = volume_parse_result.secret_files
    disk_volumes = volume_parse_result.disk_volumes

    if pass_config:
        env_name = NEURO_STEAL_CONFIG
        if env_name in env_dict:
            raise ValueError(
                f"{env_name} is already set to {env_dict[env_name]}")
        env_var, secret_volume = await upload_and_map_config(root)
        env_dict[NEURO_STEAL_CONFIG] = env_var
        volumes.append(secret_volume)

    if volumes:
        log.info("Using volumes: \n" +
                 "\n".join(f"  {volume_to_verbose_str(v)}" for v in volumes))

    container = Container(
        image=image,
        entrypoint=entrypoint,
        command=real_cmd,
        working_dir=working_dir,
        http=HTTPPort(http, http_auth) if http else None,
        resources=resources,
        env=env_dict,
        volumes=volumes,
        secret_env=secret_env_dict,
        secret_files=secret_files,
        disk_volumes=disk_volumes,
        tty=tty,
    )

    job = await root.client.jobs.run(
        container,
        is_preemptible=preemptible,
        name=name,
        tags=tags,
        description=description,
        restart_policy=job_restart_policy,
        life_span=job_life_span,
        schedule_timeout=job_schedule_timeout,
    )
    progress = JobStartProgress.create(tty=root.tty,
                                       color=root.color,
                                       quiet=root.quiet)
    progress.begin(job)
    while wait_start and job.status == JobStatus.PENDING:
        await asyncio.sleep(0.2)
        job = await root.client.jobs.status(job.id)
        progress.step(job)
    progress.end(job)
    # Even if we detached, but the job has failed to start
    # (most common reason - no resources), the command fails
    if job.status == JobStatus.FAILED:
        sys.exit(job.history.exit_code or EX_PLATFORMERROR)

    if browse:
        await browse_job(root, job)

    if not detach:
        await process_attach(root,
                             job,
                             tty=tty,
                             logs=True,
                             port_forward=port_forward)

    return job
示例#5
0
async def run_job(
    root: Root,
    *,
    image: RemoteImage,
    gpu: Optional[int],
    gpu_model: Optional[str],
    tpu_type: Optional[str],
    tpu_software_version: Optional[str],
    cpu: float,
    memory: int,
    extshm: bool,
    http: Optional[int],
    http_auth: Optional[bool],
    entrypoint: Optional[str],
    cmd: Sequence[str],
    volume: Sequence[str],
    env: Sequence[str],
    env_file: Optional[str],
    preemptible: bool,
    name: Optional[str],
    description: Optional[str],
    wait_start: bool,
    pass_config: bool,
    browse: bool,
    detach: bool,
) -> JobDescription:
    if http_auth is None:
        http_auth = True
    elif not http:
        if http_auth:
            raise click.UsageError("--http-auth requires --http")
        else:
            raise click.UsageError("--no-http-auth requires --http")
    if browse and not http:
        raise click.UsageError("--browse requires --http")
    if browse and not wait_start:
        raise click.UsageError(
            "Cannot use --browse and --no-wait-start together")
    if not wait_start:
        detach = True

    env_dict = build_env(env, env_file)

    cmd = " ".join(cmd) if cmd is not None else None
    log.debug(f'entrypoint="{entrypoint}"')
    log.debug(f'cmd="{cmd}"')

    log.info(f"Using image '{image}'")

    if tpu_type:
        if not tpu_software_version:
            raise ValueError(
                "--tpu-sw-version cannot be empty while --tpu-type specified")
    resources = Resources(
        memory_mb=memory,
        cpu=cpu,
        gpu=gpu,
        gpu_model=gpu_model,
        shm=extshm,
        tpu_type=tpu_type,
        tpu_software_version=tpu_software_version,
    )
    volumes = await _build_volumes(root, volume, env_dict)

    if pass_config:
        if CONFIG_ENV_NAME in env_dict:
            raise ValueError(
                f"{CONFIG_ENV_NAME} is already set to {env_dict[CONFIG_ENV_NAME]}"
            )
        env_var, secret_volume = await upload_and_map_config(root)
        env_dict[CONFIG_ENV_NAME] = env_var
        volumes.add(secret_volume)

    if volumes:
        log.info("Using volumes: \n" +
                 "\n".join(f"  {volume_to_verbose_str(v)}" for v in volumes))

    container = Container(
        image=image,
        entrypoint=entrypoint,
        command=cmd,
        http=HTTPPort(http, http_auth) if http else None,
        resources=resources,
        env=env_dict,
        volumes=list(volumes),
    )

    job = await root.client.jobs.run(container,
                                     is_preemptible=preemptible,
                                     name=name,
                                     description=description)
    click.echo(JobFormatter(root.quiet)(job))
    progress = JobStartProgress.create(tty=root.tty,
                                       color=root.color,
                                       quiet=root.quiet)
    while wait_start and job.status == JobStatus.PENDING:
        await asyncio.sleep(0.2)
        job = await root.client.jobs.status(job.id)
        progress(job)
    progress.close()
    if browse and job.status != JobStatus.FAILED:
        await browse_job(root, job)

    exit_code = None
    if not detach:
        msg = textwrap.dedent("""\
            Terminal is attached to the remote job, so you receive the job's output.
            Use 'Ctrl-C' to detach (it will NOT terminate the job), or restart the job
            with `--detach` option.
        """)
        click.echo(click.style(msg, dim=True))
        await _print_logs(root, job.id)
        job = await root.client.jobs.status(job.id)
        exit_code = job.history.exit_code
    else:
        # Even if we detached, but the job has failed to start
        # (most common reason - no resources), the command fails
        if job.status == JobStatus.FAILED:
            exit_code = 125

    if exit_code is not None:
        sys.exit(exit_code)

    return job