Пример #1
0
        def copy_input(task, r):
            if isinstance(r, InputResourceFile):
                if r not in copied_input_resource_files:
                    copied_input_resource_files.add(r)

                    if r._input_path.startswith('gs://'):
                        return [
                            f'gsutil cp {r._input_path} {r._get_path(tmpdir)}'
                        ]
                    else:
                        absolute_input_path = shq(
                            os.path.realpath(r._input_path))
                        if task._image is not None:  # pylint: disable-msg=W0640
                            return [
                                f'cp {absolute_input_path} {r._get_path(tmpdir)}'
                            ]
                        else:
                            return [
                                f'ln -sf {absolute_input_path} {r._get_path(tmpdir)}'
                            ]
                else:
                    return []
            else:
                assert isinstance(r, TaskResourceFile)
                return []
Пример #2
0
async def docker_run(*args: str):
    script = ' '.join([shq(a) for a in args])
    outerr = await check_shell_output(script)

    cid = outerr[0].decode('ascii').strip()

    outerr = await check_shell_output(f'docker wait {cid}')

    exit_code = int(outerr[0].decode('ascii').strip())
    return cid, exit_code == 0
Пример #3
0
async def docker_run(*args: str):
    script = ' '.join([shq(a) for a in args])
    outerr = await check_shell_output(script, echo=True)
    print(f'Container output: {outerr[0]}\n' f'Container error: {outerr[1]}')

    cid = outerr[0].decode('ascii').strip()

    outerr = await check_shell_output(f'docker wait {cid}')
    exit_code = int(outerr[0].decode('ascii').strip())
    return cid, exit_code == 0
Пример #4
0
        def handler(match_obj):
            groups = match_obj.groupdict()

            if groups['JOB']:
                raise BatchException(
                    f"found a reference to a Job object in command '{command}'."
                )
            if groups['BATCH']:
                raise BatchException(
                    f"found a reference to a Batch object in command '{command}'."
                )
            if groups['PYTHON_RESULT'] and not allow_python_results:
                raise BatchException(
                    f"found a reference to a PythonResult object. hint: Use one of the methods `as_str`, `as_json` or `as_repr` on a PythonResult. command: '{command}'"
                )

            assert groups['RESOURCE_FILE'] or groups[
                'RESOURCE_GROUP'] or groups['PYTHON_RESULT']
            r_uid = match_obj.group()
            r = self._batch._resource_map.get(r_uid)

            if r is None:
                raise BatchException(
                    f"undefined resource '{r_uid}' in command '{command}'.\n"
                    f"Hint: resources must be from the same batch as the current job."
                )

            if r._source != self:
                self._add_inputs(r)
                if r._source is not None:
                    if r not in r._source._valid:
                        name = r._source._resources_inverse[r]
                        raise BatchException(
                            f"undefined resource '{name}'\n"
                            f"Hint: resources must be defined within "
                            f"the job methods 'command' or 'declare_resource_group'"
                        )
                    self._dependencies.add(r._source)
                    r._source._add_internal_outputs(r)
            else:
                _add_resource_to_set(self._valid, r)

            self._mentioned.add(r)
            return '${BATCH_TMPDIR}' + shq(r._get_path(''))
Пример #5
0
    def build(self, batch, code, deploy):
        if self.inputs:
            input_files = []
            for i in self.inputs:
                input_files.append(
                    (f'{BUCKET}/build/{batch.attributes["token"]}{i["from"]}',
                     f'/io/{os.path.basename(i["to"])}'))
        else:
            input_files = None

        config = self.input_config(code, deploy)

        if self.context_path:
            context = f'repo/{self.context_path}'
            init_context = ''
        else:
            context = 'context'
            init_context = 'mkdir context'

        dockerfile = 'Dockerfile'
        render_dockerfile = f'python3 jinja2_render.py {shq(json.dumps(config))} {shq(f"repo/{self.dockerfile}")} Dockerfile'

        if self.publish_as:
            published_latest = shq(
                f'gcr.io/{GCP_PROJECT}/{self.publish_as}:latest')
            pull_published_latest = f'docker pull {shq(published_latest)} || true'
            cache_from_published_latest = f'--cache-from {shq(published_latest)}'
        else:
            pull_published_latest = ''
            cache_from_published_latest = ''

        push_image = f'''
time docker push {self.image}
'''
        if deploy and self.publish_as:
            push_image = f'''
docker tag {shq(self.image)} {self.base_image}:latest
docker push {self.base_image}:latest
''' + push_image

        copy_inputs = ''
        if self.inputs:
            for i in self.inputs:
                # to is relative to docker context
                copy_inputs = copy_inputs + f'''
mkdir -p {shq(os.path.dirname(f'{context}{i["to"]}'))}
cp {shq(f'/io/{os.path.basename(i["to"])}')} {shq(f'{context}{i["to"]}')}
'''

        script = f'''
set -ex
date

rm -rf repo
mkdir repo
(cd repo; {code.checkout_script()})
{render_dockerfile}
{init_context}
{copy_inputs}

FROM_IMAGE=$(awk '$1 == "FROM" {{ print $2; exit }}' {shq(dockerfile)})

gcloud -q auth activate-service-account \
  --key-file=/secrets/gcr-push-service-account-key/gcr-push-service-account-key.json
gcloud -q auth configure-docker

docker pull $FROM_IMAGE
{pull_published_latest}
docker build --memory="1.5g" --cpu-period=100000 --cpu-quota=100000 -t {shq(self.image)} \
  -f {dockerfile} \
  --cache-from $FROM_IMAGE {cache_from_published_latest} \
  {context}
{push_image}

date
'''

        log.info(f'step {self.name}, script:\n{script}')

        volumes = [{
            'volume': {
                'name': 'docker-sock-volume',
                'hostPath': {
                    'path': '/var/run/docker.sock',
                    'type': 'File'
                }
            },
            'volume_mount': {
                'mountPath': '/var/run/docker.sock',
                'name': 'docker-sock-volume'
            }
        }, {
            'volume': {
                'name': 'gcr-push-service-account-key',
                'secret': {
                    'optional': False,
                    'secretName': 'gcr-push-service-account-key'
                }
            },
            'volume_mount': {
                'mountPath': '/secrets/gcr-push-service-account-key',
                'name': 'gcr-push-service-account-key',
                'readOnly': True
            }
        }]

        self.job = batch.create_job(CI_UTILS_IMAGE,
                                    command=['bash', '-c', script],
                                    resources={
                                        'requests': {
                                            'memory': '2G',
                                            'cpu': '1'
                                        },
                                        'limits': {
                                            'memory': '2G',
                                            'cpu': '1'
                                        }
                                    },
                                    attributes={'name': self.name},
                                    volumes=volumes,
                                    input_files=input_files,
                                    parents=self.deps_parents())
Пример #6
0
    def build(self, batch, code, scope):
        if self.inputs:
            input_files = []
            for i in self.inputs:
                input_files.append(
                    (f'{BUCKET}/build/{batch.attributes["token"]}{i["from"]}',
                     f'/io/{os.path.basename(i["to"])}'))
        else:
            input_files = None

        config = self.input_config(code, scope)

        if self.context_path:
            context = f'repo/{self.context_path}'
            init_context = ''
        else:
            context = 'context'
            init_context = 'mkdir context'

        rendered_dockerfile = 'Dockerfile'
        if isinstance(self.dockerfile, dict):
            assert ['inline'] == list(self.dockerfile.keys())
            render_dockerfile = f'echo {shq(self.dockerfile["inline"])} > Dockerfile.{self.token};\n'
            unrendered_dockerfile = f'Dockerfile.{self.token}'
        else:
            assert isinstance(self.dockerfile, str)
            render_dockerfile = ''
            unrendered_dockerfile = f'repo/{self.dockerfile}'
        render_dockerfile += (
            f'python3 jinja2_render.py {shq(json.dumps(config))} '
            f'{shq(unrendered_dockerfile)} {shq(rendered_dockerfile)}')

        if self.publish_as:
            published_latest = shq(
                f'gcr.io/{GCP_PROJECT}/{self.publish_as}:latest')
            pull_published_latest = f'retry docker pull {shq(published_latest)} || true'
            cache_from_published_latest = f'--cache-from {shq(published_latest)}'
        else:
            pull_published_latest = ''
            cache_from_published_latest = ''

        push_image = f'''
time retry docker push {self.image}
'''
        if scope == 'deploy' and self.publish_as and not is_test_deployment:
            push_image = f'''
docker tag {shq(self.image)} {self.base_image}:latest
retry docker push {self.base_image}:latest
''' + push_image

        copy_inputs = ''
        if self.inputs:
            for i in self.inputs:
                # to is relative to docker context
                copy_inputs = copy_inputs + f'''
mkdir -p {shq(os.path.dirname(f'{context}{i["to"]}'))}
cp {shq(f'/io/{os.path.basename(i["to"])}')} {shq(f'{context}{i["to"]}')}
'''

        script = f'''
set -ex
date

{ RETRY_FUNCTION_SCRIPT }

rm -rf repo
mkdir repo
(cd repo; {code.checkout_script()})
{render_dockerfile}
{init_context}
{copy_inputs}

FROM_IMAGE=$(awk '$1 == "FROM" {{ print $2; exit }}' {shq(rendered_dockerfile)})

gcloud -q auth activate-service-account \
  --key-file=/secrets/gcr-push-service-account-key/gcr-push-service-account-key.json
gcloud -q auth configure-docker

retry docker pull $FROM_IMAGE
{pull_published_latest}
docker build --memory="1.5g" --cpu-period=100000 --cpu-quota=100000 -t {shq(self.image)} \
  -f {rendered_dockerfile} \
  --cache-from $FROM_IMAGE {cache_from_published_latest} \
  {context}
{push_image}

date
'''

        log.info(f'step {self.name}, script:\n{script}')

        self.job = batch.create_job(CI_UTILS_IMAGE,
                                    command=['bash', '-c', script],
                                    mount_docker_socket=True,
                                    secrets=[{
                                        'namespace':
                                        BATCH_PODS_NAMESPACE,
                                        'name':
                                        'gcr-push-service-account-key',
                                        'mount_path':
                                        '/secrets/gcr-push-service-account-key'
                                    }],
                                    resources={
                                        'memory': '2G',
                                        'cpu': '1'
                                    },
                                    attributes={'name': self.name},
                                    input_files=input_files,
                                    parents=self.deps_parents())
Пример #7
0
    def _run(self,
             batch: 'batch.Batch',
             dry_run: bool,
             verbose: bool,
             delete_scratch_on_exit: bool,
             **backend_kwargs):  # pylint: disable=R0915
        """
        Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.batch.Batch.run`.

        Parameters
        ----------
        batch:
            Batch to execute.
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        """

        if backend_kwargs:
            raise ValueError(f'LocalBackend does not support any of these keywords: {backend_kwargs}')

        tmpdir = self._get_scratch_dir()

        lines = ['set -e' + ('x' if verbose else ''),
                 '\n',
                 '# change cd to tmp directory',
                 f"cd {tmpdir}",
                 '\n']

        copied_input_resource_files = set()
        os.makedirs(tmpdir + '/inputs/', exist_ok=True)

        if batch.requester_pays_project:
            requester_pays_project = f'-u {batch.requester_pays_project}'
        else:
            requester_pays_project = ''

        def copy_input(job, r):
            if isinstance(r, resource.InputResourceFile):
                if r not in copied_input_resource_files:
                    copied_input_resource_files.add(r)

                    if r._input_path.startswith('gs://'):
                        return [f'gsutil {requester_pays_project} cp {shq(r._input_path)} {shq(r._get_path(tmpdir))}']

                    absolute_input_path = os.path.realpath(r._input_path)

                    dest = r._get_path(tmpdir)
                    dir = os.path.dirname(dest)
                    os.makedirs(dir, exist_ok=True)

                    if job._image is not None:  # pylint: disable-msg=W0640
                        return [f'cp {shq(absolute_input_path)} {shq(dest)}']

                    return [f'ln -sf {shq(absolute_input_path)} {shq(dest)}']

                return []

            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return []

        def copy_external_output(r):
            def _cp(dest):
                if not dest.startswith('gs://'):
                    dest = os.path.abspath(dest)
                    directory = os.path.dirname(dest)
                    os.makedirs(directory, exist_ok=True)
                    return 'cp'
                return f'gsutil {requester_pays_project} cp'

            if isinstance(r, resource.InputResourceFile):
                return [f'{_cp(dest)} {shq(r._input_path)} {shq(dest)}'
                        for dest in r._output_paths]

            assert isinstance(r, (resource.JobResourceFile, resource.PythonResult))
            return [f'{_cp(dest)} {r._get_path(tmpdir)} {shq(dest)}'
                    for dest in r._output_paths]

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(tmpdir)
                    dest = f'{r._get_path(tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        write_inputs = [x for r in batch._input_resources for x in copy_external_output(r)]
        if write_inputs:
            lines += ["# Write input resources to output destinations"]
            lines += write_inputs
            lines += ['\n']

        for job in batch._jobs:
            if isinstance(job, _job.PythonJob):
                job._compile(tmpdir, tmpdir)

            os.makedirs(f'{tmpdir}/{job._job_id}/', exist_ok=True)

            lines.append(f"# {job._job_id}: {job.name if job.name else ''}")

            lines += [x for r in job._inputs for x in copy_input(job, r)]
            lines += [x for r in job._mentioned for x in symlink_input_resource_group(r)]

            resource_defs = [r._declare(tmpdir) for r in job._mentioned]
            env = [f'export {k}={v}' for k, v in job._env.items()]

            job_shell = job._shell if job._shell else self._DEFAULT_SHELL

            defs = '; '.join(resource_defs) + '; ' if resource_defs else ''
            joined_env = '; '.join(env) + '; ' if env else ''

            cmd = " && ".join(f'{{\n{x}\n}}' for x in job._command)

            quoted_job_script = shq(joined_env + defs + cmd)

            if job._image:
                cpu = f'--cpus={job._cpu}' if job._cpu else ''

                memory = job._memory
                if memory is not None:
                    memory_ratios = {'lowmem': 1024**3, 'standard': 4 * 1024**3, 'highmem': 7 * 1024**3}
                    if memory in memory_ratios:
                        if job._cpu is not None:
                            mcpu = parse_cpu_in_mcpu(job._cpu)
                            if mcpu is not None:
                                memory = str(int(memory_ratios[memory] * (mcpu / 1000)))
                            else:
                                raise BatchException(f'invalid value for cpu: {job._cpu}')
                        else:
                            raise BatchException(f'must specify cpu when using {memory} to specify the memory')
                    memory = f'-m {memory}' if memory else ''

                lines.append(f"docker run "
                             "--entrypoint=''"
                             f"{self._extra_docker_run_flags} "
                             f"-v {tmpdir}:{tmpdir} "
                             f"-w {tmpdir} "
                             f"{memory} "
                             f"{cpu} "
                             f"{job._image} "
                             f"{job_shell} -c {quoted_job_script}")
            else:
                lines.append(f"{job_shell} -c {quoted_job_script}")

            lines += [x for r in job._external_outputs for x in copy_external_output(r)]
            lines += ['\n']

        script = "\n".join(lines)

        if dry_run:
            print(lines)
        else:
            try:
                sp.check_call(script, shell=True)
            except sp.CalledProcessError as e:
                print(e)
                print(e.output)
                raise
            finally:
                if delete_scratch_on_exit:
                    sp.run(f'rm -rf {tmpdir}', shell=True, check=False)

        print('Batch completed successfully!')
Пример #8
0
 def _get_path(self, directory):
     assert self._source is not None
     assert self._value is not None
     return shq(directory + '/' + self._source._uid + '/' + self._value)
Пример #9
0
 def _get_path(self, directory):
     assert self._value is not None
     return shq(directory + '/inputs/' + self._value)
Пример #10
0
    async def _async_run(
            self,
            batch: 'batch.Batch',
            dry_run: bool,
            verbose: bool,
            delete_scratch_on_exit: bool,
            wait: bool = True,
            open: bool = False,
            disable_progress_bar: bool = False,
            callback: Optional[str] = None,
            token: Optional[str] = None,
            **backend_kwargs):  # pylint: disable-msg=too-many-statements
        if backend_kwargs:
            raise ValueError(
                f'ServiceBackend does not support any of these keywords: {backend_kwargs}'
            )

        build_dag_start = time.time()

        uid = uuid.uuid4().hex[:6]
        batch_remote_tmpdir = f'{self.remote_tmpdir}{uid}'
        local_tmpdir = f'/io/batch/{uid}'

        default_image = 'ubuntu:20.04'

        attributes = copy.deepcopy(batch.attributes)
        if batch.name is not None:
            attributes['name'] = batch.name

        bc_batch = self._batch_client.create_batch(
            attributes=attributes,
            callback=callback,
            token=token,
            cancel_after_n_failures=batch._cancel_after_n_failures)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        job_to_client_job_mapping: Dict[_job.Job, bc.Job] = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '')

        def copy_input(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(batch_remote_tmpdir),
                     r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir),
                     r._get_path(batch_remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, resource.InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return [(r._get_path(local_tmpdir), dest)
                    for dest in r._output_paths]

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(local_tmpdir)
                    dest = f'{r._get_path(local_tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        write_external_inputs = [
            x for r in batch._input_resources for x in copy_external_output(r)
        ]
        if write_external_inputs:
            transfers_bytes = orjson.dumps([{
                "from": src,
                "to": dest
            } for src, dest in write_external_inputs])
            transfers = transfers_bytes.decode('utf-8')
            write_cmd = [
                'python3', '-m', 'hailtop.aiotools.copy', 'null', transfers
            ]
            if dry_run:
                commands.append(' '.join(shq(x) for x in write_cmd))
            else:
                j = bc_batch.create_job(
                    image=HAIL_GENETICS_HAIL_IMAGE,
                    command=write_cmd,
                    attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = ' '.join(shq(x) for x in write_cmd)
                n_jobs_submitted += 1

        pyjobs = [j for j in batch._jobs if isinstance(j, _job.PythonJob)]
        for job in pyjobs:
            if job._image is None:
                version = sys.version_info
                if version.major != 3 or version.minor not in (6, 7, 8):
                    raise BatchException(
                        f"You must specify 'image' for Python jobs if you are using a Python version other than 3.6, 3.7, or 3.8 (you are using {version})"
                    )
                job._image = f'hailgenetics/python-dill:{version.major}.{version.minor}-slim'

        with tqdm(total=len(batch._jobs),
                  desc='upload code',
                  disable=disable_progress_bar) as pbar:

            async def compile_job(job):
                used_remote_tmpdir = await job._compile(local_tmpdir,
                                                        batch_remote_tmpdir,
                                                        dry_run=dry_run)
                pbar.update(1)
                return used_remote_tmpdir

            used_remote_tmpdir_results = await bounded_gather(
                *[functools.partial(compile_job, j) for j in batch._jobs],
                parallelism=150)
            used_remote_tmpdir |= any(used_remote_tmpdir_results)

        for job in tqdm(batch._jobs,
                        desc='create job objects',
                        disable=disable_progress_bar):
            inputs = [x for r in job._inputs for x in copy_input(r)]

            outputs = [
                x for r in job._internal_outputs
                for x in copy_internal_output(r)
            ]
            if outputs:
                used_remote_tmpdir = True
            outputs += [
                x for r in job._external_outputs
                for x in copy_external_output(r)
            ]

            symlinks = [
                x for r in job._mentioned
                for x in symlink_input_resource_group(r)
            ]

            if job._image is None:
                if verbose:
                    print(
                        f"Using image '{default_image}' since no image was specified."
                    )

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{job._dirname}'

            job_command = [cmd.strip() for cmd in job._wrapper_code]
            prepared_job_command = (f'{{\n{x}\n}}' for x in job_command)
            cmd = f'''
{bash_flags}
{make_local_tmpdir}
{"; ".join(symlinks)}
{" && ".join(prepared_job_command)}
'''

            user_code = '\n\n'.join(job._user_code) if job._user_code else None

            if dry_run:
                formatted_command = f'''
================================================================================
# Job {job._job_id} {f": {job.name}" if job.name else ''}

--------------------------------------------------------------------------------
## USER CODE
--------------------------------------------------------------------------------
{user_code}

--------------------------------------------------------------------------------
## COMMAND
--------------------------------------------------------------------------------
{cmd}
================================================================================
'''
                commands.append(formatted_command)
                continue

            parents = [job_to_client_job_mapping[j] for j in job._dependencies]

            attributes = copy.deepcopy(
                job.attributes) if job.attributes else {}
            if job.name:
                attributes['name'] = job.name

            resources: Dict[str, Any] = {}
            if job._cpu:
                resources['cpu'] = job._cpu
            if job._memory:
                resources['memory'] = job._memory
            if job._storage:
                resources['storage'] = job._storage
            if job._machine_type:
                resources['machine_type'] = job._machine_type
            if job._preemptible is not None:
                resources['preemptible'] = job._preemptible

            image = job._image if job._image else default_image
            image_ref = parse_docker_image_reference(image)
            if image_ref.hosted_in('dockerhub') and image_ref.name(
            ) not in HAIL_GENETICS_IMAGES:
                warnings.warn(f'Using an image {image} from Docker Hub. '
                              f'Jobs may fail due to Docker Hub rate limits.')

            env = {**job._env, 'BATCH_TMPDIR': local_tmpdir}

            j = bc_batch.create_job(
                image=image,
                command=[
                    job._shell if job._shell else DEFAULT_SHELL, '-c', cmd
                ],
                parents=parents,
                attributes=attributes,
                resources=resources,
                input_files=inputs if len(inputs) > 0 else None,
                output_files=outputs if len(outputs) > 0 else None,
                always_run=job._always_run,
                timeout=job._timeout,
                cloudfuse=job._cloudfuse if len(job._cloudfuse) > 0 else None,
                env=env,
                requester_pays_project=batch.requester_pays_project,
                mount_tokens=True,
                user_code=user_code)

            n_jobs_submitted += 1

            job_to_client_job_mapping[job] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return None

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            j = bc_batch.create_job(image=HAIL_GENETICS_HAIL_IMAGE,
                                    command=[
                                        'python3', '-m',
                                        'hailtop.aiotools.delete',
                                        batch_remote_tmpdir
                                    ],
                                    parents=parents,
                                    attributes={'name': 'remove_tmpdir'},
                                    always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(
                f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.'
            )

        submit_batch_start = time.time()
        batch_handle = bc_batch.submit(
            disable_progress_bar=disable_progress_bar)

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(
                f'Submitted batch {batch_handle.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:'
            )
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')
            print('')

        deploy_config = get_deploy_config()
        url = deploy_config.url('batch', f'/batches/{batch_handle.id}')
        print(f'Submitted batch {batch_handle.id}, see {url}')

        if open:
            webbrowser.open(url)
        if wait:
            print(f'Waiting for batch {batch_handle.id}...')
            status = batch_handle.wait()
            print(f'batch {batch_handle.id} complete: {status["state"]}')
        return batch_handle
Пример #11
0
    def _run(self, batch: 'batch.Batch', dry_run: bool, verbose: bool,
             delete_scratch_on_exit: bool, **backend_kwargs) -> None:  # pylint: disable=R0915
        """
        Execute a batch.

        Warning
        -------
        This method should not be called directly. Instead, use :meth:`.batch.Batch.run`.

        Parameters
        ----------
        batch:
            Batch to execute.
        dry_run:
            If `True`, don't execute code.
        verbose:
            If `True`, print debugging output.
        delete_scratch_on_exit:
            If `True`, delete temporary directories with intermediate files.
        """

        if backend_kwargs:
            raise ValueError(
                f'LocalBackend does not support any of these keywords: {backend_kwargs}'
            )

        tmpdir = self._get_scratch_dir()

        def new_code_block():
            return [
                'set -e' + ('x' if verbose else ''), '\n',
                '# change cd to tmp directory', f"cd {tmpdir}", '\n'
            ]

        def run_code(code):
            code = '\n'.join(code)
            if dry_run:
                print(code)
            else:
                try:
                    sp.check_call(code, shell=True)
                except sp.CalledProcessError as e:
                    print(e)
                    print(e.output)
                    raise

        copied_input_resource_files = set()
        os.makedirs(tmpdir + '/inputs/', exist_ok=True)

        requester_pays_project_json = orjson.dumps(
            batch.requester_pays_project).decode('utf-8')

        def copy_input(job, r):
            if isinstance(r, resource.InputResourceFile):
                if r not in copied_input_resource_files:
                    copied_input_resource_files.add(r)

                    input_scheme = url_scheme(r._input_path)
                    if input_scheme != '':
                        transfers_bytes = orjson.dumps([{
                            "from":
                            r._input_path,
                            "to":
                            r._get_path(tmpdir)
                        }])
                        transfers = transfers_bytes.decode('utf-8')
                        return [
                            f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(transfers)}'
                        ]

                    absolute_input_path = os.path.realpath(
                        os.path.expanduser(r._input_path))

                    dest = r._get_path(os.path.expanduser(tmpdir))
                    dir = os.path.dirname(dest)
                    os.makedirs(dir, exist_ok=True)

                    if job._image is not None:  # pylint: disable-msg=W0640
                        return [f'cp {shq(absolute_input_path)} {shq(dest)}']

                    return [f'ln -sf {shq(absolute_input_path)} {shq(dest)}']

                return []

            assert isinstance(
                r, (resource.JobResourceFile, resource.PythonResult))
            return []

        def symlink_input_resource_group(r):
            symlinks = []
            if isinstance(r, resource.ResourceGroup) and r._source is None:
                for name, irf in r._resources.items():
                    src = irf._get_path(tmpdir)
                    dest = f'{r._get_path(tmpdir)}.{name}'
                    symlinks.append(f'ln -sf {shq(src)} {shq(dest)}')
            return symlinks

        def transfer_dicts_for_resource_file(
            res_file: Union[resource.ResourceFile, resource.PythonResult]
        ) -> List[dict]:
            if isinstance(res_file, resource.InputResourceFile):
                source = res_file._input_path
            else:
                assert isinstance(
                    res_file,
                    (resource.JobResourceFile, resource.PythonResult))
                source = res_file._get_path(tmpdir)

            return [{
                "from": source,
                "to": dest
            } for dest in res_file._output_paths]

        try:
            input_transfer_dicts = [
                transfer_dict for input_resource in batch._input_resources
                for transfer_dict in transfer_dicts_for_resource_file(
                    input_resource)
            ]

            if input_transfer_dicts:
                input_transfers = orjson.dumps(input_transfer_dicts).decode(
                    'utf-8')
                code = new_code_block()
                code += ["# Write input resources to output destinations"]
                code += [
                    f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(input_transfers)}'
                ]
                code += ['\n']
                run_code(code)

            for job in batch._jobs:
                async_to_blocking(job._compile(tmpdir, tmpdir))

                os.makedirs(f'{tmpdir}/{job._dirname}/', exist_ok=True)

                code = new_code_block()

                code.append(f"# {job._job_id}: {job.name if job.name else ''}")

                if job._user_code:
                    code.append('# USER CODE')
                    user_code = [
                        f'# {line}' for cmd in job._user_code
                        for line in cmd.split('\n')
                    ]
                    code.append('\n'.join(user_code))

                code += [x for r in job._inputs for x in copy_input(job, r)]
                code += [
                    x for r in job._mentioned
                    for x in symlink_input_resource_group(r)
                ]

                env = {**job._env, 'BATCH_TMPDIR': tmpdir}
                env_declarations = [f'export {k}={v}' for k, v in env.items()]
                joined_env = '; '.join(env_declarations) + '; ' if env else ''

                job_shell = job._shell if job._shell else DEFAULT_SHELL

                cmd = " && ".join(f'{{\n{x}\n}}' for x in job._wrapper_code)

                quoted_job_script = shq(joined_env + cmd)

                if job._image:
                    cpu = f'--cpus={job._cpu}' if job._cpu else ''

                    memory = job._memory
                    if memory is not None:
                        memory_ratios = {
                            'lowmem': 1024**3,
                            'standard': 4 * 1024**3,
                            'highmem': 7 * 1024**3
                        }
                        if memory in memory_ratios:
                            if job._cpu is not None:
                                mcpu = parse_cpu_in_mcpu(job._cpu)
                                if mcpu is not None:
                                    memory = str(
                                        int(memory_ratios[memory] *
                                            (mcpu / 1000)))
                                else:
                                    raise BatchException(
                                        f'invalid value for cpu: {job._cpu}')
                            else:
                                raise BatchException(
                                    f'must specify cpu when using {memory} to specify the memory'
                                )
                        memory = f'-m {memory}' if memory else ''
                    else:
                        memory = ''

                    code.append(f"docker run "
                                "--entrypoint=''"
                                f"{self._extra_docker_run_flags} "
                                f"-v {tmpdir}:{tmpdir} "
                                f"-w {tmpdir} "
                                f"{memory} "
                                f"{cpu} "
                                f"{job._image} "
                                f"{job_shell} -c {quoted_job_script}")
                else:
                    code.append(f"{job_shell} -c {quoted_job_script}")

                output_transfer_dicts = [
                    transfer_dict for output_resource in job._external_outputs
                    for transfer_dict in transfer_dicts_for_resource_file(
                        output_resource)
                ]
                output_transfers = orjson.dumps(output_transfer_dicts).decode(
                    'utf-8')

                code += [
                    f'python3 -m hailtop.aiotools.copy {shq(requester_pays_project_json)} {shq(output_transfers)}'
                ]
                code += ['\n']

                run_code(code)
        finally:
            if delete_scratch_on_exit:
                sp.run(f'rm -rf {tmpdir}', shell=True, check=False)

        print('Batch completed successfully!')
Пример #12
0
    async def build(self, batch, pr):
        if self.inputs:
            input_files = []
            for i in self.inputs:
                input_files.append((f'{BUCKET}/build/{batch.attributes["token"]}{i["from"]}', f'/io/{os.path.basename(i["to"])}'))
        else:
            input_files = None

        config = self.input_config(pr)

        if self.context_path:
            context = f'repo/{self.context_path}'
            init_context = ''
        else:
            context = 'context'
            init_context = 'mkdir context'

        dockerfile = 'Dockerfile'
        render_dockerfile = f'python3 jinja2_render.py {shq(json.dumps(config))} {shq(f"repo/{self.dockerfile}")} Dockerfile'

        if self.publish_as:
            published_latest = shq(f'gcr.io/{GCP_PROJECT}/{self.publish_as}:latest')
            pull_published_latest = f'docker pull {shq(published_latest)} || true'
            cache_from_published_latest = f'--cache-from {shq(published_latest)}'
        else:
            pull_published_latest = ''
            cache_from_published_latest = ''

        copy_inputs = ''
        if self.inputs:
            for i in self.inputs:
                # to is relative to docker context
                copy_inputs = copy_inputs + f'''
mkdir -p {shq(os.path.dirname(f'{context}{i["to"]}'))}
mv {shq(f'/io/{os.path.basename(i["to"])}')} {shq(f'{context}{i["to"]}')}
'''

        script = f'''
set -ex

git clone {shq(pr.target_branch.branch.repo.url)} repo

git -C repo config user.email [email protected]
git -C repo config user.name hail-ci-leader

git -C repo remote add {shq(pr.source_repo.short_str())} {shq(pr.source_repo.url)}
git -C repo fetch -q {shq(pr.source_repo.short_str())}
git -C repo checkout {shq(pr.target_branch.sha)}
git -C repo merge {shq(pr.source_sha)} -m 'merge PR'

{render_dockerfile}
{init_context}
{copy_inputs}

FROM_IMAGE=$(awk '$1 == "FROM" {{ print $2; exit }}' {shq(dockerfile)})

gcloud -q auth activate-service-account \
  --key-file=/secrets/gcr-push-service-account-key/gcr-push-service-account-key.json
gcloud -q auth configure-docker

docker pull $FROM_IMAGE
{pull_published_latest}
docker build -t {shq(self.image)} \
  -f {dockerfile} \
  --cache-from $FROM_IMAGE {cache_from_published_latest} \
  {context}
docker push {shq(self.image)}
'''

        log.info(f'step {self.name}, script:\n{script}')

        volumes = [{
            'volume': {
                'name': 'docker-sock-volume',
                'hostPath': {
                    'path': '/var/run/docker.sock',
                    'type': 'File'
                }
            },
            'volume_mount': {
                'mountPath': '/var/run/docker.sock',
                'name': 'docker-sock-volume'
            }
        }, {
            'volume': {
                'name': 'gcr-push-service-account-key',
                'secret': {
                    'optional': False,
                    'secretName': 'gcr-push-service-account-key'
                }
            },
            'volume_mount': {
                'mountPath': '/secrets/gcr-push-service-account-key',
                'name': 'gcr-push-service-account-key',
                'readOnly': True
            }
        }]

        sa = None
        if self.inputs is not None:
            sa = 'ci2'

        self.job = await batch.create_job(CI_UTILS_IMAGE,
                                          command=['bash', '-c', script],
                                          attributes={'name': self.name},
                                          volumes=volumes,
                                          input_files=input_files,
                                          copy_service_account_name=sa,
                                          parent_ids=self.deps_parent_ids())