Exemplo n.º 1
0
    def _run(self, pipeline, dry_run, verbose,
             delete_scratch_on_exit):  # pylint: disable-msg=R0915
        build_dag_start = time.time()

        bucket = self._batch_client.bucket
        subdir_name = 'pipeline-{}'.format(uuid.uuid4().hex[:12])

        remote_tmpdir = f'gs://{bucket}/pipeline/{subdir_name}'
        local_tmpdir = f'/io/pipeline/{subdir_name}'

        default_image = 'ubuntu:latest'

        attributes = pipeline.attributes
        if pipeline.name is not None:
            attributes['name'] = pipeline.name

        batch = self._batch_client.create_batch(attributes=attributes)

        n_jobs_submitted = 0
        used_remote_tmpdir = False

        task_to_job_mapping = {}
        jobs_to_command = {}
        commands = []

        bash_flags = 'set -e' + ('x' if verbose else '') + '; '

        activate_service_account = 'gcloud -q auth activate-service-account ' \
                                   '--key-file=/gsa-key/privateKeyData'

        def copy_input(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, r._get_path(local_tmpdir))]
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(remote_tmpdir), r._get_path(local_tmpdir))]

        def copy_internal_output(r):
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(local_tmpdir), r._get_path(remote_tmpdir))]

        def copy_external_output(r):
            if isinstance(r, InputResourceFile):
                return [(r._input_path, dest) for dest in r._output_paths]
            assert isinstance(r, TaskResourceFile)
            return [(r._get_path(local_tmpdir), dest)
                    for dest in r._output_paths]

        write_external_inputs = [
            x for r in pipeline._input_resources
            for x in copy_external_output(r)
        ]
        if write_external_inputs:

            def _cp(src, dst):
                return f'gsutil -m cp -R {src} {dst}'

            write_cmd = bash_flags + activate_service_account + ' && ' + \
                ' && '.join([_cp(*files) for files in write_external_inputs])

            if dry_run:
                commands.append(write_cmd)
            else:
                j = batch.create_job(
                    image='google/cloud-sdk:237.0.0-alpine',
                    command=['/bin/bash', '-c', write_cmd],
                    attributes={'name': 'write_external_inputs'})
                jobs_to_command[j] = write_cmd
                n_jobs_submitted += 1

        for task in pipeline._tasks:
            inputs = [x for r in task._inputs for x in copy_input(r)]

            outputs = [
                x for r in task._internal_outputs
                for x in copy_internal_output(r)
            ]
            if outputs:
                used_remote_tmpdir = True
            outputs += [
                x for r in task._external_outputs
                for x in copy_external_output(r)
            ]

            resource_defs = [
                r._declare(directory=local_tmpdir) for r in task._mentioned
            ]

            if task._image is None:
                if verbose:
                    print(
                        f"Using image '{default_image}' since no image was specified."
                    )

            make_local_tmpdir = f'mkdir -p {local_tmpdir}/{task._uid}/; '
            defs = '; '.join(resource_defs) + '; ' if resource_defs else ''
            task_command = [cmd.strip() for cmd in task._command]

            cmd = bash_flags + make_local_tmpdir + defs + " && ".join(
                task_command)
            if dry_run:
                commands.append(cmd)
                continue

            parents = [task_to_job_mapping[t] for t in task._dependencies]

            attributes = {'task_uid': task._uid}
            if task.name:
                attributes['name'] = task.name
            attributes.update(task.attributes)

            resources = {}
            if task._cpu:
                resources['cpu'] = task._cpu
            if task._memory:
                resources['memory'] = task._memory

            j = batch.create_job(
                image=task._image if task._image else default_image,
                command=['/bin/bash', '-c', cmd],
                parents=parents,
                attributes=attributes,
                resources=resources,
                input_files=inputs if len(inputs) > 0 else None,
                output_files=outputs if len(outputs) > 0 else None,
                pvc_size=task._storage)
            n_jobs_submitted += 1

            task_to_job_mapping[task] = j
            jobs_to_command[j] = cmd

        if dry_run:
            print("\n\n".join(commands))
            return

        if delete_scratch_on_exit and used_remote_tmpdir:
            parents = list(jobs_to_command.keys())
            rm_cmd = f'gsutil -m rm -r {remote_tmpdir}'
            cmd = bash_flags + f'{activate_service_account} && {rm_cmd}'
            j = batch.create_job(image='google/cloud-sdk:237.0.0-alpine',
                                 command=['/bin/bash', '-c', cmd],
                                 parents=parents,
                                 attributes={'name': 'remove_tmpdir'},
                                 always_run=True)
            jobs_to_command[j] = cmd
            n_jobs_submitted += 1

        if verbose:
            print(
                f'Built DAG with {n_jobs_submitted} jobs in {round(time.time() - build_dag_start, 3)} seconds.'
            )

        submit_batch_start = time.time()
        batch = batch.submit()

        jobs_to_command = {j.id: cmd for j, cmd in jobs_to_command.items()}

        if verbose:
            print(
                f'Submitted batch {batch.id} with {n_jobs_submitted} jobs in {round(time.time() - submit_batch_start, 3)} seconds:'
            )
            for jid, cmd in jobs_to_command.items():
                print(f'{jid}: {cmd}')

        status = batch.wait()

        if status['state'] == 'success':
            print('Pipeline completed successfully!')
            return

        failed_jobs = [(j, Job.exit_code(j)) for j in status['jobs']]
        failed_jobs = [((j['batch_id'], j['job_id']), Job._get_exit_codes(j))
                       for j, ec in failed_jobs if ec != 0]

        fail_msg = ''
        for jid, ec in failed_jobs:
            ec = Job.exit_code(ec)
            job = self._batch_client.get_job(*jid)
            log = job.log()
            name = job.status()['attributes'].get('name', None)
            fail_msg += (f"Job {jid} failed with exit code {ec}:\n"
                         f"  Task name:\t{name}\n"
                         f"  Command:\t{jobs_to_command[jid]}\n"
                         f"  Log:\t{log}\n")

        raise PipelineException(fail_msg)
Exemplo n.º 2
0
def batch_status_exit_codes(batch_status):
    return [Job._get_exit_codes(j) for j in batch_status['jobs']]