Пример #1
0
 def _metadata_to_json(self, run_id, step_name, task_id, metadata):
     user = get_username()
     return [{
         'flow_id': self._flow_name,
         'run_number': run_id,
         'step_name': step_name,
         'task_id': task_id,
         'field_name': datum.field,
         'type': datum.type,
         'value': datum.value,
         'tags': datum.tags,
         'user_name': user,
         'ts_epoch': int(round(time.time() * 1000))
     } for datum in metadata]
Пример #2
0
 def _metadata_to_json(self, run_id, step_name, task_id, metadata):
     user = get_username()
     return [{
         "flow_id": self._flow_name,
         "run_number": run_id,
         "step_name": step_name,
         "task_id": task_id,
         "field_name": datum.field,
         "type": datum.type,
         "value": datum.value,
         "tags": list(set(datum.tags)) if datum.tags else [],
         "user_name": user,
         "ts_epoch": int(round(time.time() * 1000)),
     } for datum in metadata]
Пример #3
0
    def get_runtime_environment(self, runtime_name):
        '''
        Returns a dictionary of environment variables to be set

        Parameters
        ----------
        runtime_name : string
            Name of the runtime for which to get the environment

        Returns
        -------
        dict[string] -> string
            Environment variables from this metadata provider
        '''
        return {'METAFLOW_RUNTIME_NAME': runtime_name, 'USER': get_username()}
Пример #4
0
 def __init__(self, name, flow, graph, code_package, code_package_url,
              metadata, datastore, environment, event_logger, monitor,
              image, image_pull_secrets, env, env_from, labels, annotations,
              max_workers, volumes, workflow_timeout):
     self.name = name
     self.flow = flow
     self.graph = graph
     self.code_package = code_package
     self.code_package_url = code_package_url
     self.metadata = metadata
     self.datastore = datastore
     self.environment = environment
     self.event_logger = event_logger
     self.monitor = monitor
     self.image = image
     self.image_pull_secrets = image_pull_secrets
     self.env = env
     self.env_from = env_from
     self.labels = labels
     self.annotations = annotations
     self.volumes = volumes
     self.attributes = {
         'labels': {
             'app': 'metaflow',
             'metaflow/workflow_template': name,
             'app.kubernetes.io/created-by': get_username(),
         },
         # TODO: Add annotations based on https://kubernetes.io/blog/2021/04/20/annotating-k8s-for-humans/
         'annotations': {
             'metaflow/flow_name': self.flow.name,
         },
     }
     # Add Metaflow system tags as labels
     self.system_tags = {
         "metaflow/%s" % sys_tag[:sys_tag.index(":")]:
         sanitize_label_value(sys_tag[sys_tag.index(":") + 1:])
         for sys_tag in self.metadata.sticky_sys_tags
     }
     self.attributes['labels'].update(self.system_tags)
     self.max_workers = max_workers
     self.workflow_timeout = workflow_timeout
     self._flow_attributes = self._parse_flow_decorator()
     self._workflow = remove_empty_elements(self._compile())
     self._cron = self._cron()
Пример #5
0
 def _prepare_environment(self, attr, env_decorator):
     default = {
         'METAFLOW_USER': get_username(),
         'METAFLOW_DATASTORE_SYSROOT_S3': DATASTORE_SYSROOT_S3,
     }
     if DEFAULT_METADATA:
         default['METAFLOW_DEFAULT_METADATA'] = DEFAULT_METADATA
     if METADATA_SERVICE_URL:
         default['METAFLOW_SERVICE_URL'] = METADATA_SERVICE_URL
     if METADATA_SERVICE_HEADERS:
         default['METADATA_SERVICE_HEADERS'] = METADATA_SERVICE_HEADERS
     # add env vars from @environment decorator if exist
     default.update(env_decorator.get('vars', {}))
     default_env = [{'name': k, 'value': v} for k, v in default.items()]
     env = default_env + self._flow_attributes.get(
         'env', []) + self.env + attr.get('env', [])
     env_from = self._flow_attributes.get(
         'envFrom', []) + self.env_from + attr.get('envFrom', [])
     return env, env_from
Пример #6
0
 def flow_init(self, flow, graph, environment, datastore, logger, echo,
               options):
     self._option_values = options
     project_name = self.attributes.get('name')
     project_flow_name, branch_name = format_name(flow.name, project_name,
                                                  options['production'],
                                                  options['branch'],
                                                  get_username())
     is_user_branch = options['branch'] is None and not options['production']
     echo("Project: *%s*, Branch: *%s*" % (project_name, branch_name),
          fg='magenta',
          highlight='green')
     current._update_env({
         'project_name': project_name,
         'branch_name': branch_name,
         'is_user_branch': is_user_branch,
         'is_production': options['production'],
         'project_flow_name': project_flow_name
     })
Пример #7
0
def make_flow(obj,
              token,
              name,
              tags,
              namespace,
              max_workers,
              workflow_timeout,
              is_project):
    datastore = obj.datastore(obj.flow.name,
                              mode='w',
                              metadata=obj.metadata,
                              event_logger=obj.event_logger,
                              monitor=obj.monitor)
    if datastore.TYPE != 's3':
        raise MetaflowException("AWS Step Functions requires --datastore=s3.")

    # Attach AWS Batch decorator to the flow
    decorators._attach_decorators(obj.flow, [BatchDecorator.name])
    decorators._init_step_decorators(
            obj.flow, obj.graph, obj.environment, obj.datastore, obj.logger)

    obj.package = MetaflowPackage(
        obj.flow, obj.environment, obj.echo, obj.package_suffixes)
    package_url = datastore.save_data(
        obj.package.sha, TransformableObject(obj.package.blob))

    return StepFunctions(name,
                         obj.graph,
                         obj.flow,
                         obj.package,
                         package_url,
                         token,
                         obj.metadata,
                         obj.datastore,
                         obj.environment,
                         obj.event_logger,
                         obj.monitor,
                         tags=tags,
                         namespace=namespace,
                         max_workers=max_workers,
                         username=get_username(),
                         workflow_timeout=workflow_timeout,
                         is_project=is_project)
Пример #8
0
def make_flow(obj, token, name, tags, namespace, max_workers, workflow_timeout,
              workflow_priority):
    # TODO: Make this check less specific to Amazon S3 as we introduce
    #       support for more cloud object stores.
    if obj.flow_datastore.TYPE != "s3":
        raise MetaflowException("Argo Workflows requires --datastore=s3.")

    # Attach @kubernetes and @environment decorator to the flow to
    # ensure that the related decorator hooks are invoked.
    decorators._attach_decorators(
        obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name])

    decorators._init_step_decorators(obj.flow, obj.graph, obj.environment,
                                     obj.flow_datastore, obj.logger)

    # Save the code package in the flow datastore so that both user code and
    # metaflow package can be retrieved during workflow execution.
    obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo,
                                  obj.package_suffixes)
    package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob],
                                                            len_hint=1)[0]

    return ArgoWorkflows(
        name,
        obj.graph,
        obj.flow,
        package_sha,
        package_url,
        token,
        obj.metadata,
        obj.flow_datastore,
        obj.environment,
        obj.event_logger,
        obj.monitor,
        tags=tags,
        namespace=namespace,
        max_workers=max_workers,
        username=get_username(),
        workflow_timeout=workflow_timeout,
        workflow_priority=workflow_priority,
    )
Пример #9
0
def _execute_cmd(func, flow_name, run_id, user, my_runs, echo):
    if user and my_runs:
        raise CommandException("--user and --my-runs are mutually exclusive.")

    if run_id and my_runs:
        raise CommandException("--run_id and --my-runs are mutually exclusive.")

    if my_runs:
        user = util.get_username()

    latest_run = True

    if user and not run_id:
        latest_run = False

    if not run_id and latest_run:
        run_id = util.get_latest_run_id(echo, flow_name)
        if run_id is None:
            raise CommandException("A previous run id was not found. Specify --run-id.")

    func(flow_name, run_id, user, echo)
Пример #10
0
def make_flow(
    obj, token, name, tags, namespace, max_workers, workflow_timeout, is_project
):
    if obj.flow_datastore.TYPE != "s3":
        raise MetaflowException("AWS Step Functions requires --datastore=s3.")

    # Attach AWS Batch decorator to the flow
    decorators._attach_decorators(obj.flow, [BatchDecorator.name])
    decorators._init_step_decorators(
        obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger
    )

    obj.package = MetaflowPackage(
        obj.flow, obj.environment, obj.echo, obj.package_suffixes
    )
    package_url, package_sha = obj.flow_datastore.save_data(
        [obj.package.blob], len_hint=1
    )[0]

    return StepFunctions(
        name,
        obj.graph,
        obj.flow,
        package_sha,
        package_url,
        token,
        obj.metadata,
        obj.flow_datastore,
        obj.environment,
        obj.event_logger,
        obj.monitor,
        tags=tags,
        namespace=namespace,
        max_workers=max_workers,
        username=get_username(),
        workflow_timeout=workflow_timeout,
        is_project=is_project,
    )
Пример #11
0
def step(ctx,
         step_name,
         code_package_sha,
         code_package_url,
         executable=None,
         image=None,
         iam_role=None,
         execution_role=None,
         cpu=None,
         gpu=None,
         memory=None,
         queue=None,
         run_time_limit=None,
         shared_memory=None,
         max_swap=None,
         swappiness=None,
         **kwargs):
    def echo(batch_id, msg, stream=sys.stdout):
        ctx.obj.echo_always("[%s] %s" % (batch_id, msg))

    if ctx.obj.datastore.datastore_root is None:
        ctx.obj.datastore.datastore_root = ctx.obj.datastore.get_datastore_root_from_config(
            echo)

    if R.use_r():
        entrypoint = R.entrypoint()
    else:
        if executable is None:
            executable = ctx.obj.environment.executable(step_name)
        entrypoint = '%s -u %s' % (executable, os.path.basename(sys.argv[0]))

    top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))

    input_paths = kwargs.get("input_paths")
    split_vars = None
    if input_paths:
        max_size = 30 * 1024
        split_vars = {
            "METAFLOW_INPUT_PATHS_%d" % (i // max_size):
            input_paths[i:i + max_size]
            for i in range(0, len(input_paths), max_size)
        }
        kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())

    step_args = " ".join(util.dict_to_cli_options(kwargs))
    step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format(
        entrypoint=entrypoint,
        top_args=top_args,
        step=step_name,
        step_args=step_args)
    node = ctx.obj.graph[step_name]

    # Get retry information
    retry_count = kwargs.get("retry_count", 0)
    retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
    minutes_between_retries = None
    if retry_deco:
        minutes_between_retries = int(retry_deco[0].attributes.get(
            "minutes_between_retries", 1))

    # Set batch attributes
    attrs = {
        "metaflow.user":
        util.get_username(),
        "metaflow.flow_name":
        ctx.obj.flow.name,
        "metaflow.step_name":
        step_name,
        "metaflow.run_id":
        kwargs["run_id"],
        "metaflow.task_id":
        kwargs["task_id"],
        "metaflow.retry_count":
        str(retry_count),
        "metaflow.version":
        ctx.obj.environment.get_environment_info()["metaflow_version"],
    }

    env_deco = [deco for deco in node.decorators if deco.name == "environment"]
    if env_deco:
        env = env_deco[0].attributes["vars"]
    else:
        env = {}

    datastore_root = os.path.join(
        ctx.obj.datastore.make_path(ctx.obj.flow.name, kwargs['run_id'],
                                    step_name, kwargs['task_id']))
    # Add the environment variables related to the input-paths argument
    if split_vars:
        env.update(split_vars)

    if retry_count:
        ctx.obj.echo_always(
            "Sleeping %d minutes before the next AWS Batch retry" %
            minutes_between_retries)
        time.sleep(minutes_between_retries * 60)
    batch = Batch(ctx.obj.metadata, ctx.obj.environment)
    try:
        with ctx.obj.monitor.measure("metaflow.batch.launch"):
            batch.launch_job(step_name,
                             step_cli,
                             code_package_sha,
                             code_package_url,
                             ctx.obj.datastore.TYPE,
                             image=image,
                             queue=queue,
                             iam_role=iam_role,
                             execution_role=execution_role,
                             cpu=cpu,
                             gpu=gpu,
                             memory=memory,
                             run_time_limit=run_time_limit,
                             shared_memory=shared_memory,
                             max_swap=max_swap,
                             swappiness=swappiness,
                             env=env,
                             attrs=attrs)
    except Exception as e:
        print(e)
        _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    try:
        batch.wait(echo=echo)
    except BatchKilledException:
        # don't retry killed tasks
        traceback.print_exc()
        _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    _sync_metadata(echo, ctx.obj.metadata, datastore_root, retry_count)
Пример #12
0
def step(
    ctx,
    step_name,
    code_package_sha,
    code_package_url,
    executable=None,
    image=None,
    iam_role=None,
    execution_role=None,
    cpu=None,
    gpu=None,
    memory=None,
    queue=None,
    run_time_limit=None,
    shared_memory=None,
    max_swap=None,
    swappiness=None,
    host_volumes=None,
    **kwargs
):
    def echo(msg, stream="stderr", batch_id=None):
        msg = util.to_unicode(msg)
        if batch_id:
            msg = "[%s] %s" % (batch_id, msg)
        ctx.obj.echo_always(msg, err=(stream == sys.stderr))

    if R.use_r():
        entrypoint = R.entrypoint()
    else:
        if executable is None:
            executable = ctx.obj.environment.executable(step_name)
        entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))

    top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))

    input_paths = kwargs.get("input_paths")
    split_vars = None
    if input_paths:
        max_size = 30 * 1024
        split_vars = {
            "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
            for i in range(0, len(input_paths), max_size)
        }
        kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())

    step_args = " ".join(util.dict_to_cli_options(kwargs))
    step_cli = u"{entrypoint} {top_args} step {step} {step_args}".format(
        entrypoint=entrypoint,
        top_args=top_args,
        step=step_name,
        step_args=step_args,
    )
    node = ctx.obj.graph[step_name]

    # Get retry information
    retry_count = kwargs.get("retry_count", 0)
    retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
    minutes_between_retries = None
    if retry_deco:
        minutes_between_retries = int(
            retry_deco[0].attributes.get("minutes_between_retries", 1)
        )

    # Set batch attributes
    task_spec = {
        "flow_name": ctx.obj.flow.name,
        "step_name": step_name,
        "run_id": kwargs["run_id"],
        "task_id": kwargs["task_id"],
        "retry_count": str(retry_count),
    }
    attrs = {"metaflow.%s" % k: v for k, v in task_spec.items()}
    attrs["metaflow.user"] = util.get_username()
    attrs["metaflow.version"] = ctx.obj.environment.get_environment_info()[
        "metaflow_version"
    ]

    env_deco = [deco for deco in node.decorators if deco.name == "environment"]
    if env_deco:
        env = env_deco[0].attributes["vars"]
    else:
        env = {}

    # Add the environment variables related to the input-paths argument
    if split_vars:
        env.update(split_vars)

    if retry_count:
        ctx.obj.echo_always(
            "Sleeping %d minutes before the next AWS Batch retry"
            % minutes_between_retries
        )
        time.sleep(minutes_between_retries * 60)

    # this information is needed for log tailing
    ds = ctx.obj.flow_datastore.get_task_datastore(
        mode="w",
        run_id=kwargs["run_id"],
        step_name=step_name,
        task_id=kwargs["task_id"],
        attempt=int(retry_count),
    )
    stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
    stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")

    def _sync_metadata():
        if ctx.obj.metadata.TYPE == "local":
            sync_local_metadata_from_datastore(
                DATASTORE_LOCAL_DIR,
                ctx.obj.flow_datastore.get_task_datastore(
                    kwargs["run_id"], step_name, kwargs["task_id"]
                ),
            )

    batch = Batch(ctx.obj.metadata, ctx.obj.environment)
    try:
        with ctx.obj.monitor.measure("metaflow.aws.batch.launch_job"):
            batch.launch_job(
                step_name,
                step_cli,
                task_spec,
                code_package_sha,
                code_package_url,
                ctx.obj.flow_datastore.TYPE,
                image=image,
                queue=queue,
                iam_role=iam_role,
                execution_role=execution_role,
                cpu=cpu,
                gpu=gpu,
                memory=memory,
                run_time_limit=run_time_limit,
                shared_memory=shared_memory,
                max_swap=max_swap,
                swappiness=swappiness,
                env=env,
                attrs=attrs,
                host_volumes=host_volumes,
            )
    except Exception as e:
        traceback.print_exc()
        _sync_metadata()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    try:
        batch.wait(stdout_location, stderr_location, echo=echo)
    except BatchKilledException:
        # don't retry killed tasks
        traceback.print_exc()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    finally:
        _sync_metadata()
Пример #13
0
def resolve_token(
    name, token_prefix, obj, authorize, given_token, generate_new_token, is_project
):

    # 1) retrieve the previous deployment, if one exists
    workflow = StepFunctions.get_existing_deployment(name)
    if workflow is None:
        obj.echo(
            "It seems this is the first time you are deploying *%s* to "
            "AWS Step Functions." % name
        )
        prev_token = None
    else:
        prev_user, prev_token = workflow

    # 2) authorize this deployment
    if prev_token is not None:
        if authorize is None:
            authorize = load_token(token_prefix)
        elif authorize.startswith("production:"):
            authorize = authorize[11:]

        # we allow the user who deployed the previous version to re-deploy,
        # even if they don't have the token
        if prev_user != get_username() and authorize != prev_token:
            obj.echo(
                "There is an existing version of *%s* on AWS Step "
                "Functions which was deployed by the user "
                "*%s*." % (name, prev_user)
            )
            obj.echo(
                "To deploy a new version of this flow, you need to use "
                "the same production token that they used. "
            )
            obj.echo(
                "Please reach out to them to get the token. Once you "
                "have it, call this command:"
            )
            obj.echo("    step-functions create --authorize MY_TOKEN", fg="green")
            obj.echo(
                'See "Organizing Results" at docs.metaflow.org for more '
                "information about production tokens."
            )
            raise IncorrectProductionToken(
                "Try again with the correct " "production token."
            )

    # 3) do we need a new token or should we use the existing token?
    if given_token:
        if is_project:
            # we rely on a known prefix for @project tokens, so we can't
            # allow the user to specify a custom token with an arbitrary prefix
            raise MetaflowException(
                "--new-token is not supported for "
                "@projects. Use --generate-new-token to "
                "create a new token."
            )
        if given_token.startswith("production:"):
            given_token = given_token[11:]
        token = given_token
        obj.echo("")
        obj.echo("Using the given token, *%s*." % token)
    elif prev_token is None or generate_new_token:
        token = new_token(token_prefix, prev_token)
        if token is None:
            if prev_token is None:
                raise MetaflowInternalError(
                    "We could not generate a new " "token. This is unexpected. "
                )
            else:
                raise MetaflowException(
                    "--generate-new-token option is not "
                    "supported after using --new-token. "
                    "Use --new-token to make a new "
                    "namespace."
                )
        obj.echo("")
        obj.echo("A new production token generated.")
    else:
        token = prev_token

    obj.echo("")
    obj.echo("The namespace of this production flow is")
    obj.echo("    production:%s" % token, fg="green")
    obj.echo(
        "To analyze results of this production flow " "add this line in your notebooks:"
    )
    obj.echo('    namespace("production:%s")' % token, fg="green")
    obj.echo(
        "If you want to authorize other people to deploy new versions "
        "of this flow to AWS Step Functions, they need to call"
    )
    obj.echo("    step-functions create --authorize %s" % token, fg="green")
    obj.echo("when deploying this flow to AWS Step Functions for the first " "time.")
    obj.echo(
        'See "Organizing Results" at https://docs.metaflow.org/ for more '
        "information about production tokens."
    )
    obj.echo("")
    store_token(token_prefix, token)
    return token
Пример #14
0
    def container_template(self, node):
        """
        Returns an argo container template spec. to execute a step
        """
        attr = parse_step_decorator(node, ArgoStepDecorator)
        env_decorator = parse_step_decorator(node, EnvironmentDecorator)
        retry_decorator = parse_step_decorator(node, RetryDecorator)
        catch_decorator = parse_step_decorator(node, CatchDecorator)
        res_decorator = parse_step_decorator(node, ResourcesDecorator)
        k8s_decorator = parse_step_decorator(node, KubernetesDecorator)
        resources = merge_resources(
            res_decorator, {
                k: v
                for k, v in k8s_decorator.items()
                if k in ResourcesDecorator.defaults
            })
        image = attr.get('image') or k8s_decorator.get(
            'image') or self._default_image()
        env, env_from = self._prepare_environment(attr, env_decorator)
        res = self._resources(resources)
        volume_mounts = attr.get('volumeMounts', [])
        volume_mounts.append(self._shared_memory(resources))

        user_code_retries = retry_decorator.get('times', 0)
        total_retries = user_code_retries + 1 if catch_decorator else user_code_retries
        retry_count = '{{retries}}' if total_retries else '0'
        cmd = self._commands(node, retry_count, user_code_retries)

        metadata = {
            'labels': {
                **attr.get('labels', {}),
                **self.attributes['labels'],
                'metaflow/step_name':
                sanitize_label_value(dns_name(node.name)),
                'app.kubernetes.io/name':
                'metaflow-task',
                'app.kubernetes.io/part-of':
                'metaflow',
                'app.kubernetes.io/created-by':
                get_username(),
            },
            'annotations': {
                **attr.get('annotations', {}),
                **self.attributes['annotations'],

                # should be a label but cannot sanitize argo variables
                'metaflow/attempt':
                retry_count,
            },
        }
        metadata['labels'].update(self.system_tags)

        template = {
            'name': dns_name(node.name),
            'metadata': metadata,
            'activeDeadlineSeconds':
            get_run_time_limit_for_task(node.decorators),
            'inputs': {
                'parameters': [{
                    'name': 'input-paths'
                }],
                'artifacts': attr.get('input_artifacts'),
            },
            'outputs': {
                'parameters': [{
                    'name': 'task-id',
                    'value': '{{pod.name}}'
                }],
                'artifacts': attr.get('output_artifacts')
            },
            'nodeSelector': attr.get('nodeSelector'),
            'container': {
                'image': image,
                'volumeMounts': volume_mounts,
                'command': [cmd[0]],
                'args': cmd[1:],
                'env': env,
                'envFrom': env_from,
                'resources': {
                    'requests': res,
                    'limits': res
                }
            },
        }

        if total_retries:
            template['retryStrategy'] = {
                'retryPolicy': 'Always',
                # fallback_step for @catch is only executed if retry_count > user_code_retries
                'limit': str(total_retries),
                'backoff': {
                    'duration':
                    '%sm' % str(retry_decorator['minutes_between_retries']
                                if user_code_retries else 0),
                }
            }

        if self._is_foreach_first_child(node):
            template['inputs']['parameters'].append({'name': 'split-index'})

        if node.type == 'foreach':
            template['outputs']['parameters'].append({
                'name': 'num-splits',
                'valueFrom': {
                    'path': ArgoInternalStepDecorator.splits_file_path
                }
            })

        return template
Пример #15
0
def step(
    ctx,
    step_name,
    code_package_sha,
    code_package_url,
    executable=None,
    image=None,
    service_account=None,
    secrets=None,
    node_selector=None,
    k8s_namespace=None,
    cpu=None,
    disk=None,
    memory=None,
    gpu=None,
    gpu_vendor=None,
    run_time_limit=None,
    **kwargs
):
    def echo(msg, stream="stderr", job_id=None):
        msg = util.to_unicode(msg)
        if job_id:
            msg = "[%s] %s" % (job_id, msg)
        ctx.obj.echo_always(msg, err=(stream == sys.stderr))

    node = ctx.obj.graph[step_name]

    # Construct entrypoint CLI
    if executable is None:
        executable = ctx.obj.environment.executable(step_name)

    # Set environment
    env = {}
    env_deco = [deco for deco in node.decorators if deco.name == "environment"]
    if env_deco:
        env = env_deco[0].attributes["vars"]

    # Set input paths.
    input_paths = kwargs.get("input_paths")
    split_vars = None
    if input_paths:
        max_size = 30 * 1024
        split_vars = {
            "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
            for i in range(0, len(input_paths), max_size)
        }
        kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
        env.update(split_vars)

    # Set retry policy.
    retry_count = int(kwargs.get("retry_count", 0))
    retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
    minutes_between_retries = None
    if retry_deco:
        minutes_between_retries = int(
            retry_deco[0].attributes.get("minutes_between_retries", 2)
        )
    if retry_count:
        ctx.obj.echo_always(
            "Sleeping %d minutes before the next retry" % minutes_between_retries
        )
        time.sleep(minutes_between_retries * 60)

    step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
        entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])),
        top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)),
        step=step_name,
        step_args=" ".join(util.dict_to_cli_options(kwargs)),
    )

    # Set log tailing.
    ds = ctx.obj.flow_datastore.get_task_datastore(
        mode="w",
        run_id=kwargs["run_id"],
        step_name=step_name,
        task_id=kwargs["task_id"],
        attempt=int(retry_count),
    )
    stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
    stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")

    def _sync_metadata():
        if ctx.obj.metadata.TYPE == "local":
            sync_local_metadata_from_datastore(
                DATASTORE_LOCAL_DIR,
                ctx.obj.flow_datastore.get_task_datastore(
                    kwargs["run_id"], step_name, kwargs["task_id"]
                ),
            )

    try:
        kubernetes = Kubernetes(
            datastore=ctx.obj.flow_datastore,
            metadata=ctx.obj.metadata,
            environment=ctx.obj.environment,
        )
        # Configure and launch Kubernetes job.
        with ctx.obj.monitor.measure("metaflow.kubernetes.launch_job"):
            kubernetes.launch_job(
                flow_name=ctx.obj.flow.name,
                run_id=kwargs["run_id"],
                step_name=step_name,
                task_id=kwargs["task_id"],
                attempt=str(retry_count),
                user=util.get_username(),
                code_package_sha=code_package_sha,
                code_package_url=code_package_url,
                code_package_ds=ctx.obj.flow_datastore.TYPE,
                step_cli=step_cli,
                docker_image=image,
                service_account=service_account,
                secrets=secrets,
                node_selector=node_selector,
                namespace=k8s_namespace,
                cpu=cpu,
                disk=disk,
                memory=memory,
                gpu=gpu,
                gpu_vendor=gpu_vendor,
                run_time_limit=run_time_limit,
                env=env,
            )
    except Exception as e:
        traceback.print_exc(chain=False)
        _sync_metadata()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    try:
        kubernetes.wait(stdout_location, stderr_location, echo=echo)
    except KubernetesKilledException:
        # don't retry killed tasks
        traceback.print_exc()
        sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
    finally:
        _sync_metadata()