Пример #1
0
 def _process_parameters(self):
     parameters = []
     has_schedule = self._cron() is not None
     for var, param in self.flow._get_parameters():
         valuetype = param.kwargs.get('type', str)
         value = deploy_time_eval(param.kwargs.get('default'))
         required = param.kwargs.get('required', False)
         # Throw an exception if the flow has optional parameters
         # with no default value.
         if value is None and required is False:
             raise MetaflowException("The value of parameter *%s* is "
                                     "ambiguous. It does not have a "
                                     "default and it is not required." %
                                     param.name)
         # Throw an exception if a schedule is set for a flow with required
         # parameters with no defaults. We currently don't have any notion
         # of data triggers in AWS Event Bridge.
         if value is None and required and has_schedule:
             raise MetaflowException("The parameter *%s* does not have a "
                                     "default and is required. Scheduling "
                                     "such parameters via AWS Event Bridge "
                                     "is not currently supported." %
                                     param.name)
         parameters.append(dict(name=param.name, value=value))
     return parameters
Пример #2
0
def get_aws_client(module, with_error=False, params={}):
    from metaflow.exception import MetaflowException
    from metaflow.metaflow_config import AWS_SANDBOX_ENABLED, \
        AWS_SANDBOX_STS_ENDPOINT_URL, AWS_SANDBOX_API_KEY
    import requests
    try:
        import boto3
        from botocore.exceptions import ClientError
    except (NameError, ImportError):
        raise MetaflowException(
            "Could not import module 'boto3'. Install boto3 first.")

    if AWS_SANDBOX_ENABLED:
        global cached_aws_sandbox_creds
        if cached_aws_sandbox_creds is None:
            # authenticate using STS
            url = "%s/auth/token" % AWS_SANDBOX_STS_ENDPOINT_URL
            headers = {'x-api-key': AWS_SANDBOX_API_KEY}
            try:
                r = requests.get(url, headers=headers)
                r.raise_for_status()
                cached_aws_sandbox_creds = r.json()
            except requests.exceptions.HTTPError as e:
                raise MetaflowException(repr(e))
        if with_error:
            return boto3.session.Session(**cached_aws_sandbox_creds).client(
                module, **params), ClientError
        return boto3.session.Session(**cached_aws_sandbox_creds).client(
            module, **params)
    if with_error:
        return boto3.client(module, **params), ClientError
    return boto3.client(module, **params)
Пример #3
0
    def _process_parameters(self):
        parameters = []
        has_schedule = self._cron() is not None
        seen = set()
        for var, param in self.flow._get_parameters():
            # Throw an exception if the parameter is specified twice.
            norm = param.name.lower()
            if norm in seen:
                raise MetaflowException("Parameter *%s* is specified twice. "
                                        "Note that parameter names are "
                                        "case-insensitive." % param.name)
            seen.add(norm)

            is_required = param.kwargs.get('required', False)
            # Throw an exception if a schedule is set for a flow with required
            # parameters with no defaults. We currently don't have any notion
            # of data triggers in AWS Event Bridge.
            if 'default' not in param.kwargs and is_required and has_schedule:
                raise MetaflowException("The parameter *%s* does not have a "
                                        "default and is required. Scheduling "
                                        "such parameters via AWS Event Bridge "
                                        "is not currently supported." %
                                        param.name)
            value = deploy_time_eval(param.kwargs.get('default'))
            parameters.append(dict(name=param.name, value=value))
        return parameters
Пример #4
0
    def get_client(module, with_error=False, params={}, s3_role_arn=None):
        from metaflow.exception import MetaflowException
        from metaflow.metaflow_config import (
            AWS_SANDBOX_ENABLED,
            AWS_SANDBOX_STS_ENDPOINT_URL,
            AWS_SANDBOX_API_KEY,
        )
        import requests

        try:
            import boto3
            import botocore
            from botocore.exceptions import ClientError
        except (NameError, ImportError):
            raise MetaflowException(
                "Could not import module 'boto3'. Install boto3 first."
            )

        if AWS_SANDBOX_ENABLED:
            # role is ignored in the sandbox
            global cached_aws_sandbox_creds
            if cached_aws_sandbox_creds is None:
                # authenticate using STS
                url = "%s/auth/token" % AWS_SANDBOX_STS_ENDPOINT_URL
                headers = {"x-api-key": AWS_SANDBOX_API_KEY}
                try:
                    r = requests.get(url, headers=headers)
                    r.raise_for_status()
                    cached_aws_sandbox_creds = r.json()
                except requests.exceptions.HTTPError as e:
                    raise MetaflowException(repr(e))
            if with_error:
                return (
                    boto3.session.Session(**cached_aws_sandbox_creds).client(
                        module, **params
                    ),
                    ClientError,
                )
            return boto3.session.Session(**cached_aws_sandbox_creds).client(
                module, **params
            )
        session = boto3.session.Session()
        if s3_role_arn:
            fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
                client_creator=session._session.create_client,
                source_credentials=session._session.get_credentials(),
                role_arn=s3_role_arn,
                extra_args={},
            )
            creds = botocore.credentials.DeferredRefreshableCredentials(
                method="assume-role", refresh_using=fetcher.fetch_credentials
            )
            botocore_session = botocore.session.Session()
            botocore_session._credentials = creds
            session = boto3.session.Session(botocore_session=botocore_session)
        if with_error:
            return session.client(module, **params), ClientError
        return session.client(module, **params)
Пример #5
0
def resolve_workflow_name(obj, name):
    project = current.get("project_name")
    obj._is_workflow_name_modified = False
    if project:
        if name:
            raise MetaflowException(
                "--name is not supported for @projects. Use --branch instead.")
        workflow_name = current.project_flow_name
        project_branch = to_bytes(".".join((project, current.branch_name)))
        token_prefix = (
            "mfprj-%s" %
            to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16])
        is_project = True
        # Argo Workflow names can't be longer than 253 characters, so we truncate
        # by default. Also, while project and branch allow for underscores, Argo
        # Workflows doesn't (DNS Subdomain names as defined in RFC 1123) - so we will
        # remove any underscores as well as convert the name to lower case.
        if len(workflow_name) > 253:
            name_hash = to_unicode(
                base64.b32encode(sha1(
                    to_bytes(workflow_name)).digest()))[:8].lower()
            workflow_name = "%s-%s" % (workflow_name[:242], name_hash)
            obj._is_workflow_name_modified = True
        if not VALID_NAME.search(workflow_name):
            workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub(
                "", workflow_name).replace("_", "").lower())
            obj._is_workflow_name_modified = True
    else:
        if name and not VALID_NAME.search(name):
            raise MetaflowException(
                "Name '%s' contains invalid characters. The "
                "name must consist of lower case alphanumeric characters, '-' or '.'"
                ", and must start and end with an alphanumeric character." %
                name)

        workflow_name = name if name else current.flow_name
        token_prefix = workflow_name
        is_project = False

        if len(workflow_name) > 253:
            msg = ("The full name of the workflow:\n*%s*\nis longer than 253 "
                   "characters.\n\n"
                   "To deploy this workflow to Argo Workflows, please "
                   "assign a shorter name\nusing the option\n"
                   "*argo-workflows --name <name> create*." % workflow_name)
            raise ArgoWorkflowsNameTooLong(msg)

        if not VALID_NAME.search(workflow_name):
            workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub(
                "", workflow_name).replace("_", "").lower())
            obj._is_workflow_name_modified = True

    return workflow_name, token_prefix.lower(), is_project
Пример #6
0
def resolve_state_machine_name(obj, name):
    def attach_prefix(name):
        if SFN_STATE_MACHINE_PREFIX is not None:
            return SFN_STATE_MACHINE_PREFIX + "_" + name
        return name

    project = current.get("project_name")
    obj._is_state_machine_name_hashed = False
    if project:
        if name:
            raise MetaflowException(
                "--name is not supported for @projects. " "Use --branch instead."
            )
        state_machine_name = attach_prefix(current.project_flow_name)
        project_branch = to_bytes(".".join((project, current.branch_name)))
        token_prefix = (
            "mfprj-%s"
            % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16]
        )
        is_project = True
        # AWS Step Functions has a limit of 80 chars for state machine names.
        # We truncate the state machine name if the computed name is greater
        # than 60 chars and append a hashed suffix to ensure uniqueness.
        if len(state_machine_name) > 60:
            name_hash = to_unicode(
                base64.b32encode(sha1(to_bytes(state_machine_name)).digest())
            )[:16].lower()
            state_machine_name = "%s-%s" % (state_machine_name[:60], name_hash)
            obj._is_state_machine_name_hashed = True
    else:
        if name and VALID_NAME.search(name):
            raise MetaflowException("Name '%s' contains invalid characters." % name)

        state_machine_name = attach_prefix(name if name else current.flow_name)
        token_prefix = state_machine_name
        is_project = False

        if len(state_machine_name) > 80:
            msg = (
                "The full name of the workflow:\n*%s*\nis longer than 80 "
                "characters.\n\n"
                "To deploy this workflow to AWS Step Functions, please "
                "assign a shorter name\nusing the option\n"
                "*step-functions --name <name> create*." % state_machine_name
            )
            raise StepFunctionsStateMachineNameTooLong(msg)

    return state_machine_name, token_prefix.lower(), is_project
Пример #7
0
 def step_init(self, flow, graph, step, decos, environment, flow_datastore,
               logger):
     # The total number of attempts must not exceed MAX_ATTEMPTS.
     # attempts = normal task (1) + retries (N) + @catch fallback (1)
     if int(self.attributes["times"]) + 2 > MAX_ATTEMPTS:
         raise MetaflowException("The maximum number of retries is "
                                 "@retry(times=%d)." % (MAX_ATTEMPTS - 2))
Пример #8
0
    def __init__(self, namespace=None):

        try:
            from kubernetes import client, config
        except (NameError, ImportError):
            raise MetaflowException(
                "Could not import module 'kubernetes'.\n\nInstall kubernetes "
                "Python package (https://pypi.org/project/kubernetes/) first.\n"
                "You can install the module by executing - "
                "%s -m pip install kubernetes\n"
                "or equivalent through your favorite Python package manager." %
                sys.executable)

        if os.getenv("KUBERNETES_SERVICE_HOST"):
            # We are inside a pod, authenticate via ServiceAccount assigned
            # to us
            config.load_incluster_config()
        else:
            # Use kubeconfig, likely $HOME/.kube/config
            # TODO (savin):
            #     1. Support generating kubeconfig on the fly using boto3
            #     2. Support auth via OIDC -
            #        https://docs.aws.amazon.com/eks/latest/userguide/authenticate-oidc-identity-provider.html
            config.load_kube_config()

        self._client = client
        self._namespace = namespace or "default"
        self._group = "argoproj.io"
        self._version = "v1alpha1"
Пример #9
0
def make_flow(obj, token, name, tags, namespace, max_workers, workflow_timeout,
              is_project):
    if obj.flow_datastore.TYPE != "s3":
        raise MetaflowException("AWS Step Functions requires --datastore=s3.")

    # Attach AWS Batch decorator to the flow
    decorators._attach_decorators(obj.flow, [BatchDecorator.name])
    decorators._init_step_decorators(obj.flow, obj.graph, obj.environment,
                                     obj.flow_datastore, obj.logger)

    obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo,
                                  obj.package_suffixes)
    package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob],
                                                            len_hint=1)[0]

    return StepFunctions(
        name,
        obj.graph,
        obj.flow,
        package_sha,
        package_url,
        token,
        obj.metadata,
        obj.flow_datastore,
        obj.environment,
        obj.event_logger,
        obj.monitor,
        tags=tags,
        namespace=namespace,
        max_workers=max_workers,
        username=get_username(),
        workflow_timeout=workflow_timeout,
        is_project=is_project,
    )
Пример #10
0
    def __init__(self, k8s_namespace):
        try:
            from kubernetes import client, config
        except (NameError, ImportError):
            raise MetaflowException(
                "Could not import module 'kubernetes'. Install kubernetes "
                "Python package (https://pypi.org/project/kubernetes/) first.")
        config.load_kube_config()
        self.client = client

        if k8s_namespace is None:
            k8s_namespace = 'default'
        self.k8s_namespace = from_conf('METAFLOW_K8S_NAMESPACE',
                                       default=k8s_namespace)
        try:
            core_api_instance = self.client.CoreV1Api()
            core_api_instance.read_namespace(self.k8s_namespace)
        except self.client.rest.ApiException as e:
            msg = json.loads(
                e.body)["message"] if e.body is not None else e.reason
            raise ArgoException(msg)

        self.api_instance = self.client.CustomObjectsApi()
        self.kwargs = {
            'version': 'v1alpha1',
            'group': 'argoproj.io',
            'namespace': self.k8s_namespace
        }
    def control_task_step_func(self, flow, graph, retry_count):
        from metaflow import current
        run_id = current.run_id
        step_name = current.step_name
        control_task_id = current.task_id
        (_, split_step_name, split_task_id) = control_task_id.split('-')[1:]
        # If we are running inside Conda, we use the base executable FIRST;
        # the conda environment will then be used when runtime_step_cli is
        # called. This is so that it can properly set up all the metaflow
        # aliases needed.
        env_to_use = getattr(self.environment, 'base_env', self.environment)
        executable = env_to_use.executable(step_name)
        script = sys.argv[0]

        # Access the `unbounded_foreach` param using `flow` (as datastore).
        assert(flow._unbounded_foreach)
        foreach_iter = flow.input
        if not isinstance(foreach_iter, InternalTestUnboundedForeachInput):
            raise MetaflowException('Expected type to be '\
                                    'InternalTestUnboundedForeachInput. Found %s'\
                                    % (type(foreach_iter)))
        foreach_num_splits = sum(1 for _ in foreach_iter)

        print('Simulating UnboundedForeach over value:',
              foreach_iter, 'num_splits:', foreach_num_splits)
        mapper_tasks = []

        for i in range(foreach_num_splits):
            task_id = \
                '%s-%d' % (control_task_id.replace('control-', 'test-ubf-'), i)
            pathspec = '%s/%s/%s' % (run_id, step_name, task_id)
            mapper_tasks.append(to_unicode(pathspec))
            input_paths = '%s/%s/%s' % (run_id, split_step_name, split_task_id)

            # Override specific `step` kwargs.
            kwargs = cli_args.step_kwargs
            kwargs['split_index'] = str(i)
            kwargs['run_id'] = run_id
            kwargs['task_id'] = task_id
            kwargs['input_paths'] = input_paths
            kwargs['ubf_context'] = UBF_TASK
            kwargs['retry_count'] = 0

            cmd = cli_args.step_command(executable, script, step_name,
                                        step_kwargs=kwargs)
            step_cli = u' '.join(cmd)
            # Print cmdline for execution. Doesn't work without the temporary
            # unicode object while using `print`.
            print(u'[${cwd}] Starting split#{split} with cmd:{cmd}'\
                  .format(cwd=os.getcwd(),
                          split=i,
                          cmd=step_cli))
            output_bytes = subprocess.check_output(cmd)
            output = to_unicode(output_bytes)
            for line in output.splitlines():
                print('[Split#%d] %s' % (i, line))
        # Save the list of (child) mapper task pathspec(s) into a designated
        # artifact `_control_mapper_tasks`.
        flow._control_mapper_tasks = mapper_tasks
Пример #12
0
    def _request(cls, monitor, path, data=None, retry_409_path=None):
        if cls.INFO is None:
            raise MetaflowException('Missing Metaflow Service URL. '
                'Specify with METAFLOW_SERVICE_URL environment variable')
        url = os.path.join(cls.INFO, path.lstrip('/'))
        for i in range(METADATA_SERVICE_NUM_RETRIES):
            try:
                if data is None:
                    if monitor:
                        with monitor.measure('metaflow.service_metadata.get'):
                            resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
                    else:
                        resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
                else:
                    if monitor:
                        with monitor.measure('metaflow.service_metadata.post'):
                            resp = requests.post(url, headers=METADATA_SERVICE_HEADERS, json=data)
                    else:
                        resp = requests.post(url, headers=METADATA_SERVICE_HEADERS, json=data)
            except:  # noqa E722
                if monitor:
                    with monitor.count('metaflow.service_metadata.failed_request'):
                        if i == METADATA_SERVICE_NUM_RETRIES - 1:
                            raise
                else:
                    if i == METADATA_SERVICE_NUM_RETRIES - 1:
                        raise
                resp = None
            else:
                if resp.status_code < 300:
                    return resp.json()
                elif resp.status_code == 409 and data is not None:
                    # a special case: the post fails due to a conflict
                    # this could occur when we missed a success response
                    # from the first POST request but the request
                    # actually went though, so a subsequent POST
                    # returns 409 (conflict) or we end up with a
                    # conflict while running on AWS Step Functions
                    # instead of retrying the post we retry with a get since
                    # the record is guaranteed to exist
                    if retry_409_path:
                        return cls._request(monitor, retry_409_path)
                    else:
                        return
                elif resp.status_code != 503:
                    raise ServiceException('Metadata request (%s) failed (code %s): %s'
                                           % (path, resp.status_code, resp.text),
                                           resp.status_code,
                                           resp.text)
            time.sleep(2**i)

        if resp:
            raise ServiceException('Metadata request (%s) failed (code %s): %s'
                                   % (path, resp.status_code, resp.text),
                                   resp.status_code,
                                   resp.text)
        else:
            raise ServiceException('Metadata request (%s) failed' % path)
Пример #13
0
 def step_init(self, flow, graph, step, decos, environment, datastore,
               logger):
     # handling _foreach_var and _foreach_num_splits requires some
     # deeper thinking, so let's not support that use case for now
     self.logger = logger
     if graph[step].type == 'foreach':
         raise MetaflowException('@catch is defined for the step *%s* '
                                 'but @catch is not supported in foreach '
                                 'split steps.' % step)
def format_name(flow_name, project_name, deploy_prod, given_branch, user_name):

    if not project_name:
        # an empty string is not a valid project name
        raise MetaflowException(
            "@project needs a name. " "Try @project(name='some_name')"
        )
    elif re.search(VALID_NAME_RE, project_name):
        raise MetaflowException(
            "The @project name must contain only "
            "lowercase alphanumeric characters "
            "and underscores."
        )
    elif len(project_name) > VALID_NAME_LEN:
        raise MetaflowException(
            "The @project name must be shorter than " "%d characters." % VALID_NAME_LEN
        )

    if given_branch:
        if re.search(VALID_NAME_RE, given_branch):
            raise MetaflowException(
                "The branch name must contain only "
                "lowercase alphanumeric characters "
                "and underscores."
            )
        elif len(given_branch) > VALID_NAME_LEN:
            raise MetaflowException(
                "Branch name is too long. "
                "The maximum is %d characters." % VALID_NAME_LEN
            )
        if deploy_prod:
            branch = "prod.%s" % given_branch
        else:
            branch = "test.%s" % given_branch
    elif deploy_prod:
        branch = "prod"
    else:
        # For AWS Step Functions, we set the branch to the value of
        # environment variable `METAFLOW_OWNER`, since AWS Step Functions
        # has no notion of user name.
        branch = "user.%s" % os.environ.get("METAFLOW_OWNER", user_name)

    return ".".join((project_name, branch, flow_name)), branch
Пример #15
0
    def _request(cls, monitor, path, data=None):
        if cls.INFO is None:
            raise MetaflowException(
                'Missing Metaflow Service URL. '
                'Specify with METAFLOW_SERVICE_URL environment variable')
        url = os.path.join(cls.INFO, path.lstrip('/'))
        for i in range(METADATA_SERVICE_NUM_RETRIES):
            try:
                if data is None:
                    if monitor:
                        with monitor.measure('metaflow.service_metadata.get'):
                            resp = requests.get(
                                url, headers=METADATA_SERVICE_HEADERS)
                    else:
                        resp = requests.get(url,
                                            headers=METADATA_SERVICE_HEADERS)
                else:
                    if monitor:
                        with monitor.measure('metaflow.service_metadata.post'):
                            resp = requests.post(
                                url,
                                headers=METADATA_SERVICE_HEADERS,
                                json=data)
                    else:
                        resp = requests.post(url,
                                             headers=METADATA_SERVICE_HEADERS,
                                             json=data)
            except:  # noqa E722
                if monitor:
                    with monitor.count(
                            'metaflow.service_metadata.failed_request'):
                        if i == METADATA_SERVICE_NUM_RETRIES - 1:
                            raise
                else:
                    if i == METADATA_SERVICE_NUM_RETRIES - 1:
                        raise
                resp = None
            else:
                if resp.status_code < 300:
                    return resp.json()
                elif resp.status_code != 503:
                    raise ServiceException(
                        'Metadata request (%s) failed (code %s): %s' %
                        (path, resp.status_code, resp.text), resp.status_code,
                        resp.text)
            time.sleep(2**i)

        if resp:
            raise ServiceException(
                'Metadata request (%s) failed (code %s): %s' %
                (path, resp.status_code, resp.text), resp.status_code,
                resp.text)
        else:
            raise ServiceException('Metadata request (%s) failed' % path)
Пример #16
0
def create(obj,
           image,
           image_pull_secrets,
           env,
           env_from,
           labels,
           annotations,
           k8s_namespace,
           embedded,
           max_workers,
           volumes,
           workflow_timeout=None,
           only_json=False):
    obj.echo("Deploying *%s* to Argo Workflow Templates..." %
             obj.workflow_template_name,
             bold=True)

    if obj.flow_datastore.TYPE != 's3':
        raise MetaflowException("Argo Workflows require --datastore=s3.")

    # When using conda attach AWS Batch decorator to the flow.
    # This results in 'linux-64' libraries to be packaged.
    decorators._attach_decorators(obj.flow, [BatchDecorator.name])
    decorators._init_step_decorators(obj.flow, obj.graph, obj.environment,
                                     obj.flow_datastore, obj.logger)

    obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo,
                                  obj.package_suffixes)
    package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob],
                                                            len_hint=1)[0]

    warn_use_argo_image(obj)
    workflow = ArgoWorkflow(obj.workflow_template_name, obj.flow, obj.graph,
                            obj.package, package_url if not embedded else None,
                            obj.metadata, obj.flow_datastore, obj.environment,
                            obj.event_logger, obj.monitor, image,
                            image_pull_secrets, env, env_from, labels,
                            annotations, max_workers, volumes,
                            workflow_timeout)

    if only_json:
        obj.echo_always(workflow.to_json(), err=False, no_bold=True, nl=False)
    else:
        workflow.deploy(k8s_namespace)
        obj.echo(
            "WorkflowTemplate *{name}* is pushed to Argo Workflows successfully.\n"
            .format(name=obj.workflow_template_name),
            bold=True)
        workflow.schedule(k8s_namespace)
        obj.echo("What will trigger execution of the workflow:", bold=True)
        obj.echo(workflow.trigger_explanation(), indent=True)
Пример #17
0
def get_authenticated_boto3_client(module):
    from metaflow.exception import MetaflowException
    import requests
    try:
        import boto3
    except (NameError, ImportError):
        raise MetaflowException(
            "Could not import module 'boto3'. Install boto3 first.")

    if AWS_SANDBOX_ENABLED:
        global cached_aws_sandbox_creds
        if cached_aws_sandbox_creds is None:
            # authenticate using STS
            url = "%s/auth/token" % AWS_SANDBOX_STS_ENDPOINT_URL
            headers = {'x-api-key': AWS_SANDBOX_API_KEY}
            try:
                r = requests.get(url, headers=headers)
                r.raise_for_status()
                cached_aws_sandbox_creds = r.json()
            except requests.exceptions.HTTPError as e:
                raise MetaflowException(repr(e))
        return boto3.session.Session(**cached_aws_sandbox_creds).client(module)
    return boto3.client(module)
Пример #18
0
    def __init__(self):
        # TODO: Look into removing the usage of Kubernetes Python SDK
        # at some point in the future. Given that Kubernetes Python SDK
        # aggressively drops support for older kubernetes clusters, continued
        # dependency on it may bite our users.

        try:
            # Kubernetes is a soft dependency.
            from kubernetes import client, config
        except (NameError, ImportError):
            raise MetaflowException(
                "Could not import module 'kubernetes'. Install kubernetes "
                "Python package (https://pypi.org/project/kubernetes/) first.")
        self._refresh_client()
Пример #19
0
def init_config():
    # Read configuration from $METAFLOW_HOME/config_<profile>.json.
    home = os.environ.get('METAFLOW_HOME', '~/.metaflowconfig')
    profile = os.environ.get('METAFLOW_PROFILE')
    path_to_config = os.path.join(home, 'config.json')
    if profile:
        path_to_config = os.path.join(home, 'config_%s.json' % profile)
    path_to_config = os.path.expanduser(path_to_config)
    config = {}
    if os.path.exists(path_to_config):
        with open(path_to_config) as f:
            return json.load(f)
    elif profile:
        raise MetaflowException('Unable to locate METAFLOW_PROFILE \'%s\' in \'%s\')' %
                                (profile, home))
    return config
Пример #20
0
def compute_resource_attributes(decos, compute_deco, resource_defaults):
    """
    Compute resource values taking into account defaults, the values specified
    in the compute decorator (like @batch or @kubernetes) directly, and
    resources specified via @resources decorator.

    Returns a dictionary of resource attr -> value (str).
    """
    assert compute_deco is not None

    # Use the value from resource_defaults by default (don't use None)
    result = {k: v for k, v in resource_defaults.items() if v is not None}

    for deco in decos:
        # If resource decorator is used
        if deco.name == "resources":
            for k, v in deco.attributes.items():
                my_val = compute_deco.attributes.get(k)
                # We use the non None value if there is only one or the larger value
                # if they are both non None. Note this considers "" to be equivalent to
                # the value zero.
                if my_val is None and v is None:
                    continue
                if my_val is not None and v is not None:
                    try:
                        result[k] = str(max(int(my_val or 0), int(v or 0)))
                    except ValueError:
                        # Here, we don't have ints so we compare the value and raise
                        # an exception if not equal
                        if my_val != v:
                            raise MetaflowException(
                                "'resources' and compute decorator have conflicting "
                                "values for '%s'. Please use consistent values or "
                                "specify this resource constraint once" % k)
                elif my_val is not None:
                    result[k] = str(my_val or "0")
                else:
                    result[k] = str(v or "0")
            return result

    # If there is no resources decorator, values from compute_deco override
    # the defaults.
    for k in resource_defaults:
        if compute_deco.attributes.get(k) is not None:
            result[k] = str(compute_deco.attributes[k] or "0")

    return result
Пример #21
0
def init_config():
    # Read configuration from $METAFLOW_HOME/config_<profile>.json.
    home = os.environ.get("METAFLOW_HOME", "~/.metaflowconfig")
    profile = os.environ.get("METAFLOW_PROFILE")
    path_to_config = os.path.join(home, "config.json")
    if profile:
        path_to_config = os.path.join(home, "config_%s.json" % profile)
    path_to_config = os.path.expanduser(path_to_config)
    config = {}
    if os.path.exists(path_to_config):
        with open(path_to_config) as f:
            return json.load(f)
    elif profile:
        raise MetaflowException(
            "Unable to locate METAFLOW_PROFILE '%s' in '%s')" %
            (profile, home))
    return config
Пример #22
0
 def _version(cls, monitor):
     if cls.INFO is None:
         raise MetaflowException(
             "Missing Metaflow Service URL. "
             "Specify with METAFLOW_SERVICE_URL environment variable"
         )
     path = "ping"
     url = os.path.join(cls.INFO, path)
     for i in range(METADATA_SERVICE_NUM_RETRIES):
         try:
             if monitor:
                 with monitor.measure("metaflow.service_metadata.get"):
                     resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
             else:
                 resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
         except:
             if monitor:
                 with monitor.count("metaflow.service_metadata.failed_request"):
                     if i == METADATA_SERVICE_NUM_RETRIES - 1:
                         raise
             else:
                 if i == METADATA_SERVICE_NUM_RETRIES - 1:
                     raise
             resp = None
         else:
             if resp.status_code < 300:
                 return resp.headers.get("METADATA_SERVICE_VERSION", None)
             elif resp.status_code != 503:
                 raise ServiceException(
                     "Metadata request (%s) failed"
                     " (code %s): %s" % (url, resp.status_code, resp.text),
                     resp.status_code,
                     resp.text,
                 )
         time.sleep(2 ** i)
     if resp:
         raise ServiceException(
             "Metadata request (%s) failed (code %s): %s"
             % (url, resp.status_code, resp.text),
             resp.status_code,
             resp.text,
         )
     else:
         raise ServiceException("Metadata request (%s) failed" % url)
Пример #23
0
 def _parameters(self):
     parameters = []
     has_schedule = self._cron() is not None
     for _, param in self.flow._get_parameters():
         # Throw an exception if a schedule is set for a flow with required
         # parameters with no defaults.
         is_required = param.kwargs.get('required', False)
         if 'default' not in param.kwargs and is_required and has_schedule:
             raise MetaflowException(
                 "The parameter *%s* does not have a "
                 "default and is required. Scheduling "
                 "such parameters via Argo CronWorkflow "
                 "is not currently supported." % param.name)
         p = {'name': param.name}
         if 'default' in param.kwargs:
             v = deploy_time_eval(param.kwargs.get('default'))
             p['value'] = json.dumps(v)
         parameters.append(p)
     return parameters
Пример #24
0
def make_flow(obj,
              token,
              name,
              tags,
              namespace,
              max_workers,
              workflow_timeout,
              is_project):
    datastore = obj.datastore(obj.flow.name,
                              mode='w',
                              metadata=obj.metadata,
                              event_logger=obj.event_logger,
                              monitor=obj.monitor)
    if datastore.TYPE != 's3':
        raise MetaflowException("AWS Step Functions requires --datastore=s3.")

    # Attach AWS Batch decorator to the flow
    decorators._attach_decorators(obj.flow, [BatchDecorator.name])
    decorators._init_step_decorators(
            obj.flow, obj.graph, obj.environment, obj.datastore, obj.logger)

    obj.package = MetaflowPackage(
        obj.flow, obj.environment, obj.echo, obj.package_suffixes)
    package_url = datastore.save_data(
        obj.package.sha, TransformableObject(obj.package.blob))

    return StepFunctions(name,
                         obj.graph,
                         obj.flow,
                         obj.package,
                         package_url,
                         token,
                         obj.metadata,
                         obj.datastore,
                         obj.environment,
                         obj.event_logger,
                         obj.monitor,
                         tags=tags,
                         namespace=namespace,
                         max_workers=max_workers,
                         username=get_username(),
                         workflow_timeout=workflow_timeout,
                         is_project=is_project)
Пример #25
0
def make_flow(obj, token, name, tags, namespace, max_workers, workflow_timeout,
              workflow_priority):
    # TODO: Make this check less specific to Amazon S3 as we introduce
    #       support for more cloud object stores.
    if obj.flow_datastore.TYPE != "s3":
        raise MetaflowException("Argo Workflows requires --datastore=s3.")

    # Attach @kubernetes and @environment decorator to the flow to
    # ensure that the related decorator hooks are invoked.
    decorators._attach_decorators(
        obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name])

    decorators._init_step_decorators(obj.flow, obj.graph, obj.environment,
                                     obj.flow_datastore, obj.logger)

    # Save the code package in the flow datastore so that both user code and
    # metaflow package can be retrieved during workflow execution.
    obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo,
                                  obj.package_suffixes)
    package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob],
                                                            len_hint=1)[0]

    return ArgoWorkflows(
        name,
        obj.graph,
        obj.flow,
        package_sha,
        package_url,
        token,
        obj.metadata,
        obj.flow_datastore,
        obj.environment,
        obj.event_logger,
        obj.monitor,
        tags=tags,
        namespace=namespace,
        max_workers=max_workers,
        username=get_username(),
        workflow_timeout=workflow_timeout,
        workflow_priority=workflow_priority,
    )
Пример #26
0
 def __call__(self):
     unit = ['B', 'KB', 'MB', 'GB', 'TB']
     sz = self._size
     pos = 0
     while pos < len(unit) and sz >= 1024:
         sz = sz // 1024
         pos += 1
     if pos >= 3:
         extra = '(this may take a while)'
     else:
         extra = ''
     self._logger('Including file %s of size %d%s %s' %
                  (self._path, sz, unit[pos], extra))
     if self._is_text:
         return io.open(self._path, mode='rt',
                        encoding=self._encoding).read()
     try:
         return io.open(self._path, mode='rb').read()
     except IOError:
         # If we get an error here, since we know that the file exists already,
         # it means that read failed which happens with Python 2.7 for large files
         raise MetaflowException(
             'Cannot read file at %s -- this is likely because it is too '
             'large to be properly handled by Python 2.7' % self._path)
def _local_multinode_control_task_step_func(flow, env_to_use, step_func,
                                            retry_count):
    """
    Used as multinode UBF control task when run in local mode.
    """
    from metaflow import current
    from metaflow.cli_args import cli_args
    from metaflow.unbounded_foreach import UBF_TASK
    import subprocess

    assert flow._unbounded_foreach
    foreach_iter = flow._parallel_ubf_iter
    if foreach_iter.__class__.__name__ != "ParallelUBF":
        raise MetaflowException(
            "Expected ParallelUBFIter iterator object, got:" +
            foreach_iter.__class__.__name__)

    num_parallel = foreach_iter.num_parallel
    os.environ["MF_PARALLEL_NUM_NODES"] = str(num_parallel)
    os.environ["MF_PARALLEL_MAIN_IP"] = "127.0.0.1"

    run_id = current.run_id
    step_name = current.step_name
    control_task_id = current.task_id

    (_, split_step_name, split_task_id) = control_task_id.split("-")[1:]
    # UBF handling for multinode case
    top_task_id = control_task_id.replace("control-", "")  # chop "-0"
    mapper_task_ids = [control_task_id]
    # If we are running inside Conda, we use the base executable FIRST;
    # the conda environment will then be used when runtime_step_cli is
    # called. This is so that it can properly set up all the metaflow
    # aliases needed.
    executable = env_to_use.executable(step_name)
    script = sys.argv[0]

    # start workers
    subprocesses = []
    for node_index in range(1, num_parallel):
        task_id = "%s_node_%d" % (top_task_id, node_index)
        mapper_task_ids.append(task_id)
        os.environ["MF_PARALLEL_NODE_INDEX"] = str(node_index)
        input_paths = "%s/%s/%s" % (run_id, split_step_name, split_task_id)
        # Override specific `step` kwargs.
        kwargs = cli_args.step_kwargs
        kwargs["split_index"] = str(node_index)
        kwargs["run_id"] = run_id
        kwargs["task_id"] = task_id
        kwargs["input_paths"] = input_paths
        kwargs["ubf_context"] = UBF_TASK
        kwargs["retry_count"] = str(retry_count)

        cmd = cli_args.step_command(executable,
                                    script,
                                    step_name,
                                    step_kwargs=kwargs)
        p = subprocess.Popen(cmd)
        subprocesses.append(p)

    flow._control_mapper_tasks = [
        "%s/%s/%s" % (run_id, step_name, mapper_task_id)
        for mapper_task_id in mapper_task_ids
    ]
    flow._control_task_is_mapper_zero = True

    # run the step function ourselves
    os.environ["MF_PARALLEL_NODE_INDEX"] = "0"
    step_func()

    # join the subprocesses
    for p in subprocesses:
        p.wait()
        if p.returncode:
            raise Exception("Subprocess failed, return code {}".format(
                p.returncode))
Пример #28
0
def resolve_token(
    name, token_prefix, obj, authorize, given_token, generate_new_token, is_project
):

    # 1) retrieve the previous deployment, if one exists
    workflow = StepFunctions.get_existing_deployment(name)
    if workflow is None:
        obj.echo(
            "It seems this is the first time you are deploying *%s* to "
            "AWS Step Functions." % name
        )
        prev_token = None
    else:
        prev_user, prev_token = workflow

    # 2) authorize this deployment
    if prev_token is not None:
        if authorize is None:
            authorize = load_token(token_prefix)
        elif authorize.startswith("production:"):
            authorize = authorize[11:]

        # we allow the user who deployed the previous version to re-deploy,
        # even if they don't have the token
        if prev_user != get_username() and authorize != prev_token:
            obj.echo(
                "There is an existing version of *%s* on AWS Step "
                "Functions which was deployed by the user "
                "*%s*." % (name, prev_user)
            )
            obj.echo(
                "To deploy a new version of this flow, you need to use "
                "the same production token that they used. "
            )
            obj.echo(
                "Please reach out to them to get the token. Once you "
                "have it, call this command:"
            )
            obj.echo("    step-functions create --authorize MY_TOKEN", fg="green")
            obj.echo(
                'See "Organizing Results" at docs.metaflow.org for more '
                "information about production tokens."
            )
            raise IncorrectProductionToken(
                "Try again with the correct " "production token."
            )

    # 3) do we need a new token or should we use the existing token?
    if given_token:
        if is_project:
            # we rely on a known prefix for @project tokens, so we can't
            # allow the user to specify a custom token with an arbitrary prefix
            raise MetaflowException(
                "--new-token is not supported for "
                "@projects. Use --generate-new-token to "
                "create a new token."
            )
        if given_token.startswith("production:"):
            given_token = given_token[11:]
        token = given_token
        obj.echo("")
        obj.echo("Using the given token, *%s*." % token)
    elif prev_token is None or generate_new_token:
        token = new_token(token_prefix, prev_token)
        if token is None:
            if prev_token is None:
                raise MetaflowInternalError(
                    "We could not generate a new " "token. This is unexpected. "
                )
            else:
                raise MetaflowException(
                    "--generate-new-token option is not "
                    "supported after using --new-token. "
                    "Use --new-token to make a new "
                    "namespace."
                )
        obj.echo("")
        obj.echo("A new production token generated.")
    else:
        token = prev_token

    obj.echo("")
    obj.echo("The namespace of this production flow is")
    obj.echo("    production:%s" % token, fg="green")
    obj.echo(
        "To analyze results of this production flow " "add this line in your notebooks:"
    )
    obj.echo('    namespace("production:%s")' % token, fg="green")
    obj.echo(
        "If you want to authorize other people to deploy new versions "
        "of this flow to AWS Step Functions, they need to call"
    )
    obj.echo("    step-functions create --authorize %s" % token, fg="green")
    obj.echo("when deploying this flow to AWS Step Functions for the first " "time.")
    obj.echo(
        'See "Organizing Results" at https://docs.metaflow.org/ for more '
        "information about production tokens."
    )
    obj.echo("")
    store_token(token_prefix, token)
    return token
Пример #29
0
    def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        # Executing Kubernetes jobs requires a non-local datastore.
        if flow_datastore.TYPE != "s3":
            raise KubernetesException(
                "The *@kubernetes* decorator requires --datastore=s3 at the moment."
            )

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore

        if any([deco.name == "batch" for deco in decos]):
            raise MetaflowException(
                "Step *{step}* is marked for execution both on AWS Batch and "
                "Kubernetes. Please use one or the other.".format(step=step))

        for deco in decos:
            if getattr(deco, "IS_PARALLEL", False):
                raise KubernetesException(
                    "@kubernetes does not support parallel execution currently."
                )

        # Set run time limit for the Kubernetes job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise KubernetesException(
                "The timeout for step *{step}* should be at least 60 seconds for "
                "execution on Kubernetes.".format(step=step))

        for deco in decos:
            if isinstance(deco, ResourcesDecorator):
                for k, v in deco.attributes.items():
                    # TODO: Special case GPUs when they are introduced in @resources.
                    if k in self.attributes:
                        if self.defaults[k] is None:
                            # skip if expected value isn't an int/float
                            continue
                        # We use the larger of @resources and @batch attributes
                        # TODO: Fix https://github.com/Netflix/metaflow/issues/467
                        my_val = self.attributes.get(k)
                        if not (my_val is None and v is None):
                            self.attributes[k] = str(
                                max(float(my_val or 0), float(v or 0)))

        # Check GPU vendor.
        if self.attributes["gpu_vendor"].lower() not in ("amd", "nvidia"):
            raise KubernetesException(
                "GPU vendor *{}* for step *{step}* is not currently supported."
                .format(self.attributes["gpu_vendor"], step=step))

        # CPU, Disk, and Memory values should be greater than 0.
        for attr in ["cpu", "disk", "memory"]:
            if not (isinstance(self.attributes[attr],
                               (int, unicode, basestring, float))
                    and float(self.attributes[attr]) > 0):
                raise KubernetesException(
                    "Invalid {} value *{}* for step *{step}*; it should be greater than 0"
                    .format(attr, self.attributes[attr], step=step))

        if self.attributes["gpu"] is not None and not (
                isinstance(self.attributes["gpu"], (int, unicode, basestring))
                and float(self.attributes["gpu"]).is_integer()):
            raise KubernetesException(
                "Invalid GPU value *{}* for step *{step}*; it should be an integer"
                .format(self.attributes["gpu"], step=step))
Пример #30
0
    def _visit(self, node, tasks=[], nested_dags=[], exit_node=None):
        """
        Traverse graph nodes.
        Special treatment of split and foreach subgraphs
        """
        def _linear_or_first_dag_task(node):
            if self._is_foreach_first_child(node):
                return dag_first_task(node)
            elif node.name == 'start':
                return start_task()
            else:
                return linear_task(node)

        if node.type == 'end':
            tasks.append(linear_task(node))

        elif node == exit_node:
            pass  # end recursion

        elif node.type in ('linear', 'join'):
            tasks.append(_linear_or_first_dag_task(node))
            tasks, nested_dags = self._visit(self.graph[node.out_funcs[0]],
                                             tasks, nested_dags, exit_node)

        elif node.type == 'split-and':
            tasks.append(_linear_or_first_dag_task(node))
            join = self.graph[node.matching_join]
            # traverse branches
            for out in node.out_funcs:
                tasks, nested_dags = self._visit(self.graph[out],
                                                 tasks,
                                                 nested_dags,
                                                 exit_node=join)
            # finally continue with join node
            tasks, nested_dags = self._visit(join, tasks, nested_dags,
                                             exit_node)

        elif node.type == 'foreach':
            tasks.append(_linear_or_first_dag_task(node))
            for_each = foreach_task(node)
            tasks.append(for_each)
            join = self.graph[node.matching_join]
            # create nested dag and add tasks for foreach block
            nested_tasks, nested_dags = self._visit(
                self.graph[node.out_funcs[0]],
                tasks=[],
                nested_dags=nested_dags,
                exit_node=join)
            nested_dags.append(nested_dag(for_each['name'], nested_tasks))
            # join ends the foreach block
            tasks.append(join_foreach_task(join, parent_task=for_each))
            # continue with node after join
            tasks, nested_dags = self._visit(self.graph[join.out_funcs[0]],
                                             tasks, nested_dags, exit_node)

        else:
            raise MetaflowException(
                'Undefined node type: {} in step: {}'.format(
                    node.type, node.name))

        return tasks, nested_dags