def _process_parameters(self): parameters = [] has_schedule = self._cron() is not None for var, param in self.flow._get_parameters(): valuetype = param.kwargs.get('type', str) value = deploy_time_eval(param.kwargs.get('default')) required = param.kwargs.get('required', False) # Throw an exception if the flow has optional parameters # with no default value. if value is None and required is False: raise MetaflowException("The value of parameter *%s* is " "ambiguous. It does not have a " "default and it is not required." % param.name) # Throw an exception if a schedule is set for a flow with required # parameters with no defaults. We currently don't have any notion # of data triggers in AWS Event Bridge. if value is None and required and has_schedule: raise MetaflowException("The parameter *%s* does not have a " "default and is required. Scheduling " "such parameters via AWS Event Bridge " "is not currently supported." % param.name) parameters.append(dict(name=param.name, value=value)) return parameters
def get_aws_client(module, with_error=False, params={}): from metaflow.exception import MetaflowException from metaflow.metaflow_config import AWS_SANDBOX_ENABLED, \ AWS_SANDBOX_STS_ENDPOINT_URL, AWS_SANDBOX_API_KEY import requests try: import boto3 from botocore.exceptions import ClientError except (NameError, ImportError): raise MetaflowException( "Could not import module 'boto3'. Install boto3 first.") if AWS_SANDBOX_ENABLED: global cached_aws_sandbox_creds if cached_aws_sandbox_creds is None: # authenticate using STS url = "%s/auth/token" % AWS_SANDBOX_STS_ENDPOINT_URL headers = {'x-api-key': AWS_SANDBOX_API_KEY} try: r = requests.get(url, headers=headers) r.raise_for_status() cached_aws_sandbox_creds = r.json() except requests.exceptions.HTTPError as e: raise MetaflowException(repr(e)) if with_error: return boto3.session.Session(**cached_aws_sandbox_creds).client( module, **params), ClientError return boto3.session.Session(**cached_aws_sandbox_creds).client( module, **params) if with_error: return boto3.client(module, **params), ClientError return boto3.client(module, **params)
def _process_parameters(self): parameters = [] has_schedule = self._cron() is not None seen = set() for var, param in self.flow._get_parameters(): # Throw an exception if the parameter is specified twice. norm = param.name.lower() if norm in seen: raise MetaflowException("Parameter *%s* is specified twice. " "Note that parameter names are " "case-insensitive." % param.name) seen.add(norm) is_required = param.kwargs.get('required', False) # Throw an exception if a schedule is set for a flow with required # parameters with no defaults. We currently don't have any notion # of data triggers in AWS Event Bridge. if 'default' not in param.kwargs and is_required and has_schedule: raise MetaflowException("The parameter *%s* does not have a " "default and is required. Scheduling " "such parameters via AWS Event Bridge " "is not currently supported." % param.name) value = deploy_time_eval(param.kwargs.get('default')) parameters.append(dict(name=param.name, value=value)) return parameters
def get_client(module, with_error=False, params={}, s3_role_arn=None): from metaflow.exception import MetaflowException from metaflow.metaflow_config import ( AWS_SANDBOX_ENABLED, AWS_SANDBOX_STS_ENDPOINT_URL, AWS_SANDBOX_API_KEY, ) import requests try: import boto3 import botocore from botocore.exceptions import ClientError except (NameError, ImportError): raise MetaflowException( "Could not import module 'boto3'. Install boto3 first." ) if AWS_SANDBOX_ENABLED: # role is ignored in the sandbox global cached_aws_sandbox_creds if cached_aws_sandbox_creds is None: # authenticate using STS url = "%s/auth/token" % AWS_SANDBOX_STS_ENDPOINT_URL headers = {"x-api-key": AWS_SANDBOX_API_KEY} try: r = requests.get(url, headers=headers) r.raise_for_status() cached_aws_sandbox_creds = r.json() except requests.exceptions.HTTPError as e: raise MetaflowException(repr(e)) if with_error: return ( boto3.session.Session(**cached_aws_sandbox_creds).client( module, **params ), ClientError, ) return boto3.session.Session(**cached_aws_sandbox_creds).client( module, **params ) session = boto3.session.Session() if s3_role_arn: fetcher = botocore.credentials.AssumeRoleCredentialFetcher( client_creator=session._session.create_client, source_credentials=session._session.get_credentials(), role_arn=s3_role_arn, extra_args={}, ) creds = botocore.credentials.DeferredRefreshableCredentials( method="assume-role", refresh_using=fetcher.fetch_credentials ) botocore_session = botocore.session.Session() botocore_session._credentials = creds session = boto3.session.Session(botocore_session=botocore_session) if with_error: return session.client(module, **params), ClientError return session.client(module, **params)
def resolve_workflow_name(obj, name): project = current.get("project_name") obj._is_workflow_name_modified = False if project: if name: raise MetaflowException( "--name is not supported for @projects. Use --branch instead.") workflow_name = current.project_flow_name project_branch = to_bytes(".".join((project, current.branch_name))) token_prefix = ( "mfprj-%s" % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16]) is_project = True # Argo Workflow names can't be longer than 253 characters, so we truncate # by default. Also, while project and branch allow for underscores, Argo # Workflows doesn't (DNS Subdomain names as defined in RFC 1123) - so we will # remove any underscores as well as convert the name to lower case. if len(workflow_name) > 253: name_hash = to_unicode( base64.b32encode(sha1( to_bytes(workflow_name)).digest()))[:8].lower() workflow_name = "%s-%s" % (workflow_name[:242], name_hash) obj._is_workflow_name_modified = True if not VALID_NAME.search(workflow_name): workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub( "", workflow_name).replace("_", "").lower()) obj._is_workflow_name_modified = True else: if name and not VALID_NAME.search(name): raise MetaflowException( "Name '%s' contains invalid characters. The " "name must consist of lower case alphanumeric characters, '-' or '.'" ", and must start and end with an alphanumeric character." % name) workflow_name = name if name else current.flow_name token_prefix = workflow_name is_project = False if len(workflow_name) > 253: msg = ("The full name of the workflow:\n*%s*\nis longer than 253 " "characters.\n\n" "To deploy this workflow to Argo Workflows, please " "assign a shorter name\nusing the option\n" "*argo-workflows --name <name> create*." % workflow_name) raise ArgoWorkflowsNameTooLong(msg) if not VALID_NAME.search(workflow_name): workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub( "", workflow_name).replace("_", "").lower()) obj._is_workflow_name_modified = True return workflow_name, token_prefix.lower(), is_project
def resolve_state_machine_name(obj, name): def attach_prefix(name): if SFN_STATE_MACHINE_PREFIX is not None: return SFN_STATE_MACHINE_PREFIX + "_" + name return name project = current.get("project_name") obj._is_state_machine_name_hashed = False if project: if name: raise MetaflowException( "--name is not supported for @projects. " "Use --branch instead." ) state_machine_name = attach_prefix(current.project_flow_name) project_branch = to_bytes(".".join((project, current.branch_name))) token_prefix = ( "mfprj-%s" % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16] ) is_project = True # AWS Step Functions has a limit of 80 chars for state machine names. # We truncate the state machine name if the computed name is greater # than 60 chars and append a hashed suffix to ensure uniqueness. if len(state_machine_name) > 60: name_hash = to_unicode( base64.b32encode(sha1(to_bytes(state_machine_name)).digest()) )[:16].lower() state_machine_name = "%s-%s" % (state_machine_name[:60], name_hash) obj._is_state_machine_name_hashed = True else: if name and VALID_NAME.search(name): raise MetaflowException("Name '%s' contains invalid characters." % name) state_machine_name = attach_prefix(name if name else current.flow_name) token_prefix = state_machine_name is_project = False if len(state_machine_name) > 80: msg = ( "The full name of the workflow:\n*%s*\nis longer than 80 " "characters.\n\n" "To deploy this workflow to AWS Step Functions, please " "assign a shorter name\nusing the option\n" "*step-functions --name <name> create*." % state_machine_name ) raise StepFunctionsStateMachineNameTooLong(msg) return state_machine_name, token_prefix.lower(), is_project
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger): # The total number of attempts must not exceed MAX_ATTEMPTS. # attempts = normal task (1) + retries (N) + @catch fallback (1) if int(self.attributes["times"]) + 2 > MAX_ATTEMPTS: raise MetaflowException("The maximum number of retries is " "@retry(times=%d)." % (MAX_ATTEMPTS - 2))
def __init__(self, namespace=None): try: from kubernetes import client, config except (NameError, ImportError): raise MetaflowException( "Could not import module 'kubernetes'.\n\nInstall kubernetes " "Python package (https://pypi.org/project/kubernetes/) first.\n" "You can install the module by executing - " "%s -m pip install kubernetes\n" "or equivalent through your favorite Python package manager." % sys.executable) if os.getenv("KUBERNETES_SERVICE_HOST"): # We are inside a pod, authenticate via ServiceAccount assigned # to us config.load_incluster_config() else: # Use kubeconfig, likely $HOME/.kube/config # TODO (savin): # 1. Support generating kubeconfig on the fly using boto3 # 2. Support auth via OIDC - # https://docs.aws.amazon.com/eks/latest/userguide/authenticate-oidc-identity-provider.html config.load_kube_config() self._client = client self._namespace = namespace or "default" self._group = "argoproj.io" self._version = "v1alpha1"
def make_flow(obj, token, name, tags, namespace, max_workers, workflow_timeout, is_project): if obj.flow_datastore.TYPE != "s3": raise MetaflowException("AWS Step Functions requires --datastore=s3.") # Attach AWS Batch decorator to the flow decorators._attach_decorators(obj.flow, [BatchDecorator.name]) decorators._init_step_decorators(obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger) obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo, obj.package_suffixes) package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob], len_hint=1)[0] return StepFunctions( name, obj.graph, obj.flow, package_sha, package_url, token, obj.metadata, obj.flow_datastore, obj.environment, obj.event_logger, obj.monitor, tags=tags, namespace=namespace, max_workers=max_workers, username=get_username(), workflow_timeout=workflow_timeout, is_project=is_project, )
def __init__(self, k8s_namespace): try: from kubernetes import client, config except (NameError, ImportError): raise MetaflowException( "Could not import module 'kubernetes'. Install kubernetes " "Python package (https://pypi.org/project/kubernetes/) first.") config.load_kube_config() self.client = client if k8s_namespace is None: k8s_namespace = 'default' self.k8s_namespace = from_conf('METAFLOW_K8S_NAMESPACE', default=k8s_namespace) try: core_api_instance = self.client.CoreV1Api() core_api_instance.read_namespace(self.k8s_namespace) except self.client.rest.ApiException as e: msg = json.loads( e.body)["message"] if e.body is not None else e.reason raise ArgoException(msg) self.api_instance = self.client.CustomObjectsApi() self.kwargs = { 'version': 'v1alpha1', 'group': 'argoproj.io', 'namespace': self.k8s_namespace }
def control_task_step_func(self, flow, graph, retry_count): from metaflow import current run_id = current.run_id step_name = current.step_name control_task_id = current.task_id (_, split_step_name, split_task_id) = control_task_id.split('-')[1:] # If we are running inside Conda, we use the base executable FIRST; # the conda environment will then be used when runtime_step_cli is # called. This is so that it can properly set up all the metaflow # aliases needed. env_to_use = getattr(self.environment, 'base_env', self.environment) executable = env_to_use.executable(step_name) script = sys.argv[0] # Access the `unbounded_foreach` param using `flow` (as datastore). assert(flow._unbounded_foreach) foreach_iter = flow.input if not isinstance(foreach_iter, InternalTestUnboundedForeachInput): raise MetaflowException('Expected type to be '\ 'InternalTestUnboundedForeachInput. Found %s'\ % (type(foreach_iter))) foreach_num_splits = sum(1 for _ in foreach_iter) print('Simulating UnboundedForeach over value:', foreach_iter, 'num_splits:', foreach_num_splits) mapper_tasks = [] for i in range(foreach_num_splits): task_id = \ '%s-%d' % (control_task_id.replace('control-', 'test-ubf-'), i) pathspec = '%s/%s/%s' % (run_id, step_name, task_id) mapper_tasks.append(to_unicode(pathspec)) input_paths = '%s/%s/%s' % (run_id, split_step_name, split_task_id) # Override specific `step` kwargs. kwargs = cli_args.step_kwargs kwargs['split_index'] = str(i) kwargs['run_id'] = run_id kwargs['task_id'] = task_id kwargs['input_paths'] = input_paths kwargs['ubf_context'] = UBF_TASK kwargs['retry_count'] = 0 cmd = cli_args.step_command(executable, script, step_name, step_kwargs=kwargs) step_cli = u' '.join(cmd) # Print cmdline for execution. Doesn't work without the temporary # unicode object while using `print`. print(u'[${cwd}] Starting split#{split} with cmd:{cmd}'\ .format(cwd=os.getcwd(), split=i, cmd=step_cli)) output_bytes = subprocess.check_output(cmd) output = to_unicode(output_bytes) for line in output.splitlines(): print('[Split#%d] %s' % (i, line)) # Save the list of (child) mapper task pathspec(s) into a designated # artifact `_control_mapper_tasks`. flow._control_mapper_tasks = mapper_tasks
def _request(cls, monitor, path, data=None, retry_409_path=None): if cls.INFO is None: raise MetaflowException('Missing Metaflow Service URL. ' 'Specify with METAFLOW_SERVICE_URL environment variable') url = os.path.join(cls.INFO, path.lstrip('/')) for i in range(METADATA_SERVICE_NUM_RETRIES): try: if data is None: if monitor: with monitor.measure('metaflow.service_metadata.get'): resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) else: resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) else: if monitor: with monitor.measure('metaflow.service_metadata.post'): resp = requests.post(url, headers=METADATA_SERVICE_HEADERS, json=data) else: resp = requests.post(url, headers=METADATA_SERVICE_HEADERS, json=data) except: # noqa E722 if monitor: with monitor.count('metaflow.service_metadata.failed_request'): if i == METADATA_SERVICE_NUM_RETRIES - 1: raise else: if i == METADATA_SERVICE_NUM_RETRIES - 1: raise resp = None else: if resp.status_code < 300: return resp.json() elif resp.status_code == 409 and data is not None: # a special case: the post fails due to a conflict # this could occur when we missed a success response # from the first POST request but the request # actually went though, so a subsequent POST # returns 409 (conflict) or we end up with a # conflict while running on AWS Step Functions # instead of retrying the post we retry with a get since # the record is guaranteed to exist if retry_409_path: return cls._request(monitor, retry_409_path) else: return elif resp.status_code != 503: raise ServiceException('Metadata request (%s) failed (code %s): %s' % (path, resp.status_code, resp.text), resp.status_code, resp.text) time.sleep(2**i) if resp: raise ServiceException('Metadata request (%s) failed (code %s): %s' % (path, resp.status_code, resp.text), resp.status_code, resp.text) else: raise ServiceException('Metadata request (%s) failed' % path)
def step_init(self, flow, graph, step, decos, environment, datastore, logger): # handling _foreach_var and _foreach_num_splits requires some # deeper thinking, so let's not support that use case for now self.logger = logger if graph[step].type == 'foreach': raise MetaflowException('@catch is defined for the step *%s* ' 'but @catch is not supported in foreach ' 'split steps.' % step)
def format_name(flow_name, project_name, deploy_prod, given_branch, user_name): if not project_name: # an empty string is not a valid project name raise MetaflowException( "@project needs a name. " "Try @project(name='some_name')" ) elif re.search(VALID_NAME_RE, project_name): raise MetaflowException( "The @project name must contain only " "lowercase alphanumeric characters " "and underscores." ) elif len(project_name) > VALID_NAME_LEN: raise MetaflowException( "The @project name must be shorter than " "%d characters." % VALID_NAME_LEN ) if given_branch: if re.search(VALID_NAME_RE, given_branch): raise MetaflowException( "The branch name must contain only " "lowercase alphanumeric characters " "and underscores." ) elif len(given_branch) > VALID_NAME_LEN: raise MetaflowException( "Branch name is too long. " "The maximum is %d characters." % VALID_NAME_LEN ) if deploy_prod: branch = "prod.%s" % given_branch else: branch = "test.%s" % given_branch elif deploy_prod: branch = "prod" else: # For AWS Step Functions, we set the branch to the value of # environment variable `METAFLOW_OWNER`, since AWS Step Functions # has no notion of user name. branch = "user.%s" % os.environ.get("METAFLOW_OWNER", user_name) return ".".join((project_name, branch, flow_name)), branch
def _request(cls, monitor, path, data=None): if cls.INFO is None: raise MetaflowException( 'Missing Metaflow Service URL. ' 'Specify with METAFLOW_SERVICE_URL environment variable') url = os.path.join(cls.INFO, path.lstrip('/')) for i in range(METADATA_SERVICE_NUM_RETRIES): try: if data is None: if monitor: with monitor.measure('metaflow.service_metadata.get'): resp = requests.get( url, headers=METADATA_SERVICE_HEADERS) else: resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) else: if monitor: with monitor.measure('metaflow.service_metadata.post'): resp = requests.post( url, headers=METADATA_SERVICE_HEADERS, json=data) else: resp = requests.post(url, headers=METADATA_SERVICE_HEADERS, json=data) except: # noqa E722 if monitor: with monitor.count( 'metaflow.service_metadata.failed_request'): if i == METADATA_SERVICE_NUM_RETRIES - 1: raise else: if i == METADATA_SERVICE_NUM_RETRIES - 1: raise resp = None else: if resp.status_code < 300: return resp.json() elif resp.status_code != 503: raise ServiceException( 'Metadata request (%s) failed (code %s): %s' % (path, resp.status_code, resp.text), resp.status_code, resp.text) time.sleep(2**i) if resp: raise ServiceException( 'Metadata request (%s) failed (code %s): %s' % (path, resp.status_code, resp.text), resp.status_code, resp.text) else: raise ServiceException('Metadata request (%s) failed' % path)
def create(obj, image, image_pull_secrets, env, env_from, labels, annotations, k8s_namespace, embedded, max_workers, volumes, workflow_timeout=None, only_json=False): obj.echo("Deploying *%s* to Argo Workflow Templates..." % obj.workflow_template_name, bold=True) if obj.flow_datastore.TYPE != 's3': raise MetaflowException("Argo Workflows require --datastore=s3.") # When using conda attach AWS Batch decorator to the flow. # This results in 'linux-64' libraries to be packaged. decorators._attach_decorators(obj.flow, [BatchDecorator.name]) decorators._init_step_decorators(obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger) obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo, obj.package_suffixes) package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob], len_hint=1)[0] warn_use_argo_image(obj) workflow = ArgoWorkflow(obj.workflow_template_name, obj.flow, obj.graph, obj.package, package_url if not embedded else None, obj.metadata, obj.flow_datastore, obj.environment, obj.event_logger, obj.monitor, image, image_pull_secrets, env, env_from, labels, annotations, max_workers, volumes, workflow_timeout) if only_json: obj.echo_always(workflow.to_json(), err=False, no_bold=True, nl=False) else: workflow.deploy(k8s_namespace) obj.echo( "WorkflowTemplate *{name}* is pushed to Argo Workflows successfully.\n" .format(name=obj.workflow_template_name), bold=True) workflow.schedule(k8s_namespace) obj.echo("What will trigger execution of the workflow:", bold=True) obj.echo(workflow.trigger_explanation(), indent=True)
def get_authenticated_boto3_client(module): from metaflow.exception import MetaflowException import requests try: import boto3 except (NameError, ImportError): raise MetaflowException( "Could not import module 'boto3'. Install boto3 first.") if AWS_SANDBOX_ENABLED: global cached_aws_sandbox_creds if cached_aws_sandbox_creds is None: # authenticate using STS url = "%s/auth/token" % AWS_SANDBOX_STS_ENDPOINT_URL headers = {'x-api-key': AWS_SANDBOX_API_KEY} try: r = requests.get(url, headers=headers) r.raise_for_status() cached_aws_sandbox_creds = r.json() except requests.exceptions.HTTPError as e: raise MetaflowException(repr(e)) return boto3.session.Session(**cached_aws_sandbox_creds).client(module) return boto3.client(module)
def __init__(self): # TODO: Look into removing the usage of Kubernetes Python SDK # at some point in the future. Given that Kubernetes Python SDK # aggressively drops support for older kubernetes clusters, continued # dependency on it may bite our users. try: # Kubernetes is a soft dependency. from kubernetes import client, config except (NameError, ImportError): raise MetaflowException( "Could not import module 'kubernetes'. Install kubernetes " "Python package (https://pypi.org/project/kubernetes/) first.") self._refresh_client()
def init_config(): # Read configuration from $METAFLOW_HOME/config_<profile>.json. home = os.environ.get('METAFLOW_HOME', '~/.metaflowconfig') profile = os.environ.get('METAFLOW_PROFILE') path_to_config = os.path.join(home, 'config.json') if profile: path_to_config = os.path.join(home, 'config_%s.json' % profile) path_to_config = os.path.expanduser(path_to_config) config = {} if os.path.exists(path_to_config): with open(path_to_config) as f: return json.load(f) elif profile: raise MetaflowException('Unable to locate METAFLOW_PROFILE \'%s\' in \'%s\')' % (profile, home)) return config
def compute_resource_attributes(decos, compute_deco, resource_defaults): """ Compute resource values taking into account defaults, the values specified in the compute decorator (like @batch or @kubernetes) directly, and resources specified via @resources decorator. Returns a dictionary of resource attr -> value (str). """ assert compute_deco is not None # Use the value from resource_defaults by default (don't use None) result = {k: v for k, v in resource_defaults.items() if v is not None} for deco in decos: # If resource decorator is used if deco.name == "resources": for k, v in deco.attributes.items(): my_val = compute_deco.attributes.get(k) # We use the non None value if there is only one or the larger value # if they are both non None. Note this considers "" to be equivalent to # the value zero. if my_val is None and v is None: continue if my_val is not None and v is not None: try: result[k] = str(max(int(my_val or 0), int(v or 0))) except ValueError: # Here, we don't have ints so we compare the value and raise # an exception if not equal if my_val != v: raise MetaflowException( "'resources' and compute decorator have conflicting " "values for '%s'. Please use consistent values or " "specify this resource constraint once" % k) elif my_val is not None: result[k] = str(my_val or "0") else: result[k] = str(v or "0") return result # If there is no resources decorator, values from compute_deco override # the defaults. for k in resource_defaults: if compute_deco.attributes.get(k) is not None: result[k] = str(compute_deco.attributes[k] or "0") return result
def init_config(): # Read configuration from $METAFLOW_HOME/config_<profile>.json. home = os.environ.get("METAFLOW_HOME", "~/.metaflowconfig") profile = os.environ.get("METAFLOW_PROFILE") path_to_config = os.path.join(home, "config.json") if profile: path_to_config = os.path.join(home, "config_%s.json" % profile) path_to_config = os.path.expanduser(path_to_config) config = {} if os.path.exists(path_to_config): with open(path_to_config) as f: return json.load(f) elif profile: raise MetaflowException( "Unable to locate METAFLOW_PROFILE '%s' in '%s')" % (profile, home)) return config
def _version(cls, monitor): if cls.INFO is None: raise MetaflowException( "Missing Metaflow Service URL. " "Specify with METAFLOW_SERVICE_URL environment variable" ) path = "ping" url = os.path.join(cls.INFO, path) for i in range(METADATA_SERVICE_NUM_RETRIES): try: if monitor: with monitor.measure("metaflow.service_metadata.get"): resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) else: resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) except: if monitor: with monitor.count("metaflow.service_metadata.failed_request"): if i == METADATA_SERVICE_NUM_RETRIES - 1: raise else: if i == METADATA_SERVICE_NUM_RETRIES - 1: raise resp = None else: if resp.status_code < 300: return resp.headers.get("METADATA_SERVICE_VERSION", None) elif resp.status_code != 503: raise ServiceException( "Metadata request (%s) failed" " (code %s): %s" % (url, resp.status_code, resp.text), resp.status_code, resp.text, ) time.sleep(2 ** i) if resp: raise ServiceException( "Metadata request (%s) failed (code %s): %s" % (url, resp.status_code, resp.text), resp.status_code, resp.text, ) else: raise ServiceException("Metadata request (%s) failed" % url)
def _parameters(self): parameters = [] has_schedule = self._cron() is not None for _, param in self.flow._get_parameters(): # Throw an exception if a schedule is set for a flow with required # parameters with no defaults. is_required = param.kwargs.get('required', False) if 'default' not in param.kwargs and is_required and has_schedule: raise MetaflowException( "The parameter *%s* does not have a " "default and is required. Scheduling " "such parameters via Argo CronWorkflow " "is not currently supported." % param.name) p = {'name': param.name} if 'default' in param.kwargs: v = deploy_time_eval(param.kwargs.get('default')) p['value'] = json.dumps(v) parameters.append(p) return parameters
def make_flow(obj, token, name, tags, namespace, max_workers, workflow_timeout, is_project): datastore = obj.datastore(obj.flow.name, mode='w', metadata=obj.metadata, event_logger=obj.event_logger, monitor=obj.monitor) if datastore.TYPE != 's3': raise MetaflowException("AWS Step Functions requires --datastore=s3.") # Attach AWS Batch decorator to the flow decorators._attach_decorators(obj.flow, [BatchDecorator.name]) decorators._init_step_decorators( obj.flow, obj.graph, obj.environment, obj.datastore, obj.logger) obj.package = MetaflowPackage( obj.flow, obj.environment, obj.echo, obj.package_suffixes) package_url = datastore.save_data( obj.package.sha, TransformableObject(obj.package.blob)) return StepFunctions(name, obj.graph, obj.flow, obj.package, package_url, token, obj.metadata, obj.datastore, obj.environment, obj.event_logger, obj.monitor, tags=tags, namespace=namespace, max_workers=max_workers, username=get_username(), workflow_timeout=workflow_timeout, is_project=is_project)
def make_flow(obj, token, name, tags, namespace, max_workers, workflow_timeout, workflow_priority): # TODO: Make this check less specific to Amazon S3 as we introduce # support for more cloud object stores. if obj.flow_datastore.TYPE != "s3": raise MetaflowException("Argo Workflows requires --datastore=s3.") # Attach @kubernetes and @environment decorator to the flow to # ensure that the related decorator hooks are invoked. decorators._attach_decorators( obj.flow, [KubernetesDecorator.name, EnvironmentDecorator.name]) decorators._init_step_decorators(obj.flow, obj.graph, obj.environment, obj.flow_datastore, obj.logger) # Save the code package in the flow datastore so that both user code and # metaflow package can be retrieved during workflow execution. obj.package = MetaflowPackage(obj.flow, obj.environment, obj.echo, obj.package_suffixes) package_url, package_sha = obj.flow_datastore.save_data([obj.package.blob], len_hint=1)[0] return ArgoWorkflows( name, obj.graph, obj.flow, package_sha, package_url, token, obj.metadata, obj.flow_datastore, obj.environment, obj.event_logger, obj.monitor, tags=tags, namespace=namespace, max_workers=max_workers, username=get_username(), workflow_timeout=workflow_timeout, workflow_priority=workflow_priority, )
def __call__(self): unit = ['B', 'KB', 'MB', 'GB', 'TB'] sz = self._size pos = 0 while pos < len(unit) and sz >= 1024: sz = sz // 1024 pos += 1 if pos >= 3: extra = '(this may take a while)' else: extra = '' self._logger('Including file %s of size %d%s %s' % (self._path, sz, unit[pos], extra)) if self._is_text: return io.open(self._path, mode='rt', encoding=self._encoding).read() try: return io.open(self._path, mode='rb').read() except IOError: # If we get an error here, since we know that the file exists already, # it means that read failed which happens with Python 2.7 for large files raise MetaflowException( 'Cannot read file at %s -- this is likely because it is too ' 'large to be properly handled by Python 2.7' % self._path)
def _local_multinode_control_task_step_func(flow, env_to_use, step_func, retry_count): """ Used as multinode UBF control task when run in local mode. """ from metaflow import current from metaflow.cli_args import cli_args from metaflow.unbounded_foreach import UBF_TASK import subprocess assert flow._unbounded_foreach foreach_iter = flow._parallel_ubf_iter if foreach_iter.__class__.__name__ != "ParallelUBF": raise MetaflowException( "Expected ParallelUBFIter iterator object, got:" + foreach_iter.__class__.__name__) num_parallel = foreach_iter.num_parallel os.environ["MF_PARALLEL_NUM_NODES"] = str(num_parallel) os.environ["MF_PARALLEL_MAIN_IP"] = "127.0.0.1" run_id = current.run_id step_name = current.step_name control_task_id = current.task_id (_, split_step_name, split_task_id) = control_task_id.split("-")[1:] # UBF handling for multinode case top_task_id = control_task_id.replace("control-", "") # chop "-0" mapper_task_ids = [control_task_id] # If we are running inside Conda, we use the base executable FIRST; # the conda environment will then be used when runtime_step_cli is # called. This is so that it can properly set up all the metaflow # aliases needed. executable = env_to_use.executable(step_name) script = sys.argv[0] # start workers subprocesses = [] for node_index in range(1, num_parallel): task_id = "%s_node_%d" % (top_task_id, node_index) mapper_task_ids.append(task_id) os.environ["MF_PARALLEL_NODE_INDEX"] = str(node_index) input_paths = "%s/%s/%s" % (run_id, split_step_name, split_task_id) # Override specific `step` kwargs. kwargs = cli_args.step_kwargs kwargs["split_index"] = str(node_index) kwargs["run_id"] = run_id kwargs["task_id"] = task_id kwargs["input_paths"] = input_paths kwargs["ubf_context"] = UBF_TASK kwargs["retry_count"] = str(retry_count) cmd = cli_args.step_command(executable, script, step_name, step_kwargs=kwargs) p = subprocess.Popen(cmd) subprocesses.append(p) flow._control_mapper_tasks = [ "%s/%s/%s" % (run_id, step_name, mapper_task_id) for mapper_task_id in mapper_task_ids ] flow._control_task_is_mapper_zero = True # run the step function ourselves os.environ["MF_PARALLEL_NODE_INDEX"] = "0" step_func() # join the subprocesses for p in subprocesses: p.wait() if p.returncode: raise Exception("Subprocess failed, return code {}".format( p.returncode))
def resolve_token( name, token_prefix, obj, authorize, given_token, generate_new_token, is_project ): # 1) retrieve the previous deployment, if one exists workflow = StepFunctions.get_existing_deployment(name) if workflow is None: obj.echo( "It seems this is the first time you are deploying *%s* to " "AWS Step Functions." % name ) prev_token = None else: prev_user, prev_token = workflow # 2) authorize this deployment if prev_token is not None: if authorize is None: authorize = load_token(token_prefix) elif authorize.startswith("production:"): authorize = authorize[11:] # we allow the user who deployed the previous version to re-deploy, # even if they don't have the token if prev_user != get_username() and authorize != prev_token: obj.echo( "There is an existing version of *%s* on AWS Step " "Functions which was deployed by the user " "*%s*." % (name, prev_user) ) obj.echo( "To deploy a new version of this flow, you need to use " "the same production token that they used. " ) obj.echo( "Please reach out to them to get the token. Once you " "have it, call this command:" ) obj.echo(" step-functions create --authorize MY_TOKEN", fg="green") obj.echo( 'See "Organizing Results" at docs.metaflow.org for more ' "information about production tokens." ) raise IncorrectProductionToken( "Try again with the correct " "production token." ) # 3) do we need a new token or should we use the existing token? if given_token: if is_project: # we rely on a known prefix for @project tokens, so we can't # allow the user to specify a custom token with an arbitrary prefix raise MetaflowException( "--new-token is not supported for " "@projects. Use --generate-new-token to " "create a new token." ) if given_token.startswith("production:"): given_token = given_token[11:] token = given_token obj.echo("") obj.echo("Using the given token, *%s*." % token) elif prev_token is None or generate_new_token: token = new_token(token_prefix, prev_token) if token is None: if prev_token is None: raise MetaflowInternalError( "We could not generate a new " "token. This is unexpected. " ) else: raise MetaflowException( "--generate-new-token option is not " "supported after using --new-token. " "Use --new-token to make a new " "namespace." ) obj.echo("") obj.echo("A new production token generated.") else: token = prev_token obj.echo("") obj.echo("The namespace of this production flow is") obj.echo(" production:%s" % token, fg="green") obj.echo( "To analyze results of this production flow " "add this line in your notebooks:" ) obj.echo(' namespace("production:%s")' % token, fg="green") obj.echo( "If you want to authorize other people to deploy new versions " "of this flow to AWS Step Functions, they need to call" ) obj.echo(" step-functions create --authorize %s" % token, fg="green") obj.echo("when deploying this flow to AWS Step Functions for the first " "time.") obj.echo( 'See "Organizing Results" at https://docs.metaflow.org/ for more ' "information about production tokens." ) obj.echo("") store_token(token_prefix, token) return token
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger): # Executing Kubernetes jobs requires a non-local datastore. if flow_datastore.TYPE != "s3": raise KubernetesException( "The *@kubernetes* decorator requires --datastore=s3 at the moment." ) # Set internal state. self.logger = logger self.environment = environment self.step = step self.flow_datastore = flow_datastore if any([deco.name == "batch" for deco in decos]): raise MetaflowException( "Step *{step}* is marked for execution both on AWS Batch and " "Kubernetes. Please use one or the other.".format(step=step)) for deco in decos: if getattr(deco, "IS_PARALLEL", False): raise KubernetesException( "@kubernetes does not support parallel execution currently." ) # Set run time limit for the Kubernetes job. self.run_time_limit = get_run_time_limit_for_task(decos) if self.run_time_limit < 60: raise KubernetesException( "The timeout for step *{step}* should be at least 60 seconds for " "execution on Kubernetes.".format(step=step)) for deco in decos: if isinstance(deco, ResourcesDecorator): for k, v in deco.attributes.items(): # TODO: Special case GPUs when they are introduced in @resources. if k in self.attributes: if self.defaults[k] is None: # skip if expected value isn't an int/float continue # We use the larger of @resources and @batch attributes # TODO: Fix https://github.com/Netflix/metaflow/issues/467 my_val = self.attributes.get(k) if not (my_val is None and v is None): self.attributes[k] = str( max(float(my_val or 0), float(v or 0))) # Check GPU vendor. if self.attributes["gpu_vendor"].lower() not in ("amd", "nvidia"): raise KubernetesException( "GPU vendor *{}* for step *{step}* is not currently supported." .format(self.attributes["gpu_vendor"], step=step)) # CPU, Disk, and Memory values should be greater than 0. for attr in ["cpu", "disk", "memory"]: if not (isinstance(self.attributes[attr], (int, unicode, basestring, float)) and float(self.attributes[attr]) > 0): raise KubernetesException( "Invalid {} value *{}* for step *{step}*; it should be greater than 0" .format(attr, self.attributes[attr], step=step)) if self.attributes["gpu"] is not None and not ( isinstance(self.attributes["gpu"], (int, unicode, basestring)) and float(self.attributes["gpu"]).is_integer()): raise KubernetesException( "Invalid GPU value *{}* for step *{step}*; it should be an integer" .format(self.attributes["gpu"], step=step))
def _visit(self, node, tasks=[], nested_dags=[], exit_node=None): """ Traverse graph nodes. Special treatment of split and foreach subgraphs """ def _linear_or_first_dag_task(node): if self._is_foreach_first_child(node): return dag_first_task(node) elif node.name == 'start': return start_task() else: return linear_task(node) if node.type == 'end': tasks.append(linear_task(node)) elif node == exit_node: pass # end recursion elif node.type in ('linear', 'join'): tasks.append(_linear_or_first_dag_task(node)) tasks, nested_dags = self._visit(self.graph[node.out_funcs[0]], tasks, nested_dags, exit_node) elif node.type == 'split-and': tasks.append(_linear_or_first_dag_task(node)) join = self.graph[node.matching_join] # traverse branches for out in node.out_funcs: tasks, nested_dags = self._visit(self.graph[out], tasks, nested_dags, exit_node=join) # finally continue with join node tasks, nested_dags = self._visit(join, tasks, nested_dags, exit_node) elif node.type == 'foreach': tasks.append(_linear_or_first_dag_task(node)) for_each = foreach_task(node) tasks.append(for_each) join = self.graph[node.matching_join] # create nested dag and add tasks for foreach block nested_tasks, nested_dags = self._visit( self.graph[node.out_funcs[0]], tasks=[], nested_dags=nested_dags, exit_node=join) nested_dags.append(nested_dag(for_each['name'], nested_tasks)) # join ends the foreach block tasks.append(join_foreach_task(join, parent_task=for_each)) # continue with node after join tasks, nested_dags = self._visit(self.graph[join.out_funcs[0]], tasks, nested_dags, exit_node) else: raise MetaflowException( 'Undefined node type: {} in step: {}'.format( node.type, node.name)) return tasks, nested_dags