def _step_cli(self, node, paths, code_package_url, user_code_retries): cmds = [] script_name = os.path.basename(sys.argv[0]) executable = self.environment.executable(node.name) if R.use_r(): entrypoint = [R.entrypoint()] else: entrypoint = [executable, script_name] # Use AWS Batch job identifier as the globally unique task identifier. task_id = "${AWS_BATCH_JOB_ID}" # FlowDecorators can define their own top-level options. They are # responsible for adding their own top-level options and values through # the get_top_level_options() hook. See similar logic in runtime.py. top_opts_dict = {} for deco in flow_decorators(): top_opts_dict.update(deco.get_top_level_options()) top_opts = list(dict_to_cli_options(top_opts_dict)) if node.name == "start": # We need a separate unique ID for the special _parameters task task_id_params = "%s-params" % task_id # Export user-defined parameters into runtime environment param_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10)) export_params = ( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parameters %s && . `pwd`/%s" % (param_file, param_file)) params = (entrypoint + top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=s3", "--event-logger=%s" % self.event_logger.logger_type, "--monitor=%s" % self.monitor.monitor_type, "--no-pylint", "init", "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id_params, ]) # Assign tags to run objects. if self.tags: params.extend("--tag %s" % tag for tag in self.tags) # If the start step gets retried, we must be careful not to # regenerate multiple parameters tasks. Hence we check first if # _parameters exists already. exists = entrypoint + [ "dump", "--max-value-size=0", "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params), ] cmd = "if ! %s >/dev/null 2>/dev/null; then %s && %s; fi" % ( " ".join(exists), export_params, " ".join(params), ) cmds.append(cmd) paths = "sfn-${METAFLOW_RUN_ID}/_parameters/%s" % (task_id_params) if node.type == "join" and self.graph[ node.split_parents[-1]].type == "foreach": parent_tasks_file = "".join( random.choice(string.ascii_lowercase) for _ in range(10)) export_parent_tasks = ( "python -m " "metaflow.plugins.aws.step_functions.set_batch_environment " "parent_tasks %s && . `pwd`/%s" % (parent_tasks_file, parent_tasks_file)) cmds.append(export_parent_tasks) top_level = top_opts + [ "--quiet", "--metadata=%s" % self.metadata.TYPE, "--environment=%s" % self.environment.TYPE, "--datastore=%s" % self.flow_datastore.TYPE, "--datastore-root=%s" % self.flow_datastore.datastore_root, "--event-logger=%s" % self.event_logger.logger_type, "--monitor=%s" % self.monitor.monitor_type, "--no-pylint", "--with=step_functions_internal", ] step = [ "step", node.name, "--run-id sfn-$METAFLOW_RUN_ID", "--task-id %s" % task_id, # Since retries are handled by AWS Batch, we can rely on # AWS_BATCH_JOB_ATTEMPT as the job counter. "--retry-count $((AWS_BATCH_JOB_ATTEMPT-1))", "--max-user-code-retries %d" % user_code_retries, "--input-paths %s" % paths, # Set decorator to batch to execute `task_*` hooks for batch # decorator. "--with=batch", ] if any(self.graph[n].type == "foreach" for n in node.in_funcs): # We set the `METAFLOW_SPLIT_INDEX` through JSONPath-foo # to pass the state from the parent DynamoDb state for for-each. step.append("--split-index $METAFLOW_SPLIT_INDEX") if self.tags: step.extend("--tag %s" % tag for tag in self.tags) if self.namespace is not None: step.append("--namespace=%s" % self.namespace) cmds.append(" ".join(entrypoint + top_level + step)) return " && ".join(cmds)
def _set_constants(self, graph, kwargs): from metaflow.decorators import ( flow_decorators, ) # To prevent circular dependency # Persist values for parameters and other constants (class level variables) # only once. This method is called before persist_constants is called to # persist all values set using setattr seen = set() for var, param in self._get_parameters(): norm = param.name.lower() if norm in seen: raise MetaflowException("Parameter *%s* is specified twice. " "Note that parameter names are " "case-insensitive." % param.name) seen.add(norm) seen.clear() self._success = True parameters_info = [] for var, param in self._get_parameters(): seen.add(var) val = kwargs[param.name.replace("-", "_").lower()] # Support for delayed evaluation of parameters. This is used for # includefile in particular if callable(val): val = val() val = val.split( param.separator) if val and param.separator else val setattr(self, var, val) parameters_info.append({ "name": var, "type": param.__class__.__name__ }) # Do the same for class variables which will be forced constant as modifications # to them don't propagate well since we create a new process for each step and # re-read the flow file constants_info = [] for var in dir(self.__class__): if var[0] == "_" or var in self._NON_PARAMETERS or var in seen: continue val = getattr(self.__class__, var) if isinstance(val, (MethodType, FunctionType, property, type)): continue constants_info.append({"name": var, "type": type(val).__name__}) setattr(self, var, val) # We store the DAG information as an artifact called _graph_info steps_info, graph_structure = graph.output_steps() graph_info = { "file": os.path.basename(os.path.abspath(sys.argv[0])), "parameters": parameters_info, "constants": constants_info, "steps": steps_info, "graph_structure": graph_structure, "doc": graph.doc, "decorators": [{ "name": deco.name, "attributes": deco.attributes, "statically_defined": deco.statically_defined, } for deco in flow_decorators() if not deco.name.startswith("_")], } self._graph_info = graph_info