def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: import pyspark as _pyspark ctx = FlyteContextManager.current_context() sess_builder = _pyspark.sql.SparkSession.builder.appName( f"FlyteSpark: {user_params.execution_id}") if not (ctx.execution_state and ctx.execution_state.Mode == ExecutionState.Mode.TASK_EXECUTION): # If either of above cases is not true, then we are in local execution of this task # Add system spark-conf for local/notebook based execution. spark_conf = _pyspark.SparkConf() for k, v in self.task_config.spark_conf.items(): spark_conf.set(k, v) # In local execution, propagate PYTHONPATH to executors too. This makes the spark # execution hermetic to the execution environment. For example, it allows running # Spark applications using Bazel, without major changes. if "PYTHONPATH" in os.environ: spark_conf.setExecutorEnv("PYTHONPATH", os.environ["PYTHONPATH"]) sess_builder = sess_builder.config(conf=spark_conf) self.sess = sess_builder.getOrCreate() return user_params.builder().add_attr("SPARK_SESSION", self.sess).build()
def wrapper(*args, **kwargs): # get the current flyte context to obtain access to the compilation state of the workflow DAG. ctx = FlyteContextManager.current_context() # defines before node before_node = create_node(before) # ctx.compilation_state.nodes == [before_node] # under the hood, flytekit compiler defines and threads # together nodes within the `my_workflow` function body outputs = fn(*args, **kwargs) # ctx.compilation_state.nodes == [before_node, *nodes_created_by_fn] # defines the after node after_node = create_node(after) # ctx.compilation_state.nodes == [before_node, *nodes_created_by_fn, after_node] # compile the workflow correctly by making sure `before_node` # runs before the first workflow node and `after_node` # runs after the last workflow node. if ctx.compilation_state is not None: # ctx.compilation_state.nodes is a list of nodes defined in the # order of execution above workflow_node0 = ctx.compilation_state.nodes[1] workflow_node1 = ctx.compilation_state.nodes[-2] before_node >> workflow_node0 workflow_node1 >> after_node return outputs
def test_create_native_named_tuple(): ctx = FlyteContextManager.current_context() t = create_native_named_tuple(ctx, promises=None, entity_interface=Interface()) assert t is None p1 = Promise(var="x", val=TypeEngine.to_literal( ctx, 1, int, LiteralType(simple=SimpleType.INTEGER))) p2 = Promise(var="y", val=TypeEngine.to_literal( ctx, 2, int, LiteralType(simple=SimpleType.INTEGER))) t = create_native_named_tuple( ctx, promises=p1, entity_interface=Interface(outputs={"x": int})) assert t assert t == 1 t = create_native_named_tuple(ctx, promises=[], entity_interface=Interface()) assert t is None t = create_native_named_tuple(ctx, promises=[p1, p2], entity_interface=Interface(outputs={ "x": int, "y": int })) assert t assert t == (1, 2) t = create_native_named_tuple(ctx, promises=[p1, p2], entity_interface=Interface( outputs={ "x": int, "y": int }, output_tuple_name="Tup")) assert t assert t == (1, 2) assert t.__class__.__name__ == "Tup" with pytest.raises(KeyError): create_native_named_tuple(ctx, promises=[p1, p2], entity_interface=Interface( outputs={"x": int}, output_tuple_name="Tup"))
def test_deck_in_jupyter(mock_ipython_check): mock_ipython_check.return_value = True ctx = FlyteContextManager.current_context() ctx.user_space_params._decks = [ctx.user_space_params.default_deck] _output_deck("test_task", ctx.user_space_params) @task() def t1(a: int) -> str: return str(a) with flytekit.new_context() as ctx: t1(a=3) deck = ctx.get_deck() assert deck is not None
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: """ Pre-execute for Sagemaker will automatically add the distributed context to the execution params, only if the number of execution instances is > 1. Otherwise this is considered to be a single node execution """ if self._is_distributed(): logger.info("Distributed context detected!") exec_state = FlyteContextManager.current_context().execution_state if exec_state and exec_state.mode == ExecutionState.Mode.TASK_EXECUTION: """ This mode indicates we are actually in a remote execute environment (within sagemaker in this case) """ dist_ctx = DistributedTrainingContext.from_env() else: dist_ctx = DistributedTrainingContext.local_execute() return user_params.builder().add_attr( "DISTRIBUTED_TRAINING_CONTEXT", dist_ctx).build() return user_params
def test_deck(): df = pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [1, 22]}) ctx = FlyteContextManager.current_context() ctx.user_space_params._decks = [ctx.user_space_params.default_deck] renderer = TopFrameRenderer() deck_name = "test" deck = Deck(deck_name) deck.append(renderer.to_html(df)) assert deck.name == deck_name assert deck.html is not None assert len(ctx.user_space_params.decks) == 2 _output_deck("test_task", ctx.user_space_params) @task() def t1(a: int) -> str: return str(a) t1(a=3) assert len(ctx.user_space_params.decks) == 2 # input, output decks
def dispatch_execute( self, ctx: FlyteContext, input_literal_map: _literal_models.LiteralMap ) -> Union[_literal_models.LiteralMap, _dynamic_job.DynamicJobSpec]: """ This function is mostly copied from the base PythonTask, but differs in that we have to infer the Python interface before executing. Also, we refer to ``self.task_template`` rather than just ``self`` like in task classes that derive from the base ``PythonTask``. """ # Invoked before the task is executed new_user_params = self.pre_execute(ctx.user_space_params) # Create another execution context with the new user params, but let's keep the same working dir with FlyteContextManager.with_context( ctx.with_execution_state( ctx.execution_state.with_params( user_space_params=new_user_params))) as exec_ctx: # Added: Have to reverse the Python interface from the task template Flyte interface # See docstring for more details. guessed_python_input_types = TypeEngine.guess_python_types( self.task_template.interface.inputs) native_inputs = TypeEngine.literal_map_to_kwargs( exec_ctx, input_literal_map, guessed_python_input_types) logger.info( f"Invoking FlyteTask executor {self.task_template.id.name} with inputs: {native_inputs}" ) try: native_outputs = self.execute(**native_inputs) except Exception as e: logger.exception(f"Exception when executing {e}") raise e logger.info( f"Task executed successfully in user level, outputs: {native_outputs}" ) # Lets run the post_execute method. This may result in a IgnoreOutputs Exception, which is # bubbled up to be handled at the callee layer. native_outputs = self.post_execute(new_user_params, native_outputs) # Short circuit the translation to literal map because what's returned may be a dj spec (or an # already-constructed LiteralMap if the dynamic task was a no-op), not python native values if isinstance(native_outputs, _literal_models.LiteralMap) or isinstance( native_outputs, _dynamic_job.DynamicJobSpec): return native_outputs expected_output_names = list( self.task_template.interface.outputs.keys()) if len(expected_output_names) == 1: # Here we have to handle the fact that the task could've been declared with a typing.NamedTuple of # length one. That convention is used for naming outputs - and single-length-NamedTuples are # particularly troublesome but elegant handling of them is not a high priority # Again, we're using the output_tuple_name as a proxy. # Deleted some stuff native_outputs_as_map = { expected_output_names[0]: native_outputs } elif len(expected_output_names) == 0: native_outputs_as_map = {} else: native_outputs_as_map = { expected_output_names[i]: native_outputs[i] for i, _ in enumerate(native_outputs) } # We manually construct a LiteralMap here because task inputs and outputs actually violate the assumption # built into the IDL that all the values of a literal map are of the same type. literals = {} for k, v in native_outputs_as_map.items(): literal_type = self.task_template.interface.outputs[k].type py_type = type(v) if isinstance(v, tuple): raise AssertionError( f"Output({k}) in task{self.task_template.id.name} received a tuple {v}, instead of {py_type}" ) try: literals[k] = TypeEngine.to_literal( exec_ctx, v, py_type, literal_type) except Exception as e: raise AssertionError( f"failed to convert return value for var {k}") from e outputs_literal_map = _literal_models.LiteralMap(literals=literals) # After the execute has been successfully completed return outputs_literal_map
from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata from flytekit.models.types import StructuredDatasetType from flytekit.types.structured.structured_dataset import ( BIGQUERY, DF, LOCAL, PARQUET, S3, StructuredDataset, StructuredDatasetDecoder, StructuredDatasetEncoder, StructuredDatasetTransformerEngine, ) PANDAS_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() NUMPY_PATH = FlyteContextManager.current_context().file_access.get_random_local_directory() BQ_PATH = "bq://flyte-dataset:flyte.table" my_cols = kwtypes(Name=str, Age=int) fields = [("Name", pa.string()), ("Age", pa.int32())] arrow_schema = pa.schema(fields) pd_df = pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) class MockBQEncodingHandlers(StructuredDatasetEncoder): def __init__(self): super().__init__(pd.DataFrame, BIGQUERY, "") def encode( self,