def apply( self, objects: Union[Entity, FeatureView, FeatureService, List[Union[FeatureView, Entity, FeatureService]], ], ) -> None: fs = self._build_feast_feature_store() fs.apply(objects) # Applying also initializes the sqlite tables in the online store FlyteContext.current_context().file_access.upload( self.config.online_store_path, f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", )
def all(self, **kwargs) -> pyspark.sql.DataFrame: if self._fmt == SchemaFormat.PARQUET: ctx = FlyteContext.current_context().user_space_params return ctx.spark_session.read.parquet(self.from_path) raise AssertionError( "Only Parquet type files are supported for spark dataframe currently" )
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: import pyspark as _pyspark ctx = FlyteContext.current_context() sess_builder = _pyspark.sql.SparkSession.builder.appName( f"FlyteSpark: {user_params.execution_id}") if not (ctx.execution_state and ctx.execution_state.Mode == ExecutionState.Mode.TASK_EXECUTION): # If either of above cases is not true, then we are in local execution of this task # Add system spark-conf for local/notebook based execution. spark_conf = _pyspark.SparkConf() for k, v in self.task_config.spark_conf.items(): spark_conf.set(k, v) # In local execution, propagate PYTHONPATH to executors too. This makes the spark # execution hermetic to the execution environment. For example, it allows running # Spark applications using Bazel, without major changes. if "PYTHONPATH" in os.environ: spark_conf.setExecutorEnv("PYTHONPATH", os.environ["PYTHONPATH"]) sess_builder = sess_builder.config(conf=spark_conf) self.sess = sess_builder.getOrCreate() return user_params.builder().add_attr("SPARK_SESSION", self.sess).build()
def get_online_features( self, features: Union[List[str], FeatureService], entity_rows: List[Dict[str, Any]], feature_refs: Optional[List[str]] = None, full_feature_names: bool = False, ) -> Dict[str, Any]: FlyteContext.current_context().file_access.download( f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", self.config.online_store_path, ) fs = self._build_feast_feature_store() online_response = fs.get_online_features(features, entity_rows, feature_refs, full_feature_names) return online_response.to_dict()
def materialize( self, start_date: datetime, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: FlyteContext.current_context().file_access.download( f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", self.config.online_store_path, ) fs = self._build_feast_feature_store() fs.materialize( start_date=start_date, end_date=end_date, ) FlyteContext.current_context().file_access.upload( self.config.online_store_path, f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", )
def __init__(self, base_dir: str = None): if base_dir is None: self._base_dir = ( FlyteContext.current_context().user_space_params.working_directory ) self._files = [] else: self._base_dir = base_dir files = os.listdir(base_dir) self._files = [os.path.join(base_dir, f) for f in files]
def test_parameter_ranges_transformer(): t = ParameterRangesTransformer() assert t.get_literal_type(ParameterRangeOneOf) == Generic.to_flyte_literal_type() o = ParameterRangeOneOf(param=IntegerParameterRange(10, 0, 1)) ctx = FlyteContext.current_context() lit = t.to_literal(ctx, python_val=o, python_type=ParameterRangeOneOf, expected=None) assert lit is not None assert lit.scalar.generic is not None ro = t.to_python_value(ctx, lit, ParameterRangeOneOf) assert ro is not None assert ro == o
def execute(self, **kwargs) -> typing.Any: with tempfile.TemporaryDirectory() as temp_dir: ctx = FlyteContext.current_context() file_ext = os.path.basename(self.task_config.uri) local_path = os.path.join(temp_dir, file_ext) ctx.file_access.download(self.task_config.uri, local_path) if self.task_config.compressed: local_path = unarchive_file(local_path, temp_dir) print(f"Connecting to db {local_path}") with contextlib.closing(sqlite3.connect(local_path)) as con: df = pd.read_sql_query(self.get_query(**kwargs), con) return df
def decode( self, ctx: FlyteContext, flyte_value: literals.StructuredDataset, current_task_metadata: StructuredDatasetMetadata, ) -> DataFrame: user_ctx = FlyteContext.current_context().user_space_params if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns: columns = [ c.name for c in current_task_metadata.structured_dataset_type.columns ] return user_ctx.spark_session.read.parquet( flyte_value.uri).select(*columns) return user_ctx.spark_session.read.parquet(flyte_value.uri)
def execute_from_model(self, tt: task_models.TaskTemplate, **kwargs) -> typing.Any: with tempfile.TemporaryDirectory() as temp_dir: ctx = FlyteContext.current_context() file_ext = os.path.basename(tt.custom["uri"]) local_path = os.path.join(temp_dir, file_ext) ctx.file_access.download(tt.custom["uri"], local_path) if tt.custom["compressed"]: local_path = unarchive_file(local_path, temp_dir) print(f"Connecting to db {local_path}") interpolated_query = SQLite3Task.interpolate_query(tt.custom["query_template"], **kwargs) print(f"Interpolated query {interpolated_query}") with contextlib.closing(sqlite3.connect(local_path)) as con: df = pd.read_sql_query(interpolated_query, con) return df
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: import pyspark as _pyspark ctx = FlyteContext.current_context() if not (ctx.execution_state and ctx.execution_state.Mode == ExecutionState.Mode.TASK_EXECUTION): # If either of above cases is not true, then we are in local execution of this task # Add system spark-conf for local/notebook based execution. spark_conf = set() for k, v in self.task_config.spark_conf.items(): spark_conf.add((k, v)) spark_conf.add(("spark.master", "local")) _pyspark.SparkConf().setAll(spark_conf) sess = _pyspark.sql.SparkSession.builder.appName(f"FlyteSpark: {user_params.execution_id}").getOrCreate() return user_params.builder().add_attr("SPARK_SESSION", sess).build()
def test_hpoconfig_transformer(): t = HPOTuningJobConfigTransformer() assert t.get_literal_type(HyperparameterTuningJobConfig) == Generic.to_flyte_literal_type() o = HyperparameterTuningJobConfig( tuning_strategy=1, tuning_objective=HyperparameterTuningObjective( objective_type=HyperparameterTuningObjectiveType.MINIMIZE, metric_name="x", ), training_job_early_stopping_type=TrainingJobEarlyStoppingType.OFF, ) ctx = FlyteContext.current_context() lit = t.to_literal(ctx, python_val=o, python_type=HyperparameterTuningJobConfig, expected=None) assert lit is not None assert lit.scalar.generic is not None ro = t.to_python_value(ctx, lit, HyperparameterTuningJobConfig) assert ro is not None assert ro == o
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: """ Pre-execute for Sagemaker will automatically add the distributed context to the execution params, only if the number of execution instances is > 1. Otherwise this is considered to be a single node execution """ if self._is_distributed(): logging.info("Distributed context detected!") exec_state = FlyteContext.current_context().execution_state if exec_state and exec_state.mode == ExecutionState.Mode.TASK_EXECUTION: """ This mode indicates we are actually in a remote execute environment (within sagemaker in this case) """ dist_ctx = DistributedTrainingContext.from_env() else: dist_ctx = DistributedTrainingContext.local_execute() return user_params.builder().add_attr("DISTRIBUTED_TRAINING_CONTEXT", dist_ctx).build() return user_params
def record_outputs(**kwargs) -> str: """ Use this method to record outputs from a notebook. It will convert all outputs to a Flyte understandable format. For Files, Directories, please use FlyteFile or FlyteDirectory, or wrap up your paths in these decorators. """ if kwargs is None: return "" m = {} ctx = FlyteContext.current_context() for k, v in kwargs.items(): expected = TypeEngine.to_literal_type(type(v)) lit = TypeEngine.to_literal(ctx, python_type=type(v), python_val=v, expected=expected) m[k] = lit return LiteralMap(literals=m).to_flyte_idl()
def execute(self, **kwargs) -> Any: """ TODO: Figure out how to share FlyteContext ExecutionParameters with the notebook kernel (as notebook kernel is executed in a separate python process) For Spark, the notebooks today need to use the new_session or just getOrCreate session and get a handle to the singleton """ logger.info( f"Hijacking the call for task-type {self.task_type}, to call notebook." ) # Execute Notebook via Papermill. pm.execute_notebook(self._notebook_path, self.output_notebook_path, parameters=kwargs) # type: ignore outputs = self.extract_outputs(self.output_notebook_path) self.render_nb_html(self.output_notebook_path, self.rendered_output_path) m = {} if outputs: m = outputs.literals output_list = [] for k, type_v in self.python_interface.outputs.items(): if k == self._IMPLICIT_OP_NOTEBOOK: output_list.append(self.output_notebook_path) elif k == self._IMPLICIT_RENDERED_NOTEBOOK: output_list.append(self.rendered_output_path) elif k in m: v = TypeEngine.to_python_value( ctx=FlyteContext.current_context(), lv=m[k], expected_python_type=type_v) output_list.append(v) else: raise RuntimeError( f"Expected output {k} of type {v} not found in the notebook outputs" ) return tuple(output_list)
def dispatch_execute( self, ctx: FlyteContext, input_literal_map: _literal_models.LiteralMap ) -> Union[_literal_models.LiteralMap, _dynamic_job.DynamicJobSpec]: """ This function is mostly copied from the base PythonTask, but differs in that we have to infer the Python interface before executing. Also, we refer to ``self.task_template`` rather than just ``self`` like in task classes that derive from the base ``PythonTask``. """ # Invoked before the task is executed new_user_params = self.pre_execute(ctx.user_space_params) # Create another execution context with the new user params, but let's keep the same working dir with ctx.new_execution_context( mode=ctx.execution_state.mode, execution_params=new_user_params, working_dir=ctx.execution_state.working_dir, ) as exec_ctx: # Added: Have to reverse the Python interface from the task template Flyte interface # See docstring for more details. guessed_python_input_types = TypeEngine.guess_python_types( self.task_template.interface.inputs) native_inputs = TypeEngine.literal_map_to_kwargs( exec_ctx, input_literal_map, guessed_python_input_types) logger.info( f"Invoking FlyteTask executor {self.task_template.id.name} with inputs: {native_inputs}" ) try: native_outputs = self.execute(**native_inputs) except Exception as e: logger.exception(f"Exception when executing {e}") raise e logger.info( f"Task executed successfully in user level, outputs: {native_outputs}" ) # Lets run the post_execute method. This may result in a IgnoreOutputs Exception, which is # bubbled up to be handled at the callee layer. native_outputs = self.post_execute(new_user_params, native_outputs) # Short circuit the translation to literal map because what's returned may be a dj spec (or an # already-constructed LiteralMap if the dynamic task was a no-op), not python native values if isinstance(native_outputs, _literal_models.LiteralMap) or isinstance( native_outputs, _dynamic_job.DynamicJobSpec): return native_outputs expected_output_names = list( self.task_template.interface.outputs.keys()) if len(expected_output_names) == 1: # Here we have to handle the fact that the task could've been declared with a typing.NamedTuple of # length one. That convention is used for naming outputs - and single-length-NamedTuples are # particularly troublesome but elegant handling of them is not a high priority # Again, we're using the output_tuple_name as a proxy. # Deleted some stuff native_outputs_as_map = { expected_output_names[0]: native_outputs } elif len(expected_output_names) == 0: native_outputs_as_map = {} else: native_outputs_as_map = { expected_output_names[i]: native_outputs[i] for i, _ in enumerate(native_outputs) } # We manually construct a LiteralMap here because task inputs and outputs actually violate the assumption # built into the IDL that all the values of a literal map are of the same type. literals = {} for k, v in native_outputs_as_map.items(): literal_type = self.task_template.interface.outputs[k].type py_type = type(v) if isinstance(v, tuple): raise AssertionError( f"Output({k}) in task{self.task_template.id.name} received a tuple {v}, instead of {py_type}" ) try: literals[k] = TypeEngine.to_literal( exec_ctx, v, py_type, literal_type) except Exception as e: raise AssertionError( f"failed to convert return value for var {k}") from e outputs_literal_map = _literal_models.LiteralMap(literals=literals) # After the execute has been successfully completed return outputs_literal_map