def apply( self, objects: Union[Entity, FeatureView, FeatureService, List[Union[FeatureView, Entity, FeatureService]], ], ) -> None: fs = self._build_feast_feature_store() fs.apply(objects) # Applying also initializes the sqlite tables in the online store FlyteContext.current_context().file_access.upload( self.config.online_store_path, f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", )
def all(self, **kwargs) -> pyspark.sql.DataFrame: if self._fmt == SchemaFormat.PARQUET: ctx = FlyteContext.current_context().user_space_params return ctx.spark_session.read.parquet(self.from_path) raise AssertionError( "Only Parquet type files are supported for spark dataframe currently" )
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: import pyspark as _pyspark ctx = FlyteContext.current_context() sess_builder = _pyspark.sql.SparkSession.builder.appName( f"FlyteSpark: {user_params.execution_id}") if not (ctx.execution_state and ctx.execution_state.Mode == ExecutionState.Mode.TASK_EXECUTION): # If either of above cases is not true, then we are in local execution of this task # Add system spark-conf for local/notebook based execution. spark_conf = _pyspark.SparkConf() for k, v in self.task_config.spark_conf.items(): spark_conf.set(k, v) # In local execution, propagate PYTHONPATH to executors too. This makes the spark # execution hermetic to the execution environment. For example, it allows running # Spark applications using Bazel, without major changes. if "PYTHONPATH" in os.environ: spark_conf.setExecutorEnv("PYTHONPATH", os.environ["PYTHONPATH"]) sess_builder = sess_builder.config(conf=spark_conf) self.sess = sess_builder.getOrCreate() return user_params.builder().add_attr("SPARK_SESSION", self.sess).build()
def get_online_features( self, features: Union[List[str], FeatureService], entity_rows: List[Dict[str, Any]], feature_refs: Optional[List[str]] = None, full_feature_names: bool = False, ) -> Dict[str, Any]: FlyteContext.current_context().file_access.download( f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", self.config.online_store_path, ) fs = self._build_feast_feature_store() online_response = fs.get_online_features(features, entity_rows, feature_refs, full_feature_names) return online_response.to_dict()
def materialize( self, start_date: datetime, end_date: datetime, feature_views: Optional[List[str]] = None, ) -> None: FlyteContext.current_context().file_access.download( f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", self.config.online_store_path, ) fs = self._build_feast_feature_store() fs.materialize( start_date=start_date, end_date=end_date, ) FlyteContext.current_context().file_access.upload( self.config.online_store_path, f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", )
def __init__(self, base_dir: str = None): if base_dir is None: self._base_dir = ( FlyteContext.current_context().user_space_params.working_directory ) self._files = [] else: self._base_dir = base_dir files = os.listdir(base_dir) self._files = [os.path.join(base_dir, f) for f in files]
def test_parameter_ranges_transformer(): t = ParameterRangesTransformer() assert t.get_literal_type(ParameterRangeOneOf) == Generic.to_flyte_literal_type() o = ParameterRangeOneOf(param=IntegerParameterRange(10, 0, 1)) ctx = FlyteContext.current_context() lit = t.to_literal(ctx, python_val=o, python_type=ParameterRangeOneOf, expected=None) assert lit is not None assert lit.scalar.generic is not None ro = t.to_python_value(ctx, lit, ParameterRangeOneOf) assert ro is not None assert ro == o
def execute(self, **kwargs) -> typing.Any: with tempfile.TemporaryDirectory() as temp_dir: ctx = FlyteContext.current_context() file_ext = os.path.basename(self.task_config.uri) local_path = os.path.join(temp_dir, file_ext) ctx.file_access.download(self.task_config.uri, local_path) if self.task_config.compressed: local_path = unarchive_file(local_path, temp_dir) print(f"Connecting to db {local_path}") with contextlib.closing(sqlite3.connect(local_path)) as con: df = pd.read_sql_query(self.get_query(**kwargs), con) return df
def decode( self, ctx: FlyteContext, flyte_value: literals.StructuredDataset, current_task_metadata: StructuredDatasetMetadata, ) -> DataFrame: user_ctx = FlyteContext.current_context().user_space_params if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns: columns = [ c.name for c in current_task_metadata.structured_dataset_type.columns ] return user_ctx.spark_session.read.parquet( flyte_value.uri).select(*columns) return user_ctx.spark_session.read.parquet(flyte_value.uri)
def execute_from_model(self, tt: task_models.TaskTemplate, **kwargs) -> typing.Any: with tempfile.TemporaryDirectory() as temp_dir: ctx = FlyteContext.current_context() file_ext = os.path.basename(tt.custom["uri"]) local_path = os.path.join(temp_dir, file_ext) ctx.file_access.download(tt.custom["uri"], local_path) if tt.custom["compressed"]: local_path = unarchive_file(local_path, temp_dir) print(f"Connecting to db {local_path}") interpolated_query = SQLite3Task.interpolate_query(tt.custom["query_template"], **kwargs) print(f"Interpolated query {interpolated_query}") with contextlib.closing(sqlite3.connect(local_path)) as con: df = pd.read_sql_query(interpolated_query, con) return df
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: import pyspark as _pyspark ctx = FlyteContext.current_context() if not (ctx.execution_state and ctx.execution_state.Mode == ExecutionState.Mode.TASK_EXECUTION): # If either of above cases is not true, then we are in local execution of this task # Add system spark-conf for local/notebook based execution. spark_conf = set() for k, v in self.task_config.spark_conf.items(): spark_conf.add((k, v)) spark_conf.add(("spark.master", "local")) _pyspark.SparkConf().setAll(spark_conf) sess = _pyspark.sql.SparkSession.builder.appName(f"FlyteSpark: {user_params.execution_id}").getOrCreate() return user_params.builder().add_attr("SPARK_SESSION", sess).build()
def test_hpoconfig_transformer(): t = HPOTuningJobConfigTransformer() assert t.get_literal_type(HyperparameterTuningJobConfig) == Generic.to_flyte_literal_type() o = HyperparameterTuningJobConfig( tuning_strategy=1, tuning_objective=HyperparameterTuningObjective( objective_type=HyperparameterTuningObjectiveType.MINIMIZE, metric_name="x", ), training_job_early_stopping_type=TrainingJobEarlyStoppingType.OFF, ) ctx = FlyteContext.current_context() lit = t.to_literal(ctx, python_val=o, python_type=HyperparameterTuningJobConfig, expected=None) assert lit is not None assert lit.scalar.generic is not None ro = t.to_python_value(ctx, lit, HyperparameterTuningJobConfig) assert ro is not None assert ro == o
def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters: """ Pre-execute for Sagemaker will automatically add the distributed context to the execution params, only if the number of execution instances is > 1. Otherwise this is considered to be a single node execution """ if self._is_distributed(): logging.info("Distributed context detected!") exec_state = FlyteContext.current_context().execution_state if exec_state and exec_state.mode == ExecutionState.Mode.TASK_EXECUTION: """ This mode indicates we are actually in a remote execute environment (within sagemaker in this case) """ dist_ctx = DistributedTrainingContext.from_env() else: dist_ctx = DistributedTrainingContext.local_execute() return user_params.builder().add_attr("DISTRIBUTED_TRAINING_CONTEXT", dist_ctx).build() return user_params
def record_outputs(**kwargs) -> str: """ Use this method to record outputs from a notebook. It will convert all outputs to a Flyte understandable format. For Files, Directories, please use FlyteFile or FlyteDirectory, or wrap up your paths in these decorators. """ if kwargs is None: return "" m = {} ctx = FlyteContext.current_context() for k, v in kwargs.items(): expected = TypeEngine.to_literal_type(type(v)) lit = TypeEngine.to_literal(ctx, python_type=type(v), python_val=v, expected=expected) m[k] = lit return LiteralMap(literals=m).to_flyte_idl()
def execute(self, **kwargs) -> Any: """ TODO: Figure out how to share FlyteContext ExecutionParameters with the notebook kernel (as notebook kernel is executed in a separate python process) For Spark, the notebooks today need to use the new_session or just getOrCreate session and get a handle to the singleton """ logger.info( f"Hijacking the call for task-type {self.task_type}, to call notebook." ) # Execute Notebook via Papermill. pm.execute_notebook(self._notebook_path, self.output_notebook_path, parameters=kwargs) # type: ignore outputs = self.extract_outputs(self.output_notebook_path) self.render_nb_html(self.output_notebook_path, self.rendered_output_path) m = {} if outputs: m = outputs.literals output_list = [] for k, type_v in self.python_interface.outputs.items(): if k == self._IMPLICIT_OP_NOTEBOOK: output_list.append(self.output_notebook_path) elif k == self._IMPLICIT_RENDERED_NOTEBOOK: output_list.append(self.rendered_output_path) elif k in m: v = TypeEngine.to_python_value( ctx=FlyteContext.current_context(), lv=m[k], expected_python_type=type_v) output_list.append(v) else: raise RuntimeError( f"Expected output {k} of type {v} not found in the notebook outputs" ) return tuple(output_list)