def test_sd(): sd = StructuredDataset(dataframe="hi") sd.uri = "my uri" assert sd.file_format == PARQUET with pytest.raises(ValueError, match="No dataframe type set"): sd.all() with pytest.raises(ValueError, match="No dataframe type set."): sd.iter() class MockPandasDecodingHandlers(StructuredDatasetDecoder): def decode( self, ctx: FlyteContext, flyte_value: literals.StructuredDataset, current_task_metadata: StructuredDatasetMetadata, ) -> typing.Union[typing.Generator[pd.DataFrame, None, None]]: yield pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) StructuredDatasetTransformerEngine.register(MockPandasDecodingHandlers( pd.DataFrame, "tmpfs"), default_for_type=False) sd = StructuredDataset() sd._literal_sd = literals.StructuredDataset( uri="tmpfs://somewhere", metadata=StructuredDatasetMetadata(StructuredDatasetType(format=""))) assert isinstance(sd.open(pd.DataFrame).iter(), typing.Generator) with pytest.raises(ValueError): sd.open(pd.DataFrame).all() class MockPandasDecodingHandlers(StructuredDatasetDecoder): def decode( self, ctx: FlyteContext, flyte_value: literals.StructuredDataset, current_task_metadata: StructuredDatasetMetadata, ) -> pd.DataFrame: return pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]}) StructuredDatasetTransformerEngine.register(MockPandasDecodingHandlers( pd.DataFrame, "tmpfs"), default_for_type=False, override=True) sd = StructuredDataset() sd._literal_sd = literals.StructuredDataset( uri="tmpfs://somewhere", metadata=StructuredDatasetMetadata(StructuredDatasetType(format=""))) with pytest.raises(ValueError): sd.open(pd.DataFrame).iter()
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: return literals.StructuredDataset(uri="")
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: return literals.StructuredDataset( uri="bq://bucket/key", metadata=StructuredDatasetMetadata(structured_dataset_type) )
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: return literals.StructuredDataset( uri="/tmp/avro", metadata=StructuredDatasetMetadata(structured_dataset_type))
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: path = typing.cast(str, structured_dataset.uri ) or ctx.file_access.get_random_remote_directory() df = typing.cast(DataFrame, structured_dataset.dataframe) df.write.mode("overwrite").parquet(path) return literals.StructuredDataset( uri=path, metadata=StructuredDatasetMetadata(structured_dataset_type))
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: uri = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory() if not ctx.file_access.is_remote(uri): Path(uri).mkdir(parents=True, exist_ok=True) path = os.path.join(uri, f"{0:05}") fp = FSSpecPersistence(data_config=ctx.file_access.data_config) filesystem = fp.get_filesystem(path) pq.write_table(structured_dataset.dataframe, strip_protocol(path), filesystem=filesystem) return literals.StructuredDataset(uri=uri, metadata=StructuredDatasetMetadata(structured_dataset_type))
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: path = typing.cast(str, structured_dataset.uri ) or ctx.file_access.get_random_remote_path() df = structured_dataset.dataframe local_dir = ctx.file_access.get_random_local_directory() local_path = os.path.join(local_dir, f"{0:05}") pq.write_table(df, local_path) ctx.file_access.upload_directory(local_dir, path) return literals.StructuredDataset( uri=path, metadata=StructuredDatasetMetadata(structured_dataset_type))
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: path = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory() df = typing.cast(np.ndarray, structured_dataset.dataframe) name = ["col" + str(i) for i in range(len(df))] table = pa.Table.from_arrays(df, name) local_dir = ctx.file_access.get_random_local_directory() local_path = os.path.join(local_dir, f"{0:05}") pq.write_table(table, local_path) ctx.file_access.upload_directory(local_dir, path) structured_dataset_type.format = PARQUET return literals.StructuredDataset(uri=path, metadata=StructuredDatasetMetadata(structured_dataset_type))
def convert_to_structured_dataset(self, ctx: typing.Optional[click.Context], param: typing.Optional[click.Parameter], value: Directory) -> Literal: uri = self.get_uri_for_dir(value, "00000.parquet") lit = Literal(scalar=Scalar( structured_dataset=literals.StructuredDataset( uri=uri, metadata=literals.StructuredDatasetMetadata( structured_dataset_type=self._literal_type. structured_dataset_type), ), ), ) return lit
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: uri = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory() if not ctx.file_access.is_remote(uri): Path(uri).mkdir(parents=True, exist_ok=True) path = os.path.join(uri, f"{0:05}") df = typing.cast(pd.DataFrame, structured_dataset.dataframe) df.to_parquet( path, coerce_timestamps="us", allow_truncated_timestamps=False, storage_options=get_storage_options(ctx.file_access.data_config, path), ) structured_dataset_type.format = PARQUET return literals.StructuredDataset(uri=uri, metadata=StructuredDatasetMetadata(structured_dataset_type))
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: path = typing.cast(str, structured_dataset.uri ) or ctx.file_access.get_random_remote_directory() df = typing.cast(pd.DataFrame, structured_dataset.dataframe) local_dir = ctx.file_access.get_random_local_directory() local_path = os.path.join(local_dir, f"{0:05}") df.to_parquet(local_path, coerce_timestamps="us", allow_truncated_timestamps=False) ctx.file_access.upload_directory(local_dir, path) structured_dataset_type.format = PARQUET return literals.StructuredDataset( uri=path, metadata=StructuredDatasetMetadata(structured_dataset_type))
def test_structured_dataset(): my_cols = [ _types.StructuredDatasetType.DatasetColumn( "a", _types.LiteralType(simple=_types.SimpleType.INTEGER)), _types.StructuredDatasetType.DatasetColumn( "b", _types.LiteralType(simple=_types.SimpleType.STRING)), _types.StructuredDatasetType.DatasetColumn( "c", _types.LiteralType(collection_type=_types.LiteralType( simple=_types.SimpleType.INTEGER))), _types.StructuredDatasetType.DatasetColumn( "d", _types.LiteralType(map_value_type=_types.LiteralType( simple=_types.SimpleType.INTEGER))), ] ds = literals.StructuredDataset( uri="s3://bucket", metadata=literals.StructuredDatasetMetadata( structured_dataset_type=_types.StructuredDatasetType( columns=my_cols, format="parquet")), ) obj = literals.Scalar(structured_dataset=ds) assert obj.error is None assert obj.blob is None assert obj.binary is None assert obj.schema is None assert obj.none_type is None assert obj.structured_dataset is not None assert obj.value.uri == "s3://bucket" assert len(obj.value.metadata.structured_dataset_type.columns) == 4 obj2 = literals.Scalar.from_flyte_idl(obj.to_flyte_idl()) assert obj == obj2 assert obj2.blob is None assert obj2.binary is None assert obj2.schema is None assert obj2.none_type is None assert obj2.structured_dataset is not None assert obj2.value.uri == "s3://bucket" assert len(obj2.value.metadata.structured_dataset_type.columns) == 4
def encode( self, ctx: FlyteContext, structured_dataset: StructuredDataset, structured_dataset_type: StructuredDatasetType, ) -> literals.StructuredDataset: df = typing.cast(pl.DataFrame, structured_dataset.dataframe) local_dir = ctx.file_access.get_random_local_directory() local_path = f"{local_dir}/00000" # Polars 0.13.12 deprecated to_parquet in favor of write_parquet if hasattr(df, "write_parquet"): df.write_parquet(local_path) else: df.to_parquet(local_path) remote_dir = typing.cast( str, structured_dataset.uri ) or ctx.file_access.get_random_remote_directory() ctx.file_access.upload_directory(local_dir, remote_dir) return literals.StructuredDataset( uri=remote_dir, metadata=StructuredDatasetMetadata(structured_dataset_type))
def to_python_value(self, ctx: FlyteContext, lv: Literal, expected_python_type: Type[T]) -> T: """ The only tricky thing with converting a Literal (say the output of an earlier task), to a Python value at the start of a task execution, is the column subsetting behavior. For example, if you have, def t1() -> Annotated[StructuredDataset, kwtypes(col_a=int, col_b=float)]: ... def t2(in_a: Annotated[StructuredDataset, kwtypes(col_b=float)]): ... where t2(in_a=t1()), when t2 does in_a.open(pd.DataFrame).all(), it should get a DataFrame with only one column. +-----------------------------+-----------------------------------------+--------------------------------------+ | | StructuredDatasetType of the incoming Literal | +-----------------------------+-----------------------------------------+--------------------------------------+ | StructuredDatasetType | Has columns defined | [] columns or None | | of currently running task | | | +=============================+=========================================+======================================+ | Has columns | The StructuredDatasetType passed to the decoder will have the columns | | defined | as defined by the type annotation of the currently running task. | | | | | | Decoders **should** then subset the incoming data to the columns requested. | | | | +-----------------------------+-----------------------------------------+--------------------------------------+ | [] columns or None | StructuredDatasetType passed to decoder | StructuredDatasetType passed to the | | | will have the columns from the incoming | decoder will have an empty list of | | | Literal. This is the scenario where | columns. | | | the Literal returned by the running | | | | task will have more information than | | | | the running task's signature. | | +-----------------------------+-----------------------------------------+--------------------------------------+ """ # Detect annotations and extract out all the relevant information that the user might supply expected_python_type, column_dict, storage_fmt, pa_schema = extract_cols_and_format( expected_python_type) # The literal that we get in might be an old FlyteSchema. # We'll continue to support this for the time being. There is some duplicated logic here but let's # keep it copy/pasted for clarity if lv.scalar.schema is not None: schema_columns = lv.scalar.schema.type.columns # See the repeated logic below for comments if column_dict is None or len(column_dict) == 0: final_dataset_columns = [] if schema_columns is not None and schema_columns != []: for c in schema_columns: final_dataset_columns.append( StructuredDatasetType.DatasetColumn( name=c.name, literal_type=LiteralType( simple= convert_schema_type_to_structured_dataset_type( c.type), ), )) # Dataframe will always be serialized to parquet file by FlyteSchema transformer new_sdt = StructuredDatasetType(columns=final_dataset_columns, format=PARQUET) else: final_dataset_columns = self._convert_ordered_dict_of_columns_to_list( column_dict) # Dataframe will always be serialized to parquet file by FlyteSchema transformer new_sdt = StructuredDatasetType(columns=final_dataset_columns, format=PARQUET) metad = literals.StructuredDatasetMetadata( structured_dataset_type=new_sdt) sd_literal = literals.StructuredDataset( uri=lv.scalar.schema.uri, metadata=metad, ) if issubclass(expected_python_type, StructuredDataset): sd = StructuredDataset(dataframe=None, metadata=metad) sd._literal_sd = sd_literal return sd else: return self.open_as(ctx, sd_literal, expected_python_type, metad) # Start handling for StructuredDataset scalars, first look at the columns incoming_columns = lv.scalar.structured_dataset.metadata.structured_dataset_type.columns # If the incoming literal, also doesn't have columns, then we just have an empty list, so initialize here final_dataset_columns = [] # If the current running task's input does not have columns defined, or has an empty list of columns if column_dict is None or len(column_dict) == 0: # but if it does, then we just copy it over if incoming_columns is not None and incoming_columns != []: final_dataset_columns = incoming_columns.copy() # If the current running task's input does have columns defined else: final_dataset_columns = self._convert_ordered_dict_of_columns_to_list( column_dict) new_sdt = StructuredDatasetType( columns=final_dataset_columns, format=lv.scalar.structured_dataset.metadata. structured_dataset_type.format, external_schema_type=lv.scalar.structured_dataset.metadata. structured_dataset_type.external_schema_type, external_schema_bytes=lv.scalar.structured_dataset.metadata. structured_dataset_type.external_schema_bytes, ) metad = StructuredDatasetMetadata(structured_dataset_type=new_sdt) # A StructuredDataset type, for example # t1(input_a: StructuredDataset) # or # t1(input_a: Annotated[StructuredDataset, my_cols]) if issubclass(expected_python_type, StructuredDataset): sd = expected_python_type( dataframe=None, # Note here that the type being passed in metadata=metad, ) sd._literal_sd = lv.scalar.structured_dataset sd.file_format = metad.structured_dataset_type.format return sd # If the requested type was not a StructuredDataset, then it means it was a plain dataframe type, which means # we should do the opening/downloading and whatever else it might entail right now. No iteration option here. return self.open_as(ctx, lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)
def to_literal( self, ctx: FlyteContext, python_val: Union[StructuredDataset, typing.Any], python_type: Union[Type[StructuredDataset], Type], expected: LiteralType, ) -> Literal: # Make a copy in case we need to hand off to encoders, since we can't be sure of mutations. # Check first to see if it's even an SD type. For backwards compatibility, we may be getting a FlyteSchema python_type, *attrs = extract_cols_and_format(python_type) # In case it's a FlyteSchema sdt = StructuredDatasetType( format=self.DEFAULT_FORMATS.get(python_type, None)) if expected and expected.structured_dataset_type: sdt = StructuredDatasetType( columns=expected.structured_dataset_type.columns, format=expected.structured_dataset_type.format, external_schema_type=expected.structured_dataset_type. external_schema_type, external_schema_bytes=expected.structured_dataset_type. external_schema_bytes, ) # If the type signature has the StructuredDataset class, it will, or at least should, also be a # StructuredDataset instance. if issubclass(python_type, StructuredDataset) and isinstance( python_val, StructuredDataset): # There are three cases that we need to take care of here. # 1. A task returns a StructuredDataset that was just a passthrough input. If this happens # then return the original literals.StructuredDataset without invoking any encoder # # Ex. # def t1(dataset: Annotated[StructuredDataset, my_cols]) -> Annotated[StructuredDataset, my_cols]: # return dataset if python_val._literal_sd is not None: if python_val.dataframe is not None: raise ValueError( f"Shouldn't have specified both literal {python_val._literal_sd} and dataframe {python_val.dataframe}" ) return Literal(scalar=Scalar( structured_dataset=python_val._literal_sd)) # 2. A task returns a python StructuredDataset with a uri. # Note: this case is also what happens we start a local execution of a task with a python StructuredDataset. # It gets converted into a literal first, then back into a python StructuredDataset. # # Ex. # def t2(uri: str) -> Annotated[StructuredDataset, my_cols] # return StructuredDataset(uri=uri) if python_val.dataframe is None: if not python_val.uri: raise ValueError( f"If dataframe is not specified, then the uri should be specified. {python_val}" ) sd_model = literals.StructuredDataset( uri=python_val.uri, metadata=StructuredDatasetMetadata( structured_dataset_type=sdt), ) return Literal(scalar=Scalar(structured_dataset=sd_model)) # 3. This is the third and probably most common case. The python StructuredDataset object wraps a dataframe # that we will need to invoke an encoder for. Figure out which encoder to call and invoke it. df_type = type(python_val.dataframe) if python_val.uri is None: protocol = self.DEFAULT_PROTOCOLS[df_type] else: protocol = protocol_prefix(python_val.uri) return self.encode( ctx, python_val, df_type, protocol, sdt.format or typing.cast(StructuredDataset, python_val).DEFAULT_FILE_FORMAT, sdt, ) # Otherwise assume it's a dataframe instance. Wrap it with some defaults fmt = self.DEFAULT_FORMATS[python_type] protocol = self.DEFAULT_PROTOCOLS[python_type] meta = StructuredDatasetMetadata( structured_dataset_type=expected. structured_dataset_type if expected else None) sd = StructuredDataset(dataframe=python_val, metadata=meta) return self.encode(ctx, sd, python_type, protocol, fmt, sdt)