Пример #1
0
def test_sd():
    sd = StructuredDataset(dataframe="hi")
    sd.uri = "my uri"
    assert sd.file_format == PARQUET

    with pytest.raises(ValueError, match="No dataframe type set"):
        sd.all()

    with pytest.raises(ValueError, match="No dataframe type set."):
        sd.iter()

    class MockPandasDecodingHandlers(StructuredDatasetDecoder):
        def decode(
            self,
            ctx: FlyteContext,
            flyte_value: literals.StructuredDataset,
            current_task_metadata: StructuredDatasetMetadata,
        ) -> typing.Union[typing.Generator[pd.DataFrame, None, None]]:
            yield pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]})

    StructuredDatasetTransformerEngine.register(MockPandasDecodingHandlers(
        pd.DataFrame, "tmpfs"),
                                                default_for_type=False)
    sd = StructuredDataset()
    sd._literal_sd = literals.StructuredDataset(
        uri="tmpfs://somewhere",
        metadata=StructuredDatasetMetadata(StructuredDatasetType(format="")))
    assert isinstance(sd.open(pd.DataFrame).iter(), typing.Generator)

    with pytest.raises(ValueError):
        sd.open(pd.DataFrame).all()

    class MockPandasDecodingHandlers(StructuredDatasetDecoder):
        def decode(
            self,
            ctx: FlyteContext,
            flyte_value: literals.StructuredDataset,
            current_task_metadata: StructuredDatasetMetadata,
        ) -> pd.DataFrame:
            return pd.DataFrame({"Name": ["Tom", "Joseph"], "Age": [20, 22]})

    StructuredDatasetTransformerEngine.register(MockPandasDecodingHandlers(
        pd.DataFrame, "tmpfs"),
                                                default_for_type=False,
                                                override=True)
    sd = StructuredDataset()
    sd._literal_sd = literals.StructuredDataset(
        uri="tmpfs://somewhere",
        metadata=StructuredDatasetMetadata(StructuredDatasetType(format="")))

    with pytest.raises(ValueError):
        sd.open(pd.DataFrame).iter()
Пример #2
0
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     return literals.StructuredDataset(uri="")
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     return literals.StructuredDataset(
         uri="bq://bucket/key", metadata=StructuredDatasetMetadata(structured_dataset_type)
     )
Пример #4
0
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     return literals.StructuredDataset(
         uri="/tmp/avro",
         metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #5
0
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     path = typing.cast(str, structured_dataset.uri
                        ) or ctx.file_access.get_random_remote_directory()
     df = typing.cast(DataFrame, structured_dataset.dataframe)
     df.write.mode("overwrite").parquet(path)
     return literals.StructuredDataset(
         uri=path,
         metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #6
0
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     uri = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory()
     if not ctx.file_access.is_remote(uri):
         Path(uri).mkdir(parents=True, exist_ok=True)
     path = os.path.join(uri, f"{0:05}")
     fp = FSSpecPersistence(data_config=ctx.file_access.data_config)
     filesystem = fp.get_filesystem(path)
     pq.write_table(structured_dataset.dataframe, strip_protocol(path), filesystem=filesystem)
     return literals.StructuredDataset(uri=uri, metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #7
0
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     path = typing.cast(str, structured_dataset.uri
                        ) or ctx.file_access.get_random_remote_path()
     df = structured_dataset.dataframe
     local_dir = ctx.file_access.get_random_local_directory()
     local_path = os.path.join(local_dir, f"{0:05}")
     pq.write_table(df, local_path)
     ctx.file_access.upload_directory(local_dir, path)
     return literals.StructuredDataset(
         uri=path,
         metadata=StructuredDatasetMetadata(structured_dataset_type))
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     path = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory()
     df = typing.cast(np.ndarray, structured_dataset.dataframe)
     name = ["col" + str(i) for i in range(len(df))]
     table = pa.Table.from_arrays(df, name)
     local_dir = ctx.file_access.get_random_local_directory()
     local_path = os.path.join(local_dir, f"{0:05}")
     pq.write_table(table, local_path)
     ctx.file_access.upload_directory(local_dir, path)
     structured_dataset_type.format = PARQUET
     return literals.StructuredDataset(uri=path, metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #9
0
    def convert_to_structured_dataset(self,
                                      ctx: typing.Optional[click.Context],
                                      param: typing.Optional[click.Parameter],
                                      value: Directory) -> Literal:

        uri = self.get_uri_for_dir(value, "00000.parquet")

        lit = Literal(scalar=Scalar(
            structured_dataset=literals.StructuredDataset(
                uri=uri,
                metadata=literals.StructuredDatasetMetadata(
                    structured_dataset_type=self._literal_type.
                    structured_dataset_type),
            ), ), )

        return lit
Пример #10
0
 def encode(
     self,
     ctx: FlyteContext,
     structured_dataset: StructuredDataset,
     structured_dataset_type: StructuredDatasetType,
 ) -> literals.StructuredDataset:
     uri = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory()
     if not ctx.file_access.is_remote(uri):
         Path(uri).mkdir(parents=True, exist_ok=True)
     path = os.path.join(uri, f"{0:05}")
     df = typing.cast(pd.DataFrame, structured_dataset.dataframe)
     df.to_parquet(
         path,
         coerce_timestamps="us",
         allow_truncated_timestamps=False,
         storage_options=get_storage_options(ctx.file_access.data_config, path),
     )
     structured_dataset_type.format = PARQUET
     return literals.StructuredDataset(uri=uri, metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #11
0
    def encode(
        self,
        ctx: FlyteContext,
        structured_dataset: StructuredDataset,
        structured_dataset_type: StructuredDatasetType,
    ) -> literals.StructuredDataset:

        path = typing.cast(str, structured_dataset.uri
                           ) or ctx.file_access.get_random_remote_directory()
        df = typing.cast(pd.DataFrame, structured_dataset.dataframe)
        local_dir = ctx.file_access.get_random_local_directory()
        local_path = os.path.join(local_dir, f"{0:05}")
        df.to_parquet(local_path,
                      coerce_timestamps="us",
                      allow_truncated_timestamps=False)
        ctx.file_access.upload_directory(local_dir, path)
        structured_dataset_type.format = PARQUET
        return literals.StructuredDataset(
            uri=path,
            metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #12
0
def test_structured_dataset():
    my_cols = [
        _types.StructuredDatasetType.DatasetColumn(
            "a", _types.LiteralType(simple=_types.SimpleType.INTEGER)),
        _types.StructuredDatasetType.DatasetColumn(
            "b", _types.LiteralType(simple=_types.SimpleType.STRING)),
        _types.StructuredDatasetType.DatasetColumn(
            "c",
            _types.LiteralType(collection_type=_types.LiteralType(
                simple=_types.SimpleType.INTEGER))),
        _types.StructuredDatasetType.DatasetColumn(
            "d",
            _types.LiteralType(map_value_type=_types.LiteralType(
                simple=_types.SimpleType.INTEGER))),
    ]
    ds = literals.StructuredDataset(
        uri="s3://bucket",
        metadata=literals.StructuredDatasetMetadata(
            structured_dataset_type=_types.StructuredDatasetType(
                columns=my_cols, format="parquet")),
    )
    obj = literals.Scalar(structured_dataset=ds)
    assert obj.error is None
    assert obj.blob is None
    assert obj.binary is None
    assert obj.schema is None
    assert obj.none_type is None
    assert obj.structured_dataset is not None
    assert obj.value.uri == "s3://bucket"
    assert len(obj.value.metadata.structured_dataset_type.columns) == 4
    obj2 = literals.Scalar.from_flyte_idl(obj.to_flyte_idl())
    assert obj == obj2
    assert obj2.blob is None
    assert obj2.binary is None
    assert obj2.schema is None
    assert obj2.none_type is None
    assert obj2.structured_dataset is not None
    assert obj2.value.uri == "s3://bucket"
    assert len(obj2.value.metadata.structured_dataset_type.columns) == 4
Пример #13
0
    def encode(
        self,
        ctx: FlyteContext,
        structured_dataset: StructuredDataset,
        structured_dataset_type: StructuredDatasetType,
    ) -> literals.StructuredDataset:
        df = typing.cast(pl.DataFrame, structured_dataset.dataframe)

        local_dir = ctx.file_access.get_random_local_directory()
        local_path = f"{local_dir}/00000"

        # Polars 0.13.12 deprecated to_parquet in favor of write_parquet
        if hasattr(df, "write_parquet"):
            df.write_parquet(local_path)
        else:
            df.to_parquet(local_path)
        remote_dir = typing.cast(
            str, structured_dataset.uri
        ) or ctx.file_access.get_random_remote_directory()
        ctx.file_access.upload_directory(local_dir, remote_dir)
        return literals.StructuredDataset(
            uri=remote_dir,
            metadata=StructuredDatasetMetadata(structured_dataset_type))
Пример #14
0
    def to_python_value(self, ctx: FlyteContext, lv: Literal,
                        expected_python_type: Type[T]) -> T:
        """
        The only tricky thing with converting a Literal (say the output of an earlier task), to a Python value at
        the start of a task execution, is the column subsetting behavior. For example, if you have,

        def t1() -> Annotated[StructuredDataset, kwtypes(col_a=int, col_b=float)]: ...
        def t2(in_a: Annotated[StructuredDataset, kwtypes(col_b=float)]): ...

        where t2(in_a=t1()), when t2 does in_a.open(pd.DataFrame).all(), it should get a DataFrame
        with only one column.

        +-----------------------------+-----------------------------------------+--------------------------------------+
        |                             |          StructuredDatasetType of the incoming Literal                         |
        +-----------------------------+-----------------------------------------+--------------------------------------+
        | StructuredDatasetType       | Has columns defined                     |  [] columns or None                  |
        | of currently running task   |                                         |                                      |
        +=============================+=========================================+======================================+
        |    Has columns              | The StructuredDatasetType passed to the decoder will have the columns          |
        |    defined                  | as defined by the type annotation of the currently running task.               |
        |                             |                                                                                |
        |                             | Decoders **should** then subset the incoming data to the columns requested.    |
        |                             |                                                                                |
        +-----------------------------+-----------------------------------------+--------------------------------------+
        |   [] columns or None        | StructuredDatasetType passed to decoder | StructuredDatasetType passed to the  |
        |                             | will have the columns from the incoming | decoder will have an empty list of   |
        |                             | Literal. This is the scenario where     | columns.                             |
        |                             | the Literal returned by the running     |                                      |
        |                             | task will have more information than    |                                      |
        |                             | the running task's signature.           |                                      |
        +-----------------------------+-----------------------------------------+--------------------------------------+
        """
        # Detect annotations and extract out all the relevant information that the user might supply
        expected_python_type, column_dict, storage_fmt, pa_schema = extract_cols_and_format(
            expected_python_type)

        # The literal that we get in might be an old FlyteSchema.
        # We'll continue to support this for the time being. There is some duplicated logic here but let's
        # keep it copy/pasted for clarity
        if lv.scalar.schema is not None:
            schema_columns = lv.scalar.schema.type.columns

            # See the repeated logic below for comments
            if column_dict is None or len(column_dict) == 0:
                final_dataset_columns = []
                if schema_columns is not None and schema_columns != []:
                    for c in schema_columns:
                        final_dataset_columns.append(
                            StructuredDatasetType.DatasetColumn(
                                name=c.name,
                                literal_type=LiteralType(
                                    simple=
                                    convert_schema_type_to_structured_dataset_type(
                                        c.type), ),
                            ))
                # Dataframe will always be serialized to parquet file by FlyteSchema transformer
                new_sdt = StructuredDatasetType(columns=final_dataset_columns,
                                                format=PARQUET)
            else:
                final_dataset_columns = self._convert_ordered_dict_of_columns_to_list(
                    column_dict)
                # Dataframe will always be serialized to parquet file by FlyteSchema transformer
                new_sdt = StructuredDatasetType(columns=final_dataset_columns,
                                                format=PARQUET)

            metad = literals.StructuredDatasetMetadata(
                structured_dataset_type=new_sdt)
            sd_literal = literals.StructuredDataset(
                uri=lv.scalar.schema.uri,
                metadata=metad,
            )

            if issubclass(expected_python_type, StructuredDataset):
                sd = StructuredDataset(dataframe=None, metadata=metad)
                sd._literal_sd = sd_literal
                return sd
            else:
                return self.open_as(ctx, sd_literal, expected_python_type,
                                    metad)

        # Start handling for StructuredDataset scalars, first look at the columns
        incoming_columns = lv.scalar.structured_dataset.metadata.structured_dataset_type.columns

        # If the incoming literal, also doesn't have columns, then we just have an empty list, so initialize here
        final_dataset_columns = []
        # If the current running task's input does not have columns defined, or has an empty list of columns
        if column_dict is None or len(column_dict) == 0:
            # but if it does, then we just copy it over
            if incoming_columns is not None and incoming_columns != []:
                final_dataset_columns = incoming_columns.copy()
        # If the current running task's input does have columns defined
        else:
            final_dataset_columns = self._convert_ordered_dict_of_columns_to_list(
                column_dict)

        new_sdt = StructuredDatasetType(
            columns=final_dataset_columns,
            format=lv.scalar.structured_dataset.metadata.
            structured_dataset_type.format,
            external_schema_type=lv.scalar.structured_dataset.metadata.
            structured_dataset_type.external_schema_type,
            external_schema_bytes=lv.scalar.structured_dataset.metadata.
            structured_dataset_type.external_schema_bytes,
        )
        metad = StructuredDatasetMetadata(structured_dataset_type=new_sdt)

        # A StructuredDataset type, for example
        #   t1(input_a: StructuredDataset)  # or
        #   t1(input_a: Annotated[StructuredDataset, my_cols])
        if issubclass(expected_python_type, StructuredDataset):
            sd = expected_python_type(
                dataframe=None,
                # Note here that the type being passed in
                metadata=metad,
            )
            sd._literal_sd = lv.scalar.structured_dataset
            sd.file_format = metad.structured_dataset_type.format
            return sd

        # If the requested type was not a StructuredDataset, then it means it was a plain dataframe type, which means
        # we should do the opening/downloading and whatever else it might entail right now. No iteration option here.
        return self.open_as(ctx,
                            lv.scalar.structured_dataset,
                            df_type=expected_python_type,
                            updated_metadata=metad)
Пример #15
0
    def to_literal(
        self,
        ctx: FlyteContext,
        python_val: Union[StructuredDataset, typing.Any],
        python_type: Union[Type[StructuredDataset], Type],
        expected: LiteralType,
    ) -> Literal:
        # Make a copy in case we need to hand off to encoders, since we can't be sure of mutations.
        # Check first to see if it's even an SD type. For backwards compatibility, we may be getting a FlyteSchema
        python_type, *attrs = extract_cols_and_format(python_type)
        # In case it's a FlyteSchema
        sdt = StructuredDatasetType(
            format=self.DEFAULT_FORMATS.get(python_type, None))

        if expected and expected.structured_dataset_type:
            sdt = StructuredDatasetType(
                columns=expected.structured_dataset_type.columns,
                format=expected.structured_dataset_type.format,
                external_schema_type=expected.structured_dataset_type.
                external_schema_type,
                external_schema_bytes=expected.structured_dataset_type.
                external_schema_bytes,
            )

        # If the type signature has the StructuredDataset class, it will, or at least should, also be a
        # StructuredDataset instance.
        if issubclass(python_type, StructuredDataset) and isinstance(
                python_val, StructuredDataset):
            # There are three cases that we need to take care of here.

            # 1. A task returns a StructuredDataset that was just a passthrough input. If this happens
            # then return the original literals.StructuredDataset without invoking any encoder
            #
            # Ex.
            #   def t1(dataset: Annotated[StructuredDataset, my_cols]) -> Annotated[StructuredDataset, my_cols]:
            #       return dataset
            if python_val._literal_sd is not None:
                if python_val.dataframe is not None:
                    raise ValueError(
                        f"Shouldn't have specified both literal {python_val._literal_sd} and dataframe {python_val.dataframe}"
                    )
                return Literal(scalar=Scalar(
                    structured_dataset=python_val._literal_sd))

            # 2. A task returns a python StructuredDataset with a uri.
            # Note: this case is also what happens we start a local execution of a task with a python StructuredDataset.
            #  It gets converted into a literal first, then back into a python StructuredDataset.
            #
            # Ex.
            #   def t2(uri: str) -> Annotated[StructuredDataset, my_cols]
            #       return StructuredDataset(uri=uri)
            if python_val.dataframe is None:
                if not python_val.uri:
                    raise ValueError(
                        f"If dataframe is not specified, then the uri should be specified. {python_val}"
                    )
                sd_model = literals.StructuredDataset(
                    uri=python_val.uri,
                    metadata=StructuredDatasetMetadata(
                        structured_dataset_type=sdt),
                )
                return Literal(scalar=Scalar(structured_dataset=sd_model))

            # 3. This is the third and probably most common case. The python StructuredDataset object wraps a dataframe
            # that we will need to invoke an encoder for. Figure out which encoder to call and invoke it.
            df_type = type(python_val.dataframe)
            if python_val.uri is None:
                protocol = self.DEFAULT_PROTOCOLS[df_type]
            else:
                protocol = protocol_prefix(python_val.uri)
            return self.encode(
                ctx,
                python_val,
                df_type,
                protocol,
                sdt.format or typing.cast(StructuredDataset,
                                          python_val).DEFAULT_FILE_FORMAT,
                sdt,
            )

        # Otherwise assume it's a dataframe instance. Wrap it with some defaults
        fmt = self.DEFAULT_FORMATS[python_type]
        protocol = self.DEFAULT_PROTOCOLS[python_type]
        meta = StructuredDatasetMetadata(
            structured_dataset_type=expected.
            structured_dataset_type if expected else None)

        sd = StructuredDataset(dataframe=python_val, metadata=meta)
        return self.encode(ctx, sd, python_type, protocol, fmt, sdt)