Пример #1
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Пример #2
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
Пример #3
0
    def test_to_table_nullable(self):
        boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype())
        int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype())
        int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype())
        int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype())
        int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype())
        float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype())
        double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype())
        string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype())
        object_array = pd.array([pd.NA, "s22", None], dtype=object)

        df = pd.DataFrame({
            "NullableBoolean": boolean_array,
            "NullableInt8": int8_array,
            "NullableInt16": int16_array,
            "NullableInt32": int32_array,
            "NullableInt64": int64_array,
            "NullableFloat": float_array,
            "NullableDouble": double_array,
            "NullableString": string_array,
            "NullableObject": object_array,
        })

        table = to_table(df)
        self.assertIs(table.columns[0].data_type, dtypes.bool_)
        self.assertIs(table.columns[1].data_type, dtypes.int8)
        self.assertIs(table.columns[2].data_type, dtypes.int16)
        self.assertIs(table.columns[3].data_type, dtypes.int32)
        self.assertIs(table.columns[4].data_type, dtypes.int64)
        self.assertIs(table.columns[5].data_type, dtypes.float32)
        self.assertIs(table.columns[6].data_type, dtypes.double)
        self.assertIs(table.columns[7].data_type, dtypes.string)
        self.assertIs(table.columns[8].data_type, dtypes.PyObject)
        self.assertEqual(table.size, 3)
        table_string = table.to_string()
        self.assertEqual(9, table_string.count("null"))
Пример #4
0
                    "foo buzz",
                    "",
                    None,
                    "rapids ai",
                ],
                dtype=pd.StringDtype(),
            ),
        ),
        (
            cudf.Series(
                [1, 2, None, 10.2, None],
                dtype="float32",
            ),
            pd.Series(
                [1, 2, None, 10.2, None],
                dtype=pd.Float32Dtype(),
            ),
        ),
    ],
)
def test_series_to_pandas_nullable_dtypes(sr, expected_psr):
    actual_psr = sr.to_pandas(nullable=True)

    assert_eq(actual_psr, expected_psr)


def test_series_pipe():
    psr = pd.Series([10, 20, 30, 40])
    gsr = cudf.Series([10, 20, 30, 40])

    def custom_add_func(sr, val):
Пример #5
0
pandas_dtypes_to_cudf_dtypes = {
    pd.UInt8Dtype(): np.dtype("uint8"),
    pd.UInt16Dtype(): np.dtype("uint16"),
    pd.UInt32Dtype(): np.dtype("uint32"),
    pd.UInt64Dtype(): np.dtype("uint64"),
    pd.Int8Dtype(): np.dtype("int8"),
    pd.Int16Dtype(): np.dtype("int16"),
    pd.Int32Dtype(): np.dtype("int32"),
    pd.Int64Dtype(): np.dtype("int64"),
    pd.BooleanDtype(): np.dtype("bool_"),
    pd.StringDtype(): np.dtype("object"),
}

if PANDAS_GE_120:
    cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype()
    cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype()
    pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32")
    pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64")

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {
    "datetime64[s]",
    "datetime64[ms]",
    "datetime64[us]",
    "datetime64[ns]",
Пример #6
0
                [
                    "abc",
                    "a",
                    None,
                    "hello world",
                    "foo buzz",
                    "",
                    None,
                    "rapids ai",
                ],
                dtype=pd.StringDtype(),
            ),
        ),
        (
            cudf.Series([1, 2, None, 10.2, None], dtype="float32",),
            pd.Series([1, 2, None, 10.2, None], dtype=pd.Float32Dtype(),),
        ),
    ],
)
def test_series_to_pandas_nullable_dtypes(sr, expected_psr):
    actual_psr = sr.to_pandas(nullable=True)

    assert_eq(actual_psr, expected_psr)


def test_series_pipe():
    psr = pd.Series([10, 20, 30, 40])
    gsr = cudf.Series([10, 20, 30, 40])

    def custom_add_func(sr, val):
        new_sr = sr + val
Пример #7
0
    class FLOAT32(FLOAT64):
        """Semantic representation of a :class:`pandas.Float32Dtype`."""

        type = pd.Float32Dtype()
        bit_width: int = 32
Пример #8
0
    builtin_name="float",
    pandera_name="Float",
    sizes=[128, 64, 32, 16] if FLOAT_128_AVAILABLE else [64, 32, 16],
)

if PANDAS_1_2_0_PLUS:

    @Engine.register_dtype(equivalents=[pd.Float64Dtype, pd.Float64Dtype()])
    @immutable
    class FLOAT64(DataType, dtypes.Float):
        """Semantic representation of a :class:`pandas.Float64Dtype`."""

        type = pd.Float64Dtype()
        bit_width: int = 64

    @Engine.register_dtype(equivalents=[pd.Float32Dtype, pd.Float32Dtype()])
    @immutable
    class FLOAT32(FLOAT64):
        """Semantic representation of a :class:`pandas.Float32Dtype`."""

        type = pd.Float32Dtype()
        bit_width: int = 32


# ###############################################################################
# # complex
# ###############################################################################

_register_numpy_numbers(
    builtin_name="complex",
    pandera_name="Complex",
Пример #9
0
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

if FLOAT_NAN_IMPLEMENTED:  # pragma: no cover
    _PYTHON_TO_SQL.update({
        pd.Float32Dtype(): SqlTypeName.FLOAT,
        pd.Float64Dtype(): SqlTypeName.FLOAT
    })

# Default mapping between SQL types and python types
# for values
_SQL_TO_PYTHON_SCALARS = {
    "DOUBLE": np.float64,
    "FLOAT": np.float32,
    "DECIMAL": np.float32,
    "BIGINT": np.int64,
    "INTEGER": np.int32,
    "SMALLINT": np.int16,
    "TINYINT": np.int8,
    "BOOLEAN": np.bool8,
    "VARCHAR": str,
Пример #10
0
class DataMapping:
    """
    Map primary data between different supported data frameworks, preserving equivalent data types.

    DataMapping is for primary data, to map metadata types and values use
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and
    :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`.
    """

    __log = _util.logger_for_namespace(_DataInternal.__module__ +
                                       ".DataMapping")

    # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data

    __TRAC_DECIMAL_PRECISION = 38
    __TRAC_DECIMAL_SCALE = 12
    __TRAC_TIMESTAMP_UNIT = "ms"
    __TRAC_TIMESTAMP_ZONE = None

    __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
        _meta.BasicType.BOOLEAN:
        pa.bool_(),
        _meta.BasicType.INTEGER:
        pa.int64(),
        _meta.BasicType.FLOAT:
        pa.float64(),
        _meta.BasicType.DECIMAL:
        pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
        _meta.BasicType.STRING:
        pa.utf8(),
        _meta.BasicType.DATE:
        pa.date32(),
        _meta.BasicType.DATETIME:
        pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
    }

    # Check the Pandas dtypes for handling floats are available before setting up the type mapping
    __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check()
    __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype

    # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
    __ARROW_TO_PANDAS_TYPE_MAPPING = {
        pa.bool_(): pd.BooleanDtype(),
        pa.int8(): pd.Int8Dtype(),
        pa.int16(): pd.Int16Dtype(),
        pa.int32(): pd.Int32Dtype(),
        pa.int64(): pd.Int64Dtype(),
        pa.uint8(): pd.UInt8Dtype(),
        pa.uint16(): pd.UInt16Dtype(),
        pa.uint32(): pd.UInt32Dtype(),
        pa.uint64(): pd.UInt64Dtype(),
        pa.float16(): pd.Float32Dtype(),
        pa.float32(): pd.Float32Dtype(),
        pa.float64(): pd.Float64Dtype(),
        pa.utf8(): pd.StringDtype()
    }

    @staticmethod
    def arrow_to_python_type(arrow_type: pa.DataType) -> type:

        if pa.types.is_boolean(arrow_type):
            return bool

        if pa.types.is_integer(arrow_type):
            return int

        if pa.types.is_floating(arrow_type):
            return float

        if pa.types.is_decimal(arrow_type):
            return decimal.Decimal

        if pa.types.is_string(arrow_type):
            return str

        if pa.types.is_date(arrow_type):
            return dt.date

        if pa.types.is_timestamp(arrow_type):
            return dt.datetime

        raise _ex.ETracInternal(
            f"No Python type mapping available for Arrow type [{arrow_type}]")

    @classmethod
    def python_to_arrow_type(cls, python_type: type) -> pa.DataType:

        if python_type == bool:
            return pa.bool_()

        if python_type == int:
            return pa.int64()

        if python_type == float:
            return pa.float64()

        if python_type == decimal.Decimal:
            return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                                 cls.__TRAC_DECIMAL_SCALE)

        if python_type == str:
            return pa.utf8()

        if python_type == dt.date:
            return pa.date32()

        if python_type == dt.datetime:
            return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT,
                                cls.__TRAC_TIMESTAMP_ZONE)

        raise _ex.ETracInternal(
            f"No Arrow type mapping available for Python type [{python_type}]")

    @classmethod
    def trac_to_arrow_type(cls,
                           trac_type: _meta.TypeDescriptor) -> pa.DataType:

        return cls.trac_to_arrow_basic_type(trac_type.basicType)

    @classmethod
    def trac_to_arrow_basic_type(
            cls, trac_basic_type: _meta.BasicType) -> pa.DataType:

        arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get(
            trac_basic_type)

        if arrow_type is None:
            raise _ex.ETracInternal(
                f"No Arrow type mapping available for TRAC type [{trac_basic_type}]"
            )

        return arrow_type

    @classmethod
    def trac_to_arrow_schema(cls,
                             trac_schema: _meta.SchemaDefinition) -> pa.Schema:

        if trac_schema.schemaType != _meta.SchemaType.TABLE:
            raise _ex.ETracInternal(
                f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow"
            )

        arrow_fields = [(f.fieldName,
                         cls.trac_to_arrow_basic_type(f.fieldType))
                        for f in trac_schema.table.fields]

        return pa.schema(arrow_fields, metadata={})

    @classmethod
    def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:

        return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION,
                             cls.__TRAC_DECIMAL_SCALE)

    @classmethod
    def pandas_datetime_type(cls):
        return cls.__PANDAS_DATETIME_TYPE

    @classmethod
    def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame:

        deltas = view.parts.get(part)

        # Sanity checks

        if not view.arrow_schema:
            raise _ex.ETracInternal(f"Data view schema not set")

        if not deltas:
            raise _ex.ETracInternal(
                f"Data view for part [{part.opaque_key}] does not contain any items"
            )

        if len(deltas) == 1:
            return cls.item_to_pandas(deltas[0])

        batches = {
            batch
            for delta in deltas for batch in (
                delta.batches if delta.batches else delta.table.to_batches())
        }

        table = pa.Table.from_batches(batches)  # noqa
        return table.to_pandas()

    @classmethod
    def item_to_pandas(cls, item: DataItem) -> pd.DataFrame:

        if item.pandas is not None:
            return item.pandas.copy()

        if item.table is not None:
            return cls.arrow_to_pandas(item.table)

        if item.batches is not None:
            table = pa.Table.from_batches(item.batches, item.schema)  # noqa
            return cls.arrow_to_pandas(table)

        raise _ex.ETracInternal(f"Data item does not contain any usable data")

    @classmethod
    def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame:

        return table.to_pandas(
            ignore_metadata=True,  # noqa
            date_as_object=False,  # noqa
            timestamp_as_object=False,  # noqa
            types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get)

    @classmethod
    def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView,
                       part: DataPartKey):

        item = cls.pandas_to_item(df, prior_view.arrow_schema)
        return cls.add_item_to_view(prior_view, part, item)

    @classmethod
    def pandas_to_item(cls, df: pd.DataFrame,
                       schema: tp.Optional[pa.Schema]) -> DataItem:

        table = cls.pandas_to_arrow(df, schema)
        return DataItem(table.schema, table)

    @classmethod
    def pandas_to_arrow(cls,
                        df: pd.DataFrame,
                        schema: tp.Optional[pa.Schema] = None) -> pa.Table:

        # Here we convert the whole Pandas df and then pass it to conformance
        # An optimization would be to filter columns before applying conformance
        # To do this, we'd need the case-insensitive field matching logic, including output of warnings

        # Also, note that schema is not applied in from_pandas
        # This is because the conformance logic allows for a wider range of conversions
        # Applying the schema directly would fail for some types where casting is possible

        if len(df) == 0:
            df_schema = pa.Schema.from_pandas(df, preserve_index=False)  # noqa
            table = pa.Table.from_batches(list(), df_schema)  # noqa
        else:
            table = pa.Table.from_pandas(df, preserve_index=False)  # noqa

        # If there is no explict schema, give back the table exactly as it was received from Pandas
        # There could be an option here to coerce types to the appropriate TRAC standard types
        # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type

        if schema is None:
            return table
        else:
            return DataConformance.conform_to_schema(table, schema, df.dtypes)

    @classmethod
    def add_item_to_view(cls, view: DataView, part: DataPartKey,
                         item: DataItem) -> DataView:

        prior_deltas = view.parts.get(part) or list()
        deltas = [*prior_deltas, item]
        parts = {**view.parts, part: deltas}

        return DataView(view.trac_schema, view.arrow_schema, parts)