def test_as_spark_type_extension_dtypes(self): from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype type_mapper = { Int8Dtype(): ByteType(), Int16Dtype(): ShortType(), Int32Dtype(): IntegerType(), Int64Dtype(): LongType(), } for extension_dtype, spark_type in type_mapper.items(): self.assertEqual(as_spark_type(extension_dtype), spark_type) self.assertEqual(koalas_dtype(extension_dtype), (extension_dtype, spark_type))
def integral_extension_dtypes(self): return ( [ "Int8", "Int16", "Int32", "Int64", Int8Dtype(), Int16Dtype(), Int32Dtype(), Int64Dtype(), ] if extension_dtypes_available else [] )
def spark_type_to_pandas_dtype( spark_type: types.DataType, *, use_extension_dtypes: bool = False ) -> Dtype: """Return the given Spark DataType to pandas dtype.""" if use_extension_dtypes and extension_dtypes_available: # IntegralType if isinstance(spark_type, types.ByteType): return Int8Dtype() elif isinstance(spark_type, types.ShortType): return Int16Dtype() elif isinstance(spark_type, types.IntegerType): return Int32Dtype() elif isinstance(spark_type, types.LongType): return Int64Dtype() if extension_object_dtypes_available: # BooleanType if isinstance(spark_type, types.BooleanType): return BooleanDtype() # StringType elif isinstance(spark_type, types.StringType): return StringDtype() # FractionalType if extension_float_dtypes_available: if isinstance(spark_type, types.FloatType): return Float32Dtype() elif isinstance(spark_type, types.DoubleType): return Float64Dtype() if isinstance( spark_type, ( types.DateType, types.NullType, types.ArrayType, types.MapType, types.StructType, types.UserDefinedType, ), ): return np.dtype("object") elif isinstance(spark_type, types.TimestampType): return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
def convert_dtype(schema: Dict[str, str], data: DataFrame) -> DataFrame: """Convert all columns in `data` to the appropriate dtype according to `schema`.""" df = DataFrame(index=data.index) for column_name, dtype in schema.items(): if column_name not in data.columns: continue elif dtype == "str": df[column_name] = data[column_name] elif dtype == "float": apply_func = partial(nullable_method_call, float, print_exc=False) df[column_name] = data[column_name].apply(apply_func).astype(float) elif dtype == "int": apply_func = partial(nullable_method_call, int, print_exc=False) df[column_name] = data[column_name].apply(apply_func).astype( Int32Dtype()) else: raise TypeError(f"Unknown dtype {dtype}") return df
""" pybbda data module some data, and blah """ from pandas import Int32Dtype from pybbda.data.sources.lahman.data import LahmanData from pybbda.data.sources.baseball_reference.data import BaseballReferenceData from pybbda.data.sources.retrosheet.data import RetrosheetData from pybbda.data.sources.fangraphs.data import FangraphsData from pybbda.data.sources.statcast.data import StatcastData nullable_int = Int32Dtype() __all__ = [ "LahmanData", "BaseballReferenceData", "RetrosheetData", "FangraphsData", "StatcastData", ]