def test_data_type_ops(self): _mock_spark_type = DataType() _mock_dtype = ExtensionDtype() _mappings = ( (CategoricalDtype(), _mock_spark_type, CategoricalOps), (_mock_dtype, DecimalType(), DecimalOps), (_mock_dtype, FractionalType(), FractionalOps), (_mock_dtype, IntegralType(), IntegralOps), (_mock_dtype, StringType(), StringOps), (_mock_dtype, BooleanType(), BooleanOps), (_mock_dtype, TimestampType(), DatetimeOps), (_mock_dtype, TimestampNTZType(), DatetimeNTZOps), (_mock_dtype, DateType(), DateOps), (_mock_dtype, DayTimeIntervalType(), TimedeltaOps), (_mock_dtype, BinaryType(), BinaryOps), (_mock_dtype, ArrayType(StringType()), ArrayOps), (_mock_dtype, MapType(StringType(), IntegralType()), MapOps), (_mock_dtype, StructType(), StructOps), (_mock_dtype, NullType(), NullOps), (_mock_dtype, UserDefinedType(), UDTOps), ) for _dtype, _spark_type, _ops in _mappings: self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops) _unknow_spark_type = _mock_spark_type self.assertRaises(TypeError, DataTypeOps, BooleanType(), _unknow_spark_type)
def test_create_data_frame_to_pandas_timestamp_ntz(self): # SPARK-36626: Test TimestampNTZ in createDataFrame and toPandas with self.sql_conf({"spark.sql.session.timeZone": "America/Los_Angeles"}): origin = pd.DataFrame({"a": [datetime.datetime(2012, 2, 2, 2, 2, 2)]}) df = self.spark.createDataFrame( origin, schema=StructType([StructField("a", TimestampNTZType(), True)])) df.selectExpr("assert_true('2012-02-02 02:02:02' == CAST(a AS STRING))").collect() pdf, pdf_arrow = self._toPandas_arrow_toggle(df) assert_frame_equal(origin, pdf) assert_frame_equal(pdf, pdf_arrow)
def _to_pandas(self): from datetime import datetime, date, timedelta schema = ( StructType() .add("a", IntegerType()) .add("b", StringType()) .add("c", BooleanType()) .add("d", FloatType()) .add("dt", DateType()) .add("ts", TimestampType()) .add("ts_ntz", TimestampNTZType()) .add("dt_interval", DayTimeIntervalType()) ) data = [ ( 1, "foo", True, 3.0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), datetime(1969, 1, 1, 1, 1, 1), timedelta(days=1), ), (2, "foo", True, 5.0, None, None, None, None), ( 3, "bar", False, -1.0, date(2012, 3, 3), datetime(2012, 3, 3, 3, 3, 3), datetime(2012, 3, 3, 3, 3, 3), timedelta(hours=-1, milliseconds=421), ), ( 4, "bar", False, 6.0, date(2100, 4, 4), datetime(2100, 4, 4, 4, 4, 4), datetime(2100, 4, 4, 4, 4, 4), timedelta(microseconds=123), ), ] df = self.spark.createDataFrame(data, schema) return df.toPandas()
def test_udf_timestamp_ntz(self): # SPARK-36626: Test TimestampNTZ in Python UDF @udf(TimestampNTZType()) def noop(x): assert x == datetime.datetime(1970, 1, 1, 0, 0) return x with self.sql_conf({"spark.sql.session.timeZone": "Pacific/Honolulu"}): df = self.spark.createDataFrame( [(datetime.datetime(1970, 1, 1, 0, 0),)], schema="dt timestamp_ntz" ).select(noop("dt").alias("dt")) df.selectExpr("assert_true('1970-01-01 00:00:00' == CAST(dt AS STRING))").collect() self.assertEqual(df.schema[0].dataType.typeName(), "timestamp_ntz") self.assertEqual(df.first()[0], datetime.datetime(1970, 1, 1, 0, 0))
def _to_pandas(self): from datetime import datetime, date schema = StructType().add("a", IntegerType()).add("b", StringType())\ .add("c", BooleanType()).add("d", FloatType())\ .add("dt", DateType()).add("ts", TimestampType())\ .add("ts_ntz", TimestampNTZType()) data = [ (1, "foo", True, 3.0, date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), datetime(1969, 1, 1, 1, 1, 1)), (2, "foo", True, 5.0, None, None, None), (3, "bar", False, -1.0, date(2012, 3, 3), datetime(2012, 3, 3, 3, 3, 3), datetime(2012, 3, 3, 3, 3, 3)), (4, "bar", False, 6.0, date(2100, 4, 4), datetime(2100, 4, 4, 4, 4, 4), datetime(2100, 4, 4, 4, 4, 4)), ] df = self.spark.createDataFrame(data, schema) return df.toPandas()
def test_toPandas_empty_df_arrow_enabled(self): # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes # when arrow is enabled from datetime import date from decimal import Decimal schema = StructType([ StructField("a", StringType(), True), StructField("a", IntegerType(), True), StructField("c", TimestampType(), True), StructField("d", NullType(), True), StructField("e", LongType(), True), StructField("f", FloatType(), True), StructField("g", DateType(), True), StructField("h", BinaryType(), True), StructField("i", DecimalType(38, 18), True), StructField("k", TimestampNTZType(), True), StructField("L", DayTimeIntervalType(0, 3), True), ]) df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema=schema) non_empty_df = self.spark.createDataFrame( [( "a", 1, datetime.datetime(1969, 1, 1, 1, 1, 1), None, 10, 0.2, date(1969, 1, 1), bytearray(b"a"), Decimal("2.0"), datetime.datetime(1969, 1, 1, 1, 1, 1), datetime.timedelta(microseconds=123), )], schema=schema, ) pdf, pdf_arrow = self._toPandas_arrow_toggle(df) pdf_non_empty, pdf_arrow_non_empty = self._toPandas_arrow_toggle( non_empty_df) assert_frame_equal(pdf, pdf_arrow) self.assertTrue(pdf_arrow.dtypes.equals(pdf_arrow_non_empty.dtypes)) self.assertTrue(pdf_arrow.dtypes.equals(pdf_non_empty.dtypes))
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() # type: DataType elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError( "MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type