示例#1
0
    def test_assert_true(self):
        from pyspark.sql.functions import assert_true

        df = self.spark.range(3)

        self.assertEquals(
            df.select(assert_true(df.id < 3)).toDF("val").collect(),
            [Row(val=None), Row(val=None),
             Row(val=None)],
        )

        with self.assertRaises(Py4JJavaError) as cm:
            df.select(assert_true(df.id < 2, 'too big')).toDF("val").collect()
        self.assertIn("java.lang.RuntimeException", str(cm.exception))
        self.assertIn("too big", str(cm.exception))

        with self.assertRaises(Py4JJavaError) as cm:
            df.select(assert_true(df.id < 2,
                                  df.id * 1e6)).toDF("val").collect()
        self.assertIn("java.lang.RuntimeException", str(cm.exception))
        self.assertIn("2000000", str(cm.exception))

        with self.assertRaises(TypeError) as cm:
            df.select(assert_true(df.id < 2, 5))
        self.assertEquals(
            "errMsg should be a Column or a str, got <class 'int'>",
            str(cm.exception))
示例#2
0
    def test_create_data_frame_to_pandas_day_time_internal(self):
        # SPARK-37279: Test DayTimeInterval in createDataFrame and toPandas
        origin = pd.DataFrame({"a": [datetime.timedelta(microseconds=123)]})
        df = self.spark.createDataFrame(origin)
        df.select(
            assert_true(
                lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.a.cast(
                    "string"))).collect()

        pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
        assert_frame_equal(origin, pdf)
        assert_frame_equal(pdf, pdf_arrow)
示例#3
0
    def test_udf_daytime_interval(self):
        # SPARK-37277: Support DayTimeIntervalType in Python UDF
        @udf(DayTimeIntervalType(DayTimeIntervalType.DAY, DayTimeIntervalType.SECOND))
        def noop(x):
            assert x == datetime.timedelta(microseconds=123)
            return x

        df = self.spark.createDataFrame(
            [(datetime.timedelta(microseconds=123),)], schema="td interval day to second"
        ).select(noop("td").alias("td"))

        df.select(
            assert_true(lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.td.cast("string"))
        ).collect()
        self.assertEqual(df.schema[0].dataType.simpleString(), "interval day to second")
        self.assertEqual(df.first()[0], datetime.timedelta(microseconds=123))
示例#4
0
    def test_pandas_udf_day_time_interval_type(self):
        # SPARK-37277: Test DayTimeIntervalType in pandas UDF
        import pandas as pd

        @pandas_udf(DayTimeIntervalType(DayTimeIntervalType.DAY, DayTimeIntervalType.SECOND))
        def noop(s: pd.Series) -> pd.Series:
            assert s.iloc[0] == datetime.timedelta(microseconds=123)
            return s

        df = self.spark.createDataFrame(
            [(datetime.timedelta(microseconds=123),)], schema="td interval day to second"
        ).select(noop("td").alias("td"))

        df.select(
            assert_true(lit("INTERVAL '0 00:00:00.000123' DAY TO SECOND") == df.td.cast("string"))
        ).collect()
        self.assertEqual(df.schema[0].dataType.simpleString(), "interval day to second")
        self.assertEqual(df.first()[0], datetime.timedelta(microseconds=123))
示例#5
0
 def test_shiftrightunsigned(self):
     self.spark.range(10).select(
         assert_true(
             shiftRightUnsigned(col("id"), 2) == shiftrightunsigned(
                 col("id"), 2))).collect()
示例#6
0
 def test_shiftleft(self):
     self.spark.range(10).select(
         assert_true(
             shiftLeft(col("id"), 2) == shiftleft(col("id"), 2))).collect()
示例#7
0
 def test_sum_distinct(self):
     self.spark.range(10).select(
         assert_true(
             sum_distinct(col("id")) == sumDistinct(col("id")))).collect()