def test_map_expr_literal_keys_dupe_exception(): data_gen = [('a', StringGen(nullable=False)), ('b', StringGen(nullable=False))] assert_gpu_and_cpu_error( lambda spark : gen_df(spark, data_gen).selectExpr( 'map("key1", b, "key1", a) as m1').collect(), conf={'spark.sql.mapKeyDedupPolicy':'EXCEPTION'}, error_message = "Duplicate map key")
def test_cast_string_to_day_time_interval_exception(invalid_string): dtType = DayTimeIntervalType(0, 3) def fun(spark): data=[invalid_string] df = spark.createDataFrame(data, StringType()) return df.select(f.col('value').cast(dtType)).collect() assert_gpu_and_cpu_error(fun, {}, "java.lang.IllegalArgumentException")
def test_regexp_extract_idx_negative(): gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", -1)').collect(), error_message = "The specified group index cannot be less than zero", conf=_regexp_conf)
def test_sequence_too_long_sequence(stop_gen): assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. lambda spark: unary_op_df(spark, stop_gen, 1).selectExpr( "sequence(0, a)").collect(), conf={}, error_message="Too long sequence")
def test_simple_get_map_value_ansi_fail(data_gen): message = "org.apache.spark.SparkNoSuchElementException" if is_databricks104_or_later() else "java.util.NoSuchElementException" assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'a["NOT_FOUND"]').collect(), conf=ansi_enabled_conf, error_message=message)
def test_regexp_extract_idx_out_of_bounds(): gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", 4)').collect(), error_message = "Regex group count is 3, but the specified group index is 4", conf=_regexp_conf)
def test_array_element_at_ansi_fail(data_gen): assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).select( element_at(col('a'), 100)).collect(), conf={ 'spark.sql.ansi.enabled': True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True }, error_message='java.lang.ArrayIndexOutOfBoundsException')
def test_map_element_at_ansi_fail(data_gen): message = "org.apache.spark.SparkNoSuchElementException" if (not is_before_spark_330() or is_databricks104_or_later()) else "java.util.NoSuchElementException" # For 3.3.0+ strictIndexOperator should not affect element_at test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'}) assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'element_at(a, "NOT_FOUND")').collect(), conf=test_conf, error_message=message)
def test_dateaddinterval_ansi_exception(): assert_gpu_and_cpu_error( # specify the `seconds` lambda spark: unary_op_df( spark, DateGen(start=date(200, 1, 1), end=date(800, 1, 1)), seed=1) .selectExpr('a + (interval {} days {} seconds)'.format(1, 5)).collect( ), conf=copy_and_update(ansi_enabled_conf, legacy_interval_enabled_conf), error_message="IllegalArgumentException")
def test_map_element_at_ansi_fail(data_gen): assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'element_at(a, "NOT_FOUND")').collect(), conf={ 'spark.sql.ansi.enabled': True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True }, error_message='java.util.NoSuchElementException')
def test_array_item_ansi_fail_invalid_index(index): message = "SparkArrayIndexOutOfBoundsException" if is_databricks104_or_later() else "java.lang.ArrayIndexOutOfBoundsException" if isinstance(index, int): test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(col('a')[index]).collect() else: test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr('a[b]').collect() assert_gpu_and_cpu_error( test_func, conf=ansi_enabled_conf, error_message=message)
def test_div_overflow_exception_when_ansi(expr, ansi_enabled): ansi_conf = {'spark.sql.ansi.enabled': ansi_enabled} if ansi_enabled == 'true': assert_gpu_and_cpu_error( df_fun=lambda spark: _get_div_overflow_df(spark, expr).collect(), conf=ansi_conf, error_message= 'java.lang.ArithmeticException: Overflow in integral divide') else: assert_gpu_and_cpu_are_equal_collect( func=lambda spark: _get_div_overflow_df(spark, expr), conf=ansi_conf)
def test_array_element_at_zero_index_fail(index, ansi_enabled): message = "SQL array indices start at 1" if isinstance(index, int): test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select( element_at(col('a'), index)).collect() else: test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr( 'element_at(a, b)').collect() assert_gpu_and_cpu_error( test_func, conf={'spark.sql.ansi.enabled':ansi_enabled}, error_message=message)
def _test_div_by_zero(ansi_mode, expr): ansi_conf = {'spark.sql.ansi.enabled': ansi_mode == 'ansi'} data_gen = lambda spark: two_col_df( spark, IntegerGen(), IntegerGen(min_val=0, max_val=0), length=1) div_by_zero_func = lambda spark: data_gen(spark).selectExpr(expr) if ansi_mode == 'ansi': assert_gpu_and_cpu_error( df_fun=lambda spark: div_by_zero_func(spark).collect(), conf=ansi_conf, error_message='java.lang.ArithmeticException: divide by zero') else: assert_gpu_and_cpu_are_equal_collect(div_by_zero_func, ansi_conf)
def test_simple_get_map_value_with_strict_index(strict_index, data_gen): message = "org.apache.spark.SparkNoSuchElementException" test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': strict_index}) if strict_index == 'true': assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'a["NOT_FOUND"]').collect(), conf=test_conf, error_message=message) else: assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'a["NOT_FOUND"]'), conf=test_conf)
def test_array_element_at_ansi_fail_invalid_index(index): message = "ArrayIndexOutOfBoundsException" if is_before_spark_330() else "SparkArrayIndexOutOfBoundsException" if isinstance(index, int): test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select( element_at(col('a'), index)).collect() else: test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr( 'element_at(a, b)').collect() # For 3.3.0+ strictIndexOperator should not affect element_at test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'}) assert_gpu_and_cpu_error( test_func, conf=test_conf, error_message=message)
def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, spark_tmp_table_factory): updated_conf = copy_and_update( _enable_all_types_conf, { 'spark.sql.ansi.enabled': ansi_enabled, 'spark.sql.legacy.timeParserPolicy': time_parser_policy }) f = read_func(std_input_path + '/' + filename, schema, spark_tmp_table_factory, {}) if time_parser_policy == 'EXCEPTION': assert_gpu_and_cpu_error(df_fun=lambda spark: f(spark).collect(), conf=updated_conf, error_message='DateTimeException') elif time_parser_policy == 'LEGACY' and ansi_enabled == 'true': assert_gpu_fallback_collect(f, 'FileSourceScanExec', conf=updated_conf) else: assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf)
def test_array_item_with_strict_index(strict_index_enabled, index): message = "SparkArrayIndexOutOfBoundsException" if isinstance(index, int): test_df = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(col('a')[index]) else: test_df = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr('a[b]') test_conf=copy_and_update( ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': strict_index_enabled}) if strict_index_enabled: assert_gpu_and_cpu_error( lambda spark: test_df(spark).collect(), conf=test_conf, error_message=message) else: assert_gpu_and_cpu_are_equal_collect( test_df, conf=test_conf)
def test_logical_with_side_effect(ansi_enabled, lhs_arg, int_arg, logic_op): def do_it(spark, lhs_bool_arg, arith_arg, op): schema = StructType([ StructField("a", BooleanType()), StructField("b", BooleanType()), StructField("c", IntegerType())]) return spark.createDataFrame( [(False, True, arith_arg), (False, True, 1), (False, True, -5)], schema=schema ).selectExpr('{} {} (c + 2) > 0'.format(lhs_bool_arg, op)) ansi_conf = {'spark.sql.ansi.enabled': ansi_enabled} bypass_map = {'AND': 'a', 'OR': 'b'} expect_error = int_arg == INT_MAX and (lhs_arg == 'NULL' or bypass_map[logic_op] != lhs_arg) if ansi_enabled == 'true' and expect_error: assert_gpu_and_cpu_error( df_fun=lambda spark: do_it(spark, lhs_arg, int_arg, logic_op).collect(), conf=ansi_conf, error_message="ArithmeticException") else: assert_gpu_and_cpu_are_equal_collect( func=lambda spark: do_it(spark, lhs_arg, int_arg, logic_op), conf=ansi_conf)
def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, ansi_enabled, time_parser_policy): data_path = std_input_path + '/' + filename updated_conf = copy_and_update( _enable_all_types_conf, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, 'spark.sql.ansi.enabled': ansi_enabled, 'spark.sql.legacy.timeParserPolicy': time_parser_policy }) if time_parser_policy == 'EXCEPTION': assert_gpu_and_cpu_error( lambda spark : spark.read \ .schema(_date_schema) \ .csv(data_path) .collect(), conf=updated_conf, error_message='DateTimeException') else: assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.read \ .schema(_date_schema) \ .csv(data_path), conf=updated_conf)
def test_gettimestamp_ansi_exception(): assert_gpu_and_cpu_error(lambda spark: invalid_date_string_df( spark).select(f.to_date(f.col("a"), "yyyy-MM-dd")).collect(), error_message="Exception", conf=ansi_enabled_conf)
def test_string_unix_timestamp_ansi_exception(): assert_gpu_and_cpu_error(lambda spark: invalid_date_string_df( spark).select(f.unix_timestamp(f.col('a'), 'yyyy/MM/dd')).collect(), error_message="Exception", conf=ansi_enabled_conf)
def test_string_to_unix_timestamp_ansi_exception(): assert_gpu_and_cpu_error( lambda spark: invalid_date_string_df(spark).selectExpr( "to_unix_timestamp(a, '{}')".format('yyyy/MM/dd')).collect(), error_message="Exception", conf=ansi_enabled_conf)
def test_transform_keys_null_fail(data_gen): assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'transform_keys(a, (key, value) -> CAST(null as INT))').collect(), conf={}, error_message='Cannot use null as map key')
def test_transform_keys_duplicate_fail(data_gen): assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'transform_keys(a, (key, value) -> 1)').collect(), conf={}, error_message='Duplicate map key')
def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): assert_gpu_and_cpu_error( lambda spark: three_col_df(spark, start_gen, stop_gen, step_gen ).selectExpr("sequence(a, b, c)").collect(), conf={}, error_message="Illegal sequence boundaries")
def test_cast_double_to_timestamp(type): def fun(spark): data=[float("inf"),float("-inf"),float("nan")] df = spark.createDataFrame(data, DoubleType()) return df.select(f.col('value').cast(TimestampType())).collect() assert_gpu_and_cpu_error(fun, {"spark.sql.ansi.enabled": True}, "java.time.DateTimeException")
def test_re_replace_backrefs_idx_out_of_bounds(): gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') assert_gpu_and_cpu_error(lambda spark: unary_op_df(spark, gen).selectExpr( 'REGEXP_REPLACE(a, "(T)(E)(S)(T)", "[$5]")').collect(), conf=_regexp_conf, error_message='')