ansi_conf = {'spark.sql.ansi.enabled': ansi_mode == 'ansi'} data_gen = lambda spark: two_col_df( spark, IntegerGen(), IntegerGen(min_val=0, max_val=0), length=1) div_by_zero_func = lambda spark: data_gen(spark).selectExpr(expr) if ansi_mode == 'ansi': assert_gpu_and_cpu_error( df_fun=lambda spark: div_by_zero_func(spark).collect(), conf=ansi_conf, error_message='java.lang.ArithmeticException: divide by zero') else: assert_gpu_and_cpu_are_equal_collect(div_by_zero_func, ansi_conf) @pytest.mark.parametrize('expr', ['1/0', 'a/0', 'a/b']) @pytest.mark.xfail(condition=is_before_spark_311(), reason='https://github.com/apache/spark/pull/29882') def test_div_by_zero_ansi(expr): _test_div_by_zero(ansi_mode='ansi', expr=expr) @pytest.mark.parametrize('expr', ['1/0', 'a/0', 'a/b']) def test_div_by_zero_nonansi(expr): _test_div_by_zero(ansi_mode='nonAnsi', expr=expr) def _get_div_overflow_df(spark, expr): return spark.createDataFrame([(LONG_MIN, -1)], ['a', 'b']).selectExpr(expr) div_overflow_exprs = [
from spark_session import is_before_spark_311 from pyspark.sql.types import * import pyspark.sql.functions as f @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn) def test_simple_get_map_value(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( 'a["key_0"]', 'a["key_1"]', 'a[null]', 'a["key_9"]', 'a["NOT_FOUND"]', 'a["key_5"]')) @pytest.mark.skipif( is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, map key throws on no such element" ) @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn) def test_simple_get_map_value_ansi_fail(data_gen): assert_gpu_and_cpu_error( lambda spark: unary_op_df(spark, data_gen).selectExpr('a["NOT_FOUND"]' ).collect(), conf={ 'spark.sql.ansi.enabled': True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True }, error_message='java.util.NoSuchElementException')
# limitations under the License. import pytest from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect from spark_session import is_before_spark_311 from data_gen import * from marks import ignore_order, allow_non_gpu import pyspark.sql.functions as f nested_scalar_mark=pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/1459") @pytest.mark.parametrize('data_gen', [pytest.param((StructGen([['child0', DecimalGen(7, 2)]]), StructGen([['child1', IntegerGen()]])), marks=nested_scalar_mark), (StructGen([['child0', DecimalGen(7, 2)]], nullable=False), StructGen([['child1', IntegerGen()]], nullable=False))], ids=idfn) @pytest.mark.skipif(is_before_spark_311(), reason="This is supported only in Spark 3.1.1+") # This tests the union of DF of structs with different types of cols as long as the struct itself # isn't null. This is a limitation in cudf because we don't support nested types as literals def test_union_struct_missing_children(data_gen): left_gen, right_gen = data_gen assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, left_gen).unionByName(binary_op_df( spark, right_gen), True)) @pytest.mark.parametrize('data_gen', all_gen + [all_basic_struct_gen, StructGen([['child0', DecimalGen(7, 2)]])], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same def test_union(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen))) @pytest.mark.parametrize('data_gen', all_gen + [pytest.param(all_basic_struct_gen, marks=nested_scalar_mark),
f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first() ], ids=idfn) def test_single_sort_in_part(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).sortWithinPartitions(order), conf=allow_negative_scale_of_decimal_conf) orderable_gens_sort = [ byte_gen, short_gen, int_gen, long_gen, pytest.param(float_gen, marks=pytest.mark.xfail( condition=is_before_spark_311(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), pytest.param(double_gen, marks=pytest.mark.xfail( condition=is_before_spark_311(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), boolean_gen, timestamp_gen, date_gen, string_gen, null_gen ] + decimal_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: binary_op_df(spark, data_gen).orderBy( f.col('a'), f.col('b').desc()),
def test_single_nested_orderby_with_limit(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100), conf = { 'spark.rapids.allowCpuRangePartitioning': False }) @pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn) @pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn) def test_single_sort_in_part(data_gen, order): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).sortWithinPartitions(order), conf = allow_negative_scale_of_decimal_conf) orderable_gens_sort = [byte_gen, short_gen, int_gen, long_gen, pytest.param(float_gen, marks=pytest.mark.xfail(condition=is_before_spark_311(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), pytest.param(double_gen, marks=pytest.mark.xfail(condition=is_before_spark_311(), reason='Spark has -0.0 < 0.0 before Spark 3.1')), boolean_gen, timestamp_gen, date_gen, string_gen, null_gen] + decimal_gens @pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn) def test_multi_orderby(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()), conf = allow_negative_scale_of_decimal_conf) # SPARK CPU itself has issue with negative scale for take ordered and project orderable_gens_sort_without_neg_decimal = [n for n in orderable_gens_sort if not (isinstance(n, DecimalGen) and n.scale < 0)] @pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal, ids=idfn) def test_multi_orderby_with_limit(data_gen): assert_gpu_and_cpu_are_equal_collect(
from marks import ignore_order import pyspark.sql.functions as f nested_scalar_mark = pytest.mark.xfail( reason="https://github.com/NVIDIA/spark-rapids/issues/1459") @pytest.mark.parametrize('data_gen', [ pytest.param((StructGen([['child0', DecimalGen(7, 2)] ]), StructGen([['child1', IntegerGen()]])), marks=nested_scalar_mark), (StructGen([['child0', DecimalGen(7, 2)]], nullable=False), StructGen([['child1', IntegerGen()]], nullable=False)) ], ids=idfn) @pytest.mark.skipif(is_before_spark_311(), reason="This is supported only in Spark 3.1.1+") # This tests the union of DF of structs with different types of cols as long as the struct itself # isn't null. This is a limitation in cudf because we don't support nested types as literals def test_union_struct_missing_children(data_gen): left_gen, right_gen = data_gen assert_gpu_and_cpu_are_equal_collect(lambda spark: binary_op_df( spark, left_gen).unionByName(binary_op_df(spark, right_gen), True)) @pytest.mark.parametrize( 'data_gen', all_gen + [all_basic_struct_gen, StructGen([['child0', DecimalGen(7, 2)]])], ids=idfn)
# Test array_contains() with a literal key that is extracted from the input array of doubles # that does contain NaNs. Note that the config is still set to indicate that the input has NaNs # but we verify that the plan is on the GPU despite that if the value being looked up is not a NaN. @pytest.mark.parametrize('data_gen', [double_gen], ids=idfn) def test_array_contains_for_nans(data_gen): arr_gen = ArrayGen(data_gen) def main_df(spark): df = three_col_df(spark, arr_gen, data_gen, arr_gen) chk_val = df.select(col('a')[0].alias('t')).filter(~isnan(col('t'))).collect()[0][0] return df.select(array_contains(col('a'), chk_val)) assert_gpu_and_cpu_are_equal_collect(main_df) @pytest.mark.skipif(is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, array index throws on out of range indexes") @pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn) def test_get_array_item_ansi_fail(data_gen): assert_gpu_and_cpu_error(lambda spark: unary_op_df( spark, data_gen).select(col('a')[100]).collect(), conf={'spark.sql.ansi.enabled':True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True}, error_message='java.lang.ArrayIndexOutOfBoundsException') @pytest.mark.skipif(not is_before_spark_311(), reason="For Spark before 3.1.1 + ANSI mode, null will be returned instead of an exception if index is out of range") @pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn) def test_get_array_item_ansi_not_fail(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, data_gen).select(col('a')[100]), conf={'spark.sql.ansi.enabled':True, 'spark.sql.legacy.allowNegativeScaleOfDecimal': True})
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .selectExpr('avg(distinct a)', 'avg(distinct b)','avg(distinct c)'), conf=conf) @approximate_float @ignore_order @incompat @pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @pytest.mark.parametrize('parameterless', ['true', pytest.param('false', marks=pytest.mark.xfail( condition=not is_before_spark_311(), reason="parameterless count not supported by default in Spark 3.1+"))]) def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf, parameterless): conf.update({'spark.sql.legacy.allowParameterlessCount': parameterless}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=100), "hash_agg_table", 'select avg(a),' + 'avg(distinct b),' + 'avg(distinct c),' + 'sum(distinct a),' + 'count(distinct b),' + 'count(a),' + 'count(),' + 'sum(a),' + 'min(a),'+ 'max(a) from hash_agg_table group by a',