示例#1
0
    ansi_conf = {'spark.sql.ansi.enabled': ansi_mode == 'ansi'}
    data_gen = lambda spark: two_col_df(
        spark, IntegerGen(), IntegerGen(min_val=0, max_val=0), length=1)
    div_by_zero_func = lambda spark: data_gen(spark).selectExpr(expr)

    if ansi_mode == 'ansi':
        assert_gpu_and_cpu_error(
            df_fun=lambda spark: div_by_zero_func(spark).collect(),
            conf=ansi_conf,
            error_message='java.lang.ArithmeticException: divide by zero')
    else:
        assert_gpu_and_cpu_are_equal_collect(div_by_zero_func, ansi_conf)


@pytest.mark.parametrize('expr', ['1/0', 'a/0', 'a/b'])
@pytest.mark.xfail(condition=is_before_spark_311(),
                   reason='https://github.com/apache/spark/pull/29882')
def test_div_by_zero_ansi(expr):
    _test_div_by_zero(ansi_mode='ansi', expr=expr)


@pytest.mark.parametrize('expr', ['1/0', 'a/0', 'a/b'])
def test_div_by_zero_nonansi(expr):
    _test_div_by_zero(ansi_mode='nonAnsi', expr=expr)


def _get_div_overflow_df(spark, expr):
    return spark.createDataFrame([(LONG_MIN, -1)], ['a', 'b']).selectExpr(expr)


div_overflow_exprs = [
示例#2
0
from spark_session import is_before_spark_311
from pyspark.sql.types import *
import pyspark.sql.functions as f


@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen],
                         ids=idfn)
def test_simple_get_map_value(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, data_gen).selectExpr(
            'a["key_0"]', 'a["key_1"]', 'a[null]', 'a["key_9"]',
            'a["NOT_FOUND"]', 'a["key_5"]'))


@pytest.mark.skipif(
    is_before_spark_311(),
    reason="Only in Spark 3.1.1 + ANSI mode, map key throws on no such element"
)
@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen],
                         ids=idfn)
def test_simple_get_map_value_ansi_fail(data_gen):
    assert_gpu_and_cpu_error(
        lambda spark: unary_op_df(spark, data_gen).selectExpr('a["NOT_FOUND"]'
                                                              ).collect(),
        conf={
            'spark.sql.ansi.enabled': True,
            'spark.sql.legacy.allowNegativeScaleOfDecimal': True
        },
        error_message='java.util.NoSuchElementException')

示例#3
0
# limitations under the License.

import pytest

from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
from spark_session import is_before_spark_311
from data_gen import *
from marks import ignore_order, allow_non_gpu
import pyspark.sql.functions as f

nested_scalar_mark=pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/1459")
@pytest.mark.parametrize('data_gen', [pytest.param((StructGen([['child0', DecimalGen(7, 2)]]),
                                                    StructGen([['child1', IntegerGen()]])), marks=nested_scalar_mark),
                                      (StructGen([['child0', DecimalGen(7, 2)]], nullable=False),
                                       StructGen([['child1', IntegerGen()]], nullable=False))], ids=idfn)
@pytest.mark.skipif(is_before_spark_311(), reason="This is supported only in Spark 3.1.1+")
# This tests the union of DF of structs with different types of cols as long as the struct itself
# isn't null. This is a limitation in cudf because we don't support nested types as literals
def test_union_struct_missing_children(data_gen):
    left_gen, right_gen = data_gen
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : binary_op_df(spark, left_gen).unionByName(binary_op_df(
            spark, right_gen), True))

@pytest.mark.parametrize('data_gen', all_gen + [all_basic_struct_gen, StructGen([['child0', DecimalGen(7, 2)]])], ids=idfn)
# This tests union of two DFs of two cols each. The types of the left col and right col is the same
def test_union(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen)))

@pytest.mark.parametrize('data_gen', all_gen + [pytest.param(all_basic_struct_gen, marks=nested_scalar_mark),
示例#4
0
    f.col('a').asc_nulls_last(),
    f.col('a').desc(),
    f.col('a').desc_nulls_first()
],
                         ids=idfn)
def test_single_sort_in_part(data_gen, order):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, data_gen).sortWithinPartitions(order),
        conf=allow_negative_scale_of_decimal_conf)


orderable_gens_sort = [
    byte_gen, short_gen, int_gen, long_gen,
    pytest.param(float_gen,
                 marks=pytest.mark.xfail(
                     condition=is_before_spark_311(),
                     reason='Spark has -0.0 < 0.0 before Spark 3.1')),
    pytest.param(double_gen,
                 marks=pytest.mark.xfail(
                     condition=is_before_spark_311(),
                     reason='Spark has -0.0 < 0.0 before Spark 3.1')),
    boolean_gen, timestamp_gen, date_gen, string_gen, null_gen
] + decimal_gens


@pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn)
def test_multi_orderby(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: binary_op_df(spark, data_gen).orderBy(
            f.col('a'),
            f.col('b').desc()),
示例#5
0
def test_single_nested_orderby_with_limit(data_gen, order):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : unary_op_df(spark, data_gen).orderBy(order).limit(100),
        conf = {
            'spark.rapids.allowCpuRangePartitioning': False
        })

@pytest.mark.parametrize('data_gen', orderable_gens + orderable_not_null_gen, ids=idfn)
@pytest.mark.parametrize('order', [f.col('a').asc(), f.col('a').asc_nulls_last(), f.col('a').desc(), f.col('a').desc_nulls_first()], ids=idfn)
def test_single_sort_in_part(data_gen, order):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : unary_op_df(spark, data_gen).sortWithinPartitions(order),
            conf = allow_negative_scale_of_decimal_conf)

orderable_gens_sort = [byte_gen, short_gen, int_gen, long_gen,
        pytest.param(float_gen, marks=pytest.mark.xfail(condition=is_before_spark_311(),
            reason='Spark has -0.0 < 0.0 before Spark 3.1')),
        pytest.param(double_gen, marks=pytest.mark.xfail(condition=is_before_spark_311(),
            reason='Spark has -0.0 < 0.0 before Spark 3.1')),
        boolean_gen, timestamp_gen, date_gen, string_gen, null_gen] + decimal_gens
@pytest.mark.parametrize('data_gen', orderable_gens_sort, ids=idfn)
def test_multi_orderby(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark : binary_op_df(spark, data_gen).orderBy(f.col('a'), f.col('b').desc()),
            conf = allow_negative_scale_of_decimal_conf)

# SPARK CPU itself has issue with negative scale for take ordered and project
orderable_gens_sort_without_neg_decimal = [n for n in orderable_gens_sort if not (isinstance(n, DecimalGen) and n.scale < 0)]
@pytest.mark.parametrize('data_gen', orderable_gens_sort_without_neg_decimal, ids=idfn)
def test_multi_orderby_with_limit(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
示例#6
0
from marks import ignore_order
import pyspark.sql.functions as f

nested_scalar_mark = pytest.mark.xfail(
    reason="https://github.com/NVIDIA/spark-rapids/issues/1459")


@pytest.mark.parametrize('data_gen', [
    pytest.param((StructGen([['child0', DecimalGen(7, 2)]
                             ]), StructGen([['child1', IntegerGen()]])),
                 marks=nested_scalar_mark),
    (StructGen([['child0', DecimalGen(7, 2)]], nullable=False),
     StructGen([['child1', IntegerGen()]], nullable=False))
],
                         ids=idfn)
@pytest.mark.skipif(is_before_spark_311(),
                    reason="This is supported only in Spark 3.1.1+")
# This tests the union of DF of structs with different types of cols as long as the struct itself
# isn't null. This is a limitation in cudf because we don't support nested types as literals
def test_union_struct_missing_children(data_gen):
    left_gen, right_gen = data_gen
    assert_gpu_and_cpu_are_equal_collect(lambda spark: binary_op_df(
        spark, left_gen).unionByName(binary_op_df(spark, right_gen), True))


@pytest.mark.parametrize(
    'data_gen',
    all_gen +
    [all_basic_struct_gen,
     StructGen([['child0', DecimalGen(7, 2)]])],
    ids=idfn)
示例#7
0

# Test array_contains() with a literal key that is extracted from the input array of doubles
# that does contain NaNs. Note that the config is still set to indicate that the input has NaNs
# but we verify that the plan is on the GPU despite that if the value being looked up is not a NaN.
@pytest.mark.parametrize('data_gen', [double_gen], ids=idfn)
def test_array_contains_for_nans(data_gen):
    arr_gen = ArrayGen(data_gen)

    def main_df(spark):
        df = three_col_df(spark, arr_gen, data_gen, arr_gen)
        chk_val = df.select(col('a')[0].alias('t')).filter(~isnan(col('t'))).collect()[0][0]
        return df.select(array_contains(col('a'), chk_val))
    assert_gpu_and_cpu_are_equal_collect(main_df)

@pytest.mark.skipif(is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, array index throws on out of range indexes")
@pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn)
def test_get_array_item_ansi_fail(data_gen):
    assert_gpu_and_cpu_error(lambda spark: unary_op_df(
        spark, data_gen).select(col('a')[100]).collect(),
                               conf={'spark.sql.ansi.enabled':True,
                                     'spark.sql.legacy.allowNegativeScaleOfDecimal': True},
                               error_message='java.lang.ArrayIndexOutOfBoundsException')

@pytest.mark.skipif(not is_before_spark_311(), reason="For Spark before 3.1.1 + ANSI mode, null will be returned instead of an exception if index is out of range")
@pytest.mark.parametrize('data_gen', array_gens_sample, ids=idfn)
def test_get_array_item_ansi_not_fail(data_gen):
    assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df(
        spark, data_gen).select(col('a')[100]),
                               conf={'spark.sql.ansi.enabled':True,
                               'spark.sql.legacy.allowNegativeScaleOfDecimal': True})
示例#8
0
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs),
    ids=idfn)
def test_hash_multiple_mode_query_avg_distincts(data_gen, conf):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: gen_df(spark, data_gen, length=100)
            .selectExpr('avg(distinct a)', 'avg(distinct b)','avg(distinct c)'),
        conf=conf)


@approximate_float
@ignore_order
@incompat
@pytest.mark.parametrize('data_gen', _init_list_no_nans, ids=idfn)
@pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn)
@pytest.mark.parametrize('parameterless', ['true', pytest.param('false', marks=pytest.mark.xfail(
    condition=not is_before_spark_311(), reason="parameterless count not supported by default in Spark 3.1+"))])
def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf, parameterless):
    conf.update({'spark.sql.legacy.allowParameterlessCount': parameterless})
    assert_gpu_and_cpu_are_equal_sql(
        lambda spark : gen_df(spark, data_gen, length=100),
        "hash_agg_table",
        'select avg(a),' +
        'avg(distinct b),' +
        'avg(distinct c),' +
        'sum(distinct a),' +
        'count(distinct b),' +
        'count(a),' +
        'count(),' +
        'sum(a),' +
        'min(a),'+
        'max(a) from hash_agg_table group by a',