示例#1
0
def test_missing_column_names_filter():
    if is_spark_300():
        pytest.skip("Apache Spark 3.0.0 does not handle ORC files without column names")

    with_cpu_session(setup_orc_file_no_column_names)
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark : spark.sql("SELECT _col3,_col2 FROM test_orc_data WHERE _col2 = '155'"))
示例#2
0
def test_cache_shuffled_hash_join(data_gen, join_type):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")

    def do_join(spark):
        left, right = create_df(spark, data_gen, 50, 500)
        cached = left.join(right, left.a == right.r_a, join_type).cache()
        cached.count()
        return cached
    assert_gpu_and_cpu_are_equal_collect(do_join)
示例#3
0
def test_cache_broadcast_hash_join(data_gen, join_type, enableVectorizedConf):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")

    def do_join(spark):
        left, right = create_df(spark, data_gen, 500, 500)
        cached = left.join(right.hint("broadcast"), left.a == right.r_a, join_type).cache()
        cached.count()
        return cached

    assert_gpu_and_cpu_are_equal_collect(do_join, conf = enableVectorizedConf)
示例#4
0
def test_missing_column_names_filter(spark_tmp_table_factory):
    if is_spark_300():
        pytest.skip(
            "Apache Spark 3.0.0 does not handle ORC files without column names"
        )

    table_name = spark_tmp_table_factory.get()
    with_cpu_session(
        lambda spark: setup_orc_file_no_column_names(spark, table_name))
    assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.sql(
        "SELECT _col3,_col2 FROM {} WHERE _col2 = '155'".format(table_name)))
示例#5
0
def test_cached_join_filter(data_gen, join_type):
    data, filter = data_gen
    if is_spark_300() and data.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")

    def do_join(spark):
        left, right = create_df(spark, data, 500, 500)
        cached = left.join(right, left.a == right.r_a, join_type).cache()
        cached.count() #populates the cache
        return cached.filter(filter)

    assert_gpu_and_cpu_are_equal_collect(do_join)
示例#6
0
def test_cache_posexplode_makearray(spark_tmp_path, data_gen, ts_rebase, ts_write):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")
    data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU'
    data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU'
    def write_posExplode(data_path):
        def posExplode(spark):
            cached = four_op_df(spark, data_gen).selectExpr('posexplode(array(b, c, d))', 'a').cache()
            cached.count()
            cached.write.parquet(data_path)
            spark.read.parquet(data_path)
        return posExplode
    from_cpu = with_cpu_session(write_posExplode(data_path_cpu),
                 conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
                       'spark.sql.parquet.outputTimestampType': ts_write})
    from_gpu = with_gpu_session(write_posExplode(data_path_gpu),
                  conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase,
                        'spark.sql.parquet.outputTimestampType': ts_write})
    assert_equal(from_cpu, from_gpu)
示例#7
0
def test_cache_posexplode_makearray(spark_tmp_path, data_gen):
    if is_spark_300() and data_gen.data_type == BooleanType():
        pytest.xfail("https://issues.apache.org/jira/browse/SPARK-32672")
    data_path_cpu = spark_tmp_path + '/PARQUET_DATA_CPU'
    data_path_gpu = spark_tmp_path + '/PARQUET_DATA_GPU'

    def write_posExplode(data_path):
        def posExplode(spark):
            cached = four_op_df(spark, data_gen).selectExpr(
                'posexplode(array(b, c, d))', 'a').cache()
            cached.count()
            cached.write.parquet(data_path)
            spark.read.parquet(data_path)

        return posExplode

    from_cpu = with_cpu_session(write_posExplode(data_path_cpu))
    from_gpu = with_gpu_session(write_posExplode(data_path_gpu))
    assert_equal(from_cpu, from_gpu)
示例#8
0
        lambda spark : gen_df(spark, data_gen, length=1024),
        "hash_agg_table",
        'select a, '
        'count(*) as count_stars, ' 
        'count(b) as count_bees, '
        'sum(b) as sum_of_bees, '
        'max(c) as max_seas, '
        'min(c) as min_seas, '
        'count(distinct c) as count_distinct_cees, '
        'avg(c) as average_seas '
        'from hash_agg_table group by a',
        _no_nans_float_conf)


@pytest.mark.xfail(
    condition=with_spark_session(lambda spark : is_spark_300()),
    reason="[SPARK-32038][SQL] NormalizeFloatingNumbers should also work on distinct aggregate "
           "(https://github.com/apache/spark/pull/28876) "
           "Fixed in later Apache Spark releases.")
@approximate_float
@ignore_order
@pytest.mark.parametrize('data_gen', [ _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn)
def test_count_distinct_with_nan_floats(data_gen):
    assert_gpu_and_cpu_are_equal_sql(
        lambda spark : gen_df(spark, data_gen, length=1024),
        "hash_agg_table",
        'select a, count(distinct b) as count_distinct_bees from hash_agg_table group by a',
        _no_nans_float_conf)

# TODO: Literal tests
# TODO: First and Last tests