def test_part_write_round_trip(spark_tmp_path, orc_gen): gen_list = [('a', RepeatSeqGen(orc_gen, 10)), ('b', orc_gen)] data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce( 1).write.partitionBy('a').orc(path), lambda spark, path: spark.read.orc(path), data_path)
def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.orc(path), lambda spark, path : spark.read.orc(path), data_path, conf={'spark.sql.orc.compression.codec': compress})
def test_roundtrip_with_rebase_values(spark_tmp_path, ts_write_data_gen, date_time_rebase_read, date_time_rebase_write, int96_rebase_read, int96_rebase_write): ts_write, gen = ts_write_data_gen data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write} all_confs.update({ 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': date_time_rebase_write, 'spark.sql.legacy.parquet.int96RebaseModeInWrite': int96_rebase_write }) all_confs.update({ 'spark.sql.legacy.parquet.datetimeRebaseModeInRead': date_time_rebase_read, 'spark.sql.legacy.parquet.int96RebaseModeInRead': int96_rebase_read }) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: unary_op_df(spark, gen).coalesce(1).write.parquet( path), lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs)
def test_write_round_trip(spark_tmp_path, orc_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path), lambda spark, path: spark.read.orc(path), data_path)
def test_write_sql_save_table(spark_tmp_path, orc_gens, ts_type, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: write_orc_sql_from(spark, gen_df(spark, gen_list).coalesce(1), path, spark_tmp_table_factory.get()), lambda spark, path: spark.read.orc(path), data_path, conf={'spark.sql.orc.impl': orc_impl})
def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path), lambda spark, path: spark.read.orc(path), data_path, conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True})
def test_compress_write_round_trip(spark_tmp_path, compress, mt_opt, v1_enabled_list): data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.parquet(path), lambda spark, path : spark.read.parquet(path), data_path, conf={'spark.sql.parquet.compression.codec': compress, 'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt, 'spark.sql.sources.useV1SourceList': v1_enabled_list})
def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): gen = TimestampGen() data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: unary_op_df(spark, gen).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_type})
def test_compress_write_round_trip(spark_tmp_path, compress): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.compression.codec': compress} assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: binary_op_df(spark, long_gen).coalesce( 1).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs)
def test_write_round_trip(spark_tmp_path, parquet_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED', 'spark.sql.parquet.outputTimestampType': 'TIMESTAMP_MICROS'})
def test_part_write_round_trip(spark_tmp_path, parquet_gen): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce( 1).write.partitionBy('a').parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf=writer_confs)
def test_write_round_trip(spark_tmp_path, parquet_gens): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet( path), lambda spark, path: spark.read.parquet(path), data_path, conf=writer_confs)
def test_write_daytime_interval(spark_tmp_path): gen_list = [('_c1', DayTimeIntervalGen())] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet( path), lambda spark, path: spark.read.parquet(path), data_path, conf=writer_confs)
def test_compress_write_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = reader_confs.copy() all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list, 'spark.sql.parquet.compression.codec': compress}) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path : binary_op_df(spark, long_gen).coalesce(1).write.parquet(path), lambda spark, path : spark.read.parquet(path), data_path, conf=all_confs)
def test_part_write_round_trip(spark_tmp_path, parquet_gen): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.partitionBy('a').parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED', 'spark.sql.parquet.outputTimestampType': 'TIMESTAMP_MICROS'})
def test_write_sql_save_table(spark_tmp_path, parquet_gens, ts_type, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' all_confs={'spark.sql.parquet.outputTimestampType': ts_type} all_confs.update(writer_confs) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: write_parquet_sql_from(spark, gen_df(spark, gen_list).coalesce(1), path, spark_tmp_table_factory.get()), lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs)
def test_write_save_table(spark_tmp_path, orc_gens, orc_impl, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] data_path = spark_tmp_path + '/ORC_DATA' all_confs={'spark.sql.sources.useV1SourceList': "orc", "spark.sql.orc.impl": orc_impl} assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.format("orc").mode('overwrite').option("path", path).saveAsTable(spark_tmp_table_factory.get()), lambda spark, path: spark.read.orc(path), data_path, conf=all_confs)
def test_timestamp_write_round_trip(spark_tmp_path, parquet_gens, ts_type): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = copy_and_update( writer_confs, {'spark.sql.parquet.outputTimestampType': ts_type}) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet( path), lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs)
def test_write_round_trip(spark_tmp_path, parquet_gens, mt_opt, v1_enabled_list, ts_type): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED', 'spark.sql.parquet.outputTimestampType': ts_type, 'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt, 'spark.sql.sources.useV1SourceList': v1_enabled_list})
def test_write_empty_orc_round_trip(spark_tmp_path, orc_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] return gen_df(spark, gen_list, length=0).write.orc(path) data_path = spark_tmp_path + '/ORC_DATA' assert_gpu_and_cpu_writes_are_equal_collect( create_empty_df, lambda spark, path: spark.read.orc(path), data_path, conf={'spark.rapids.sql.format.orc.write.enabled': True})
def test_write_empty_parquet_round_trip(spark_tmp_path, parquet_gens): def create_empty_df(spark, path): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] return gen_df(spark, gen_list, length=0).write.parquet(path) data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( create_empty_df, lambda spark, path: spark.read.parquet(path), data_path, conf=writer_confs)
def test_write_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_factory): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list). coalesce(1).write.format("parquet").mode('overwrite').option( "path", path).saveAsTable(spark_tmp_table_factory.get()), lambda spark, path: spark.read.parquet(path), data_path, conf=writer_confs)
def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): # we are limiting TimestampGen to avoid overflowing the INT96 value # see https://github.com/rapidsai/cudf/issues/8070 gen = TimestampGen(start=datetime(1677, 9, 22, tzinfo=timezone.utc), end=datetime(2262, 4, 11, tzinfo=timezone.utc)) data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: unary_op_df(spark, gen).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, 'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_type})
def test_part_write_round_trip(spark_tmp_path, parquet_gen, v1_enabled_list, ts_type, reader_confs): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = reader_confs.copy() all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list, 'spark.sql.parquet.outputTimestampType': ts_type}) all_confs.update(writer_confs) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.partitionBy('a').parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs)
def test_write_round_trip(spark_tmp_path, parquet_gens, v1_enabled_list, ts_type, reader_confs): gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = reader_confs.copy() all_confs.update({'spark.sql.sources.useV1SourceList': v1_enabled_list, 'spark.sql.parquet.outputTimestampType': ts_type}) all_confs.update(writer_confs) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf=all_confs)
def test_all_null_int96(spark_tmp_path): class AllNullTimestampGen(TimestampGen): def start(self, rand): self._start(rand, lambda: None) data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, {'spark.sql.parquet.outputTimestampType': 'INT96'}) assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: unary_op_df(spark, AllNullTimestampGen()).coalesce( 1).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf=confs)
def test_part_write_round_trip(spark_tmp_path, parquet_gen, mt_opt, v1_enabled_list): gen_list = [('a', RepeatSeqGen(parquet_gen, 10)), ('b', parquet_gen)] data_path = spark_tmp_path + '/PARQUET_DATA' assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gen_list).coalesce( 1).write.partitionBy('a').parquet(path), lambda spark, path: spark.read.parquet(path), data_path, conf={ 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED', 'spark.sql.parquet.outputTimestampType': 'TIMESTAMP_MICROS', 'spark.rapids.sql.format.parquet.multiThreadedRead.enabled': mt_opt, 'spark.sql.sources.useV1SourceList': v1_enabled_list })
def test_write_map_nullable(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' def generate_map_with_empty_validity(spark, path): gen_data = StructGen( [['number', IntegerGen()], ['word', LongGen()]], nullable=False) gen_df(spark, gen_data) df = gen_df(spark, gen_data) df_noNulls = df.filter("number is not null") df_map = df_noNulls.withColumn("map", f.create_map( ["number", "word"])).drop("number").drop("word") df_map.coalesce(1).write.parquet(path) assert_gpu_and_cpu_writes_are_equal_collect( generate_map_with_empty_validity, lambda spark, path: spark.read.parquet(path), data_path)