def test_dataframe_apply_rows(dtype, has_nulls): count = 1000 gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls) gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls) gdf_series_expected = gdf_series_a * gdf_series_b df_expected = cudf.DataFrame({ "a": gdf_series_a, "b": gdf_series_b, "out": gdf_series_expected }) df_original = cudf.DataFrame({"a": gdf_series_a, "b": gdf_series_b}) df_actual = df_original.apply_rows(_kernel_multiply, ["a", "b"], {"out": dtype}, {}) assert_eq(df_expected, df_actual)
def test_operator_func_between_series(dtype, func, has_nulls, fill_value): count = 1000 gdf_series_a = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=10000) gdf_series_b = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=100) pdf_series_a = gdf_series_a.to_pandas() pdf_series_b = gdf_series_b.to_pandas() gdf_result = getattr(gdf_series_a, func)(gdf_series_b, fill_value=fill_value) pdf_result = getattr(pdf_series_a, func)(pdf_series_b, fill_value=fill_value) utils.assert_eq(pdf_result, gdf_result)
def test_operator_func_series_and_scalar_logical(dtype, func, has_nulls, scalar, fill_value): gdf_series = utils.gen_rand_series(dtype, 1000, has_nulls=has_nulls, stride=10000) pdf_series = gdf_series.to_pandas() gdf_series_result = getattr(gdf_series, func)(scalar, fill_value=fill_value) pdf_series_result = getattr(pdf_series, func)(scalar, fill_value=fill_value) utils.assert_eq(pdf_series_result, gdf_series_result)
def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): count = 1000 gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls) gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls) gdf_series_c = gen_rand_series(dtype, count, has_nulls=has_nulls) if pessimistic: # pessimistically combine the null masks gdf_series_expected = gdf_series_a * gdf_series_b else: # optimistically ignore the null masks a = cudf.Series(column.build_column(gdf_series_a.data, dtype)) b = cudf.Series(column.build_column(gdf_series_b.data, dtype)) gdf_series_expected = a * b df_expected = cudf.DataFrame( { "a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c, "out": gdf_series_expected, } ) df_original = cudf.DataFrame( {"a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c} ) df_actual = df_original.apply_rows( _kernel_multiply, ["a", "b"], {"out": dtype}, {}, pessimistic_nulls=pessimistic, ) assert_eq(df_expected, df_actual)
def test_orc_write_statistics(tmpdir, datadir, nrows): supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed if nrows == 6000000: supported_stat_types.remove("bool") # Make a dataframe gdf = cudf.DataFrame({ "col_" + str(dtype): gen_rand_series(dtype, nrows, has_nulls=True) for dtype in supported_stat_types }) fname = tmpdir.join("gdf.orc") # Write said dataframe to ORC with cuDF gdf.to_orc(fname.strpath) # Read back written ORC's statistics orc_file = pa.orc.ORCFile(fname) ( file_stats, stripes_stats, ) = cudf.io.orc.read_orc_statistics(fname) # check file stats for col in gdf: if "minimum" in file_stats[col]: stats_min = file_stats[col]["minimum"] actual_min = gdf[col].min() assert normalized_equals(actual_min, stats_min) if "maximum" in file_stats[col]: stats_max = file_stats[col]["maximum"] actual_max = gdf[col].max() assert normalized_equals(actual_max, stats_max) # compare stripe statistics with actual min/max for stripe_idx in range(0, orc_file.nstripes): stripe = orc_file.read_stripe(stripe_idx) # pandas is unable to handle min/max of string col with nulls stripe_df = cudf.DataFrame(stripe.to_pandas()) for col in stripe_df: if "minimum" in stripes_stats[stripe_idx][col]: actual_min = stripe_df[col].min() stats_min = stripes_stats[stripe_idx][col]["minimum"] assert normalized_equals(actual_min, stats_min) if "maximum" in stripes_stats[stripe_idx][col]: actual_max = stripe_df[col].max() stats_max = stripes_stats[stripe_idx][col]["maximum"] assert normalized_equals(actual_max, stats_max)
def test_orc_bool_encode_fail(): np.random.seed(0) buffer = BytesIO() # Generate a boolean column longer than a single stripe fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 600000)}) # Invalidate the first row in the second stripe to break encoding fail_df["col"][500000] = None # Should throw instead of generating a file that is incompatible # with other readers (see issue #6763) with pytest.raises(RuntimeError): fail_df.to_orc(buffer) # Generate a boolean column that fits into a single stripe okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 500000)}) okay_df["col"][500000 - 1] = None # Invalid row is in the last row group of the stripe; # encoding is assumed to be correct okay_df.to_orc(buffer) # Also validate data pdf = pa.orc.ORCFile(buffer).read().to_pandas() assert_eq(okay_df, pdf)
def test_operator_func_series_and_scalar(dtype, func, has_nulls, fill_value, use_cudf_scalar): count = 1000 scalar = 59 gdf_series = utils.gen_rand_series(dtype, count, has_nulls=has_nulls, stride=10000) pdf_series = gdf_series.to_pandas() gdf_series_result = getattr(gdf_series, func)( cudf.Scalar(scalar) if use_cudf_scalar else scalar, fill_value=fill_value, ) pdf_series_result = getattr(pdf_series, func)(scalar, fill_value=fill_value) utils.assert_eq(pdf_series_result, gdf_series_result)
def test_orc_write_bool_statistics(tmpdir, datadir, nrows): # Make a dataframe gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)}) fname = tmpdir.join("gdf.orc") # Write said dataframe to ORC with cuDF gdf.to_orc(fname.strpath) # Read back written ORC's statistics orc_file = pa.orc.ORCFile(fname) ( file_stats, stripes_stats, ) = cudf.io.orc.read_orc_statistics(fname) # check file stats col = "col_bool" if "true_count" in file_stats[col]: stats_true_count = file_stats[col]["true_count"] actual_true_count = gdf[col].sum() assert normalized_equals(actual_true_count, stats_true_count) if "number_of_values" in file_stats[col]: stats_valid_count = file_stats[col]["number_of_values"] actual_valid_count = gdf[col].valid_count assert normalized_equals(actual_valid_count, stats_valid_count) # compare stripe statistics with actual min/max for stripe_idx in range(0, orc_file.nstripes): stripe = orc_file.read_stripe(stripe_idx) # pandas is unable to handle min/max of string col with nulls stripe_df = cudf.DataFrame(stripe.to_pandas()) if "true_count" in stripes_stats[stripe_idx][col]: actual_true_count = stripe_df[col].sum() stats_true_count = stripes_stats[stripe_idx][col]["true_count"] assert normalized_equals(actual_true_count, stats_true_count) if "number_of_values" in stripes_stats[stripe_idx][col]: actual_valid_count = stripe_df[col].valid_count stats_valid_count = stripes_stats[stripe_idx][col][ "number_of_values"] assert normalized_equals(actual_valid_count, stats_valid_count)