def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" bname = "orc" expect = pa.orc.ORCFile(source_file).read().to_pandas() with open(source_file, "rb") as f: buffer = f.read() with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got = cudf.read_orc( f"s3://{bname}/{fname}", columns=columns, storage_options=s3so, use_python_file_object=use_python_file_object, ) if columns: expect = expect[columns] assert_eq(expect, got)
def test_chunked_orc_writer_lists(): num_rows = 12345 pdf_in = pd.DataFrame( { "ls": [[str(i), str(2 * i)] for i in range(num_rows)], "ld": [[dec(i / 2)] * 5 for i in range(num_rows)], } ) gdf = cudf.from_pandas(pdf_in) expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True) buffer = BytesIO() writer = ORCWriter(buffer) writer.write_table(gdf) writer.write_table(gdf) writer.close() got = pa.orc.ORCFile(buffer).read().to_pandas() assert_eq(expect, got)
def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread, use_python_file_object): # Write to buffer fname = "test_csv_reader_byte_range.csv" bname = "csv" buffer = pdf.to_csv(index=False) # Use fsspec file object with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}): got = cudf.read_csv( f"s3://{bname}/{fname}", storage_options=s3so, byte_range=(74, 73), bytes_per_thread=bytes_per_thread, header=None, names=["Integer", "Float", "Integer2", "String", "Boolean"], use_python_file_object=use_python_file_object, ) assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
def test_series_nlargest(data, n): """Indirectly tests Series.sort_values()""" sr = Series(data) psr = pd.Series(data) assert_eq(sr.nlargest(n), psr.nlargest(n)) assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last")) assert_exceptions_equal( lfunc=psr.nlargest, rfunc=sr.nlargest, lfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), rfunc_args_and_kwargs=([], { "n": 3, "keep": "what" }), expected_error_message='keep must be either "first", "last"', )
def test_hdf_reader(hdf_files, columns): hdf_df_file, hdf_series, format, nrows = hdf_files if format == "fixed" and columns is not None: pytest.skip("Can't use columns with format 'fixed'") if format == "table" and nrows == 0: pytest.skip("Can't read 0 row table with format 'table'") expect_df = pd.read_hdf(hdf_df_file, columns=columns) got_df = cudf.read_hdf(hdf_df_file, columns=columns) assert_eq(expect_df, got_df, check_categorical=False, check_index_type=False) for column in hdf_series.keys(): expect_series = pd.read_hdf(hdf_series[column]) got_series = cudf.read_hdf(hdf_series[column]) assert_eq(expect_series, got_series, check_index_type=False)
def test_replace_inplace(pframe, replace_args): gpu_frame = cudf.from_pandas(pframe) pandas_frame = pframe.copy() gpu_copy = gpu_frame.copy() cpu_copy = pandas_frame.copy() assert_eq(gpu_frame, pandas_frame) assert_eq(gpu_copy, cpu_copy) gpu_frame.replace(**replace_args) pandas_frame.replace(**replace_args) assert_eq(gpu_frame, pandas_frame) assert_eq(gpu_copy, cpu_copy)
def test_series_update(data, other): gs = data.copy(deep=True) if isinstance(other, cudf.Series): g_other = other.copy(deep=True) p_other = g_other.to_pandas() else: g_other = other p_other = other ps = gs.to_pandas() gs_column_before = gs._column gs.update(g_other) gs_column_after = gs._column assert_eq(gs_column_before.to_array(), gs_column_after.to_array()) ps.update(p_other) assert_eq(gs, ps)
def test_empty(): # empty should not throw order, quadtree = cuspatial.quadtree_on_points( cudf.Series([]), # x cudf.Series([]), # y *bbox_1, # bbox 1, # scale 1, # max_depth 1, # min_size ) assert_eq( quadtree, cudf.DataFrame({ "key": cudf.Series([], dtype=np.uint32), "level": cudf.Series([], dtype=np.uint8), "is_quad": cudf.Series([], dtype=np.bool_), "length": cudf.Series([], dtype=np.uint32), "offset": cudf.Series([], dtype=np.uint32), }), )
def test_rollling_series_numba_udf_basic(data, index, center): psr = cudf.utils.utils._create_pandas_series(data=data, index=index) gsr = cudf.from_pandas(psr) def some_func(A): b = 0 for a in A: b = max(b, math.sqrt(a)) return b for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): assert_eq( psr.rolling(window_size, min_periods, center).apply(some_func).fillna(-1), gsr.rolling(window_size, min_periods, center).apply(some_func).fillna(-1), check_dtype=False, )
def test_series_set_equal_length_object_by_mask(replace_data): psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64") gsr = cudf.from_pandas(psr) # Lengths match in trivial case pd_bool_col = pd.Series([True] * len(psr), dtype="boolean") gd_bool_col = cudf.from_pandas(pd_bool_col) psr[pd_bool_col] = (replace_data.to_pandas( nullable=True) if hasattr(replace_data, "to_pandas") else replace_data) gsr[gd_bool_col] = replace_data assert_eq(psr.astype("float"), gsr.astype("float")) # Test partial masking psr[psr > 1] = (replace_data.to_pandas() if hasattr( replace_data, "to_pandas") else replace_data) gsr[gsr > 1] = replace_data assert_eq(psr.astype("float"), gsr.astype("float"))
def test_concat_join_no_overlapping_columns(pdf1, pdf2, ignore_index, sort, join, axis): gdf1 = gd.from_pandas(pdf1) gdf2 = gd.from_pandas(pdf2) assert_eq( pd.concat( [pdf1, pdf2], sort=sort, join=join, ignore_index=ignore_index, axis=axis, ), gd.concat( [gdf1, gdf2], sort=sort, join=join, ignore_index=ignore_index, axis=axis, ), )
def test_rolling_dataframe_numba_udf_basic(data, center): pdf = pd.DataFrame(data) gdf = cudf.from_pandas(pdf) def some_func(A): b = 0 for a in A: b = b + a**2 return b / len(A) for window_size in range(1, len(data) + 1): for min_periods in range(1, window_size + 1): assert_eq( pdf.rolling(window_size, min_periods, center).apply(some_func).fillna(-1), gdf.rolling(window_size, min_periods, center).apply(some_func).fillna(-1), check_dtype=False, )
def test_read_avro(datadir, hdfs, test_url): fname = datadir / "avro" / "example.avro" # Read from local file system as buffer with open(fname, mode="rb") as f: buffer = BytesIO(f.read()) # Write to hdfs hdfs.upload(basedir + "/file.avro", buffer) if test_url: hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro" else: hd_fpath = f"hdfs://{basedir}/file.avro" got = cudf.read_avro(hd_fpath) with open(fname, mode="rb") as f: expect = pd.DataFrame.from_records(fa.reader(f)) for col in expect.columns: expect[col] = expect[col].astype(got[col].dtype) assert_eq(expect, got)
def test_series_round(arr, decimals): pser = pd.Series(arr) ser = cudf.Series(arr) result = ser.round(decimals) expected = pser.round(decimals) assert_eq(result, expected) # with nulls, maintaining existing null mask arr = arr.astype("float64") # for pandas nulls arr.ravel()[np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False)] = np.nan pser = pd.Series(arr) ser = cudf.Series(arr) result = ser.round(decimals) expected = pser.round(decimals) assert_eq(result, expected)
def test_mixed_lines(): buffers = GeoArrowBuffers({ "lines_xy": range(24), "lines_offsets": np.array(range(5)) * 6, "mlines": [1, 3], }) assert_eq(cudf.Series(range(24)), buffers.lines.xy) assert len(buffers.lines) == 3 column = GeoColumn(buffers) assert_eq( GeoSeries(column), gpGeoSeries([ LineString([[0, 1], [2, 3], [4, 5]]), MultiLineString([ LineString([[6, 7], [8, 9], [10, 11]]), LineString([[12, 13], [14, 15], [16, 17]]), ]), LineString([[18, 19], [20, 21], [22, 23]]), ]), )
def test_concat_decimal_dataframe(ltype, rtype): gdf1 = gd.DataFrame({ "id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"] }) gdf2 = gd.DataFrame({ "id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"] }) gdf1["val"] = gdf1["val"].astype(ltype) gdf2["val"] = gdf2["val"].astype(rtype) pdf1 = gdf1.to_pandas() pdf2 = gdf2.to_pandas() got = gd.concat([gdf1, gdf2]) expected = pd.concat([pdf1, pdf2]) assert_eq(expected, got)
def test_dataframe_replace(df, to_replace, value): gdf = df pdf = gdf.to_pandas() pd_value = value if isinstance(value, pd.Series): gd_value = cudf.from_pandas(value) else: gd_value = value pd_to_replace = to_replace if isinstance(to_replace, pd.Series): gd_to_replace = cudf.from_pandas(to_replace) else: gd_to_replace = to_replace expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) assert_eq(expected, actual)
def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered, inplace): pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) assert_eq(pd_sr, cd_sr) assert str(pd_sr) == str(cd_sr) kwargs = dict(ordered=to_ordered, inplace=inplace) pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1 cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1 assert_eq(pd_sr_1, cd_sr_1) assert str(cd_sr_1) == str(pd_sr_1)
def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): gpu_scalar = cudf.Scalar(cpu_scalar) gsr = cudf.Series(data=data, dtype=dtype) psr = gsr.to_pandas() if op == "add": expected = psr + cpu_scalar actual = gsr + gpu_scalar elif op == "sub": expected = psr - cpu_scalar actual = gsr - gpu_scalar elif op == "truediv": expected = psr / cpu_scalar actual = gsr / gpu_scalar elif op == "floordiv": expected = psr // cpu_scalar actual = gsr // gpu_scalar elif op == "mod": expected = psr % cpu_scalar actual = gsr % gpu_scalar assert_eq(expected, actual) if op == "add": expected = cpu_scalar + psr actual = gpu_scalar + gsr elif op == "sub": expected = cpu_scalar - psr actual = gpu_scalar - gsr elif op == "truediv": expected = cpu_scalar / psr actual = gpu_scalar / gsr elif op == "floordiv": expected = cpu_scalar // psr actual = gpu_scalar // gsr elif op == "mod": expected = cpu_scalar % psr actual = gpu_scalar % gsr assert_eq(expected, actual)
def test_timedelta_index_properties(data, dtype, name): gdi = cudf.Index(data, dtype=dtype, name=name) pdi = gdi.to_pandas() def local_assert(expected, actual): if actual._values.null_count: assert_eq(expected, actual.astype("float64")) else: assert_eq(expected, actual) expected_days = pdi.days actual_days = gdi.days local_assert(expected_days, actual_days) expected_seconds = pdi.seconds actual_seconds = gdi.seconds local_assert(expected_seconds, actual_seconds) expected_microseconds = pdi.microseconds actual_microseconds = gdi.microseconds local_assert(expected_microseconds, actual_microseconds) expected_nanoseconds = pdi.nanoseconds actual_nanoseconds = gdi.nanoseconds local_assert(expected_nanoseconds, actual_nanoseconds) expected_components = pdi.components actual_components = gdi.components if actual_components.isnull().any().any(): assert_eq(expected_components, actual_components.astype("float")) else: assert_eq( expected_components, actual_components, check_index_type=not actual_components.empty, )
def test_can_detect_dtype_from_avro_type_nested(avro_type, expected_dtype, namespace, nullable): avro_type = avro_type if not nullable else ["null", avro_type] schema_leaf = { "name": "leaf", "type": "record", "fields": [{ "name": "prop3", "type": avro_type }], } schema_child = { "name": "child", "type": "record", "fields": [{ "name": "prop2", "type": schema_leaf }], } schema_root = { "name": "root", "type": "record", "namespace": namespace, "fields": [{ "name": "prop1", "type": schema_child }], } actual = cudf_from_avro_util(schema_root, []) col_name = "{ns}child.{ns}leaf.prop3".format( ns="" if namespace is None else namespace + ".") expected = cudf.DataFrame( {col_name: cudf.Series(None, None, expected_dtype)}) assert_eq(expected, actual)
def test_series_drop_edge_inputs(): gs = cudf.Series([42], name="a") ps = gs.to_pandas() assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"])) assert_eq(ps.drop(columns="b"), gs.drop(columns="b")) assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=(["a"], { "columns": "a", "axis": 1 }), rfunc_args_and_kwargs=(["a"], { "columns": "a", "axis": 1 }), expected_error_message="Cannot specify both", ) assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=([], {}), rfunc_args_and_kwargs=([], {}), expected_error_message="Need to specify at least one", ) assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=(["b"], { "axis": 1 }), rfunc_args_and_kwargs=(["b"], { "axis": 1 }), expected_error_message="No axis named 1", )
def test_class_triple_six_splits(): t = cudf.Series([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5]).astype("float32") x = cudf.Series([3, 2, 3, 4, 3, 1, 3, 2, 3, 4, 3, 1, 3, 2, 3, 4, 3, 1]).astype("float32") prefixes = cudf.Series([0, 6, 12, 18]).astype("int32") g = cuspatial.interpolate.CubicSpline(t, x, prefixes=prefixes) groups = cudf.Series( np.ravel( np.array([np.repeat(0, 12), np.repeat(1, 12), np.repeat(2, 12)]))) split_t = cudf.Series( np.ravel(( np.linspace(0, 5, 11), np.linspace(0, 5, 11), np.linspace(0, 5, 11), )), dtype="float32", ) split_t_ind = [ 0, 2, 4, 6, 8, 10, 11, 13, 15, 17, 19, 21, 22, 24, 26, 28, 30, 32, ] assert_eq(g(split_t, groups=groups)[split_t_ind].reset_index(drop=True), x)
def compare_dataframe(left, right, nullable=True): if nullable and isinstance(left, cudf.DataFrame): left = left.to_pandas(nullable=True) if nullable and isinstance(right, cudf.DataFrame): right = right.to_pandas(nullable=True) if len(left.index) == 0 and len(right.index) == 0: check_index_type = False else: check_index_type = True return assert_eq(left, right, check_index_type=check_index_type)
def check_serialization(df): # basic assert_frame_picklable(df) # sliced assert_frame_picklable(df[:-1]) assert_frame_picklable(df[1:]) assert_frame_picklable(df[2:-2]) # sorted sortvaldf = df.sort_values("vals") assert isinstance(sortvaldf.index, GenericIndex) assert_frame_picklable(sortvaldf) # out-of-band if pickle.HIGHEST_PROTOCOL >= 5: buffers = [] serialbytes = pickle.dumps(df, protocol=5, buffer_callback=buffers.append) for b in buffers: assert isinstance(b, pickle.PickleBuffer) loaded = pickle.loads(serialbytes, buffers=buffers) assert_eq(loaded, df)
def test_df_stack(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: pytest.skip(msg="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame() for i in range(num_cols): colname = str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.random.choice( num_rows, size=int(num_rows / 2), replace=False ) data[idx] = np.nan pdf[colname] = data gdf = cudf.from_pandas(pdf) got = gdf.stack() expect = pdf.stack() assert_eq(expect, got)
def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( tmpdir, ): fname = tmpdir.mkdir("gdf_json").join("tmp_json_escape_double_quote") pdf = pd.DataFrame({ "a": ['ab"cd', "\\\b", "\r\\", "'"], "b": ["a\tb\t", "\\", '\\"', "\t"], "c": ["aeiou", "try", "json", "cudf"], }) pdf.to_json(fname, compression="infer", lines=True, orient="records") df = cudf.read_json(fname, compression="infer", lines=True, orient="records") pdf = pd.read_json(fname, compression="infer", lines=True, orient="records") assert_eq(cudf.DataFrame(pdf), df)
def test_storage_options(tmpdir, pdf, hdfs): fname = tmpdir.mkdir("csv").join("file.csv") # Write to local file system pdf.to_csv(fname) # Read from local file system as buffer with open(fname, mode="rb") as f: buffer = BytesIO(f.read()) # Write to hdfs hdfs.upload(basedir + "/file.csv", buffer) hd_fpath = f"hdfs://{basedir}/file.csv" storage_options = {"host": host, "port": port} got = cudf.read_csv(hd_fpath, storage_options=storage_options) # Read pandas from byte buffer with hdfs.open(basedir + "/file.csv") as f: expect = pd.read_csv(f) assert_eq(expect, got)
async def test_ping_pong_cudf(g): # if this test appears after cupy an import error arises # *** ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `CXXABI_1.3.11' # not found (required by python3.7/site-packages/pyarrow/../../../libarrow.so.12) cudf = pytest.importorskip("cudf") from cudf.testing._utils import assert_eq cudf_obj = g(cudf) com, serv_com = await get_comm_pair() msg = {"op": "ping", "data": to_serialize(cudf_obj)} await com.write(msg) result = await serv_com.read() cudf_obj_2 = result.pop("data") assert result["op"] == "ping" assert_eq(cudf_obj, cudf_obj_2) await com.close() await serv_com.close()
def test_cut_series(x, bins, right, include_lowest, ordered, precision): pcat = pd.cut( x=x, bins=bins, right=right, precision=precision, include_lowest=include_lowest, ordered=ordered, ) gcat = cut( x=x, bins=bins, right=right, precision=precision, include_lowest=include_lowest, ordered=ordered, ) assert_eq(pcat, gcat)