示例#1
0
def test_searchsorted(side, obj_class, vals_class):
    nelem = 1000
    column_data = gen_rand("float64", nelem)
    column_mask = random_bitmask(nelem)

    values_data = gen_rand("float64", nelem)
    values_mask = random_bitmask(nelem)

    sr = cudf.Series.from_masked_array(column_data, column_mask)
    vals = cudf.Series.from_masked_array(values_data, values_mask)

    sr = sr.sort_values()

    # Reference object can be Series, Index, or Column
    if obj_class == "index":
        sr.reset_index(drop=True)
    elif obj_class == "column":
        sr = sr._column

    # Values can be Series or Index
    if vals_class == "index":
        vals.reset_index(drop=True)

    psr = sr.to_pandas()
    pvals = vals.to_pandas()

    expect = psr.searchsorted(pvals, side)
    got = sr.searchsorted(vals, side)

    assert_eq(expect, cupy.asnumpy(got))
示例#2
0
def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
    gdf = cudf.DataFrame()
    gdf["a"] = list(range(nelem))
    gdf["b"] = list(range(nelem, 2 * nelem))
    gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem))
    gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem))

    def do_slice(x):
        return x[slice_start:slice_end]

    expect = do_slice(gdf.to_pandas())
    got = do_slice(gdf).to_pandas()

    assert_eq(expect, got, check_dtype=False)
示例#3
0
def test_null_series(nrows, dtype):
    size = 5
    mask = utils.random_bitmask(size)
    data = cudf.Series(np.random.randint(1, 9, size))
    column = data.set_mask(mask)
    sr = cudf.Series(column).astype(dtype)
    if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
        ps = pd.Series(
            sr._column.data_array_view.copy_to_host(),
            dtype=cudf_dtypes_to_pandas_dtypes.get(cudf.dtype(dtype),
                                                   cudf.dtype(dtype)),
        )
        ps[sr.isnull().to_pandas()] = pd.NA
    else:
        ps = sr.to_pandas()

    pd.options.display.max_rows = int(nrows)
    psrepr = ps.__repr__()
    psrepr = psrepr.replace("NaN", "<NA>")
    psrepr = psrepr.replace("NaT", "<NA>")
    psrepr = psrepr.replace("None", "<NA>")
    if (dtype.startswith("int") or dtype.startswith("uint")
            or dtype.startswith("long")):
        psrepr = psrepr.replace(
            str(sr._column.default_na_value()) + "\n", "<NA>\n")
    if "UInt" in psrepr:
        psrepr = psrepr.replace("UInt", "uint")
    elif "Int" in psrepr:
        psrepr = psrepr.replace("Int", "int")
    assert psrepr.split() == sr.__repr__().split()
    pd.reset_option("display.max_rows")
示例#4
0
def test_validity_ceil(nelem):
    # Data
    data = np.random.random(nelem) * 100
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    sr = Series.from_masked_array(data, mask)

    # Result
    with pytest.warns(
        FutureWarning, match="Series.ceil and DataFrame.ceil are deprecated"
    ):
        res = sr.ceil()

    na_value = -100000
    got = res.fillna(na_value).to_numpy()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size]

    expect = np.ceil(data)
    expect[~res_mask] = na_value

    # Check
    print("expect")
    print(expect)
    print("got")
    print(got)

    np.testing.assert_array_equal(expect, got)
示例#5
0
def test_applymap_round(nelem, masked):
    # Generate data
    np.random.seed(0)
    data = np.random.random(nelem) * 100

    if masked:
        # Make mask
        bitmask = utils.random_bitmask(nelem)
        boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask),
                              dtype=np.bool_)[:nelem]
        data[~boolmask] = np.nan

    sr = Series(data)

    if masked:
        # Mask the Series
        sr = sr.set_mask(bitmask)

    # Call applymap
    out = sr.applymap(lambda x: (floor(x) + 1
                                 if x - floor(x) >= 0.5 else floor(x)))

    if masked:
        # Fill masked values
        out = out.fillna(np.nan)

    # Check
    expect = np.round(data)
    got = out.to_array()
    np.testing.assert_array_almost_equal(expect, got)
示例#6
0
def test_serialize_masked_series():
    nelem = 50
    data = np.random.random(nelem)
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    null_count = utils.count_zero(bitmask)
    assert null_count >= 0
    sr = cudf.Series.from_masked_array(data, mask, null_count=null_count)
    outsr = cudf.Series.deserialize(*sr.serialize())
    assert_eq(sr, outsr)
示例#7
0
def test_sum_masked(nelem):
    dtype = np.float64
    data = gen_rand(dtype, nelem)

    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    null_count = utils.count_zero(bitmask)

    sr = Series.from_masked_array(data, mask, null_count)

    got = sr.sum()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size]
    expect = data[res_mask].sum()

    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
示例#8
0
def test_null_dataframe(ncols):
    size = 20
    gdf = cudf.DataFrame()
    for idx, dtype in enumerate(dtype_categories):
        mask = utils.random_bitmask(size)
        data = cudf.Series(np.random.randint(0, 128, size))
        column = data.set_mask(mask)
        sr = cudf.Series(column).astype(dtype)
        gdf[dtype] = sr
    pdf = gdf.to_pandas()
    pd.options.display.max_columns = int(ncols)
    pdfrepr = pdf.__repr__()
    pdfrepr = pdfrepr.replace("NaN", "<NA>")
    pdfrepr = pdfrepr.replace("NaT", "<NA>")
    pdfrepr = pdfrepr.replace("None", "<NA>")
    assert pdfrepr.split() == gdf.__repr__().split()
    pd.reset_option("display.max_columns")
示例#9
0
def test_onehot_masked():
    np.random.seed(0)
    high = 5
    size = 100
    arr = np.random.randint(low=0, high=high, size=size)
    bitmask = utils.random_bitmask(size)
    bytemask = np.asarray(
        utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_
    )
    arr[~bytemask] = -1

    df = DataFrame()
    df["a"] = Series(arr).set_mask(bitmask)

    out = df.one_hot_encoding(
        "a", cats=list(range(high)), prefix="a", dtype=np.int32
    )

    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
    np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0)
    np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1)
    np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2)
    np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3)
    np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)
示例#10
0
def test_validity_ceil(nelem):
    # Data
    data = np.random.random(nelem) * 100
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    sr = Series.from_masked_array(data, mask)

    # Result
    res = sr.ceil()

    na_value = -100000
    got = res.fillna(na_value).to_array()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[:data.size]

    expect = np.ceil(data)
    expect[~res_mask] = na_value

    # Check
    print("expect")
    print(expect)
    print("got")
    print(got)

    np.testing.assert_array_equal(expect, got)