Пример #1
0
def test_null_series(nrows, dtype):
    size = 5
    mask = utils.random_bitmask(size)
    data = cudf.Series(np.random.randint(1, 9, size))
    column = data.set_mask(mask)
    sr = cudf.Series(column).astype(dtype)
    if dtype != "category" and np.dtype(dtype).kind in {"u", "i"}:
        ps = pd.Series(
            sr._column.data_array_view.copy_to_host(),
            dtype=cudf_dtypes_to_pandas_dtypes.get(np.dtype(dtype),
                                                   np.dtype(dtype)),
        )
        ps[sr.isnull().to_pandas()] = pd.NA
    else:
        ps = sr.to_pandas()

    pd.options.display.max_rows = int(nrows)
    psrepr = ps.__repr__()
    psrepr = psrepr.replace("NaN", "<NA>")
    psrepr = psrepr.replace("NaT", "<NA>")
    psrepr = psrepr.replace("None", "<NA>")
    if (dtype.startswith("int") or dtype.startswith("uint")
            or dtype.startswith("long")):
        psrepr = psrepr.replace(
            str(sr._column.default_na_value()) + "\n", "<NA>\n")
    if "UInt" in psrepr:
        psrepr = psrepr.replace("UInt", "uint")
    elif "Int" in psrepr:
        psrepr = psrepr.replace("Int", "int")
    assert psrepr.split() == sr.__repr__().split()
Пример #2
0
def test_validity_add(nelem, lhs_nulls, rhs_nulls):
    np.random.seed(0)
    # LHS
    lhs_data = np.random.random(nelem)
    if lhs_nulls == "some":
        lhs_mask = utils.random_bitmask(nelem)
        lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem]
        lhs_null_count = utils.count_zero(lhs_bitmask)
        assert lhs_null_count >= 0
        lhs = Series.from_masked_array(lhs_data, lhs_mask)
        assert lhs.null_count == lhs_null_count
    else:
        lhs = Series(lhs_data)
    # RHS
    rhs_data = np.random.random(nelem)
    if rhs_nulls == "some":
        rhs_mask = utils.random_bitmask(nelem)
        rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem]
        rhs_null_count = utils.count_zero(rhs_bitmask)
        assert rhs_null_count >= 0
        rhs = Series.from_masked_array(rhs_data, rhs_mask)
        assert rhs.null_count == rhs_null_count
    else:
        rhs = Series(rhs_data)
    # Result
    res = lhs + rhs
    if lhs_nulls == "some" and rhs_nulls == "some":
        res_mask = np.asarray(
            utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool
        )[:nelem]
    if lhs_nulls == "some" and rhs_nulls == "none":
        res_mask = np.asarray(
            utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool
        )[:nelem]
    if lhs_nulls == "none" and rhs_nulls == "some":
        res_mask = np.asarray(
            utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool
        )[:nelem]
    # Fill NA values
    na_value = -10000
    got = res.fillna(na_value).to_array()
    expect = lhs_data + rhs_data
    if lhs_nulls == "some" or rhs_nulls == "some":
        expect[~res_mask] = na_value

    np.testing.assert_array_equal(expect, got)
Пример #3
0
def test_serialize_masked_series():
    nelem = 50
    data = np.random.random(nelem)
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    null_count = utils.count_zero(bitmask)
    assert null_count >= 0
    sr = cudf.Series.from_masked_array(data, mask, null_count=null_count)
    outsr = cudf.Series.deserialize(*sr.serialize())
    assert_eq(sr, outsr)
Пример #4
0
def test_serialize_masked_series():
    nelem = 50
    data = np.random.random(nelem)
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    null_count = utils.count_zero(bitmask)
    assert null_count >= 0
    sr = cudf.Series.from_masked_array(data, mask, null_count=null_count)
    outsr = deserialize(*serialize(sr))
    pd.util.testing.assert_series_equal(sr.to_pandas(), outsr.to_pandas())
Пример #5
0
def test_searchsorted(side, obj_class):
    nelem = 1000
    column_data = gen_rand("float64", nelem)
    column_mask = random_bitmask(nelem)

    values_data = gen_rand("float64", nelem)
    values_mask = random_bitmask(nelem)

    sr = cudf.Series.from_masked_array(column_data, column_mask)
    vals = cudf.Series.from_masked_array(values_data, values_mask)

    sr = sr.sort_values()

    if obj_class == "series":
        sr = cudf.Series.as_index(sr)

    psr = sr.to_pandas()
    pvals = vals.to_pandas()

    expect = psr.searchsorted(pvals, side)
    got = sr.searchsorted(vals, side)

    assert_eq(expect, got.to_array())
Пример #6
0
def test_null_series(nrows, dtype):
    size = 5
    mask = utils.random_bitmask(size)
    data = cudf.Series(np.random.randint(0, 128, size))
    column = data.set_mask(mask)
    sr = cudf.Series(column).astype(dtype)
    ps = sr.to_pandas()
    pd.options.display.max_rows = int(nrows)
    psrepr = ps.__repr__()
    psrepr = psrepr.replace("NaN", "null")
    psrepr = psrepr.replace("NaT", "null")
    psrepr = psrepr.replace("-1\n", "null\n")
    print(psrepr)
    print(sr)
    assert psrepr.split() == sr.__repr__().split()
Пример #7
0
def test_sum_masked(nelem):
    dtype = np.float64
    data = gen_rand(dtype, nelem)

    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    null_count = utils.count_zero(bitmask)

    sr = Series.from_masked_array(data, mask, null_count)

    got = sr.sum()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size]
    expect = data[res_mask].sum()

    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
Пример #8
0
def test_null_dataframe(ncols):
    size = 20
    gdf = cudf.DataFrame()
    for idx, dtype in enumerate(dtype_categories):
        mask = utils.random_bitmask(size)
        data = cudf.Series(np.random.randint(0, 128, size))
        column = data.set_mask(mask)
        sr = cudf.Series(column).astype(dtype)
        gdf[dtype] = sr
    pdf = gdf.to_pandas()
    pd.options.display.max_columns = int(ncols)
    pdfrepr = pdf.__repr__()
    pdfrepr = pdfrepr.replace("NaN", "<NA>")
    pdfrepr = pdfrepr.replace("NaT", "<NA>")
    pdfrepr = pdfrepr.replace("None", "<NA>")
    assert pdfrepr.split() == gdf.__repr__().split()
    pd.reset_option("display.max_columns")
Пример #9
0
def test_onehot_masked():
    np.random.seed(0)
    high = 5
    size = 100
    arr = np.random.randint(low=0, high=high, size=size)
    bitmask = utils.random_bitmask(size)
    bytemask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:size],
                          dtype=np.bool_)
    arr[~bytemask] = -1

    df = DataFrame()
    df['a'] = Series(arr).set_mask(bitmask)

    out = df.one_hot_encoding('a',
                              cats=list(range(high)),
                              prefix='a',
                              dtype=np.int32)

    assert tuple(out.columns) == ('a', 'a_0', 'a_1', 'a_2', 'a_3', 'a_4')
    np.testing.assert_array_equal(out['a_0'] == 1, arr == 0)
    np.testing.assert_array_equal(out['a_1'] == 1, arr == 1)
    np.testing.assert_array_equal(out['a_2'] == 1, arr == 2)
    np.testing.assert_array_equal(out['a_3'] == 1, arr == 3)
    np.testing.assert_array_equal(out['a_4'] == 1, arr == 4)
Пример #10
0
def test_validity_ceil(nelem):
    # Data
    data = np.random.random(nelem) * 100
    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    sr = Series.from_masked_array(data, mask)

    # Result
    res = sr.ceil()

    na_value = -100000
    got = res.fillna(na_value).to_array()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size]

    expect = np.ceil(data)
    expect[~res_mask] = na_value

    # Check
    print("expect")
    print(expect)
    print("got")
    print(got)

    np.testing.assert_array_equal(expect, got)
Пример #11
0
def test_onehot_masked():
    np.random.seed(0)
    high = 5
    size = 100
    arr = np.random.randint(low=0, high=high, size=size)
    bitmask = utils.random_bitmask(size)
    bytemask = np.asarray(utils.expand_bits_to_bytes(bitmask)[:size],
                          dtype=np.bool_)
    arr[~bytemask] = -1

    df = DataFrame()
    df["a"] = Series(arr).set_mask(bitmask)

    out = df.one_hot_encoding("a",
                              cats=list(range(high)),
                              prefix="a",
                              dtype=np.int32)

    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
    np.testing.assert_array_equal((out["a_0"] == 1).to_array(), arr == 0)
    np.testing.assert_array_equal((out["a_1"] == 1).to_array(), arr == 1)
    np.testing.assert_array_equal((out["a_2"] == 1).to_array(), arr == 2)
    np.testing.assert_array_equal((out["a_3"] == 1).to_array(), arr == 3)
    np.testing.assert_array_equal((out["a_4"] == 1).to_array(), arr == 4)