Пример #1
0
def test_operator_func_series_and_scalar(dtype, func, has_nulls, fill_value):
    nelem = 1000
    arr = utils.gen_rand(dtype, nelem) * 10000
    scalar = 59.0

    if has_nulls == 'some':
        nulls = utils.random_bitmask(nelem)
        sr = Series.from_masked_array(arr, nulls)
    else:
        sr = Series(arr)

    psr = sr.to_pandas()

    expect = getattr(psr, func)(scalar, fill_value=fill_value)
    got = getattr(sr, func)(scalar, fill_value=fill_value)

    # This is being done because of the various gymnastics required to support
    # equality for null values. cudf.Series().to_pandas() replaces nulls with
    # None and so a bool Series becomes object Series. Which does not match the
    # output of equality op in pandas which remains a bool. Furthermore, NaN
    # values are treated as not comparable and always return False in a bool op
    # except in not-equal op where bool(Nan != Nan) gives True.
    if got.dtype == np.bool:
        got = got.fillna(True) if func == 'ne' else got.fillna(False)

    utils.assert_eq(expect, got)
Пример #2
0
def test_fillna_dataframe(fill_type, inplace):
    pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]})
    gdf = DataFrame.from_pandas(pdf)

    if fill_type == "scalar":
        fill_value_pd = 5
        fill_value_cudf = fill_value_pd
    elif fill_type == "series":
        fill_value_pd = pd.Series([3, 4, 5])
        fill_value_cudf = Series.from_pandas(fill_value_pd)
    else:
        fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])}
        fill_value_cudf = {
            "a": fill_value_pd["a"],
            "b": Series.from_pandas(fill_value_pd["b"]),
        }

    # https://github.com/pandas-dev/pandas/issues/27197
    # pandas df.fill_value with series is not working

    if isinstance(fill_value_pd, pd.Series):
        expect = pd.DataFrame()
        for col in pdf.columns:
            expect[col] = pdf[col].fillna(fill_value_pd)
    else:
        expect = pdf.fillna(fill_value_pd)

    got = gdf.fillna(fill_value_cudf, inplace=inplace)

    if inplace:
        got = gdf

    assert_eq(expect, got)
Пример #3
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ['a', 'b', 'c', 'd', 'e']
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf['vals'] = pd.Series(str_data, dtype='str')
    gdf['vals'] = Series(str_data, dtype='str')
    pdf['key'] = other_data
    gdf['key'] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2['vals'] = pd.Series(str_data_nulls, dtype='str')
    gdf2['vals'] = Series(str_data_nulls, dtype='str')
    pdf2['key'] = pd.Series(other_data_nulls, dtype='int64')
    gdf2['key'] = Series(other_data_nulls, dtype='int64')

    expect = pdf.merge(pdf2, on='key', how='left')
    got = gdf.merge(gdf2, on='key', how='left')

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
Пример #4
0
def test_categorical_compare_ordered():
    cat1 = pd.Categorical(['a', 'a', 'b', 'c', 'a'],
                          categories=['a', 'b', 'c'],
                          ordered=True)
    pdsr1 = pd.Series(cat1)
    sr1 = Series(cat1)
    cat2 = pd.Categorical(['a', 'b', 'a', 'c', 'b'],
                          categories=['a', 'b', 'c'],
                          ordered=True)
    pdsr2 = pd.Series(cat2)
    sr2 = Series(cat2)

    # test equal
    out = sr1 == sr1
    assert out.dtype == np.bool_
    assert type(out[0]) == np.bool_
    assert np.all(out)
    assert np.all(pdsr1 == pdsr1)

    # test inequal
    out = sr1 != sr1
    assert not np.any(out)
    assert not np.any(pdsr1 != pdsr1)

    assert pdsr1.cat.ordered
    assert sr1.cat.ordered

    # test using ordered operators
    np.testing.assert_array_equal(pdsr1 < pdsr2, sr1 < sr2)
    np.testing.assert_array_equal(pdsr1 > pdsr2, sr1 > sr2)
Пример #5
0
def test_series_fillna_invalid_dtype(data_dtype):
    gdf = Series([1, 2, None, 3], dtype=data_dtype)
    fill_value = 2.5
    with pytest.raises(TypeError) as raises:
        gdf.fillna(fill_value)
    raises.match("Cannot safely cast non-equivalent {} to {}".format(
        np.dtype(type(fill_value)).type.__name__, gdf.dtype.type.__name__))
Пример #6
0
def test_validity_add(nelem):
    np.random.seed(0)
    # LHS
    lhs_data = np.random.random(nelem)
    lhs_mask = utils.random_bitmask(nelem)
    lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem]
    lhs_null_count = utils.count_zero(lhs_bitmask)
    assert lhs_null_count >= 0
    lhs = Series.from_masked_array(lhs_data, lhs_mask)
    assert lhs.null_count == lhs_null_count
    # RHS
    rhs_data = np.random.random(nelem)
    rhs_mask = utils.random_bitmask(nelem)
    rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem]
    rhs_null_count = utils.count_zero(rhs_bitmask)
    assert rhs_null_count >= 0
    rhs = Series.from_masked_array(rhs_data, rhs_mask)
    assert rhs.null_count == rhs_null_count
    # Result
    res = lhs + rhs
    res_mask = np.asarray(utils.expand_bits_to_bytes(lhs_mask & rhs_mask),
                          dtype=np.bool)[:nelem]
    # Fill NA values
    na_value = -10000
    got = res.fillna(na_value).to_array()
    expect = lhs_data + rhs_data
    expect[~res_mask] = na_value
    # Check
    print('expect')
    print(expect)
    print('got')
    print(got)

    np.testing.assert_array_equal(expect, got)
Пример #7
0
def test_series_unique():
    for size in [10 ** x for x in range(5)]:
        arr = np.random.randint(low=-1, high=10, size=size)
        mask = arr != -1
        sr = Series.from_masked_array(arr, Series(mask).as_mask())
        assert set(arr[mask]) == set(sr.unique().to_array())
        assert len(set(arr[mask])) == sr.nunique()
Пример #8
0
def test_categorical_basic():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    cudf_cat = as_index(cat)

    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array())
    assert sr.dtype == pdsr.dtype

    # Test attributes
    assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(pdsr.cat.codes.values,
                                  sr.cat.codes.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1 a
2 b
3 c
4 a
"""
    assert all(x == y for x, y in zip(string.split(), expect_str.split()))
    assert_eq(cat.codes, cudf_cat.codes.to_array())
Пример #9
0
def test_string_astype(dtype):
    if dtype.startswith("int"):
        data = ["1", "2", "3", "4", "5"]
    elif dtype.startswith("float"):
        data = ["1.0", "2.0", "3.0", "4.0", "5.0"]
    elif dtype.startswith("bool"):
        data = ["True", "False", "True", "False", "False"]
    elif dtype.startswith("datetime64"):
        data = [
            "2019-06-04T00:00:00Z",
            "2019-06-04T12:12:12Z",
            "2019-06-03T00:00:00Z",
            "2019-05-04T00:00:00Z",
            "2018-06-04T00:00:00Z",
        ]
    elif dtype == "str" or dtype == "object":
        data = ["ab", "cd", "ef", "gh", "ij"]
    ps = pd.Series(data)
    gs = Series(data)

    # Pandas str --> bool typecasting always returns True if there's a string
    if dtype.startswith("bool"):
        expect = ps == "True"
    else:
        expect = ps.astype(dtype)
    got = gs.astype(dtype)

    assert_eq(expect, got)
Пример #10
0
def test_reflected_ops_scalar(func, dtype, obj_class):
    import pandas as pd

    # create random series
    np.random.seed(12)
    random_series = pd.Series(np.random.sample(100) + 10, dtype=dtype)

    # gpu series
    gs = Series(random_series)

    # class typing
    if obj_class == 'Index':
        gs = as_index(gs)

    gs_result = func(gs)

    # class typing
    if obj_class == 'Index':
        gs = Series(gs)

    # pandas
    ps_result = func(random_series)

    # verify
    np.testing.assert_allclose(ps_result, gs_result)
Пример #11
0
def test_generic_ptx(dtype):

    size = 500

    lhs_arr = np.random.random(size).astype(dtype)
    lhs_col = Series(lhs_arr)._column

    rhs_arr = np.random.random(size).astype(dtype)
    rhs_col = Series(rhs_arr)._column

    @numba.cuda.jit(device=True)
    def generic_function(a, b):
        return a**3 + b

    nb_type = numba.numpy_support.from_dtype(np.dtype(dtype))
    type_signature = (nb_type, nb_type)

    result = generic_function.compile(type_signature)
    ptx = generic_function.inspect_ptx(type_signature)
    ptx_code = ptx.decode("utf-8")

    output_type = numba.numpy_support.as_dtype(result.signature.return_type)

    out_col = binops.apply_op_udf(lhs_col, rhs_col, ptx_code, output_type.type)

    result = lhs_arr**3 + rhs_arr

    np.testing.assert_almost_equal(result, out_col)
Пример #12
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["vals"] = pd.Series(str_data, dtype="str")
    gdf["vals"] = Series(str_data, dtype="str")
    pdf["key"] = other_data
    gdf["key"] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
    gdf2["vals"] = Series(str_data_nulls, dtype="str")
    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
    gdf2["key"] = Series(other_data_nulls, dtype="int64")

    expect = pdf.merge(pdf2, on="key", how="left")
    got = gdf.merge(gdf2, on="key", how="left")

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
Пример #13
0
def test_string_equality():
    data1 = ["b", "c", "d", "a", "c"]
    data2 = ["a", None, "c", "a", "c"]

    ps1 = pd.Series(data1)
    ps2 = pd.Series(data2)
    gs1 = Series(data1)
    gs2 = Series(data2)

    expect = ps1 == ps2
    got = gs1 == gs2

    assert_eq(expect, got.fillna(False))

    expect = ps1 == "m"
    got = gs1 == "m"

    assert_eq(expect, got.fillna(False))

    ps1 = pd.Series(["a"])
    gs1 = Series(["a"])

    expect = ps1 == "m"
    got = gs1 == "m"

    assert_eq(expect, got)
Пример #14
0
def test_series_groupby_agg(agg):
    s = pd.Series([1, 2, 3])
    g = Series([1, 2, 3])
    sg = s.groupby(s // 2).agg(agg)
    gg = g.groupby(g // 2).agg(agg)
    check_dtype = False if agg == 'count' else True
    assert_eq(sg, gg, check_dtype=check_dtype)
Пример #15
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = rmm.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
Пример #16
0
def test_categorical_compare_ordered(data):
    cat1 = data[0]
    cat2 = data[1]
    pdsr1 = pd.Series(cat1)
    pdsr2 = pd.Series(cat2)
    sr1 = Series(cat1)
    sr2 = Series(cat2)
    dsr1 = dgd.from_cudf(sr1, npartitions=2)
    dsr2 = dgd.from_cudf(sr2, npartitions=2)

    # Test equality
    out = dsr1 == dsr1
    assert out.dtype == np.bool_
    assert np.all(out.compute().to_array())
    assert np.all(pdsr1 == pdsr1)

    # Test inequality
    out = dsr1 != dsr1
    assert not np.any(out.compute().to_array())
    assert not np.any(pdsr1 != pdsr1)

    assert dsr1.cat.ordered
    assert pdsr1.cat.ordered

    # Test ordered operators
    np.testing.assert_array_equal(pdsr1 < pdsr2, (dsr1 < dsr2).compute())
    np.testing.assert_array_equal(pdsr1 > pdsr2, (dsr1 > dsr2).compute())
Пример #17
0
def test_string_numeric_astype(dtype):
    if dtype.startswith("bool"):
        data = [1, 0, 1, 0, 1]
    elif dtype.startswith("int"):
        data = [1, 2, 3, 4, 5]
    elif dtype.startswith("float"):
        data = [1.0, 2.0, 3.0, 4.0, 5.0]
    elif dtype.startswith("datetime64"):
        data = [1000000000, 2000000000, 3000000000, 4000000000, 5000000000]
    if dtype.startswith("datetime64"):
        ps = pd.Series(data, dtype="datetime64[ns]")
        gs = Series.from_pandas(ps)
    else:
        ps = pd.Series(data, dtype=dtype)
        gs = Series(data, dtype=dtype)

    # Pandas datetime64 --> str typecasting returns arbitrary format depending
    # on the data, so making it consistent unless we choose to match the
    # behavior
    if dtype.startswith("datetime64"):
        expect = ps.dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    else:
        expect = ps.astype("str")
    got = gs.astype("str")

    assert_eq(expect, got)
Пример #18
0
def test_series_std(ddof):
    np.random.seed(0)
    arr = np.random.random(100) - 0.5
    sr = Series(arr)
    pd = sr.to_pandas()
    got = sr.std(ddof=ddof)
    expect = pd.std(ddof=ddof)
    np.testing.assert_approx_equal(expect, got)
Пример #19
0
def test_max(dtype, nelem):
    data = gen_rand(dtype, nelem)
    sr = Series(data)

    got = sr.max()
    expect = dtype(data.max())

    assert expect == got
Пример #20
0
def test_typecast_to_from_datetime(data, from_dtype, to_dtype):
    np_data = data.astype(from_dtype)
    gdf_data = Series(np_data)

    np_casted = np_data.astype(to_dtype).astype(from_dtype)
    gdf_casted = gdf_data.astype(to_dtype).astype(from_dtype)

    np.testing.assert_equal(np_casted, np.array(gdf_casted))
Пример #21
0
def test_series_compare(cmpop):
    arr1 = np.random.random(100)
    arr2 = np.random.random(100)
    sr1 = Series(arr1)
    sr2 = Series(arr2)
    np.testing.assert_equal(cmpop(sr1, sr1).to_array(), cmpop(arr1, arr1))
    np.testing.assert_equal(cmpop(sr2, sr2).to_array(), cmpop(arr2, arr2))
    np.testing.assert_equal(cmpop(sr1, sr2).to_array(), cmpop(arr1, arr2))
Пример #22
0
def test_series_groupby(agg):
    s = pd.Series([1, 2, 3])
    g = Series([1, 2, 3])
    sg = s.groupby(s // 2)
    gg = g.groupby(g // 2)
    sa = getattr(sg, agg)()
    ga = getattr(gg, agg)()
    assert_eq(sa, ga)
Пример #23
0
def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype):
    np_data = data.astype(from_dtype)
    gdf_col = Series(np_data)._column

    np_casted = np_data.astype(to_dtype)
    gdf_casted = gdf_col.astype(to_dtype)

    np.testing.assert_equal(np_casted, gdf_casted.to_array())
Пример #24
0
    def func(index):
        arr = np.random.random(100) * 10
        sr = Series(arr)

        result = binop(sr.astype('int32'), sr)
        expect = binop(arr.astype('int32'), arr)

        np.testing.assert_almost_equal(result.to_array(), expect, decimal=5)
Пример #25
0
def test_typecast_to_datetime(data, dtype):
    np_data = data.astype(dtype)
    gdf_data = Series(np_data)

    np_casted = np_data.astype('datetime64[ms]')
    gdf_casted = gdf_data.astype('datetime64[ms]')

    np.testing.assert_equal(np_casted, np.array(gdf_casted))
Пример #26
0
def test_sum(dtype, nelem):
    data = gen_rand(dtype, nelem)
    sr = Series(data)

    got = sr.sum()
    expect = dtype(data.sum())

    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
Пример #27
0
def test_misc_quantiles(data, q):
    from cudf.tests import utils

    pdf_series = pd.Series(data)
    gdf_series = Series(data)

    expected = pdf_series.quantile(q)
    actual = gdf_series.quantile(q)
    utils.assert_eq(expected, actual)
Пример #28
0
def test_series_sort_index(nelem, asc):
    np.random.seed(0)
    sr = Series((100 * np.random.random(nelem)))
    orig = sr.to_array()
    got = sr.sort_values().sort_index(ascending=asc).to_array()
    if not asc:
        # Reverse the array for descending sort
        got = got[::-1]
    np.testing.assert_array_equal(orig, got)
Пример #29
0
def test_string_unique(item):
    ps = pd.Series(item)
    gs = Series(item)
    # Pandas `unique` returns a numpy array
    pres = pd.Series(ps.unique())
    # Nvstrings returns sorted unique with `None` placed before other strings
    pres = pres.sort_values(na_position="first").reset_index(drop=True)
    gres = gs.unique()
    assert_eq(pres, gres)
Пример #30
0
def test_string_empty_astype(dtype):
    data = []
    ps = pd.Series(data, dtype="str")
    gs = Series(data, dtype="str")

    expect = ps.astype(dtype)
    got = gs.astype(dtype)

    assert_eq(expect, got)