示例#1
0
def test_operator_func_between_series_logical(
    dtype, func, scalar_a, scalar_b, fill_value
):
    gdf_series_a = Series([scalar_a]).astype(dtype)
    gdf_series_b = Series([scalar_b]).astype(dtype)
    pdf_series_a = gdf_series_a.to_pandas()
    pdf_series_b = gdf_series_b.to_pandas()

    gdf_series_result = getattr(gdf_series_a, func)(
        gdf_series_b, fill_value=fill_value
    )
    pdf_series_result = getattr(pdf_series_a, func)(
        pdf_series_b, fill_value=fill_value
    )

    if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]:
        # cudf binary operations will return `None` when both left- and right-
        # side values are `None`. It will return `np.nan` when either side is
        # `np.nan`. As a consequence, when we convert our gdf => pdf during
        # assert_eq, we get a pdf with dtype='object' (all inputs are none).
        # to account for this, we use fillna.
        gdf_series_result.fillna(func == "ne", inplace=True)

    utils.assert_eq(pdf_series_result, gdf_series_result)
示例#2
0
def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
    other_data = ["a", "b", "c"]

    join_data_l = Series([1, 2, 3], dtype=dtype_l)
    join_data_r = Series([1, 2, 4], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    exp_join_data = [1, 2]
    exp_other_data = ["a", "b"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame({
        "join_col": exp_join_col,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
示例#3
0
def test_string_groupby_key_index():
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["a"] = pd.Series(str_data, dtype="str")
    gdf["a"] = Series(str_data, dtype="str")
    pdf["b"] = other_data
    gdf["b"] = other_data

    expect = pdf.groupby("a").count()
    got = gdf.groupby("a").count()

    assert_eq(expect, got, check_dtype=False)
示例#4
0
def test_typecast_on_join_no_float_round():

    other_data = ["a", "b", "c", "d", "e"]

    join_data_l = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32")

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_join_data = [1, 2, 3, 4, 5]
    exp_Bx = ["a", "b", "c", "d", "e"]
    exp_By = ["a", "b", "c", None, None]
    exp_join_col = Series(exp_join_data, dtype="float32")

    expect = DataFrame({
        "join_col": exp_join_col,
        "B_x": exp_Bx,
        "B_y": exp_By
    })

    got = gdf_l.merge(gdf_r, on="join_col", how="left")

    assert_eq(expect, got)
示例#5
0
def test_typecast_on_join_indexes():
    join_data_l = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_r = Series([1, 2, 3, 4, 6], dtype="int32")
    other_data = ["a", "b", "c", "d", "e"]

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    gdf_l = gdf_l.set_index("join_col")
    gdf_r = gdf_r.set_index("join_col")

    exp_join_data = [1, 2, 3, 4]
    exp_other_data = ["a", "b", "c", "d"]

    expect = DataFrame({
        "join_col": exp_join_data,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })
    expect = expect.set_index("join_col")

    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)
示例#6
0
文件: test_onehot.py 项目: vyasr/cudf
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding(
        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
    )
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
示例#7
0
def test_applymap(dtype):

    size = 500

    lhs_arr = np.random.random(size).astype(dtype)
    lhs_col = Series(lhs_arr)._column

    def generic_function(a):
        return a ** 3

    out_col = lhs_col.applymap(generic_function)

    result = lhs_arr ** 3

    np.testing.assert_almost_equal(result, out_col.to_array())
示例#8
0
def test_string_split(data, pat, n, expand, expand_raise):

    if data in (["a b", " c ", "   d", "e   ", "f"],) and pat is None:
        pytest.xfail("None pattern split algorithm not implemented yet")

    ps = pd.Series(data, dtype="str")
    gs = Series(data, dtype="str")

    expectation = raise_builder([expand_raise], NotImplementedError)

    with expectation:
        expect = ps.str.split(pat=pat, n=n, expand=expand)
        got = gs.str.split(pat=pat, n=n, expand=expand)

        assert_eq(expect, got)
示例#9
0
def array_to_series(array):

    if isinstance(array, pa.ChunkedArray):
        return Series._concat(
            [array_to_series(chunk) for chunk in array.chunks])

    array_len = len(array)
    null_count = array.null_count
    buffers = make_device_arrays(array)
    mask, data = buffers[0], buffers[1]
    dtype = arrow_to_pandas_dtype(array.type)

    if pa.types.is_dictionary(array.type):
        from cudf.core.column import CategoricalColumn

        codes = array_to_series(array.indices)
        categories = array_to_series(array.dictionary)
        data = CategoricalColumn(
            data=codes.data,
            mask=mask,
            null_count=null_count,
            categories=categories,
            ordered=array.type.ordered,
        )
    elif pa.types.is_string(array.type):
        import nvstrings

        offs, data = buffers[1], buffers[2]
        offs = offs[array.offset:array.offset + array_len + 1]
        data = None if data is None else data.device_ctypes_pointer.value
        mask = None if mask is None else mask.device_ctypes_pointer.value
        data = nvstrings.from_offsets(
            data,
            offs.device_ctypes_pointer.value,
            array_len,
            mask,
            null_count,
            True,
        )
    elif data is not None:
        data = data[array.offset:array.offset + len(array)]

    series = Series(data, dtype=dtype)

    if null_count > 0 and mask is not None and not series.has_null_mask:
        return series.set_mask(mask, null_count)

    return series
示例#10
0
def test_product(dtype, nelem):
    if np.dtype(dtype).kind == "i":
        data = np.ones(nelem, dtype=dtype)
        # Set at most 30 items to [0..2) to keep the value within 2^32
        for _ in range(30):
            data[random.randrange(nelem)] = random.random() * 2
    else:
        data = gen_rand(dtype, nelem)

    sr = Series(data)

    got = sr.product()
    expect = np.product(data)

    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
示例#11
0
def test_exact_quantiles_int(int_method):
    arr = np.asarray([7, 0, 3, 4, 2, 1, -1, 1, 6])
    quant_values = [0.0, 0.25, 0.33, 0.5, 1.0]

    df = pd.DataFrame(arr)
    gdf_series = Series(arr)

    q1 = gdf_series.quantile(
        quant_values, interpolation=int_method, exact=True
    )

    q2 = df.quantile(quant_values, interpolation=int_method)

    np.testing.assert_allclose(
        q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10
    )
示例#12
0
def test_exact_quantiles(int_method):
    arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7])
    quant_values = [0.0, 0.25, 0.33, 0.5, 1.0]

    df = pd.DataFrame(arr)
    gdf_series = Series(arr)

    q1 = gdf_series.quantile(
        quant_values, interpolation=int_method, exact=True
    )

    q2 = df.quantile(quant_values, interpolation=int_method)

    np.testing.assert_allclose(
        q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10
    )
示例#13
0
def test_applymap_python_lambda(dtype):

    size = 500

    lhs_arr = np.random.random(size).astype(dtype)
    lhs_ser = Series(lhs_arr)

    # Note that the lambda has to be written this way.
    # In other words, the following code does NOT compile with numba:
    # test_list = [1, 2, 3, 4]
    # out_ser = lhs_ser.applymap(lambda x: x in test_list)
    out_ser = lhs_ser.applymap(lambda x: x in [1, 2, 3, 4])

    result = np.isin(lhs_arr, [1, 2, 3, 4])

    np.testing.assert_almost_equal(result, out_ser.to_array())
示例#14
0
def test_datetime_scalar_timeunit_cast(timeunit):
    testscalar = np.datetime64("2016-11-20", timeunit)

    gs = Series(testscalar)
    ps = pd.Series(testscalar)
    assert_eq(ps, gs)

    gdf = DataFrame()
    gdf["a"] = np.arange(5)
    gdf["b"] = testscalar

    pdf = pd.DataFrame()
    pdf["a"] = np.arange(5)
    pdf["b"] = testscalar

    assert_eq(pdf, gdf)
示例#15
0
def test_string_replace_with_backrefs(find, replace):
    s = [
        "A543",
        "Z756",
        "",
        None,
        "tést-string",
        "two-thréé four-fivé",
        "abcd-éfgh",
        "tést-string-again",
    ]
    ps = pd.Series(s)
    gs = Series(s)
    got = gs.str.replace_with_backrefs(find, replace)
    expected = ps.str.replace(find, replace, regex=True)
    assert_eq(got, expected)
示例#16
0
def test_sum_of_squares(dtype, nelem):
    data = gen_rand(dtype, nelem)
    sr = Series(data)

    got = sr.sum_of_squares()
    expect = (data**2).sum()

    if np.dtype(dtype).kind == "i":
        if 0 <= expect <= np.iinfo(dtype).max:
            np.testing.assert_array_almost_equal(expect, got)
        else:
            print("overflow, passing")
    else:
        np.testing.assert_approx_equal(expect,
                                       got,
                                       significant=accuracy_for_dtype[dtype])
示例#17
0
def test_strings_rsplit(data, n, expand):
    gs = Series(data)
    ps = pd.Series(data)

    pd.testing.assert_frame_equal(
        ps.str.rsplit(n=n, expand=expand).reset_index(),
        gs.str.rsplit(n=n, expand=expand).to_pandas().reset_index(),
        check_index_type=False,
    )
    assert_eq(
        ps.str.rsplit(",", n=n, expand=expand),
        gs.str.rsplit(",", n=n, expand=expand),
    )
    assert_eq(
        ps.str.rsplit("-", n=n, expand=expand),
        gs.str.rsplit("-", n=n, expand=expand),
    )
示例#18
0
def test_string_slice_str(string, number, diff):
    pds = pd.Series(string)
    gds = Series(string)

    assert_eq(pds.str.slice(start=number), gds.str.slice(start=number))
    assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number))
    assert_eq(pds.str.slice(), gds.str.slice())
    assert_eq(
        pds.str.slice(start=number, stop=number + diff),
        gds.str.slice(start=number, stop=number + diff),
    )
    if diff != 0:
        assert_eq(pds.str.slice(step=diff), gds.str.slice(step=diff))
        assert_eq(
            pds.str.slice(start=number, stop=number + diff, step=diff),
            gds.str.slice(start=number, stop=number + diff, step=diff),
        )
示例#19
0
def test_string_groupby_key(str_data, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
    got = gdf.groupby(list(range(num_keys)), as_index=False).count()

    expect = expect.sort_values([0]).reset_index(drop=True)
    got = got.sort_values([0]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)
示例#20
0
def test_string_char_case(case_op, data):
    gs = Series(data)
    ps = pd.Series(data)

    s = gs.str
    a = getattr(s, case_op)

    assert_eq(a(), getattr(ps.str, case_op)())

    assert_eq(gs.str.capitalize(), ps.str.capitalize())
    assert_eq(gs.str.isdecimal(), ps.str.isdecimal())
    assert_eq(gs.str.isalnum(), ps.str.isalnum())
    assert_eq(gs.str.isalpha(), ps.str.isalpha())
    assert_eq(gs.str.isdigit(), ps.str.isdigit())
    assert_eq(gs.str.isnumeric(), ps.str.isnumeric())
    assert_eq(gs.str.isspace(), ps.str.isspace())

    assert_eq(gs.str.isempty(), ps == "")
示例#21
0
def test_series_reductions_concurrency(method):
    from concurrent.futures import ThreadPoolExecutor

    e = ThreadPoolExecutor(10)

    np.random.seed(0)
    srs = [Series(np.random.random(10000)) for _ in range(1)]

    def call_test(sr):
        fn = getattr(sr, method)
        if method in ["std", "var"]:
            return fn(ddof=1)
        else:
            return fn()

    def f(sr):
        return call_test(sr + 1)

    list(e.map(f, srs * 50))
示例#22
0
def test_product(dtype, nelem):
    np.random.seed(0)
    dtype = np.dtype(dtype).type
    if np.dtype(dtype).kind in {"u", "i"}:
        data = np.ones(nelem, dtype=dtype)
        # Set at most 30 items to [0..2) to keep the value within 2^32
        for _ in range(30):
            data[np.random.randint(low=0, high=nelem,
                                   size=1)] = (np.random.uniform() * 2)
    else:
        data = gen_rand(dtype, nelem)

    sr = Series(data)

    got = sr.product()
    expect = np.product(data)

    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
示例#23
0
def test_strings_filling_tests(data, width, fillchar):
    gs = Series(data)
    ps = pd.Series(data)

    # TODO: uncomment .str.center tests once this
    # is fixed: https://github.com/rapidsai/cudf/issues/4354
    # as .str.center is nothing but .str.pad(side="both")
    # assert_eq(
    #     ps.str.center(width=width, fillchar=fillchar),
    #     gs.str.center(width=width, fillchar=fillchar),
    # )
    assert_eq(
        ps.str.ljust(width=width, fillchar=fillchar),
        gs.str.ljust(width=width, fillchar=fillchar),
    )
    assert_eq(
        ps.str.rjust(width=width, fillchar=fillchar),
        gs.str.rjust(width=width, fillchar=fillchar),
    )
示例#24
0
def test_string_str_rindex(data, sub, er):
    ps = pd.Series(data)
    gs = Series(data)

    if er is None:
        assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False)

    try:
        ps.str.index(sub)
    except er:
        pass
    else:
        assert not er

    try:
        gs.str.index(sub)
    except er:
        pass
    else:
        assert not er
示例#25
0
def test_series_nlargest(data, n):
    """Indirectly tests Series.sort_values()
    """
    sr = Series(data)
    psr = pd.Series(data)
    assert_eq(sr.nlargest(n), psr.nlargest(n))
    assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last"))

    assert_exceptions_equal(
        lfunc=psr.nlargest,
        rfunc=sr.nlargest,
        lfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        rfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        expected_error_message='keep must be either "first", "last"',
    )
示例#26
0
def test_string_replace_multi():
    ps = pd.Series(["hello", "goodbye"])
    gs = Series(["hello", "goodbye"])
    expect = ps.str.replace("e", "E").str.replace("o", "O")
    got = gs.str.replace(["e", "o"], ["E", "O"])

    assert_eq(expect, got)

    ps = pd.Series(["foo", "fuz", np.nan])
    gs = Series.from_pandas(ps)

    expect = ps.str.replace("f.", "ba", regex=True)
    got = gs.str.replace(["f."], ["ba"], regex=True)
    assert_eq(expect, got)

    ps = pd.Series(["f.o", "fuz", np.nan])
    gs = Series.from_pandas(ps)

    expect = ps.str.replace("f.", "ba", regex=False)
    got = gs.str.replace(["f."], ["ba"], regex=False)
    assert_eq(expect, got)
示例#27
0
def test_string_groupby_non_key(str_data, num_cols, agg):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = getattr(pdf.groupby("a", as_index=False), agg)()
    got = getattr(gdf.groupby("a", as_index=False), agg)()

    expect = expect.sort_values(["a"]).reset_index(drop=True)
    got = got.sort_values(["a"]).reset_index(drop=True)

    if agg in ["min", "max"] and len(expect) == 0 and len(got) == 0:
        for i in range(num_cols):
            expect[i] = expect[i].astype("str")

    assert_eq(expect, got, check_dtype=False)
示例#28
0
def test_categorical_masking():
    """
    Test common operation for getting a all rows that matches a certain
    category.
    """
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    pdsr = pd.Series(cat)
    sr = Series(cat)

    # check scalar comparison
    expect_matches = pdsr == "a"
    got_matches = sr == "a"

    np.testing.assert_array_equal(expect_matches.values,
                                  got_matches.to_array())

    # mask series
    expect_masked = pdsr[expect_matches]
    got_masked = sr[got_matches]

    assert len(expect_masked) == len(got_masked)
    assert len(expect_masked) == got_masked.valid_count
    assert_eq(got_masked, expect_masked)
示例#29
0
def test_categorical_integer():
    cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array(fillna="pandas"))
    assert sr.null_count == 2

    np.testing.assert_array_equal(
        pdsr.cat.codes.values, sr.cat.codes.fillna(-1).to_array()
    )
    np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1 null
2 null
3 c
4 a
dtype: category
Categories (3, object): [a, b, c]
"""
    assert string.split() == expect_str.split()
示例#30
0
def test_string_slice_replace(string, number, diff, repr):
    pds = pd.Series(string)
    gds = Series(string)

    assert_eq(
        pds.str.slice_replace(start=number, repl=repr),
        gds.str.slice_replace(start=number, repl=repr),
        check_dtype=False,
    )
    assert_eq(
        pds.str.slice_replace(stop=number, repl=repr),
        gds.str.slice_replace(stop=number, repl=repr),
    )
    assert_eq(pds.str.slice_replace(), gds.str.slice_replace())
    assert_eq(
        pds.str.slice_replace(start=number, stop=number + diff),
        gds.str.slice_replace(start=number, stop=number + diff),
    )
    assert_eq(
        pds.str.slice_replace(start=number, stop=number + diff, repl=repr),
        gds.str.slice_replace(start=number, stop=number + diff, repl=repr),
        check_dtype=False,
    )