Exemplo n.º 1
0
def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na):
    pdf = pd.DataFrame({"a": [0, 1, np.nan]})
    df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null)

    expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"])
    got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"])

    if dummy_na and nan_as_null:
        got = got.rename(columns={"a_null": "a_nan"})[expected.columns]

    utils.assert_eq(expected, got)
Exemplo n.º 2
0
def test_from_pandas_with_index():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    pdf = pdf.set_index(np.asarray([4, 3, 2, 1]))
    df = DataFrame.from_pandas(pdf)

    # Check columns
    np.testing.assert_array_equal(df.a.to_array(fillna="pandas"), pdf.a)
    np.testing.assert_array_equal(df.b.to_array(fillna="pandas"), pdf.b)
    # Check index
    np.testing.assert_array_equal(df.index.values, pdf.index.values)
    # Check again using pandas testing tool on frames
    pd.util.testing.assert_frame_equal(df.to_pandas(), pdf)
Exemplo n.º 3
0
def test_from_pandas_with_index():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    pdf = pdf.set_index(np.asarray([4, 3, 2, 1]))
    df = DataFrame.from_pandas(pdf)

    # Check columns
    assert_eq(df.a, pdf.a)
    assert_eq(df.b, pdf.b)
    # Check index
    assert_eq(df.index.values, pdf.index.values)
    # Check again using pandas testing tool on frames
    assert_eq(df, pdf)
Exemplo n.º 4
0
def test_groupby_std():
    raw_data = {
        "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2],
        "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9],
    }
    pdf = pd.DataFrame(raw_data)
    gdf = DataFrame.from_pandas(pdf)
    pdg = pdf.groupby("x")
    gdg = gdf.groupby("x")
    pdresult = pdg.std()
    gdresult = gdg.std()

    assert_groupby_results_equal(pdresult, gdresult)
Exemplo n.º 5
0
def test_from_pandas_ex1():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    print(pdf)
    df = DataFrame.from_pandas(pdf)
    print(df)

    assert tuple(df.columns) == tuple(pdf.columns)
    assert np.all(df["a"].to_array() == pdf["a"])
    matches = df["b"].to_array(fillna="pandas") == pdf["b"]
    # the 3d element is False due to (nan == nan) == False
    assert np.all(matches == [True, True, False, True])
    assert np.isnan(df["b"].to_array(fillna="pandas")[2])
    assert np.isnan(pdf["b"][2])
Exemplo n.º 6
0
def test_dataframe_with_nulls_where_with_scalars(fill_value):
    pdf = pd.DataFrame(
        {
            "A": [-1, 2, -3, None, 5, 6, -7, 0],
            "B": [4, -2, 3, None, 7, 6, 8, 0],
        }
    )
    gdf = DataFrame.from_pandas(pdf)

    expect = pdf.where(pdf % 3 == 0, fill_value)
    got = gdf.where(gdf % 3 == 0, fill_value)

    assert_eq(expect, got)
Exemplo n.º 7
0
def test_dataframe_clip(lower, upper, inplace):
    pdf = pd.DataFrame(
        {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}
    )
    gdf = DataFrame.from_pandas(pdf)

    got = gdf.clip(lower=lower, upper=upper, inplace=inplace)
    expect = pdf.clip(lower=lower, upper=upper, axis=1)

    if inplace is True:
        assert_eq(expect, gdf)
    else:
        assert_eq(expect, got)
Exemplo n.º 8
0
def test_categorical_index():
    pdf = pd.DataFrame()
    pdf["a"] = [1, 2, 3]
    pdf["index"] = pd.Categorical(["a", "b", "c"])
    initial_df = DataFrame.from_pandas(pdf)
    pdf = pdf.set_index("index")
    gdf1 = DataFrame.from_pandas(pdf)
    gdf2 = DataFrame()
    gdf2["a"] = [1, 2, 3]
    gdf2["index"] = pd.Categorical(["a", "b", "c"])
    assert_eq(initial_df.index, gdf2.index)
    gdf2 = gdf2.set_index("index")

    assert isinstance(gdf1.index, CategoricalIndex)
    assert_eq(pdf, gdf1)
    assert_eq(pdf.index, gdf1.index)
    assert_eq(pdf.index.codes, gdf1.index.codes.to_array())

    assert isinstance(gdf2.index, CategoricalIndex)
    assert_eq(pdf, gdf2)
    assert_eq(pdf.index, gdf2.index)
    assert_eq(pdf.index.codes, gdf2.index.codes.to_array())
Exemplo n.º 9
0
def test_dataframe_category_clip(lower, upper, inplace):
    data = ["a", "b", "c", "d", "e"]
    pdf = pd.DataFrame({"a": data})
    gdf = DataFrame.from_pandas(pdf)
    gdf["a"] = gdf["a"].astype("category")

    expect = pdf.clip(lower=lower, upper=upper)
    got = gdf.clip(lower=lower, upper=upper, inplace=inplace)

    if inplace is True:
        assert_eq(expect, gdf.astype("str"))
    else:
        assert_eq(expect, got.astype("str"))
Exemplo n.º 10
0
def test_multiindex_clip(lower, upper, inplace):
    df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]})
    gdf = DataFrame.from_pandas(df)

    index = gdf.set_index(["a", "b"]).index

    expected = df.clip(lower=lower, upper=upper, inplace=inplace, axis=1)
    got = index.clip(lower=lower, upper=upper, inplace=inplace)

    if inplace is True:
        assert_eq(df, index.to_frame(index=False))
    else:
        assert_eq(expected, got.to_frame(index=False))
Exemplo n.º 11
0
def test_safe_merging_with_left_empty():
    import numpy as np
    from cudf import DataFrame
    import pandas as pd

    np.random.seed(0)

    pairs = ("bcd", "b")
    pdf_left = pd.DataFrame()
    pdf_right = pd.DataFrame()
    for left_column in pairs[0]:
        pdf_left[left_column] = np.random.randint(0, 10, 0)
    for right_column in pairs[1]:
        pdf_right[right_column] = np.random.randint(0, 10, 5)
    gdf_left = DataFrame.from_pandas(pdf_left)
    gdf_right = DataFrame.from_pandas(pdf_right)

    pdf_result = pdf_left.merge(pdf_right)
    gdf_result = gdf_left.merge(gdf_right)
    # Simplify test because pandas does not consider empty Index and RangeIndex
    # to be equivalent. TODO: Allow empty Index objects to have equivalence.
    assert len(pdf_result) == len(gdf_result)
Exemplo n.º 12
0
    def test_rank_error_arguments(self):
        pdf = pd.DataFrame(index=self.index)
        pdf["col1"] = self.col1
        pdf["col2"] = self.col2
        gdf = DataFrame.from_pandas(pdf)

        assert_exceptions_equal(
            lfunc=pdf["col1"].rank,
            rfunc=gdf["col1"].rank,
            lfunc_args_and_kwargs=(
                [],
                {
                    "method": "randomname",
                    "na_option": "keep",
                    "ascending": True,
                    "pct": True,
                },
            ),
            rfunc_args_and_kwargs=(
                [],
                {
                    "method": "randomname",
                    "na_option": "keep",
                    "ascending": True,
                    "pct": True,
                },
            ),
        )

        assert_exceptions_equal(
            lfunc=pdf["col1"].rank,
            rfunc=gdf["col1"].rank,
            lfunc_args_and_kwargs=(
                [],
                {
                    "method": "first",
                    "na_option": "randomname",
                    "ascending": True,
                    "pct": True,
                },
            ),
            rfunc_args_and_kwargs=(
                [],
                {
                    "method": "first",
                    "na_option": "randomname",
                    "ascending": True,
                    "pct": True,
                },
            ),
        )
Exemplo n.º 13
0
def test_string_join_values_nulls():
    left_dict = [
        {"b": "MATCH 1", "a": 1.0},
        {"b": "MATCH 1", "a": 1.0},
        {"b": "LEFT NO MATCH 1", "a": -1.0},
        {"b": "MATCH 2", "a": 2.0},
        {"b": "MATCH 2", "a": 2.0},
        {"b": "MATCH 1", "a": 1.0},
        {"b": "MATCH 1", "a": 1.0},
        {"b": "MATCH 2", "a": 2.0},
        {"b": "MATCH 2", "a": 2.0},
        {"b": "LEFT NO MATCH 2", "a": -2.0},
        {"b": "MATCH 3", "a": 3.0},
        {"b": "MATCH 3", "a": 3.0},
    ]

    right_dict = [
        {"b": "RIGHT NO MATCH 1", "c": -1.0},
        {"b": "MATCH 3", "c": 3.0},
        {"b": "MATCH 2", "c": 2.0},
        {"b": "RIGHT NO MATCH 2", "c": -2.0},
        {"b": "RIGHT NO MATCH 3", "c": -3.0},
        {"b": "MATCH 1", "c": 1.0},
    ]

    left_pdf = pd.DataFrame(left_dict)
    right_pdf = pd.DataFrame(right_dict)

    left_gdf = DataFrame.from_pandas(left_pdf)
    right_gdf = DataFrame.from_pandas(right_pdf)

    expect = left_pdf.merge(right_pdf, how="left", on="b")
    got = left_gdf.merge(right_gdf, how="left", on="b")

    expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
    got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)

    assert_eq(expect, got)
Exemplo n.º 14
0
def test_dataframe_join_mismatch_cats(how):
    pdf1 = pd.DataFrame(
        {
            "join_col": ["a", "b", "c", "d", "e"],
            "data_col_left": [10, 20, 30, 40, 50],
        }
    )
    pdf2 = pd.DataFrame(
        {"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]}
    )

    pdf1["join_col"] = pdf1["join_col"].astype("category")
    pdf2["join_col"] = pdf2["join_col"].astype("category")

    gdf1 = DataFrame.from_pandas(pdf1)
    gdf2 = DataFrame.from_pandas(pdf2)

    gdf1 = gdf1.set_index("join_col")
    gdf2 = gdf2.set_index("join_col")

    pdf1 = pdf1.set_index("join_col")
    pdf2 = pdf2.set_index("join_col")
    join_gdf = gdf1.join(gdf2, how=how, sort=True, method="hash")
    join_pdf = pdf1.join(pdf2, how=how)

    got = join_gdf.to_pandas()
    expect = join_pdf.fillna(-1)  # note: cudf join doesn't mask NA

    # We yield a categorical here whereas pandas gives Object.
    expect.index = expect.index.astype("category")
    # cudf creates the columns in different order than pandas for right join
    if how == "right":
        got = got[["data_col_left", "data_col_right"]]

    expect.data_col_right = expect.data_col_right.astype(np.int64)
    expect.data_col_left = expect.data_col_left.astype(np.int64)

    assert_eq(expect, got)
Exemplo n.º 15
0
def test_dataframe_replace():
    # numerical
    pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]})
    gdf1 = DataFrame.from_pandas(pdf1)
    pdf2 = pdf1.replace(0, 4)
    gdf2 = gdf1.replace(0, 4)
    pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2)

    # categorical
    pdf4 = pd.DataFrame(
        {
            "a": ["one", "two", "three"],
            "b": ["one", "two", "three"]
        },
        dtype="category",
    )
    gdf4 = DataFrame.from_pandas(pdf4)
    pdf5 = pdf4.replace("two", "three")
    gdf5 = gdf4.replace("two", "three")
    pd.testing.assert_frame_equal(gdf5.to_pandas(), pdf5)

    # list input
    pdf6 = pdf1.replace([0, 1], [4, 5])
    gdf6 = gdf1.replace([0, 1], [4, 5])
    pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6)

    pdf7 = pdf1.replace([0, 1], 4)
    gdf7 = gdf1.replace([0, 1], 4)
    pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7)

    # dict input:
    pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5})
    gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5})
    pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8)

    pdf9 = pdf1.replace({"a": 0}, {"a": 4})
    gdf9 = gdf1.replace({"a": 0}, {"a": 4})
    pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf9)
Exemplo n.º 16
0
def test_query_with_index_keyword(query, a_val, b_val, c_val):
    pdf = pd.DataFrame({
        "a": [1, None, 3, 4, 5],
        "b": [5, 4, 3, 2, 1],
        "c": [12, 15, 17, 19, 27],
    })
    pdf.set_index("a")

    gdf = DataFrame.from_pandas(pdf)

    out = gdf.query(query)
    expect = pdf.query(query)

    assert_eq(out, expect)
Exemplo n.º 17
0
def test_multiindex_sample_basic(n, frac, replace, axis):
    # as we currently don't support column with same name
    if axis == 1 and replace:
        return
    pdf = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, 5],
            "float": [0.05, 0.2, 0.3, 0.2, 0.25],
            "int": [1, 3, 5, 4, 2],
        },
    )
    mul_index = cudf.Index(DataFrame.from_pandas(pdf))
    random_state = 0

    kind = None

    try:
        pout = pdf.sample(
            n=n,
            frac=frac,
            replace=replace,
            random_state=random_state,
            axis=axis,
        )
    except BaseException as e:
        kind = type(e)
        msg = str(e)

    if kind is not None:
        with pytest.raises(kind, match=msg):
            gout = mul_index.sample(
                n=n,
                frac=frac,
                replace=replace,
                random_state=random_state,
                axis=axis,
            )
    else:
        gout = mul_index.sample(
            n=n,
            frac=frac,
            replace=replace,
            random_state=random_state,
            axis=axis,
        )

    if kind is not None:
        return

    assert pout.shape == gout.shape
Exemplo n.º 18
0
def test_from_pandas():
    pdf = pd.DataFrame()
    pdf["a"] = np.arange(10, dtype=np.int32)
    pdf["b"] = np.arange(10, 20, dtype=np.float64)

    df = DataFrame.from_pandas(pdf)

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df["a"].dtype == pdf["a"].dtype
    assert df["b"].dtype == pdf["b"].dtype

    assert len(df["a"]) == len(pdf["a"])
    assert len(df["b"]) == len(pdf["b"])
Exemplo n.º 19
0
def test_dataframe_pairs_of_triples(pairs, max, rows, how):
    np.random.seed(0)

    pdf_left = pd.DataFrame()
    pdf_right = pd.DataFrame()
    for left_column in pairs[0]:
        pdf_left[left_column] = np.random.randint(0, max, rows)
    for right_column in pairs[1]:
        pdf_right[right_column] = np.random.randint(0, max, rows)
    gdf_left = DataFrame.from_pandas(pdf_left)
    gdf_right = DataFrame.from_pandas(pdf_right)
    if not set(pdf_left.columns).intersection(pdf_right.columns):
        with pytest.raises(pd.core.reshape.merge.MergeError) as raises:
            pdf_left.merge(pdf_right)
        raises.match("No common columns to perform merge on")
        with pytest.raises(ValueError) as raises:
            gdf_left.merge(gdf_right)
        raises.match("No common columns to perform merge on")
    elif not [value for value in pdf_left if value in pdf_right]:
        with pytest.raises(pd.core.reshape.merge.MergeError) as raises:
            pdf_left.merge(pdf_right)
        raises.match("No common columns to perform merge on")
        with pytest.raises(ValueError) as raises:
            gdf_left.merge(gdf_right)
        raises.match("No common columns to perform merge on")
    else:
        pdf_result = pdf_left.merge(pdf_right, how=how)
        gdf_result = gdf_left.merge(gdf_right, how=how)
        assert np.array_equal(gdf_result.columns, pdf_result.columns)
        for column in gdf_result:
            gdf_col_result_sorted = gdf_result[column].fillna(-1).sort_values()
            pd_col_result_sorted = pdf_result[column].fillna(-1).sort_values()
            assert np.array_equal(
                gdf_col_result_sorted.to_pandas().values,
                pd_col_result_sorted.values,
            )
Exemplo n.º 20
0
def test_index_join(lhs, rhs, how, level):
    l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]})
    r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]})
    l_df = DataFrame.from_pandas(l_pdf)
    r_df = DataFrame.from_pandas(r_pdf)
    p_lhs = l_pdf.set_index(lhs).index
    p_rhs = r_pdf.set_index(rhs).index
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index

    expected = (
        p_lhs.join(p_rhs, level=level, how=how)
        .to_frame(index=False)
        .sort_values(by=lhs)
        .reset_index(drop=True)
    )
    got = (
        g_lhs.join(g_rhs, level=level, how=how)
        .to_frame(index=False)
        .sort_values(by=lhs)
        .reset_index(drop=True)
    )

    assert_eq(expected, got)
Exemplo n.º 21
0
 def test_rank_error_arguments(self):
     pdf = pd.DataFrame(index=self.index)
     pdf["col1"] = self.col1
     pdf["col2"] = self.col2
     gdf = DataFrame.from_pandas(pdf)
     with pytest.raises(KeyError):
         gdf["col1"].rank(method="randomname",
                          na_option="keep",
                          ascending=True,
                          pct=True)
     with pytest.raises(KeyError):
         gdf["col1"].rank(
             method="first",
             na_option="randomname",
             ascending=True,
             pct=True,
         )
Exemplo n.º 22
0
def test_groupby_std():
    raw_data = {
        "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2],
        "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9],
    }
    pdf = pd.DataFrame(raw_data)
    gdf = DataFrame.from_pandas(pdf)
    pdg = pdf.groupby("x")
    gdg = gdf.groupby("x")
    pdresult = pdg.std()
    gdresult = gdg.std()

    # There's a lot left to add to python bindings like index name
    # so this is a temporary workaround
    pdresult = pdresult["y"].reset_index(drop=True)
    gdresult = gdresult["y"].reset_index(drop=True)
    assert_eq(pdresult, gdresult)
Exemplo n.º 23
0
def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
    if dtype not in ["float32", "float64"] and nulls in ["some", "all"]:
        pytest.skip(msg="nulls not supported in dtype: " + dtype)

    pdf = pd.DataFrame()
    id_vars = []
    for i in range(num_id_vars):
        colname = "id" + str(i)
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.random.choice(num_rows,
                                   size=int(num_rows / 2),
                                   replace=False)
            data[idx] = np.nan
        elif nulls == "all":
            data[:] = np.nan
        pdf[colname] = data
        id_vars.append(colname)

    value_vars = []
    for i in range(num_value_vars):
        colname = "val" + str(i)
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.random.choice(num_rows,
                                   size=int(num_rows / 2),
                                   replace=False)
            data[idx] = np.nan
        elif nulls == "all":
            data[:] = np.nan
        pdf[colname] = data
        value_vars.append(colname)

    gdf = DataFrame.from_pandas(pdf)

    got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
    got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars)

    expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars)
    # pandas' melt makes the 'variable' column of 'object' type (string)
    # cuDF's melt makes it Categorical because it doesn't support strings
    expect["variable"] = expect["variable"].astype("category")

    assert_eq(expect, got)

    assert_eq(expect, got_from_melt_method)
Exemplo n.º 24
0
def test_string_index():
    pdf = pd.DataFrame(np.random.rand(5, 5))
    gdf = DataFrame.from_pandas(pdf)
    stringIndex = ["a", "b", "c", "d", "e"]
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = np.array(["a", "b", "c", "d", "e"])
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name")
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = StringColumn(["a", "b", "c", "d", "e"], name="name")
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
Exemplo n.º 25
0
def test_dataframe_multi_column(num_cols, num_rows, dtype, ascending,
                                na_position):

    np.random.seed(0)
    by = list(string.ascii_lowercase[:num_cols])
    pdf = pd.DataFrame()

    for i in range(5):
        colname = string.ascii_lowercase[i]
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.sort_values(by, ascending=ascending, na_position=na_position)
    expect = pdf.sort_values(by, ascending=ascending, na_position=na_position)

    assert_eq(got[by].reset_index(drop=True),
              expect[by].reset_index(drop=True))
Exemplo n.º 26
0
def test_rank_all_arguments(
    pdf, dtype, ascending, method, na_option, pct, numeric_only
):
    if method == "first" and dtype == "O":
        # not supported by pandas
        return

    pdf = pdf.copy(deep=True)  # for parallel pytest
    if numeric_only:
        pdf["str"] = np.array(
            ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]
        )
    gdf = DataFrame.from_pandas(pdf)

    kwargs = {
        "method": method,
        "na_option": na_option,
        "ascending": ascending,
        "pct": pct,
        "numeric_only": numeric_only,
    }

    # Series
    assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
    assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
    if numeric_only:
        expect = pdf["str"].rank(**kwargs)
        got = gdf["str"].rank(**kwargs)
        assert expect.empty == got.empty

    # TODO: https://github.com/pandas-dev/pandas/issues/32593
    # Dataframe (bug in pandas)
    if (
        na_option == "top"
        and method == "first"
        and not dtype == "O"
        and ascending
    ):
        assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
    else:
        with pytest.raises(AssertionError, match="values are different"):
            assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
Exemplo n.º 27
0
    def test_rank_all_arguments(self, dtype, ascending, method, na_option,
                                pct):
        if method == "first" and dtype == "O":
            # not supported by pandas
            return
        pdf = pd.DataFrame(index=self.index)
        pdf["col1"] = self.col1.astype(dtype)
        pdf["col2"] = self.col2.astype(dtype)
        gdf = DataFrame.from_pandas(pdf)

        def _check(gs, ps, method, na_option, ascending, pct):
            ranked_gs = gs.rank(
                method=method,
                na_option=na_option,
                ascending=ascending,
                pct=pct,
            )
            ranked_ps = ps.rank(
                method=method,
                na_option=na_option,
                ascending=ascending,
                pct=pct,
            )
            assert_eq(ranked_ps, ranked_gs.to_pandas())

        # # Series
        _check(
            gdf["col1"],
            pdf["col1"],
            method=method,
            na_option=na_option,
            ascending=ascending,
            pct=pct,
        )
        _check(
            gdf["col2"],
            pdf["col2"],
            method=method,
            na_option=na_option,
            ascending=ascending,
            pct=pct,
        )
Exemplo n.º 28
0
def test_issue_165():
    df_pandas = pd.DataFrame()
    start_date = dt.datetime.strptime("2000-10-21", "%Y-%m-%d")
    data = [(start_date + dt.timedelta(days=x)) for x in range(6)]
    df_pandas["dates"] = data
    df_pandas["num"] = [1, 2, 3, 4, 5, 6]
    df_cudf = DataFrame.from_pandas(df_pandas)

    base = df_pandas.query("dates==@start_date")
    test = df_cudf.query("dates==@start_date")
    assert_frame_equal(base, test.to_pandas())
    assert len(test) > 0

    mask = df_cudf.dates == start_date
    base_mask = df_pandas.dates == start_date
    assert_series_equal(mask.to_pandas(), base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0

    start_date_ts = pd.Timestamp(start_date)
    test = df_cudf.query("dates==@start_date_ts")
    base = df_pandas.query("dates==@start_date_ts")
    assert_frame_equal(base, test.to_pandas())
    assert len(test) > 0

    mask = df_cudf.dates == start_date_ts
    base_mask = df_pandas.dates == start_date_ts
    assert_series_equal(mask.to_pandas(), base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0

    start_date_np = np.datetime64(start_date_ts, "ns")
    test = df_cudf.query("dates==@start_date_np")
    base = df_pandas.query("dates==@start_date_np")
    assert_frame_equal(base, test.to_pandas())
    assert len(test) > 0

    mask = df_cudf.dates == start_date_np
    base_mask = df_pandas.dates == start_date_np
    assert_series_equal(mask.to_pandas(), base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0
Exemplo n.º 29
0
def test_query_splitted_combine():
    np.random.seed(0)
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10),
        "y": np.random.normal(size=10)
    })
    gdf = DataFrame.from_pandas(df)

    # Split the GDF
    s1 = gdf[:5]
    s2 = gdf[5:]

    # Do the query
    expr = "x > 2"
    q1 = s1.query(expr)
    q2 = s2.query(expr)
    # Combine
    got = cudf.concat([q1, q2]).to_pandas()

    # Should equal to just querying the original GDF
    expect = gdf.query(expr).to_pandas()
    assert_eq(got, expect)
Exemplo n.º 30
0
def test_string_index():
    from cudf.core.column import as_column
    from cudf.core.index import as_index

    pdf = pd.DataFrame(np.random.rand(5, 5))
    gdf = DataFrame.from_pandas(pdf)
    stringIndex = ["a", "b", "c", "d", "e"]
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = np.array(["a", "b", "c", "d", "e"])
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = StringIndex(["a", "b", "c", "d", "e"], name="name")
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)
    stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name")
    pdf.index = stringIndex
    gdf.index = stringIndex
    assert_eq(pdf, gdf)