예제 #1
0
def test_fillna_dict_series():
    frame_data = {
        "a": [np.nan, 1, 2, np.nan, np.nan],
        "b": [1, 2, 3, np.nan, np.nan],
        "c": [np.nan, 1, 2, 3, 4],
    }
    df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5}))

    df_equals(
        modin_df.fillna({
            "a": 0,
            "b": 5,
            "d": 7
        }),
        df.fillna({
            "a": 0,
            "b": 5,
            "d": 7
        }),
    )

    # Series treated same as dict
    df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max()))
예제 #2
0
def test_reindex_multiindex():
    data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6)
    index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"])
    modin_midx = pd.MultiIndex.from_product(
        [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
    )
    pandas_midx = pandas.MultiIndex.from_product(
        [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
    )
    modin_df1, modin_df2 = (
        pd.DataFrame(data=data1, index=index, columns=index),
        pd.DataFrame(data2, modin_midx),
    )
    pandas_df1, pandas_df2 = (
        pandas.DataFrame(data=data1, index=index, columns=index),
        pandas.DataFrame(data2, pandas_midx),
    )
    modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"]
    md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index])
    pd_midx = pandas.MultiIndex.from_product(
        [pandas_df2.index.levels[0], pandas_df1.index]
    )
    # reindex without axis, index, or columns
    modin_result = modin_df1.reindex(md_midx, fill_value=0)
    pandas_result = pandas_df1.reindex(pd_midx, fill_value=0)
    df_equals(modin_result, pandas_result)
    # reindex with only axis
    modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0)
    pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0)
    df_equals(modin_result, pandas_result)
    # reindex with axis and level
    modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0)
    pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0)
    df_equals(modin_result, pandas_result)
예제 #3
0
def test___getitem__(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    key = modin_df.columns[0]
    modin_col = modin_df.__getitem__(key)
    assert isinstance(modin_col, pd.Series)

    pd_col = pandas_df[key]
    df_equals(pd_col, modin_col)

    slices = [
        (None, -1),
        (-1, None),
        (1, 2),
        (1, None),
        (None, 1),
        (1, -1),
        (-3, -1),
        (1, -1, 2),
    ]

    # slice test
    for slice_param in slices:
        s = slice(*slice_param)
        df_equals(modin_df[s], pandas_df[s])

    # Test empty
    df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10])
예제 #4
0
def test_indexing_duplicate_axis(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)
    modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))]
    assert any(modin_df.index.duplicated())
    assert any(pandas_df.index.duplicated())

    df_equals(modin_df.iloc[0], pandas_df.iloc[0])
    df_equals(modin_df.loc[0], pandas_df.loc[0])
    df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4])
    df_equals(
        modin_df.loc[0, modin_df.columns[0:4]],
        pandas_df.loc[0, pandas_df.columns[0:4]],
    )
예제 #5
0
def test_export_indivisible_chunking(data_has_nulls):
    """
    Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'indivisibly chunked'.

    The setup for the test is a PyArrow table having one of the chunk consisting of a single row,
    meaning that the chunk can't be subdivide.
    """
    data = get_data_of_all_types(has_nulls=data_has_nulls,
                                 exclude_dtypes=["category"])
    pd_df = pandas.DataFrame(data)
    pd_chunks = (pd_df.iloc[:1], pd_df.iloc[1:])

    chunked_at = pa.concat_tables(
        [pa.Table.from_pandas(pd_df) for pd_df in pd_chunks])
    md_df = from_arrow(chunked_at)
    assert (len(md_df._query_compiler._modin_frame._partitions[0][0].get().
                column(0).chunks) == md_df.__dataframe__().num_chunks() == 2)
    # Meaning that we can't subdivide first chunk
    np.testing.assert_array_equal(md_df.__dataframe__()._chunk_slices,
                                  [0, 1, len(pd_df)])

    exported_df = export_frame(md_df, n_chunks=2)
    df_equals(md_df, exported_df)

    exported_df = export_frame(md_df, n_chunks=4)
    df_equals(md_df, exported_df)

    exported_df = export_frame(md_df, n_chunks=40)
    df_equals(md_df, exported_df)
예제 #6
0
def test_select_dtypes():
    frame_data = {
        "test1": list("abc"),
        "test2": np.arange(3, 6).astype("u1"),
        "test3": np.arange(8.0, 11.0, dtype="float64"),
        "test4": [True, False, True],
        "test5": pandas.date_range("now", periods=3).values,
        "test6": list(range(5, 8)),
    }
    df = pandas.DataFrame(frame_data)
    rd = pd.DataFrame(frame_data)

    include = np.float, "integer"
    exclude = (np.bool_,)
    r = rd.select_dtypes(include=include, exclude=exclude)

    e = df[["test2", "test3", "test6"]]
    df_equals(r, e)

    r = rd.select_dtypes(include=np.bool_)
    e = df[["test4"]]
    df_equals(r, e)

    r = rd.select_dtypes(exclude=np.bool_)
    e = df[["test1", "test2", "test3", "test5", "test6"]]
    df_equals(r, e)

    try:
        pd.DataFrame().select_dtypes()
        assert False
    except ValueError:
        assert True
예제 #7
0
def test_set_index(request, data, drop, append):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    if "empty_data" not in request.node.name:
        key = modin_df.columns[0]
        modin_result = modin_df.set_index(key,
                                          drop=drop,
                                          append=append,
                                          inplace=False)
        pandas_result = pandas_df.set_index(key,
                                            drop=drop,
                                            append=append,
                                            inplace=False)
        df_equals(modin_result, pandas_result)

        modin_df_copy = modin_df.copy()
        modin_df.set_index(key, drop=drop, append=append, inplace=True)

        # Check that the copy and original are different
        try:
            df_equals(modin_df, modin_df_copy)
        except AssertionError:
            assert True
        else:
            assert False

        pandas_df.set_index(key, drop=drop, append=append, inplace=True)
        df_equals(modin_df, pandas_df)
예제 #8
0
def test_set_axis(data, axis):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    x = pandas.DataFrame()._get_axis_number(axis)
    index = modin_df.columns if x else modin_df.index
    labels = ["{0}_{1}".format(index[i], i) for i in range(modin_df.shape[x])]

    modin_result = modin_df.set_axis(labels, axis=axis, inplace=False)
    pandas_result = pandas_df.set_axis(labels, axis=axis, inplace=False)
    df_equals(modin_result, pandas_result)

    modin_df_copy = modin_df.copy()
    modin_df.set_axis(labels, axis=axis, inplace=True)

    # Check that the copy and original are different
    try:
        df_equals(modin_df, modin_df_copy)
    except AssertionError:
        assert True
    else:
        assert False

    pandas_df.set_axis(labels, axis=axis, inplace=True)
    df_equals(modin_df, pandas_df)
예제 #9
0
def test_matmul(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)
    col_len = len(modin_df.columns)

    # Test list input
    arr = np.arange(col_len)
    modin_result = modin_df @ arr
    pandas_result = pandas_df @ arr
    df_equals(modin_result, pandas_result)

    # Test bad dimensions
    with pytest.raises(ValueError):
        modin_result = modin_df @ np.arange(col_len + 10)

    # Test series input
    modin_series = pd.Series(np.arange(col_len), index=modin_df.columns)
    pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns)
    modin_result = modin_df @ modin_series
    pandas_result = pandas_df @ pandas_series
    df_equals(modin_result, pandas_result)

    # Test dataframe input
    modin_result = modin_df @ modin_df.T
    pandas_result = pandas_df @ pandas_df.T
    df_equals(modin_result, pandas_result)

    # Test when input series index doesn't line up with columns
    with pytest.raises(ValueError):
        modin_result = modin_df @ pd.Series(np.arange(col_len))
예제 #10
0
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        pytest.xfail("Distributed read_sql is broken, see GH#2194")
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(query,
                                          conn,
                                          partition_column="col1",
                                          lower_bound=0,
                                          upper_bound=6)
        modin_df_from_table = pd.read_sql(table,
                                          conn,
                                          partition_column="col1",
                                          lower_bound=0,
                                          upper_bound=6)

        df_equals(modin_df_from_query, pandas_df)
        df_equals(modin_df_from_table, pandas_df)
예제 #11
0
def test_simple_import():
    modin_df_producer = pd.DataFrame(test_data["int_data"])
    internal_modin_df_producer = modin_df_producer.__dataframe__()
    # Our configuration in pytest.ini requires that we explicitly catch all
    # instances of defaulting to pandas, this one raises a warning on `.from_dataframe`
    with warns_that_defaulting_to_pandas():
        modin_df_consumer = from_dataframe(modin_df_producer)
        internal_modin_df_consumer = from_dataframe(internal_modin_df_producer)

    # TODO: the following assertions verify that `from_dataframe` doesn't return
    # the same object untouched due to optimization branching, it actually should
    # do so but the logic is not implemented yet, so the assertions are passing
    # for now. It's required to replace the producer's type with a different one
    # to consumer when we have some other implementation of the protocol as the
    # assertions may start failing shortly.
    assert modin_df_producer is not modin_df_consumer
    assert internal_modin_df_producer is not internal_modin_df_consumer
    assert (modin_df_producer._query_compiler._modin_frame
            is not modin_df_consumer._query_compiler._modin_frame)

    df_equals(modin_df_producer, modin_df_consumer)
    df_equals(modin_df_producer, internal_modin_df_consumer)
예제 #12
0
def test_export_when_delayed_computations():
    """
    Test that export works properly when OmnisciOnNative has delayed computations.

    If there are delayed functions and export is required, it has to trigger the execution
    first prior materializing protocol's buffers, so the buffers contain actual result
    of the computations.
    """
    # OmniSci can't import 'uint64' as well as booleans, so exclude them
    # issue for bool: https://github.com/modin-project/modin/issues/4299
    data = get_data_of_all_types(has_nulls=True,
                                 exclude_dtypes=["uint64", "bool"])
    md_df = pd.DataFrame(data)
    pd_df = pandas.DataFrame(data)

    md_res = md_df.fillna({"float32_null": 32.0, "float64_null": 64.0})
    pd_res = pd_df.fillna({"float32_null": 32.0, "float64_null": 64.0})
    assert (not md_res._query_compiler._modin_frame._has_arrow_table()
            ), "There are no delayed computations for the frame"

    exported_df = export_frame(md_res)
    df_equals(exported_df, pd_res)
예제 #13
0
def test_tz_convert():
    modin_idx = pd.date_range(
        "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles"
    )
    pandas_idx = pandas.date_range(
        "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles"
    )
    data = np.random.randint(0, 100, size=(len(modin_idx), 4))
    modin_df = pd.DataFrame(data, index=modin_idx)
    pandas_df = pandas.DataFrame(data, index=pandas_idx)
    modin_result = modin_df.tz_convert("UTC", axis=0)
    pandas_result = pandas_df.tz_convert("UTC", axis=0)
    df_equals(modin_result, pandas_result)

    modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))])
    pandas_multi = pandas.MultiIndex.from_arrays([pandas_idx, range(len(modin_idx))])
    modin_series = pd.DataFrame(data, index=modin_multi)
    pandas_series = pandas.DataFrame(data, index=pandas_multi)
    df_equals(
        modin_series.tz_convert("UTC", axis=0, level=0),
        pandas_series.tz_convert("UTC", axis=0, level=0),
    )
예제 #14
0
    def test_h2o_q3(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id3"], observed=True).agg({
            "v1": "sum",
            "v3": "mean"
        })
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id3"], observed=True,
                                    as_index=False).agg({
                                        "v1": "sum",
                                        "v3": "mean"
                                    })
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)
예제 #15
0
def test_copy(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)  # noqa F841

    # pandas_df is unused but there so there won't be confusing list comprehension
    # stuff in the pytest.mark.parametrize
    new_modin_df = modin_df.copy()

    assert new_modin_df is not modin_df
    assert np.array_equal(
        new_modin_df._query_compiler._modin_frame._partitions,
        modin_df._query_compiler._modin_frame._partitions,
    )
    assert new_modin_df is not modin_df
    df_equals(new_modin_df, modin_df)

    # Shallow copy tests
    modin_df = pd.DataFrame(data)
    modin_df_cp = modin_df.copy(False)

    modin_df[modin_df.columns[0]] = 0
    df_equals(modin_df, modin_df_cp)
예제 #16
0
def test_fillna_dataframe():
    frame_data = {
        "a": [np.nan, 1, 2, np.nan, np.nan],
        "b": [1, 2, 3, np.nan, np.nan],
        "c": [np.nan, 1, 2, 3, 4],
    }
    df = pandas.DataFrame(frame_data, index=list("VWXYZ"))
    modin_df = pd.DataFrame(frame_data, index=list("VWXYZ"))

    # df2 may have different index and columns
    df2 = pandas.DataFrame(
        {
            "a": [np.nan, 10, 20, 30, 40],
            "b": [50, 60, 70, 80, 90],
            "foo": ["bar"] * 5,
        },
        index=list("VWXuZ"),
    )
    modin_df2 = pd.DataFrame(df2)

    # only those columns and indices which are shared get filled
    df_equals(modin_df.fillna(modin_df2), df.fillna(df2))
예제 #17
0
def test_reorder_levels():
    data = np.random.randint(1, 100, 12)
    modin_df = pd.DataFrame(
        data,
        index=pd.MultiIndex.from_tuples(
            [(num, letter, color) for num in range(1, 3)
             for letter in ["a", "b", "c"] for color in ["Red", "Green"]],
            names=["Number", "Letter", "Color"],
        ),
    )
    pandas_df = pandas.DataFrame(
        data,
        index=pandas.MultiIndex.from_tuples(
            [(num, letter, color) for num in range(1, 3)
             for letter in ["a", "b", "c"] for color in ["Red", "Green"]],
            names=["Number", "Letter", "Color"],
        ),
    )
    df_equals(
        modin_df.reorder_levels(["Letter", "Color", "Number"]),
        pandas_df.reorder_levels(["Letter", "Color", "Number"]),
    )
예제 #18
0
def test_getitem_same_name():
    data = [
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [9, 10, 11, 12],
        [13, 14, 15, 16],
        [17, 18, 19, 20],
    ]
    columns = ["c1", "c2", "c1", "c3"]
    modin_df = pd.DataFrame(data, columns=columns)
    pandas_df = pandas.DataFrame(data, columns=columns)
    df_equals(modin_df["c1"], pandas_df["c1"])
    df_equals(modin_df["c2"], pandas_df["c2"])
    df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]])
    df_equals(modin_df["c3"], pandas_df["c3"])
예제 #19
0
def test_replace():
    modin_df = pd.DataFrame({
        "A": [0, 1, 2, 3, 4],
        "B": [5, 6, 7, 8, 9],
        "C": ["a", "b", "c", "d", "e"]
    })
    pandas_df = pandas.DataFrame({
        "A": [0, 1, 2, 3, 4],
        "B": [5, 6, 7, 8, 9],
        "C": ["a", "b", "c", "d", "e"]
    })
    modin_result = modin_df.replace({"A": 0, "B": 5}, 100)
    pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100)
    df_equals(modin_result, pandas_result)

    modin_result = modin_df.replace({"A": {0: 100, 4: 400}})
    pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}})
    df_equals(modin_result, pandas_result)

    modin_df = pd.DataFrame({
        "A": ["bat", "foo", "bait"],
        "B": ["abc", "bar", "xyz"]
    })
    pandas_df = pandas.DataFrame({
        "A": ["bat", "foo", "bait"],
        "B": ["abc", "bar", "xyz"]
    })
    modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"})
    pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"})
    df_equals(modin_result, pandas_result)

    modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new")
    pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new")
    df_equals(modin_result, pandas_result)

    modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True)
    pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True)
    df_equals(modin_df, pandas_df)
예제 #20
0
def test_unwrap_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df = pd.DataFrame(data)

    if axis is None:
        expected_partitions = df._query_compiler._modin_frame._partitions
        actual_partitions = np.array(unwrap_partitions(df, axis=axis))
        assert (expected_partitions.shape[0] == actual_partitions.shape[0] and
                expected_partitions.shape[1] == expected_partitions.shape[1])
        for row_idx in range(expected_partitions.shape[0]):
            for col_idx in range(expected_partitions.shape[1]):
                if Engine.get() == "Ray":
                    assert (expected_partitions[row_idx][col_idx].oid ==
                            actual_partitions[row_idx][col_idx])
                if Engine.get() == "Dask":
                    assert (expected_partitions[row_idx][col_idx].future ==
                            actual_partitions[row_idx][col_idx])
    else:
        expected_axis_partitions = (
            df._query_compiler._modin_frame._frame_mgr_cls.axis_partition(
                df._query_compiler._modin_frame._partitions, axis ^ 1))
        expected_axis_partitions = [
            axis_partition.force_materialization().unwrap(squeeze=True)
            for axis_partition in expected_axis_partitions
        ]
        actual_axis_partitions = unwrap_partitions(df, axis=axis)
        assert len(expected_axis_partitions) == len(actual_axis_partitions)
        for item_idx in range(len(expected_axis_partitions)):
            if Engine.get() == "Ray":
                df_equals(
                    ray.get(expected_axis_partitions[item_idx]),
                    ray.get(actual_axis_partitions[item_idx]),
                )
            if Engine.get() == "Dask":
                df_equals(
                    expected_axis_partitions[item_idx].result(),
                    actual_axis_partitions[item_idx].result(),
                )
예제 #21
0
def test_take():
    modin_df = pd.DataFrame(
        [
            ("falcon", "bird", 389.0),
            ("parrot", "bird", 24.0),
            ("lion", "mammal", 80.5),
            ("monkey", "mammal", np.nan),
        ],
        columns=["name", "class", "max_speed"],
        index=[0, 2, 3, 1],
    )
    pandas_df = pandas.DataFrame(
        [
            ("falcon", "bird", 389.0),
            ("parrot", "bird", 24.0),
            ("lion", "mammal", 80.5),
            ("monkey", "mammal", np.nan),
        ],
        columns=["name", "class", "max_speed"],
        index=[0, 2, 3, 1],
    )
    df_equals(modin_df.take([0, 3]), pandas_df.take([0, 3]))
    df_equals(modin_df.take([2], axis=1), pandas_df.take([2], axis=1))
예제 #22
0
def test_reset_index_with_named_index():
    modin_df = pd.DataFrame(test_data_values[0])
    pandas_df = pandas.DataFrame(test_data_values[0])

    modin_df.index.name = pandas_df.index.name = "NAME_OF_INDEX"
    df_equals(modin_df, pandas_df)
    df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))

    modin_df.reset_index(drop=True, inplace=True)
    pandas_df.reset_index(drop=True, inplace=True)
    df_equals(modin_df, pandas_df)

    modin_df = pd.DataFrame(test_data_values[0])
    pandas_df = pandas.DataFrame(test_data_values[0])
    modin_df.index.name = pandas_df.index.name = "NEW_NAME"
    df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))
예제 #23
0
def test_nsmallest():
    data = {
        "population": [
            59000000,
            65000000,
            434000,
            434000,
            434000,
            337000,
            11300,
            11300,
            11300,
        ],
        "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
        "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"],
    }
    index = [
        "Italy",
        "France",
        "Malta",
        "Maldives",
        "Brunei",
        "Iceland",
        "Nauru",
        "Tuvalu",
        "Anguilla",
    ]
    modin_df = pd.DataFrame(data=data, index=index)
    pandas_df = pandas.DataFrame(data=data, index=index)
    df_equals(
        modin_df.nsmallest(n=3, columns="population"),
        pandas_df.nsmallest(n=3, columns="population"),
    )
    df_equals(
        modin_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"),
        pandas_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"),
    )
예제 #24
0
def test_fillna_sanity():
    # with different dtype
    frame_data = [
        ["a", "a", np.nan, "a"],
        ["b", "b", np.nan, "b"],
        ["c", "c", np.nan, "c"],
    ]
    df = pandas.DataFrame(frame_data)

    result = df.fillna({2: "foo"})
    modin_df = pd.DataFrame(frame_data).fillna({2: "foo"})

    df_equals(modin_df, result)

    modin_df = pd.DataFrame(df)
    df.fillna({2: "foo"}, inplace=True)
    modin_df.fillna({2: "foo"}, inplace=True)
    df_equals(modin_df, result)

    frame_data = {
        "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")],
        "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT],
    }
    df = pandas.DataFrame(frame_data)
    result = df.fillna(value={"Date": df["Date2"]})
    modin_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]})
    df_equals(modin_df, result)

    frame_data = {"A": [pandas.Timestamp("2012-11-11 00:00:00+01:00"), pandas.NaT]}
    df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    df_equals(modin_df.fillna(method="pad"), df.fillna(method="pad"))

    frame_data = {"A": [pandas.NaT, pandas.Timestamp("2012-11-11 00:00:00+01:00")]}
    df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data).fillna(method="bfill")
    df_equals(modin_df, df.fillna(method="bfill"))
예제 #25
0
def test_at_time():
    i = pd.date_range("2008-01-01", periods=1000, freq="12H")
    modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i)
    pandas_df = pandas.DataFrame(
        {"A": list(range(1000)), "B": list(range(1000))}, index=i
    )
    df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00"))
    df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00"))
    df_equals(modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1))
예제 #26
0
def test_drop_api_equivalence():
    # equivalence of the labels/axis and index/columns API's
    frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]]

    modin_df = pd.DataFrame(frame_data,
                            index=["a", "b", "c"],
                            columns=["d", "e", "f"])

    modin_df1 = modin_df.drop("a")
    modin_df2 = modin_df.drop(index="a")
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop("d", 1)
    modin_df2 = modin_df.drop(columns="d")
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop(labels="e", axis=1)
    modin_df2 = modin_df.drop(columns="e")
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop(["a"], axis=0)
    modin_df2 = modin_df.drop(index=["a"])
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop(["a"], axis=0).drop(["d"], axis=1)
    modin_df2 = modin_df.drop(index=["a"], columns=["d"])
    df_equals(modin_df1, modin_df2)

    with pytest.raises(ValueError):
        modin_df.drop(labels="a", index="b")

    with pytest.raises(ValueError):
        modin_df.drop(labels="a", columns="b")

    with pytest.raises(ValueError):
        modin_df.drop(axis=1)
예제 #27
0
def test_astype_category_large():
    series_length = 10_000
    modin_df = pd.DataFrame(
        {
            "col1": ["str{0}".format(i) for i in range(0, series_length)],
            "col2": [i for i in range(0, series_length)],
        }
    )
    pandas_df = pandas.DataFrame(
        {
            "col1": ["str{0}".format(i) for i in range(0, series_length)],
            "col2": [i for i in range(0, series_length)],
        }
    )

    modin_result = modin_df.astype({"col1": "category"})
    pandas_result = pandas_df.astype({"col1": "category"})
    df_equals(modin_result, pandas_result)
    assert modin_result.dtypes.equals(pandas_result.dtypes)

    modin_result = modin_df.astype("category")
    pandas_result = pandas_df.astype("category")
    df_equals(modin_result, pandas_result)
    assert modin_result.dtypes.equals(pandas_result.dtypes)
예제 #28
0
def test_fillna_datetime_columns():
    frame_data = {
        "A": [-1, -2, np.nan],
        "B": pd.date_range("20130101", periods=3),
        "C": ["foo", "bar", None],
        "D": ["foo2", "bar2", None],
    }
    df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    df_equals(modin_df.fillna("?"), df.fillna("?"))

    frame_data = {
        "A": [-1, -2, np.nan],
        "B": [
            pandas.Timestamp("2013-01-01"),
            pandas.Timestamp("2013-01-02"),
            pandas.NaT,
        ],
        "C": ["foo", "bar", None],
        "D": ["foo2", "bar2", None],
    }
    df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    df_equals(modin_df.fillna("?"), df.fillna("?"))
예제 #29
0
    def test_h2o_q10(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"],
                         observed=True).agg({
                             "v3": "sum",
                             "v1": "count"
                         })
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        modin_df = modin_df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"],
                                    observed=True).agg({
                                        "v3": "sum",
                                        "v1": "count"
                                    })
        modin_df.reset_index(inplace=True)

        exp = to_pandas(modin_df)
        exp["id1"] = exp["id1"].astype("category")
        exp["id2"] = exp["id2"].astype("category")
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)
예제 #30
0
    def test_h2o_q7(self):
        df = self._get_h2o_df()

        ref = (df.groupby(["id3"], observed=True).agg({
            "v1": "max",
            "v2": "min"
        }).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]])
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id3"], observed=True).agg({
            "v1": "max",
            "v2": "min"
        })
        modin_df["range_v1_v2"] = modin_df["v1"] - modin_df["v2"]
        modin_df = modin_df[["range_v1_v2"]]
        modin_df.reset_index(inplace=True)
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)