Пример #1
0
def test_fillna_dict_series():
    frame_data = {
        "a": [np.nan, 1, 2, np.nan, np.nan],
        "b": [1, 2, 3, np.nan, np.nan],
        "c": [np.nan, 1, 2, 3, 4],
    }
    df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5}))

    df_equals(
        modin_df.fillna({
            "a": 0,
            "b": 5,
            "d": 7
        }),
        df.fillna({
            "a": 0,
            "b": 5,
            "d": 7
        }),
    )

    # Series treated same as dict
    df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max()))
Пример #2
0
def test_reindex_multiindex():
    data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6)
    index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"])
    modin_midx = pd.MultiIndex.from_product(
        [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
    )
    pandas_midx = pandas.MultiIndex.from_product(
        [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"]
    )
    modin_df1, modin_df2 = (
        pd.DataFrame(data=data1, index=index, columns=index),
        pd.DataFrame(data2, modin_midx),
    )
    pandas_df1, pandas_df2 = (
        pandas.DataFrame(data=data1, index=index, columns=index),
        pandas.DataFrame(data2, pandas_midx),
    )
    modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"]
    md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index])
    pd_midx = pandas.MultiIndex.from_product(
        [pandas_df2.index.levels[0], pandas_df1.index]
    )
    # reindex without axis, index, or columns
    modin_result = modin_df1.reindex(md_midx, fill_value=0)
    pandas_result = pandas_df1.reindex(pd_midx, fill_value=0)
    df_equals(modin_result, pandas_result)
    # reindex with only axis
    modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0)
    pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0)
    df_equals(modin_result, pandas_result)
    # reindex with axis and level
    modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0)
    pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0)
    df_equals(modin_result, pandas_result)
Пример #3
0
def test___getitem__(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    key = modin_df.columns[0]
    modin_col = modin_df.__getitem__(key)
    assert isinstance(modin_col, pd.Series)

    pd_col = pandas_df[key]
    df_equals(pd_col, modin_col)

    slices = [
        (None, -1),
        (-1, None),
        (1, 2),
        (1, None),
        (None, 1),
        (1, -1),
        (-3, -1),
        (1, -1, 2),
    ]

    # slice test
    for slice_param in slices:
        s = slice(*slice_param)
        df_equals(modin_df[s], pandas_df[s])

    # Test empty
    df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10])
Пример #4
0
def test_indexing_duplicate_axis(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)
    modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))]
    assert any(modin_df.index.duplicated())
    assert any(pandas_df.index.duplicated())

    df_equals(modin_df.iloc[0], pandas_df.iloc[0])
    df_equals(modin_df.loc[0], pandas_df.loc[0])
    df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4])
    df_equals(
        modin_df.loc[0, modin_df.columns[0:4]],
        pandas_df.loc[0, pandas_df.columns[0:4]],
    )
Пример #5
0
def test_export_indivisible_chunking(data_has_nulls):
    """
    Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'indivisibly chunked'.

    The setup for the test is a PyArrow table having one of the chunk consisting of a single row,
    meaning that the chunk can't be subdivide.
    """
    data = get_data_of_all_types(has_nulls=data_has_nulls,
                                 exclude_dtypes=["category"])
    pd_df = pandas.DataFrame(data)
    pd_chunks = (pd_df.iloc[:1], pd_df.iloc[1:])

    chunked_at = pa.concat_tables(
        [pa.Table.from_pandas(pd_df) for pd_df in pd_chunks])
    md_df = from_arrow(chunked_at)
    assert (len(md_df._query_compiler._modin_frame._partitions[0][0].get().
                column(0).chunks) == md_df.__dataframe__().num_chunks() == 2)
    # Meaning that we can't subdivide first chunk
    np.testing.assert_array_equal(md_df.__dataframe__()._chunk_slices,
                                  [0, 1, len(pd_df)])

    exported_df = export_frame(md_df, n_chunks=2)
    df_equals(md_df, exported_df)

    exported_df = export_frame(md_df, n_chunks=4)
    df_equals(md_df, exported_df)

    exported_df = export_frame(md_df, n_chunks=40)
    df_equals(md_df, exported_df)
Пример #6
0
def test_select_dtypes():
    frame_data = {
        "test1": list("abc"),
        "test2": np.arange(3, 6).astype("u1"),
        "test3": np.arange(8.0, 11.0, dtype="float64"),
        "test4": [True, False, True],
        "test5": pandas.date_range("now", periods=3).values,
        "test6": list(range(5, 8)),
    }
    df = pandas.DataFrame(frame_data)
    rd = pd.DataFrame(frame_data)

    include = np.float, "integer"
    exclude = (np.bool_,)
    r = rd.select_dtypes(include=include, exclude=exclude)

    e = df[["test2", "test3", "test6"]]
    df_equals(r, e)

    r = rd.select_dtypes(include=np.bool_)
    e = df[["test4"]]
    df_equals(r, e)

    r = rd.select_dtypes(exclude=np.bool_)
    e = df[["test1", "test2", "test3", "test5", "test6"]]
    df_equals(r, e)

    try:
        pd.DataFrame().select_dtypes()
        assert False
    except ValueError:
        assert True
Пример #7
0
def test_set_index(request, data, drop, append):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    if "empty_data" not in request.node.name:
        key = modin_df.columns[0]
        modin_result = modin_df.set_index(key,
                                          drop=drop,
                                          append=append,
                                          inplace=False)
        pandas_result = pandas_df.set_index(key,
                                            drop=drop,
                                            append=append,
                                            inplace=False)
        df_equals(modin_result, pandas_result)

        modin_df_copy = modin_df.copy()
        modin_df.set_index(key, drop=drop, append=append, inplace=True)

        # Check that the copy and original are different
        try:
            df_equals(modin_df, modin_df_copy)
        except AssertionError:
            assert True
        else:
            assert False

        pandas_df.set_index(key, drop=drop, append=append, inplace=True)
        df_equals(modin_df, pandas_df)
Пример #8
0
def test_set_axis(data, axis):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    x = pandas.DataFrame()._get_axis_number(axis)
    index = modin_df.columns if x else modin_df.index
    labels = ["{0}_{1}".format(index[i], i) for i in range(modin_df.shape[x])]

    modin_result = modin_df.set_axis(labels, axis=axis, inplace=False)
    pandas_result = pandas_df.set_axis(labels, axis=axis, inplace=False)
    df_equals(modin_result, pandas_result)

    modin_df_copy = modin_df.copy()
    modin_df.set_axis(labels, axis=axis, inplace=True)

    # Check that the copy and original are different
    try:
        df_equals(modin_df, modin_df_copy)
    except AssertionError:
        assert True
    else:
        assert False

    pandas_df.set_axis(labels, axis=axis, inplace=True)
    df_equals(modin_df, pandas_df)
Пример #9
0
def test_matmul(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)
    col_len = len(modin_df.columns)

    # Test list input
    arr = np.arange(col_len)
    modin_result = modin_df @ arr
    pandas_result = pandas_df @ arr
    df_equals(modin_result, pandas_result)

    # Test bad dimensions
    with pytest.raises(ValueError):
        modin_result = modin_df @ np.arange(col_len + 10)

    # Test series input
    modin_series = pd.Series(np.arange(col_len), index=modin_df.columns)
    pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns)
    modin_result = modin_df @ modin_series
    pandas_result = pandas_df @ pandas_series
    df_equals(modin_result, pandas_result)

    # Test dataframe input
    modin_result = modin_df @ modin_df.T
    pandas_result = pandas_df @ pandas_df.T
    df_equals(modin_result, pandas_result)

    # Test when input series index doesn't line up with columns
    with pytest.raises(ValueError):
        modin_result = modin_df @ pd.Series(np.arange(col_len))
Пример #10
0
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        pytest.xfail("Distributed read_sql is broken, see GH#2194")
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(query,
                                          conn,
                                          partition_column="col1",
                                          lower_bound=0,
                                          upper_bound=6)
        modin_df_from_table = pd.read_sql(table,
                                          conn,
                                          partition_column="col1",
                                          lower_bound=0,
                                          upper_bound=6)

        df_equals(modin_df_from_query, pandas_df)
        df_equals(modin_df_from_table, pandas_df)
Пример #11
0
def test_simple_import():
    modin_df_producer = pd.DataFrame(test_data["int_data"])
    internal_modin_df_producer = modin_df_producer.__dataframe__()
    # Our configuration in pytest.ini requires that we explicitly catch all
    # instances of defaulting to pandas, this one raises a warning on `.from_dataframe`
    with warns_that_defaulting_to_pandas():
        modin_df_consumer = from_dataframe(modin_df_producer)
        internal_modin_df_consumer = from_dataframe(internal_modin_df_producer)

    # TODO: the following assertions verify that `from_dataframe` doesn't return
    # the same object untouched due to optimization branching, it actually should
    # do so but the logic is not implemented yet, so the assertions are passing
    # for now. It's required to replace the producer's type with a different one
    # to consumer when we have some other implementation of the protocol as the
    # assertions may start failing shortly.
    assert modin_df_producer is not modin_df_consumer
    assert internal_modin_df_producer is not internal_modin_df_consumer
    assert (modin_df_producer._query_compiler._modin_frame
            is not modin_df_consumer._query_compiler._modin_frame)

    df_equals(modin_df_producer, modin_df_consumer)
    df_equals(modin_df_producer, internal_modin_df_consumer)
Пример #12
0
def test_export_when_delayed_computations():
    """
    Test that export works properly when OmnisciOnNative has delayed computations.

    If there are delayed functions and export is required, it has to trigger the execution
    first prior materializing protocol's buffers, so the buffers contain actual result
    of the computations.
    """
    # OmniSci can't import 'uint64' as well as booleans, so exclude them
    # issue for bool: https://github.com/modin-project/modin/issues/4299
    data = get_data_of_all_types(has_nulls=True,
                                 exclude_dtypes=["uint64", "bool"])
    md_df = pd.DataFrame(data)
    pd_df = pandas.DataFrame(data)

    md_res = md_df.fillna({"float32_null": 32.0, "float64_null": 64.0})
    pd_res = pd_df.fillna({"float32_null": 32.0, "float64_null": 64.0})
    assert (not md_res._query_compiler._modin_frame._has_arrow_table()
            ), "There are no delayed computations for the frame"

    exported_df = export_frame(md_res)
    df_equals(exported_df, pd_res)
Пример #13
0
def test_tz_convert():
    modin_idx = pd.date_range(
        "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles"
    )
    pandas_idx = pandas.date_range(
        "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles"
    )
    data = np.random.randint(0, 100, size=(len(modin_idx), 4))
    modin_df = pd.DataFrame(data, index=modin_idx)
    pandas_df = pandas.DataFrame(data, index=pandas_idx)
    modin_result = modin_df.tz_convert("UTC", axis=0)
    pandas_result = pandas_df.tz_convert("UTC", axis=0)
    df_equals(modin_result, pandas_result)

    modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))])
    pandas_multi = pandas.MultiIndex.from_arrays([pandas_idx, range(len(modin_idx))])
    modin_series = pd.DataFrame(data, index=modin_multi)
    pandas_series = pandas.DataFrame(data, index=pandas_multi)
    df_equals(
        modin_series.tz_convert("UTC", axis=0, level=0),
        pandas_series.tz_convert("UTC", axis=0, level=0),
    )
Пример #14
0
    def test_h2o_q3(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id3"], observed=True).agg({
            "v1": "sum",
            "v3": "mean"
        })
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id3"], observed=True,
                                    as_index=False).agg({
                                        "v1": "sum",
                                        "v3": "mean"
                                    })
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)
Пример #15
0
def test_copy(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)  # noqa F841

    # pandas_df is unused but there so there won't be confusing list comprehension
    # stuff in the pytest.mark.parametrize
    new_modin_df = modin_df.copy()

    assert new_modin_df is not modin_df
    assert np.array_equal(
        new_modin_df._query_compiler._modin_frame._partitions,
        modin_df._query_compiler._modin_frame._partitions,
    )
    assert new_modin_df is not modin_df
    df_equals(new_modin_df, modin_df)

    # Shallow copy tests
    modin_df = pd.DataFrame(data)
    modin_df_cp = modin_df.copy(False)

    modin_df[modin_df.columns[0]] = 0
    df_equals(modin_df, modin_df_cp)
Пример #16
0
def test_fillna_dataframe():
    frame_data = {
        "a": [np.nan, 1, 2, np.nan, np.nan],
        "b": [1, 2, 3, np.nan, np.nan],
        "c": [np.nan, 1, 2, 3, 4],
    }
    df = pandas.DataFrame(frame_data, index=list("VWXYZ"))
    modin_df = pd.DataFrame(frame_data, index=list("VWXYZ"))

    # df2 may have different index and columns
    df2 = pandas.DataFrame(
        {
            "a": [np.nan, 10, 20, 30, 40],
            "b": [50, 60, 70, 80, 90],
            "foo": ["bar"] * 5,
        },
        index=list("VWXuZ"),
    )
    modin_df2 = pd.DataFrame(df2)

    # only those columns and indices which are shared get filled
    df_equals(modin_df.fillna(modin_df2), df.fillna(df2))
Пример #17
0
def test_reorder_levels():
    data = np.random.randint(1, 100, 12)
    modin_df = pd.DataFrame(
        data,
        index=pd.MultiIndex.from_tuples(
            [(num, letter, color) for num in range(1, 3)
             for letter in ["a", "b", "c"] for color in ["Red", "Green"]],
            names=["Number", "Letter", "Color"],
        ),
    )
    pandas_df = pandas.DataFrame(
        data,
        index=pandas.MultiIndex.from_tuples(
            [(num, letter, color) for num in range(1, 3)
             for letter in ["a", "b", "c"] for color in ["Red", "Green"]],
            names=["Number", "Letter", "Color"],
        ),
    )
    df_equals(
        modin_df.reorder_levels(["Letter", "Color", "Number"]),
        pandas_df.reorder_levels(["Letter", "Color", "Number"]),
    )
Пример #18
0
def test_getitem_same_name():
    data = [
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [9, 10, 11, 12],
        [13, 14, 15, 16],
        [17, 18, 19, 20],
    ]
    columns = ["c1", "c2", "c1", "c3"]
    modin_df = pd.DataFrame(data, columns=columns)
    pandas_df = pandas.DataFrame(data, columns=columns)
    df_equals(modin_df["c1"], pandas_df["c1"])
    df_equals(modin_df["c2"], pandas_df["c2"])
    df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]])
    df_equals(modin_df["c3"], pandas_df["c3"])
Пример #19
0
def test_replace():
    modin_df = pd.DataFrame({
        "A": [0, 1, 2, 3, 4],
        "B": [5, 6, 7, 8, 9],
        "C": ["a", "b", "c", "d", "e"]
    })
    pandas_df = pandas.DataFrame({
        "A": [0, 1, 2, 3, 4],
        "B": [5, 6, 7, 8, 9],
        "C": ["a", "b", "c", "d", "e"]
    })
    modin_result = modin_df.replace({"A": 0, "B": 5}, 100)
    pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100)
    df_equals(modin_result, pandas_result)

    modin_result = modin_df.replace({"A": {0: 100, 4: 400}})
    pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}})
    df_equals(modin_result, pandas_result)

    modin_df = pd.DataFrame({
        "A": ["bat", "foo", "bait"],
        "B": ["abc", "bar", "xyz"]
    })
    pandas_df = pandas.DataFrame({
        "A": ["bat", "foo", "bait"],
        "B": ["abc", "bar", "xyz"]
    })
    modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"})
    pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"})
    df_equals(modin_result, pandas_result)

    modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new")
    pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new")
    df_equals(modin_result, pandas_result)

    modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True)
    pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True)
    df_equals(modin_df, pandas_df)
Пример #20
0
def test_unwrap_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df = pd.DataFrame(data)

    if axis is None:
        expected_partitions = df._query_compiler._modin_frame._partitions
        actual_partitions = np.array(unwrap_partitions(df, axis=axis))
        assert (expected_partitions.shape[0] == actual_partitions.shape[0] and
                expected_partitions.shape[1] == expected_partitions.shape[1])
        for row_idx in range(expected_partitions.shape[0]):
            for col_idx in range(expected_partitions.shape[1]):
                if Engine.get() == "Ray":
                    assert (expected_partitions[row_idx][col_idx].oid ==
                            actual_partitions[row_idx][col_idx])
                if Engine.get() == "Dask":
                    assert (expected_partitions[row_idx][col_idx].future ==
                            actual_partitions[row_idx][col_idx])
    else:
        expected_axis_partitions = (
            df._query_compiler._modin_frame._frame_mgr_cls.axis_partition(
                df._query_compiler._modin_frame._partitions, axis ^ 1))
        expected_axis_partitions = [
            axis_partition.force_materialization().unwrap(squeeze=True)
            for axis_partition in expected_axis_partitions
        ]
        actual_axis_partitions = unwrap_partitions(df, axis=axis)
        assert len(expected_axis_partitions) == len(actual_axis_partitions)
        for item_idx in range(len(expected_axis_partitions)):
            if Engine.get() == "Ray":
                df_equals(
                    ray.get(expected_axis_partitions[item_idx]),
                    ray.get(actual_axis_partitions[item_idx]),
                )
            if Engine.get() == "Dask":
                df_equals(
                    expected_axis_partitions[item_idx].result(),
                    actual_axis_partitions[item_idx].result(),
                )
Пример #21
0
def test_take():
    modin_df = pd.DataFrame(
        [
            ("falcon", "bird", 389.0),
            ("parrot", "bird", 24.0),
            ("lion", "mammal", 80.5),
            ("monkey", "mammal", np.nan),
        ],
        columns=["name", "class", "max_speed"],
        index=[0, 2, 3, 1],
    )
    pandas_df = pandas.DataFrame(
        [
            ("falcon", "bird", 389.0),
            ("parrot", "bird", 24.0),
            ("lion", "mammal", 80.5),
            ("monkey", "mammal", np.nan),
        ],
        columns=["name", "class", "max_speed"],
        index=[0, 2, 3, 1],
    )
    df_equals(modin_df.take([0, 3]), pandas_df.take([0, 3]))
    df_equals(modin_df.take([2], axis=1), pandas_df.take([2], axis=1))
Пример #22
0
def test_reset_index_with_named_index():
    modin_df = pd.DataFrame(test_data_values[0])
    pandas_df = pandas.DataFrame(test_data_values[0])

    modin_df.index.name = pandas_df.index.name = "NAME_OF_INDEX"
    df_equals(modin_df, pandas_df)
    df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))

    modin_df.reset_index(drop=True, inplace=True)
    pandas_df.reset_index(drop=True, inplace=True)
    df_equals(modin_df, pandas_df)

    modin_df = pd.DataFrame(test_data_values[0])
    pandas_df = pandas.DataFrame(test_data_values[0])
    modin_df.index.name = pandas_df.index.name = "NEW_NAME"
    df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))
Пример #23
0
def test_nsmallest():
    data = {
        "population": [
            59000000,
            65000000,
            434000,
            434000,
            434000,
            337000,
            11300,
            11300,
            11300,
        ],
        "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
        "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"],
    }
    index = [
        "Italy",
        "France",
        "Malta",
        "Maldives",
        "Brunei",
        "Iceland",
        "Nauru",
        "Tuvalu",
        "Anguilla",
    ]
    modin_df = pd.DataFrame(data=data, index=index)
    pandas_df = pandas.DataFrame(data=data, index=index)
    df_equals(
        modin_df.nsmallest(n=3, columns="population"),
        pandas_df.nsmallest(n=3, columns="population"),
    )
    df_equals(
        modin_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"),
        pandas_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"),
    )
Пример #24
0
def test_fillna_sanity():
    # with different dtype
    frame_data = [
        ["a", "a", np.nan, "a"],
        ["b", "b", np.nan, "b"],
        ["c", "c", np.nan, "c"],
    ]
    df = pandas.DataFrame(frame_data)

    result = df.fillna({2: "foo"})
    modin_df = pd.DataFrame(frame_data).fillna({2: "foo"})

    df_equals(modin_df, result)

    modin_df = pd.DataFrame(df)
    df.fillna({2: "foo"}, inplace=True)
    modin_df.fillna({2: "foo"}, inplace=True)
    df_equals(modin_df, result)

    frame_data = {
        "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")],
        "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT],
    }
    df = pandas.DataFrame(frame_data)
    result = df.fillna(value={"Date": df["Date2"]})
    modin_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]})
    df_equals(modin_df, result)

    frame_data = {"A": [pandas.Timestamp("2012-11-11 00:00:00+01:00"), pandas.NaT]}
    df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    df_equals(modin_df.fillna(method="pad"), df.fillna(method="pad"))

    frame_data = {"A": [pandas.NaT, pandas.Timestamp("2012-11-11 00:00:00+01:00")]}
    df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data).fillna(method="bfill")
    df_equals(modin_df, df.fillna(method="bfill"))
Пример #25
0
def test_at_time():
    i = pd.date_range("2008-01-01", periods=1000, freq="12H")
    modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i)
    pandas_df = pandas.DataFrame(
        {"A": list(range(1000)), "B": list(range(1000))}, index=i
    )
    df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00"))
    df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00"))
    df_equals(modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1))
Пример #26
0
def test_drop_api_equivalence():
    # equivalence of the labels/axis and index/columns API's
    frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]]

    modin_df = pd.DataFrame(frame_data,
                            index=["a", "b", "c"],
                            columns=["d", "e", "f"])

    modin_df1 = modin_df.drop("a")
    modin_df2 = modin_df.drop(index="a")
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop("d", 1)
    modin_df2 = modin_df.drop(columns="d")
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop(labels="e", axis=1)
    modin_df2 = modin_df.drop(columns="e")
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop(["a"], axis=0)
    modin_df2 = modin_df.drop(index=["a"])
    df_equals(modin_df1, modin_df2)

    modin_df1 = modin_df.drop(["a"], axis=0).drop(["d"], axis=1)
    modin_df2 = modin_df.drop(index=["a"], columns=["d"])
    df_equals(modin_df1, modin_df2)

    with pytest.raises(ValueError):
        modin_df.drop(labels="a", index="b")

    with pytest.raises(ValueError):
        modin_df.drop(labels="a", columns="b")

    with pytest.raises(ValueError):
        modin_df.drop(axis=1)
Пример #27
0
def test_astype_category_large():
    series_length = 10_000
    modin_df = pd.DataFrame(
        {
            "col1": ["str{0}".format(i) for i in range(0, series_length)],
            "col2": [i for i in range(0, series_length)],
        }
    )
    pandas_df = pandas.DataFrame(
        {
            "col1": ["str{0}".format(i) for i in range(0, series_length)],
            "col2": [i for i in range(0, series_length)],
        }
    )

    modin_result = modin_df.astype({"col1": "category"})
    pandas_result = pandas_df.astype({"col1": "category"})
    df_equals(modin_result, pandas_result)
    assert modin_result.dtypes.equals(pandas_result.dtypes)

    modin_result = modin_df.astype("category")
    pandas_result = pandas_df.astype("category")
    df_equals(modin_result, pandas_result)
    assert modin_result.dtypes.equals(pandas_result.dtypes)
Пример #28
0
def test_fillna_datetime_columns():
    frame_data = {
        "A": [-1, -2, np.nan],
        "B": pd.date_range("20130101", periods=3),
        "C": ["foo", "bar", None],
        "D": ["foo2", "bar2", None],
    }
    df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    df_equals(modin_df.fillna("?"), df.fillna("?"))

    frame_data = {
        "A": [-1, -2, np.nan],
        "B": [
            pandas.Timestamp("2013-01-01"),
            pandas.Timestamp("2013-01-02"),
            pandas.NaT,
        ],
        "C": ["foo", "bar", None],
        "D": ["foo2", "bar2", None],
    }
    df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3))
    df_equals(modin_df.fillna("?"), df.fillna("?"))
Пример #29
0
    def test_h2o_q10(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"],
                         observed=True).agg({
                             "v3": "sum",
                             "v1": "count"
                         })
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        modin_df = modin_df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"],
                                    observed=True).agg({
                                        "v3": "sum",
                                        "v1": "count"
                                    })
        modin_df.reset_index(inplace=True)

        exp = to_pandas(modin_df)
        exp["id1"] = exp["id1"].astype("category")
        exp["id2"] = exp["id2"].astype("category")
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)
Пример #30
0
    def test_h2o_q7(self):
        df = self._get_h2o_df()

        ref = (df.groupby(["id3"], observed=True).agg({
            "v1": "max",
            "v2": "min"
        }).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]])
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id3"], observed=True).agg({
            "v1": "max",
            "v2": "min"
        })
        modin_df["range_v1_v2"] = modin_df["v1"] - modin_df["v2"]
        modin_df = modin_df[["range_v1_v2"]]
        modin_df.reset_index(inplace=True)
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)