Python DataFrame示例，legate.pandas.DataFrame Python示例

示例#1

0

显示文件

def test():
    sr = pd.Series([1, 2, 3])
    array = np.asarray(sr)
    x = np.array([1, 2, 3])
    assert np.array_equal(array, x)

    y = np.array([4, 5, 6])
    z = np.add(sr, y)
    assert np.array_equal(z, x + y)

    df = pd.DataFrame({"x": x, "y": y})
    z = np.add(df["x"], df["y"])
    assert np.array_equal(z, x + y)
    return

示例#2

0

显示文件

                "c1": c1,
                "c3": c3_l,
                "key1": np.array(key_left, dtype=key_dtype1),
                "key2": np.array(key_left, dtype=key_dtype2),
            }
        )
        df2 = pd.DataFrame(
            {
                "c2": c2,
                "c3": c3_r,
                "key1": np.array(key_right, dtype=key_dtype1),
                "key2": np.array(key_right, dtype=key_dtype2),
            }
        )

        ldf1 = lp.DataFrame(df1)
        ldf2 = lp.DataFrame(df2)

        join_pandas = df1.merge(df2, on=keys)
        join_legate = ldf1.merge(ldf2, on=keys, method="broadcast")
        join_legate_hash = ldf1.merge(ldf2, on=keys, method="hash")

        assert sort_and_compare(join_pandas, to_pandas(join_legate))
        assert sort_and_compare(join_pandas, to_pandas(join_legate_hash))

    key_left = list(chain(*[[x] * 3 for x in range(n // 3, 0, -1)]))
    for pair in product(key_dtypes, key_dtypes[1:] + key_dtypes[:1]):
        key_dtype1, key_dtype2 = pair
        print(
            "Type: left, Size: %u, Key dtype1: %s, Key dtype2: %s "
            % (n, str(key_dtype1), str(key_dtype2))

示例#3

0

显示文件

# Copyright 2021 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd

from legate import pandas as lp

df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
ldf = lp.DataFrame(df)

df["col3"] = df["col1"]
ldf["col3"] = ldf["col1"]

assert ldf.equals(lp.DataFrame(df))

示例#4

0

显示文件

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd

from legate import pandas as lp

df = pd.DataFrame({
    "a": range(10),
    "b": range(1, 11),
    "c": [str(i) * 3 for i in range(10)],
    "d": [str(i % 3) for i in range(10)],
})
df["c"] = df["c"].astype(pd.StringDtype())
df["d"] = df["d"].astype("category")
ldf = lp.DataFrame(df)

assert ldf.tail(2).equals(df.tail(2))
assert ldf.tail(9).equals(df.tail(9))

示例#5

0

显示文件

# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd

from legate import pandas as lp
from tests.utils import equals, must_fail


def _test(ex, df, *args):
    must_fail(ex, df.insert, *args)


df = pd.DataFrame()
ldf = lp.DataFrame()

df.insert(0, "a", 1)
ldf.insert(0, "a", 1)

assert equals(ldf, df)

df = pd.DataFrame(index=[1, 2, 3])
ldf = lp.DataFrame(index=[1, 2, 3])

df.insert(0, "a", 1)
ldf.insert(0, "a", 1)

assert equals(ldf, df)

df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

示例#6

0

显示文件

文件： df_from_numpy.py 项目： nv-legate/legate.pandas

#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

try:
    from legate.numpy.random import random
except ModuleNotFoundError:
    from numpy.random import random

import pandas as pd

from legate import pandas as lp

x = random(10)
y = random(10)
df = pd.DataFrame({"x": x, "y": y})

ldf1 = lp.DataFrame({"x": x, "y": y})
# FIXME: We don't handle this case correctly now. DataFrame's ctor
#        should align all series in the dictionary.
# ldf2 = lp.DataFrame({"x": lp.Series(x), "y": lp.Series(y)})
ldf3 = lp.DataFrame(ldf1)

assert ldf1.equals(lp.DataFrame(df))
# assert ldf2.equals(lp.DataFrame(df))
assert ldf3.equals(lp.DataFrame(df))

示例#7

0

显示文件

文件： sr_dropna.py 项目： magnatelee/legate.pandas

#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd

from legate import pandas as lp

n = 17

a = [str(i) * 3 for i in range(n)]

for i in range(n):
    if i % 4 == 0:
        a[i] = None

s = pd.Series(a)
s = s.astype(pd.StringDtype())

ls = lp.DataFrame(s)

out_pd = s.dropna()
out_lp = ls.dropna()
assert out_lp.equals(out_pd)

示例#8

0

显示文件

文件： df_binary_external.py 项目： nv-legate/legate.pandas

# limitations under the License.
#

import pandas as pd
from numpy.random import permutation, randn

from legate import pandas as lp
from tests.utils import equals

n = 17
indices = [pd.RangeIndex(1, n + 1), pd.Index(permutation(n))]

for index in indices:
    print(f"Index: {index}")
    df1 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index)
    ldf1 = lp.DataFrame(df1)
    df2 = pd.DataFrame({1: randn(n), 2: randn(n), 5: randn(n)}, index=index)

    out_pd = df1 + df2
    out_lp = ldf1 + df2
    assert equals(out_lp, out_pd)

    out_pd = df1 + df2.values
    out_lp = ldf1 + df2.values
    assert equals(out_lp, out_pd)

    out_pd = df1.add(df2[1].values, axis=0)
    out_lp = ldf1.add(df2[1].values, axis=0)
    assert equals(out_lp, out_pd)

    out_pd = df1.add(df2[1].to_list(), axis=0)

示例#9

0

显示文件

文件： df_merge_index.py 项目： nv-legate/legate.pandas

            {"c1": c1, "key1": np.array(key_left, dtype=key_dtype), "c3": c3_l}
        )
        df1["key"] = df1["key1"]
        df1_key_on_index = df1.set_index("key")

        df2 = pd.DataFrame(
            {
                "c2": c2,
                "key2": np.array(key_right, dtype=key_dtype),
                "c3": c3_r,
            }
        )
        df2["key"] = df2["key2"]
        df2_key_on_index = df2.set_index("key")

        ldf1 = lp.DataFrame(df1)
        ldf1_key_on_index = lp.DataFrame(df1_key_on_index)
        ldf2 = lp.DataFrame(df2)
        ldf2_key_on_index = lp.DataFrame(df2_key_on_index)

        join_pandas2 = df1.merge(
            df2_key_on_index, left_on="key1", right_index=True
        )
        join_pandas4 = df1_key_on_index.merge(
            df2, right_on="key2", left_index=True
        )
        # XXX: Pandas sort the keys in the output when both left_index and
        #      right_index are True, whereas Legate will not for performance
        #      reasons. In this test we sorted the keys in the input dataframe
        #      so that Pandas' join output coincides with Legate's. We can't
        #      and won't guarantee this semantics equivalence in general.

示例#10

0

显示文件

文件： df_loc_empty.py 项目： nv-legate/legate.pandas

import pandas as pd

from legate import pandas as lp
from tests.utils import equals, must_fail


def _test(ex, df, *args):
    def _loc():
        df.loc[args]

    must_fail(ex, _loc)


n = 17

for index in [pd.RangeIndex(3, n + 3), pd.Index(list(range(3, n + 3)))]:
    df_copy = lp.DataFrame({"a": range(n)}, index=index)
    df = lp.DataFrame({"a": range(n)}, index=index)

    _test(KeyError, df, n + 3)
    _test(KeyError, df, n + 4, "a")

    assert len(df.loc[n + 3:n + 4]) == 0

    df.loc[n + 3] = 100
    assert equals(df_copy, df)

    df.loc[n + 3:n + 4] = 200
    assert equals(df_copy, df)

示例#11

0

显示文件

        pd.RangeIndex(21, 1, -2),
        pd.Index(permutation(10)),
]:
    print(f"Index: {index}")
    df = pd.DataFrame(
        {
            "a": range(10),
            "b": range(1, 11),
            "c": [str(i) * 3 for i in range(10)],
            "d": [str(i % 3) for i in range(10)],
        },
        index=index,
    )
    df["c"] = df["c"].astype(pd.StringDtype())
    df["d"] = df["d"].astype("category")
    ldf = lp.DataFrame(df)

    for idx in range(4):
        print(f"Testing ldf.iat[{index[idx + 3]}, {idx}].__getitem__")
        out_pd = df.iat[idx + 3, idx]
        out_lp = ldf.iat[idx + 3, idx]
        assert equals_scalar(out_lp, out_pd)

    for idx, val in enumerate([100, 200, "5678"]):
        print(f"Testing ldf.iat[{index[idx + 3]}, {idx}].__setitem__")
        df.iat[idx + 3, idx] = val
        ldf.iat[idx + 3, idx] = val

        out_pd = df.iat[idx + 3, idx]
        out_lp = ldf.iat[idx + 3, idx]
        assert equals_scalar(out_lp, out_pd)

示例#12

0

显示文件

文件： df_create.py 项目： nv-legate/legate.pandas

import pandas as pd

from legate import pandas as lp
from tests.utils import equals

indices = [
    pd.RangeIndex(3),
    pd.RangeIndex(1, 4),
    pd.RangeIndex(6, step=2),
    pd.RangeIndex(1, 10, step=3),
]

for index in indices:
    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=index)
    ldf = lp.DataFrame(df)

    # Passing Legate dataframes as arguments
    assert equals(lp.DataFrame(ldf, dtype="float64"),
                  pd.DataFrame(df, dtype="float64"))
    assert equals(lp.DataFrame(ldf, columns=["a"]),
                  pd.DataFrame(df, columns=["a"]))
    assert equals(
        lp.DataFrame(ldf, columns=["a"], dtype="float64"),
        pd.DataFrame(df, columns=["a"], dtype="float64"),
    )

    # Passing Legate series as arguments
    assert equals(lp.DataFrame(ldf["a"]), pd.DataFrame(df["a"]))
    assert equals(
        lp.DataFrame(ldf["a"], dtype="float32"),

示例#13

0

显示文件

文件： df_create_empty.py 项目： nv-legate/legate.pandas

#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd

from legate import pandas as lp
from tests.utils import equals

df = pd.DataFrame()
ldf = lp.DataFrame()

assert equals(ldf, df)

df["a"] = 0
ldf["a"] = 0

assert equals(ldf, df)

df = pd.DataFrame(columns=["a", "b"])
ldf = lp.DataFrame(columns=["a", "b"])

assert equals(ldf, df)

df.loc[:, "a"] = "1"
ldf.loc[:, "a"] = "1"

示例#14

0

显示文件

文件： sort.py 项目： nv-legate/legate.pandas

def test(
    size_per_proc=1000,
    num_procs=1,
    num_runs=1,
    scale_lhs_only=False,
    package="legate",
    ty="int64",
    key_length=40,
    pad_side="right",
):
    if package == "legate":
        from legate import numpy as np, pandas as pd
        from legate.numpy.random import randn

    elif package == "cudf":
        import cudf as pd
        import cupy as np
        from cupy.random import randn

    elif package == "pandas":
        import numpy as np
        import pandas as pd
        from numpy.random import randn

    else:
        print("Unknown dataframe package: %s" % package)
        assert False

    if package == "legate":
        from legate.timing import time

        def block():
            pass

        def get_timestamp():
            return time()

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) / 1000.0

    else:
        import time

        def block():
            pass

        get_timestamp = time.process_time

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) * 1000.0

    size = size_per_proc * num_procs

    key = np.arange(size, dtype=np.int64) % size_per_proc
    payload = randn(size)

    df = pd.DataFrame({"key": key, "payload": payload})
    if ty == "int64":
        df["key"] = df["key"] * -1
        ascending = True
    if ty == "string":
        df["key"] = (
            df["key"]
            .astype(str)
            .str.pad(width=key_length, side=pad_side, fillchar="0")
        )
        ascending = False

    print("Size: %u, Key dtype: %s" % (size, df["key"].dtype))

    block()

    for i in range(num_runs):
        start_ts = get_timestamp()

        result = df.sort_values("key", ignore_index=True, ascending=ascending)

        stop_ts = get_timestamp()

        print(
            "[Run %d] Elapsed time: %lf ms"
            % (i + 1, compute_elapsed_time(start_ts, stop_ts))
        )

        del result

示例#15

0

显示文件

文件： df_merge_chained.py 项目： nv-legate/legate.pandas

        "key1": np.array(key1, dtype=np.int64),
        "key2": np.array(key1[::-1], dtype=np.int64),
    })
    df2 = pd.DataFrame({
        "c2": c2,
        "key1": np.array(key2, dtype=np.int64),
        "key2": np.array(key2[::-1], dtype=np.int64),
    })
    df3 = pd.DataFrame({"c3": c3, "key1": np.array(key3, dtype=np.int64)})
    df4 = pd.DataFrame({
        "c4": c3,
        "key1": np.array(key3, dtype=np.int64),
        "key2": np.array(key3[::-1], dtype=np.int64),
    })

    ldf1 = lp.DataFrame(df1)
    ldf2 = lp.DataFrame(df2)
    ldf3 = lp.DataFrame(df3)
    ldf4 = lp.DataFrame(df4)

    join_pandas = (df1.merge(df2,
                             on=["key1",
                                 "key2"]).merge(df3,
                                                on="key1").merge(df4,
                                                                 on="key1"))
    join_legate = (ldf1.merge(ldf2,
                              on=["key1",
                                  "key2"]).merge(ldf3,
                                                 on="key1").merge(ldf4,
                                                                  on="key1"))

示例#16

0

显示文件

文件： merge.py 项目： nv-legate/legate.pandas

def test(
    size_per_proc=1000,
    num_procs=1,
    num_runs=1,
    ty="int64",
    key_length=10,
    scale_lhs_only=False,
    package="legate",
):
    if package == "legate":
        from legate import numpy as np, pandas as pd
        from legate.numpy.random import randn

    elif package == "cudf":
        import cudf as pd
        import cupy as np
        from cupy.random import randn

    elif package == "pandas":
        import numpy as np
        import pandas as pd
        from numpy.random import randn

    elif package == "dask" or package == "daskcudf":
        import dask.array as da
        import dask.dataframe as df
        import numpy as np

        if package == "daskcudf":
            import cudf

    else:
        print("Unknown dataframe package: %s" % package)
        assert False

    if package == "legate":
        from legate.timing import time

        def block(*args):
            pass

        def get_timestamp():
            return time()

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) / 1000.0

    elif package == "dask" or package == "daskcudf":
        import time

        def block(*args):
            for arg in args:
                arg.compute()

        get_timestamp = time.process_time

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) * 1000.0

    else:
        import time

        def block(*args):
            pass

        get_timestamp = time.process_time

        def compute_elapsed_time(start_ts, stop_ts):
            return (stop_ts - start_ts) * 1000.0

    if scale_lhs_only:
        size = size_per_proc * num_procs
        size_rhs = size // 3

        if package == "dask" or package == "daskcudf":
            # Dask array does not have randn so use arrange
            c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc)
            c2 = da.arange(
                size_rhs,
                dtype=np.float64,
                chunks=(size_per_proc + num_procs - 1) // num_procs,
            )
        else:
            c1 = randn(size)
            c2 = randn(size_rhs)

        key_dtype = np.int64
        if package == "dask" or package == "daskcudf":
            key_left = (
                da.arange(size, dtype=key_dtype, chunks=size_per_proc)
                % size_per_proc
            )
            key_right = da.arange(
                size_rhs,
                dtype=key_dtype,
                chunks=(size_per_proc + num_procs - 1) // num_procs,
            )
            da.multiply(key_right, 3, out=key_right)
        else:
            key_left = np.arange(size, dtype=key_dtype) % size_per_proc
            key_right = np.arange(size_rhs, dtype=key_dtype)
            np.multiply(key_right, 3, out=key_right)

    else:
        size = size_per_proc * num_procs
        size_rhs = size

        if package == "dask" or package == "daskcudf":
            # Dask array does not have randn so use arrange
            c1 = da.arange(size, dtype=np.float64, chunks=size_per_proc)
            c2 = da.arange(size, dtype=np.float64, chunks=size_per_proc)
        else:
            c1 = randn(size)
            c2 = randn(size)

        key_dtype = np.int64
        if package == "dask" or package == "daskcudf":
            key_left = da.arange(size, dtype=key_dtype, chunks=size_per_proc)
            key_right = da.arange(size, dtype=key_dtype, chunks=size_per_proc)
        else:
            key_left = np.arange(size, dtype=key_dtype)
            key_right = np.arange(size, dtype=key_dtype)
        # np.floor_divide(key_right, 3, out=key_right)
        # np.multiply(key_right, 3, out=key_right)

    if package == "dask" or package == "daskcudf":
        df1 = df.multi.concat(
            [df.from_dask_array(a) for a in [c1, key_left]], axis=1
        )
        df1.columns = ["c1", "key"]
        df2 = df.multi.concat(
            [df.from_dask_array(a) for a in [c2, key_right]], axis=1
        )
        df2.columns = ["c2", "key"]
        if package == "daskcudf":
            df1 = df1.map_partitions(cudf.from_pandas)
            df2 = df2.map_partitions(cudf.from_pandas)
    else:
        df1 = pd.DataFrame({"c1": c1, "key": key_left})
        df2 = pd.DataFrame({"c2": c2, "key": key_right})
    df2["key"] = df2["key"] // 3 * 3

    if ty == "string":
        df1["key"] = (
            df1["key"]
            .astype("string")
            .str.pad(width=key_length, side="both", fillchar="0")
        )
        df2["key"] = (
            df2["key"]
            .astype("string")
            .str.pad(width=key_length, side="both", fillchar="0")
        )

    print(
        "Type: inner, Size: %u x %u, Key dtype: %s"
        % (size, size_rhs, str(key_dtype))
    )

    block(df1, df2)

    for i in range(num_runs):
        start_ts = get_timestamp()

        df_result = df1.merge(df2, on="key")

        block(df_result)

        stop_ts = get_timestamp()

        print(
            "[Run %d] Elapsed time: %lf ms"
            % (i + 1, compute_elapsed_time(start_ts, stop_ts))
        )

        del df_result