Exemplo n.º 1
0
def assert_eq(df_c: DataFrame, df_p: pd.DataFrame, sort=False):
    if sort:  # sort by every column
        print(
            df_c.sort_values(by=[*range(0, df_c.shape[1])]).to_numpy(
                order='F', zero_copy_only=False))

        print(df_p.to_numpy())
        assert np.array_equal(
            df_c.sort_values(
                by=[*range(0, df_c.shape[1])]).to_pandas().to_numpy(),
            df_p.sort_values(by=[*range(0, df_p.shape[1])]).to_numpy())
    else:
        assert np.array_equal(df_c.to_numpy(order='F', zero_copy_only=False),
                              df_p.to_numpy())
def test_df_dist_sorting():
    df1 = DataFrame(
        [random.sample(range(10, 30), 5),
         random.sample(range(10, 30), 5)])

    def check_sort(df, col, ascending):
        arr = df.to_pandas()[col]
        for i in range(len(arr) - 1):
            if ascending:
                assert arr[i] <= arr[i + 1]
            else:
                assert arr[i] >= arr[i + 1]

    # local sort
    df = df1.sort_values('0', ascending=True)
    check_sort(df, '0', True)

    df = df1.sort_values('0', ascending=False)
    check_sort(df, '0', False)

    # distributed sort
    env = CylonEnv(config=MPIConfig(), distributed=True)

    print("Distributed Sort", env.rank, env.world_size)

    df3 = df1.sort_values(by=[0], env=env, ascending=True)
    check_sort(df3, '0', True)

    df3 = df1.sort_values(by=[0], env=env, ascending=False)
    check_sort(df3, '0', False)
Exemplo n.º 3
0
def create_high_price_items_df(env: CylonEnv, item_df: DataFrame) -> DataFrame:
    """
    grouped_item_df = (
        item_df[["i_category", "i_current_price"]]
            .groupby(["i_category"])
            .agg({"i_current_price": "mean"})
    )
    grouped_item_df = grouped_item_df.rename(columns={"i_current_price": "avg_price"})
    grouped_item_df = grouped_item_df.reset_index(drop=False)

    item_df = item_df.merge(grouped_item_df)
    item_df = item_df[
        item_df["i_current_price"] > item_df["avg_price"] * q07_HIGHER_PRICE_RATIO
        ].reset_index(drop=True)
    high_price_items_df = item_df
    del item_df
    return high_price_items_df
    """
    # item_df = ["i_item_sk", "i_current_price", "i_category"]
    grouped_item_df = item_df[["i_category", "i_current_price"]].groupby(by=0, env=env) \
        .agg({"i_current_price": 'mean'})
    # [i_category, i_current_price_mean]

    grouped_item_df.rename({"mean_i_current_price": "avg_price"})
    # [i_category, avg_price]

    # todo: this join has unbalanced work & can create empty tables
    item_df = item_df.merge(grouped_item_df, how='inner', algorithm='sort',
                            on=['i_category'], suffixes=("", "rt-"), env=env) \
        .drop(['rt-i_category'])
    item_df.rename({"rt-avg_price": "avg_price"})
    # [i_item_sk, i_current_price, i_category, avg_price]

    if len(item_df):
        item_df = item_df[item_df["i_current_price"] > item_df["avg_price"] *
                          q07_HIGHER_PRICE_RATIO]
        # [i_item_sk, i_current_price, i_category, avg_price]

    return item_df
Exemplo n.º 4
0
    def read(self,
             env: CylonEnv,
             table,
             relevant_cols=None,
             **kwargs) -> DataFrame:
        filepath = self.table_path_mapping[table].replace('$TABLE', table)

        names, _ = get_schema(table)
        # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
        # .with_delimiter('|')
        read_opts = ReadOptions(column_names=names, block_size=(1 << 30))
        parse_opts = ParseOptions(delimiter='|')
        convert_opts = ConvertOptions(include_columns=relevant_cols)

        # if table is in refresh_tables list, read that table and concat
        # NOTE: refresh tables have the same parallelism as its data tables
        if table in REFRESH_TABLES:
            data_table = pa_read_csv(filepath,
                                     read_options=read_opts,
                                     parse_options=parse_opts,
                                     convert_options=convert_opts)
            refresh_path = filepath.replace('/data/', '/data_refresh/')

            refresh_table = pa_read_csv(refresh_path,
                                        read_options=read_opts,
                                        parse_options=parse_opts,
                                        convert_options=convert_opts)

            pa_table = pa_concat_tables([data_table, refresh_table])
        else:
            pa_table = pa_read_csv(filepath,
                                   read_options=read_opts,
                                   parse_options=parse_opts,
                                   convert_options=convert_opts)

        return DataFrame(Table.from_arrow(env.context, pa_table))
Exemplo n.º 5
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

import random

from pycylon import DataFrame, CylonEnv
from pycylon.net import MPIConfig

df1 = DataFrame([random.sample(range(10, 100), 50),
                 random.sample(range(10, 100), 50)])

# local sort
df3 = df1.sort_values(by=[0])
print("Local Sort")
print(df3)

# distributed sort
env = CylonEnv(config=MPIConfig())

df1 = DataFrame([random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5),
                 random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5)])
print("Distributed Sort", env.rank)
df3 = df1.sort_values(by=[0], env=env)
print(df3)
Exemplo n.º 6
0
##
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

from pycylon import DataFrame

df1 = DataFrame([[0, 0, 1, 1], [1, 10, 1, 5], [10, 20, 30, 40]])

df3 = df1.groupby(by=0).agg({"1": "sum", "2": "min"})
print(df3)

df4 = df1.groupby(by=0).min()
print(df4)

df5 = df1.groupby(by=[0, 1]).max()
print(df5)
Exemplo n.º 7
0
def create_df(data) -> Union[DataFrame, pd.DataFrame]:
    # np.T is a temp fix to address inconsistencies
    return DataFrame(data), pd.DataFrame(np.array(data).T)
Exemplo n.º 8
0
def test_pd_read_csv():
    env = CylonEnv(config=MPIConfig())

    df1 = DataFrame(pd.read_csv('/tmp/user_usage_tm_1.csv'))
    df2 = DataFrame(pd.read_csv('/tmp/user_device_tm_1.csv'))

    df1 = df1.set_index([3], drop=True)
    df2 = df2.set_index([0], drop=True)

    df1.to_table().retain_memory(False)
    df2.to_table().retain_memory(False)

    df3 = df1.merge(right=df2,
                    left_on=[3],
                    right_on=[0],
                    algorithm='sort',
                    env=env)

    assert len(df3)
Exemplo n.º 9
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

from pycylon import DataFrame, read_csv, CylonEnv
from pycylon.net import MPIConfig
import sys
import pandas as pd

# using cylon native reader
df = read_csv(sys.argv[1])
print(df)

# using pandas to load csv
df = DataFrame(
    pd.read_csv(
        "http://data.un.org/_Docs/SYB/CSV/SYB63_1_202009_Population,%20Surface%20Area%20and%20Density.csv",
        skiprows=6,
        usecols=[0, 1]))
print(df)

# loading json
df = DataFrame(pd.read_json("https://api.exchangerate-api.com/v4/latest/USD"))
print(df)

# distributed loading : run in distributed mode with MPI or UCX
env = CylonEnv(config=MPIConfig())
df = read_csv(sys.argv[1], slice=True, env=env)
print(df)
Exemplo n.º 10
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

import random

import pycylon as cn
from pycylon import DataFrame, CylonEnv
from pycylon.net import MPIConfig

df1 = DataFrame(
    [random.sample(range(10, 100), 5),
     random.sample(range(10, 100), 5)])
df2 = DataFrame(
    [random.sample(range(10, 100), 5),
     random.sample(range(10, 100), 5)])
df3 = DataFrame(
    [random.sample(range(10, 100), 10),
     random.sample(range(10, 100), 10)])

# local unique
df4 = cn.concat(axis=0, objs=[df1, df2, df3])
print("Local concat axis0")
print(df4)

df2.rename(['00', '11'])
df3.rename(['000', '111'])
Exemplo n.º 11
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

from pycylon import DataFrame, CylonEnv
from pycylon.net import MPIConfig
import random

df1 = DataFrame(
    [random.sample(range(10, 100), 50),
     random.sample(range(10, 100), 50)])
df2 = DataFrame(
    [random.sample(range(10, 100), 50),
     random.sample(range(10, 100), 50)])
df2.set_index([0], inplace=True)

# local join
df3 = df1.join(other=df2, on=[0])
print("Local Join")
print(df3)

# distributed join
env = CylonEnv(config=MPIConfig())

df1 = DataFrame([
Exemplo n.º 12
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

from pycylon import DataFrame, CylonEnv
from pycylon.net import MPIConfig
import random

df1 = DataFrame(
    [random.sample(range(10, 100), 50),
     random.sample(range(10, 100), 50)])
df2 = DataFrame(
    [random.sample(range(10, 100), 50),
     random.sample(range(10, 100), 50)])

# local merge
df3 = df1.merge(right=df2, on=[0])
print("Local Merge")
print(df3)

# distributed join
env = CylonEnv(config=MPIConfig())

df1 = DataFrame([
    random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5),
Exemplo n.º 13
0
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##

from pycylon import DataFrame, CylonEnv
from pycylon.net import MPIConfig
import random

df1 = DataFrame(
    [random.sample(range(10, 100), 50),
     random.sample(range(10, 100), 50)])

# local unique
df3 = df1.drop_duplicates()
print("Local Unique")
print(df3)

# distributed unique
env = CylonEnv(config=MPIConfig())

df1 = DataFrame([
    random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5),
    random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5)
])
print("Distributed Unique", env.rank)
Exemplo n.º 14
0
def main(env, config):
    # import cudf

    # Conf variables

    q09_year = 2001

    q09_part1_ca_country = "United States"
    # q09_part1_ca_state_IN = "KY", "GA", "NM"
    q09_part1_ca_state_IN = ["KY", "GA", "NM"]
    q09_part1_net_profit_min = 0
    q09_part1_net_profit_max = 2000
    q09_part1_education_status = "4 yr Degree"
    q09_part1_marital_status = "M"
    q09_part1_sales_price_min = 100
    q09_part1_sales_price_max = 150

    q09_part2_ca_country = "United States"
    # q09_part2_ca_state_IN = "MT", "OR", "IN"
    q09_part2_ca_state_IN = ["MT", "OR", "IN"]
    q09_part2_net_profit_min = 150
    q09_part2_net_profit_max = 3000
    q09_part2_education_status = "4 yr Degree"
    q09_part2_marital_status = "M"
    q09_part2_sales_price_min = 50
    q09_part2_sales_price_max = 200

    q09_part3_ca_country = "United States"
    # q09_part3_ca_state_IN = "WI", "MO", "WV"
    q09_part3_ca_state_IN = ["WI", "MO", "WV"]
    q09_part3_net_profit_min = 50
    q09_part3_net_profit_max = 25000
    q09_part3_education_status = "4 yr Degree"
    q09_part3_marital_status = "M"
    q09_part3_sales_price_min = 150
    q09_part3_sales_price_max = 200
    """
    (
        store_sales,
        customer_address,
        customer_demographics,
        date_dim,
        store,
    ) = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    """
    (
        store_sales,
        customer_address,
        customer_demographics_1part,
        date_dim_1part,
        store,
    ) = read_tables(env, config=config)
    """
    date_dim = date_dim.query(
        "d_year==@q09_year", meta=date_dim._meta, local_dict={"q09_year": q09_year}
    ).reset_index(drop=True)
   """
    date_dim_1part = date_dim_1part[date_dim_1part['d_year'] == q09_year]
    # d_year: int64 d_date_sk: int64
    """
    output_table = store_sales.merge(
        date_dim, left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], how="inner"
    )
    """
    output_table = store_sales.merge(date_dim_1part,
                                     how="inner",
                                     algorithm='sort',
                                     left_on=["ss_sold_date_sk"],
                                     right_on=["d_date_sk"],
                                     suffixes=('', ''))  # local
    # output_table.rename([x.split('-')[1] for x in output_table.column_names])
    # ss_quantity: int64
    # ss_sold_date_sk: int64
    # ss_addr_sk: int64
    # ss_store_sk: int64
    # ss_cdemo_sk: int64
    # ss_sales_price: double
    # ss_net_profit: double
    # d_year: int64
    # d_date_sk: int64
    """
    output_table = output_table.drop(
        columns=["d_year", "d_date_sk", "ss_sold_date_sk"]
    )  # Drop the columns that are not needed
    """
    # Drop the columns that are not needed
    output_table = output_table.drop(
        ["d_year", "d_date_sk", "ss_sold_date_sk"])
    """
    output_table = output_table.merge(
        store, left_on=["ss_store_sk"], right_on=["s_store_sk"], how="inner"
    )
    output_table = output_table.drop(columns=["ss_store_sk", "s_store_sk"])
    """
    output_table = output_table.merge(store,
                                      how="inner",
                                      algorithm='sort',
                                      left_on=["ss_store_sk"],
                                      right_on=["s_store_sk"],
                                      suffixes=('', ''),
                                      env=env)
    output_table = output_table.drop(["ss_store_sk", "s_store_sk"])
    # output_table.rename([x.split('-')[1] for x in output_table.column_names])
    # ss_quantity: int64
    # ss_addr_sk: int64
    # ss_cdemo_sk: int64
    # ss_sales_price: double
    # ss_net_profit: double
    """
    output_table = output_table.merge(
        customer_demographics,
        left_on=["ss_cdemo_sk"],
        right_on=["cd_demo_sk"],
        how="inner",
    )
    """
    output_table = output_table.merge(customer_demographics_1part,
                                      how="inner",
                                      algorithm='sort',
                                      left_on=["ss_cdemo_sk"],
                                      right_on=["cd_demo_sk"],
                                      suffixes=('', ''))  # local
    # output_table.rename([x.split('-')[1] for x in output_table.column_names])
    # ss_quantity: int64
    # ss_addr_sk: int64
    # ss_cdemo_sk: int64
    # ss_sales_price: double
    # ss_net_profit: double
    # cd_demo_sk: int64
    # cd_marital_status: string
    # cd_education_status: string
    """
    output_table = output_table[
        (
                (output_table["cd_marital_status"] == q09_part1_marital_status)
                & (output_table["cd_education_status"] == q09_part1_education_status)
                & (output_table["ss_sales_price"] >= q09_part1_sales_price_min)
                & (output_table["ss_sales_price"] <= q09_part1_sales_price_max)
        )
        | (
                (output_table["cd_marital_status"] == q09_part2_marital_status)
                & (output_table["cd_education_status"] == q09_part2_education_status)
                & (output_table["ss_sales_price"] >= q09_part2_sales_price_min)
                & (output_table["ss_sales_price"] <= q09_part2_sales_price_max)
        )
        | (
                (output_table["cd_marital_status"] == q09_part3_marital_status)
                & (output_table["cd_education_status"] == q09_part3_education_status)
                & (output_table["ss_sales_price"] >= q09_part3_sales_price_min)
                & (output_table["ss_sales_price"] <= q09_part3_sales_price_max)
        )
        ].reset_index(drop=True)
    output_table = output_table.drop(
        columns=[
            "ss_cdemo_sk",
            "cd_demo_sk",
            "cd_marital_status",
            "cd_education_status",
            "ss_sales_price",
        ]
    )
    """
    output_table = output_table[
        ((output_table["cd_marital_status"] == q09_part1_marital_status)
         & (output_table["cd_education_status"] == q09_part1_education_status)
         & (output_table["ss_sales_price"] >= q09_part1_sales_price_min)
         & (output_table["ss_sales_price"] <= q09_part1_sales_price_max))
        | ((output_table["cd_marital_status"] == q09_part2_marital_status)
           &
           (output_table["cd_education_status"] == q09_part2_education_status)
           & (output_table["ss_sales_price"] >= q09_part2_sales_price_min)
           & (output_table["ss_sales_price"] <= q09_part2_sales_price_max))
        | ((output_table["cd_marital_status"] == q09_part3_marital_status)
           &
           (output_table["cd_education_status"] == q09_part3_education_status)
           & (output_table["ss_sales_price"] >= q09_part3_sales_price_min)
           & (output_table["ss_sales_price"] <= q09_part3_sales_price_max))]
    output_table = output_table.drop([
        "ss_cdemo_sk",
        "cd_demo_sk",
        "cd_marital_status",
        "cd_education_status",
        "ss_sales_price",
    ])
    # ss_quantity: int64
    # ss_addr_sk: int64
    # ss_net_profit: double
    """    
    output_table = output_table.merge(
        customer_address,
        left_on=["ss_addr_sk"],
        right_on=["ca_address_sk"],
        how="inner",
    )"""
    output_table = output_table.merge(customer_address,
                                      how="inner",
                                      algorithm='sort',
                                      left_on=["ss_addr_sk"],
                                      right_on=["ca_address_sk"],
                                      suffixes=('', ''),
                                      env=env)
    # output_table.rename([x.split('-')[1] for x in output_table.column_names])
    """
    output_table = output_table[
        (
                (output_table["ca_country"] == q09_part1_ca_country)
                & (output_table["ca_state"].str.contains("|".join(q09_part1_ca_state_IN)))
                & (output_table["ss_net_profit"] >= q09_part1_net_profit_min)
                & (output_table["ss_net_profit"] <= q09_part1_net_profit_max)
        )
        | (
                (output_table["ca_country"] == q09_part2_ca_country)
                & (output_table["ca_state"].str.contains("|".join(q09_part2_ca_state_IN)))
                & (output_table["ss_net_profit"] >= q09_part2_net_profit_min)
                & (output_table["ss_net_profit"] <= q09_part2_net_profit_max)
        )
        | (
                (output_table["ca_country"] == q09_part3_ca_country)
                & (output_table["ca_state"].str.contains("|".join(q09_part3_ca_state_IN)))
                & (output_table["ss_net_profit"] >= q09_part3_net_profit_min)
                & (output_table["ss_net_profit"] <= q09_part3_net_profit_max)
        )
        ].reset_index(drop=True)
    output_table = output_table.drop(
        columns=[
            "ss_addr_sk",
            "ca_address_sk",
            "ca_country",
            "ca_state",
            "ss_net_profit",
        ]
    )"""

    output_table = output_table[
        ((output_table["ca_country"] == q09_part1_ca_country)
         & (output_table["ca_state"].isin(q09_part1_ca_state_IN))
         & (output_table["ss_net_profit"] >= q09_part1_net_profit_min)
         & (output_table["ss_net_profit"] <= q09_part1_net_profit_max))
        | ((output_table["ca_country"] == q09_part2_ca_country)
           & (output_table["ca_state"].isin(q09_part2_ca_state_IN))
           & (output_table["ss_net_profit"] >= q09_part2_net_profit_min)
           & (output_table["ss_net_profit"] <= q09_part2_net_profit_max))
        | ((output_table["ca_country"] == q09_part3_ca_country)
           & (output_table["ca_state"].isin(q09_part3_ca_state_IN))
           & (output_table["ss_net_profit"] >= q09_part3_net_profit_min)
           & (output_table["ss_net_profit"] <= q09_part3_net_profit_max))]
    output_table = output_table.drop([
        "ss_addr_sk",
        "ca_address_sk",
        "ca_country",
        "ca_state",
        "ss_net_profit",
    ])

    ### this is a scaler so no need to transform
    """
    result = output_table["ss_quantity"].sum().persist()
    result = result.compute()
    result_df = cudf.DataFrame({"sum(ss_quantity)": [result]})
    """
    # print(output_table.to_arrow())

    result = DataFrame(output_table.to_table().sum("ss_quantity"))

    result.rename(["sum(ss_quantity)"])

    print(result)
    return result
Exemplo n.º 15
0
def inventory_before_after(df: DataFrame, date) -> DataFrame:
    df["inv_before"] = df["inv_quantity_on_hand"].copy()
    df.loc[df["d_date"] >= date, "inv_before"] = 0
    df["inv_after"] = df["inv_quantity_on_hand"].copy()
    df.loc[df["d_date"] < date, "inv_after"] = 0
    return df