Пример #1
0
def test_reregistration(c):
    def f(x):
        return x**2

    # The same is fine
    c.register_function(f, "f", [("x", np.float64)], np.float64)
    c.register_function(f, "f", [("x", np.int64)], np.int64)

    def f(x):
        return x**3

    # A different not
    with pytest.raises(ValueError):
        c.register_function(f, "f", [("x", np.float64)], np.float64)

    # only if we replace it
    c.register_function(f, "f", [("x", np.float64)], np.float64, replace=True)

    fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum())
    c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64)
    c.register_aggregation(fagg, "fagg", [("x", np.int64)], np.int64)

    fagg = dd.Aggregation("f", lambda x: x.mean(), lambda x: x.mean())

    with pytest.raises(ValueError):
        c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64)

    c.register_aggregation(fagg,
                           "fagg", [("x", np.float64)],
                           np.float64,
                           replace=True)
Пример #2
0
def q12(lineitem, orders):
    t1 = time.time()
    date1 = datetime.strptime("1994-01-01", '%Y-%m-%d')
    date2 = datetime.strptime("1995-01-01", '%Y-%m-%d')
    sel = (
        (lineitem.L_RECEIPTDATE < date2)
        & (lineitem.L_COMMITDATE < date2)
        & (lineitem.L_SHIPDATE < date2)
        & (lineitem.L_SHIPDATE < lineitem.L_COMMITDATE)
        & (lineitem.L_COMMITDATE < lineitem.L_RECEIPTDATE)
        & (lineitem.L_RECEIPTDATE >= date1)
        & ((lineitem.L_SHIPMODE == "MAIL") | (lineitem.L_SHIPMODE == "SHIP"))
    )
    flineitem = lineitem[sel]
    jn = flineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
    gb = jn.groupby("L_SHIPMODE")["O_ORDERPRIORITY"]

    def g1(x): 
        return x.apply(lambda s: ((s == "1-URGENT") | (s == "2-HIGH")).sum())

    def g2(x): 
        return x.apply(lambda s: ((s != "1-URGENT") & (s != "2-HIGH")).sum())

    g1_agg = pd.Aggregation('g1', g1, lambda s0: s0.sum())
    g2_agg = pd.Aggregation('g2', g2, lambda s0: s0.sum())
    total = gb.agg([g1_agg, g2_agg])
    total = total.compute().reset_index().sort_values("L_SHIPMODE")
    print(total)
    print("Q12 Execution time (s): ", time.time() - t1)
Пример #3
0
def group_data(df):
    """Aggregate the DataFrame and return the grouped DataFrame.

    :param df: DataFrame
    :returns: DataFrame
    """
    # round timestamps down to an hour
    df['ts'] = df['ts'].dt.floor('1H')

    # group on customer, timestamp (rounded) and url
    gb = df.groupby(['customer', 'url', 'ts'])

    counter = dd.Aggregation(
        'counter',
        lambda s: s.apply(counter_chunk),
        lambda s: s.apply(counter_agg),
    )

    count_unique = dd.Aggregation('count_unique',
                                  lambda s: s.apply(nunique_chunk),
                                  lambda s: s.apply(nunique_agg))

    ag = gb.agg({'session_id': [count_unique, 'count'], 'referrer': counter})

    ag = ag.reset_index()

    # get rid of multilevel columns
    ag.columns = [
        'customer', 'url', 'ts', 'visitors', 'page_views', 'referrers'
    ]
    ag = ag.repartition(npartitions=df.npartitions)

    return ag
    def get_function(self, agg_type=Library.PANDAS):
        if agg_type == Library.DASK:

            def chunk(s):
                def format_chunk(x):
                    return x[:].fillna(0)

                chunk_sum = s.agg(lambda x: format_chunk(x).sum())
                chunk_len = s.agg(lambda x: len(format_chunk(x)))
                if chunk_sum.dtype == 'bool':
                    chunk_sum = chunk_sum.astype('int64')
                if chunk_len.dtype == 'bool':
                    chunk_len = chunk_len.astype('int64')
                return (chunk_sum, chunk_len)

            def agg(val, length):
                return (val.sum(), length.sum())

            def finalize(total, length):
                return total / length

            return dd.Aggregation(self.name,
                                  chunk=chunk,
                                  agg=agg,
                                  finalize=finalize)

        def percent_true(s):
            return s.fillna(0).mean()

        return percent_true
Пример #5
0
def test_function(c):
    c.sql("CREATE SCHEMA other")
    c.sql("USE SCHEMA root")

    def f(x):
        return x**2

    c.register_function(f,
                        "f", [("x", np.float64)],
                        np.float64,
                        schema_name="other")

    with pytest.raises(ParsingException):
        c.sql("SELECT F(a) AS a FROM df")

    c.sql("SELECT other.F(a) AS a FROM df")

    c.sql("USE SCHEMA other")
    c.sql("SELECT F(a) AS a FROM root.df")

    c.sql("USE SCHEMA root")
    fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum())
    c.register_aggregation(fagg,
                           "fagg", [("x", np.float64)],
                           np.float64,
                           schema_name="other")

    with pytest.raises(ParsingException):
        c.sql("SELECT FAGG(b) AS test FROM df")

    c.sql("SELECT other.FAGG(b) AS test FROM df")

    c.sql("USE SCHEMA other")
    c.sql("SELECT FAGG(b) AS test FROM root.df")
Пример #6
0
def test_groupby_agg_custom__mode():
    # mode function passing intermediates as pure python objects around. to protect
    # results from pandas in apply use return results as single-item lists
    def agg_mode(s):
        def impl(s):
            res, = s.iloc[0]

            for i, in s.iloc[1:]:
                res = res.add(i, fill_value=0)

            return [res]

        return s.apply(impl)

    agg_func = dd.Aggregation(
        'custom_mode',
        lambda s: s.apply(lambda s: [s.value_counts()]),
        agg_mode,
        lambda s: s.map(lambda i: i[0].argmax()),
    )

    d = pd.DataFrame({
        'g0': [0, 0, 0, 1, 1] * 3,
        'g1': [0, 0, 0, 1, 1] * 3,
        'cc': [4, 5, 4, 6, 6] * 3,
    })
    a = dd.from_pandas(d, npartitions=5)

    actual = a['cc'].groupby([a['g0'], a['g1']]).agg(agg_func)

    # cheat to get the correct index
    expected = pd.DataFrame({'g0': [0, 1], 'g1': [0, 1], 'cc': [4, 6]})
    expected = expected['cc'].groupby([expected['g0'], expected['g1']]).agg('sum')

    assert_eq(actual, expected)
Пример #7
0
def dask_agg_largest():
    return dd.Aggregation(name='largest',
                          chunk=lambda grouped: (grouped.max(), grouped.min()),
                          agg=lambda chunk_max, chunk_min:
                          (chunk_max.max(), chunk_min.min()),
                          finalize=lambda M, m: np.sign(M + m) * abs(
                              pd.concat([M, m], axis=1)).max(axis=1))
    def get_function(self, agg_type=Library.PANDAS):
        if agg_type == Library.DASK:

            def chunk(s):
                def inner_chunk(x):
                    x = x[:].dropna()
                    return set(x.unique())

                return s.agg(inner_chunk)

            def agg(s):
                def inner_agg(x):
                    x = x[:].dropna()
                    return (set().union(*x.values))

                return s.agg(inner_agg)

            def finalize(s):
                return s.apply(lambda x: len(x))

            return dd.Aggregation(self.name,
                                  chunk=chunk,
                                  agg=agg,
                                  finalize=finalize)

        elif agg_type == Library.KOALAS:
            return 'nunique'

        return pd.Series.nunique
Пример #9
0
    def get_dask_aggregation(self):
        def chunk(s):
            return s.agg(np.all)

        def agg(s):
            return s.agg(np.all)

        return dd.Aggregation(self.name, chunk=chunk, agg=agg)
Пример #10
0
def test_groupby_agg_custom__name_clash_with_internal_same_column():
    """for a single input column only unique names are allowed"""
    d = pd.DataFrame({'g': [0, 0, 1] * 3, 'b': [1, 2, 3] * 3})
    a = dd.from_pandas(d, npartitions=2)

    agg_func = dd.Aggregation('sum', lambda s: s.sum(), lambda s0: s0.sum())

    with pytest.raises(ValueError):
        a.groupby('g').aggregate({'b': [agg_func, 'sum']})
Пример #11
0
def test_aggregate_function(c):
    fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum())
    c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64)

    return_df = c.sql("""
        SELECT FAGG(b) AS test, SUM(b) AS "S"
        FROM df
        """)

    assert_eq(return_df["test"], return_df["S"], check_names=False)
        def get_dask_aggregation(self):
            def chunk(s):
                return s.sum()

            def agg(s):
                return s.sum()

            def finalize(s):
                return s * self.n

            return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
Пример #13
0
    def get_dask_aggregation(self):
        def chunk(s):
            chunk_sum = s.agg(np.sum)
            if chunk_sum.dtype == 'bool':
                chunk_sum = chunk_sum.astype('int64')
            return chunk_sum

        def agg(s):
            return s.agg(np.sum)

        return dd.Aggregation(self.name, chunk=chunk, agg=agg)
Пример #14
0
def test_aggregate_function(c):
    fagg = dd.Aggregation("f", lambda x: x.sum(), lambda x: x.sum())
    c.register_aggregation(fagg, "fagg", [("x", np.float64)], np.float64)

    return_df = c.sql("""
        SELECT fagg(b) AS test, sum(b) AS "S"
        FROM df
        """)
    return_df = return_df.compute()

    assert (return_df["test"] == return_df["S"]).all()
    def get_function(self, agg_type=Library.PANDAS):
        if agg_type == Library.DASK:

            def chunk(s):
                return s.agg(np.all)

            def agg(s):
                return s.agg(np.all)

            return dd.Aggregation(self.name, chunk=chunk, agg=agg)

        return np.all
Пример #16
0
def execute_group_concat_series_gb(op,
                                   data,
                                   sep,
                                   _,
                                   aggcontext=None,
                                   **kwargs):
    custom_group_concat = dd.Aggregation(
        name='custom_group_concat',
        chunk=lambda s: s.apply(list),
        agg=lambda s0: s0.apply(lambda chunks: sep.join(
            str(s) for s in itertools.chain.from_iterable(chunks))),
    )
    return data.agg(custom_group_concat)
    def get_function(self, agg_type=Library.PANDAS):
        if agg_type == Library.DASK:

            def chunk(s):
                chunk_sum = s.agg(np.sum)
                if chunk_sum.dtype == 'bool':
                    chunk_sum = chunk_sum.astype('int64')
                return chunk_sum

            def agg(s):
                return s.agg(np.sum)

            return dd.Aggregation(self.name, chunk=chunk, agg=agg)

        return np.sum
Пример #18
0
def test_groupby_agg_custom__name_clash_with_internal_different_column():
    """custom aggregation functions can share the name of a builtin function"""
    d = pd.DataFrame({'g': [0, 0, 1] * 3, 'b': [1, 2, 3] * 3, 'c': [4, 5, 6] * 3})
    a = dd.from_pandas(d, npartitions=2)

    # NOTE: this function is purposefully misnamed
    agg_func = dd.Aggregation(
        'sum',
        lambda s: (s.count(), s.sum()),
        lambda s0, s1: (s0.sum(), s1.sum()),
        lambda s0, s1: s1 / s0,
    )

    # NOTE: the name of agg-func is suppressed in the output,
    # since only a single agg func per column was specified
    result = a.groupby('g').aggregate({'b': agg_func, 'c': 'sum'})
    expected = d.groupby('g').aggregate({'b': 'mean', 'c': 'sum'})

    assert_eq(result, expected, check_dtype=False)
Пример #19
0
    def get_dask_aggregation(self):
        def chunk(s):
            def inner_chunk(x):
                x = x[:].dropna()
                return set(x.unique())

            return s.agg(inner_chunk)

        def agg(s):
            def inner_agg(x):
                x = x[:].dropna()
                return(set().union(*x.values))

            return s.agg(inner_agg)

        def finalize(s):
            return s.apply(lambda x: len(x))

        return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
Пример #20
0
    def get_dask_aggregation(self):
        def chunk(s):
            def format_chunk(x):
                return x[:].fillna(0)

            chunk_sum = s.agg(lambda x: format_chunk(x).sum())
            chunk_len = s.agg(lambda x: len(format_chunk(x)))
            if chunk_sum.dtype == 'bool':
                chunk_sum = chunk_sum.astype('int64')
            if chunk_len.dtype == 'bool':
                chunk_len = chunk_len.astype('int64')
            return (chunk_sum, chunk_len)

        def agg(val, length):
            return (val.sum(), length.sum())

        def finalize(total, length):
            return total / length

        return dd.Aggregation(self.name, chunk=chunk, agg=agg, finalize=finalize)
Пример #21
0
def make_datasets(in_csv, out_dir):
    """Processes csv file and saves a curated dataset to disk.

    Parameters
    ----------
    in-csv: str
        path to csv file in local disk
    out_dir:
        directory where files should be saved to.

    Returns
    -------
    None
    """
    log = logging.getLogger('make-dataset')
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Connect to the dask cluster
    log.info(
        f'Starting make_datasets with in_csv: {in_csv} and out_dir: {out_dir}')
    log.info('Connecting to cluster')
    c = Client('dask-scheduler:8786')

    # load data as a dask Dataframe if you have trouble with dask
    # please fall back to pandas or numpy
    log.info('Reading csv file')
    ddf = dd.read_csv(in_csv, blocksize=1e6)

    log.info('ouput dataframe head')
    log.info(ddf.head())
    log.info('Trace 1')
    # we set the index so we can properly execute loc below
    ddf = ddf.set_index('Unnamed: 0')

    # trigger computation
    n_samples = len(ddf)

    # Fill NaN values with new 'Unknown' category
    ddf['country'] = ddf['country'].fillna('Unknown')
    ddf['province'] = ddf['province'].fillna('Unknown')
    ddf['taster_name'] = ddf['taster_name'].fillna('Unknown')
    log.info('Trace 2')
    # Fill region_1 missing values using the 'province' column.
    # Most common value for each province will be used. Rest are labeled Unknown
    mode = dd.Aggregation('mode', chunk, agg, finalize)
    most_common_region = ddf.groupby(['province']).agg({
        'region_1': mode
    }).compute()
    ddf['region_1'] = ddf.apply(
        lambda x: most_common_region.loc[x.province, 'region_1']
        if x.province in most_common_region['region_1'].index else 'Unknown',
        axis=1).where(ddf['region_1'].isna(), ddf['region_1'])
    log.info('Trace 3')
    # We fill price values with the province's average price. If that is
    # not available, we use the global average price
    mean_prices = ddf.groupby(['province'])['price'].mean().compute()
    global_mean = ddf['price'].mean().compute()
    mean_prices = mean_prices.fillna(global_mean)
    ddf['price'] = ddf.apply(lambda x: mean_prices[x['province']],
                             axis=1,
                             meta=('x', 'f8')).where(ddf['price'].isna(),
                                                     ddf['price'])
    # Drop this columns as explained in notebook
    ddf = ddf.drop([
        'description', 'designation', 'region_2', 'taster_twitter_handle',
        'title'
    ],
                   axis=1)

    # Encode categorical values using one-hot encoding.
    # This results in >6k columns. Maybe we'll need to change the encoding type
    # for some features such as 'winery' with so many unique values.
    # Also, I think this should be done in the model task.
    ddf = ddf.categorize()
    # encoder = DummyEncoder()
    # ddf = encoder.fit_transform(ddf)

    # # Normalize price values
    # scaler = StandardScaler()
    # ddf['price'] = scaler.fit_transform(ddf[['price']]).price
    log.info('dataset processed')

    # split dataset into train test feel free to adjust test percentage
    idx = np.arange(n_samples)
    test_idx = idx[:n_samples // 10]
    test = ddf.loc[test_idx]

    train_idx = idx[n_samples // 10:]
    train = ddf.loc[train_idx]

    # This also shuffles the data. Not sure if csv was shuffled before..
    # train, test = ddf.random_split([0.9, 0.1], shuffle=True)

    _save_datasets(train, test, out_dir)
Пример #22
0
)
from ibis.backends.pandas.execution.arrays import (
    execute_array_index,
    execute_array_length,
)

DASK_DISPATCH_TYPES: TypeRegistrationDict = {
    ops.ArrayLength: [((dd.Series, ), execute_array_length)],
    ops.ArrayIndex: [((dd.Series, int), execute_array_index)],
}

register_types_to_dispatcher(execute_node, DASK_DISPATCH_TYPES)

collect_list = dd.Aggregation(
    name="collect_list",
    chunk=lambda s: s.apply(list),
    agg=lambda s0: s0.apply(lambda chunks: list(
        itertools.chain.from_iterable(chunks))),
)


@execute_node.register(ops.ArrayColumn, list)
def execute_array_column(op, cols, **kwargs):
    df = dd.concat(cols, axis=1)
    return df.apply(lambda row: np.array(row, dtype=object),
                    axis=1,
                    meta=(None, 'object'))


# TODO - aggregations - #2553
@execute_node.register(ops.ArrayCollect, dd.Series)
def execute_array_collect(op, data, aggcontext=None, **kwargs):
Пример #23
0
def dask_agg_absmax():
    return dd.Aggregation(name='absmax',
                          chunk=lambda grouped: abs(grouped.max()),
                          agg=lambda chunk_max: abs(chunk_max.max()))
Пример #24
0
    # Compute on dask DataFrame without divisions (requires shuffling)
    result = ddf_no_divs.groupby(group_args).apply(apply_func)
    assert_eq(expected, result, check_divisions=False)

    # Check that divisions were preserved (all None in this case)
    assert ddf_no_divs.divisions == result.divisions

    # Crude check to see if shuffling was performed.
    # The groupby operation should add only more than 1 task per partition
    assert len(result.dask) > (len(ddf_no_divs.dask) + ddf_no_divs.npartitions)


custom_mean = dd.Aggregation(
    'mean',
    lambda s: (s.count(), s.sum()),
    lambda s0, s1: (s0.sum(), s1.sum()),
    lambda s0, s1: s1 / s0,
)

custom_sum = dd.Aggregation('sum', lambda s: s.sum(), lambda s0: s0.sum())


@pytest.mark.parametrize('pandas_spec, dask_spec, check_dtype', [
    ({'b': 'mean'}, {'b': custom_mean}, False),
    ({'b': 'sum'}, {'b': custom_sum}, True),
    (['mean', 'sum'], [custom_mean, custom_sum], False),
    ({'b': ['mean', 'sum']}, {'b': [custom_mean, custom_sum]}, False),
])
def test_dataframe_groupby_agg_custom_sum(pandas_spec, dask_spec, check_dtype):
    df = pd.DataFrame({'g': [0, 0, 1] * 3, 'b': [1, 2, 3] * 3})
    ddf = dd.from_pandas(df, npartitions=2)
Пример #25
0
def process_social(file):
    # Create the output file; one for every day
    cbgs_out = output_folder + file.split('/')[-1]

    if os.path.exists(cbgs_out):
        return None

    print('loading: ' + file)
    # Load in Dask DF
    #     dtype={'distance_traveled_from_home': 'float64', }
    social_df = dd.read_csv(file, error_bad_lines=False, dtype=start_dtype)
    #     social_df = dd.from_pandas(pd.read_csv(file, nrows=10), npartitions=1)
    #     social_df = social_df.fillna(method='ffill')

    # Create date and origin_fips cols
    social_df['date_start'] = social_df['date_range_start'].map(
        lambda x: x[:10])
    social_df['date_end'] = social_df['date_range_end'].map(lambda x: x[:10])

    social_df['origin_fips'] = social_df['origin_census_block_group'].apply(
        lambda x: cbgs_to_county_fips(x),
        meta=('origin_census_block_group', str))

    # Groupby and Sum
    agg_dict = {
        x: ['min', 'max', 'sum', 'prod', 'mean', 'std']
        for x in agg_cols
    }
    bucket_agg = dd.Aggregation(
        'join',
        lambda x: x.agg(''.join),
        lambda x0: x0.agg(''.join),
    )
    bucket_agg2 = dd.Aggregation(
        'new',
        lambda x: x.agg(lambda x: dict_flatten2(x, num_fips)),
        lambda x0: x0.agg(''.join),
    )

    bucket_dict = {
        x: bucket_agg
        for x in bucket_cols + home_cols + destination_cols
    }

    for col in bucket_cols:
        social_df[col] = social_df[col].astype(str)

    social_df = social_df[groupby_cols + bucket_cols + agg_cols + home_cols + destination_cols] \
            .groupby(groupby_cols) \
            .agg(dict(agg_dict, **bucket_dict)) # most efficient way to add two dicts
    #     print(social_df.compute())

    # Kill the dreaded MultiIndex
    social_df.columns = [
        '_'.join(col).strip() for col in social_df.columns.values
    ]
    social_df = social_df.reset_index()

    # Redo MetaData
    for col in bucket_cols:
        #         print(col)
        social_df[col + '_join'] = social_df[col + '_join'].astype(str)

    for bucket in bucket_cols:
        cols = bucket_col_dict[bucket]
        raw_cols = raw_bucket_col_dict[bucket]
        col = bucket + '_join'
        social_df[cols] = social_df.map_partitions(
            lambda x: x[col].apply(lambda z: dict_flatten3(z, raw_cols, cols)))
        social_df = social_df.drop(col, axis=1)

    social_df = social_df.compute()

    social_df[destination_fips_cols] = social_df[
        'destination_cbgs_join'].apply(
            lambda z: cbgs_dict_flatten2(z, num_fips))
    print(social_df)
    print('uploading 10000000 lines of data')
    social_df.to_csv(cbgs_out  #,single_file = True
                     #           chunksize=chunksize
                     )
Пример #26
0
class LogicalAggregatePlugin(BaseRelPlugin):
    """
    A LogicalAggregate is used in GROUP BY clauses, but also
    when aggregating a function over the full dataset.

    In the first case we need to find out which columns we need to
    group over, in the second case we "cheat" and add a 1-column
    to the dataframe, which allows us to reuse every aggregation
    function we already know of.

    The rest is just a lot of column-name-bookkeeping.
    Fortunately calcite will already make sure, that each
    aggregation function will only every be called with a single input
    column (by splitting the inner calculation to a step before).
    """

    class_name = "org.apache.calcite.rel.logical.LogicalAggregate"

    AGGREGATION_MAPPING = {
        "$sum0":
        "sum",
        "any_value":
        dd.Aggregation(
            "any_value",
            lambda s: s.sample(n=1).values,
            lambda s0: s0.sample(n=1).values,
        ),
        "avg":
        "mean",
        "bit_and":
        ReduceAggregation("bit_and", operator.and_),
        "bit_or":
        ReduceAggregation("bit_or", operator.or_),
        "bit_xor":
        ReduceAggregation("bit_xor", operator.xor),
        "count":
        "count",
        "every":
        dd.Aggregation("every", lambda s: s.all(), lambda s0: s0.all()),
        "max":
        "max",
        "min":
        "min",
        "single_value":
        "first",
    }

    def convert(self, rel: "org.apache.calcite.rel.RelNode",
                context: "dask_sql.Context") -> DataContainer:
        (dc, ) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # We make our life easier with having unique column names
        cc = cc.make_unique()

        # I have no idea what that is, but so far it was always of length 1
        assert len(
            rel.getGroupSets()) == 1, "Do not know how to handle this case!"

        # Extract the information, which columns we need to group for
        group_column_indices = [int(i) for i in rel.getGroupSet()]
        group_columns = [
            cc.get_backend_by_frontend_index(i) for i in group_column_indices
        ]

        # Always keep an additional column around for empty groups and aggregates
        additional_column_name = str(uuid.uuid4())

        # NOTE: it might be the case that
        # we do not need this additional
        # column, but hopefully adding a single
        # column of 1 is not so problematic...
        df = df.assign(**{additional_column_name: 1})
        cc = cc.add(additional_column_name)
        dc = DataContainer(df, cc)

        # Collect all aggregates
        filtered_aggregations, output_column_order = self._collect_aggregations(
            rel, dc, group_columns, additional_column_name, context)

        if not group_columns:
            # There was actually no GROUP BY specified in the SQL
            # Still, this plan can also be used if we need to aggregate something over the full
            # data sample
            # To reuse the code, we just create a new column at the end with a single value
            # It is important to do this after creating the aggregations,
            # as we do not want this additional column to be used anywhere
            group_columns = [additional_column_name]

            logger.debug("Performing full-table aggregation")

        # Now we can perform the aggregates
        # We iterate through all pairs of (possible pre-filtered)
        # dataframes and the aggregations to perform in this data...
        df_agg = None
        for filtered_df_desc, aggregation in filtered_aggregations.items():
            filtered_column = filtered_df_desc.filtered_column
            if filtered_column:
                logger.debug(
                    f"Aggregating {dict(aggregation)} on the data filtered by {filtered_column}"
                )
            else:
                logger.debug(f"Aggregating {dict(aggregation)} on the data")

            # ... we perform the aggregations ...
            filtered_df = filtered_df_desc.df
            # TODO: we could use the type information for
            # pre-calculating the meta information
            filtered_df_agg = filtered_df.groupby(
                by=group_columns).agg(aggregation)

            # ... fix the column names to a single level ...
            filtered_df_agg.columns = filtered_df_agg.columns.get_level_values(
                -1)

            # ... and finally concat the new data with the already present columns
            if df_agg is None:
                df_agg = filtered_df_agg
            else:
                df_agg = df_agg.assign(**{
                    col: filtered_df_agg[col]
                    for col in filtered_df_agg.columns
                })

        # SQL does not care about the index, but we do not want to have any multiindices
        df_agg = df_agg.reset_index(drop=True)

        # Fix the column names and the order of them, as this was messed with during the aggregations
        df_agg.columns = df_agg.columns.get_level_values(-1)
        cc = ColumnContainer(df_agg.columns).limit_to(output_column_order)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df_agg, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc

    def _collect_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        dc: DataContainer,
        group_columns: List[str],
        additional_column_name: str,
        context: "dask_sql.Context",
    ) -> Tuple[Dict[GroupDatasetDescription, AggregationDescription],
               List[int], ]:
        """
        Create a mapping of dataframe -> aggregations (in the form input colum, output column, aggregation)
        and the expected order of output columns.
        """
        aggregations = defaultdict(lambda: defaultdict(dict))
        output_column_order = []
        df = dc.df
        cc = dc.column_container

        # SQL needs to copy the old content also. As the values of the group columns
        # are the same for a single group anyways, we just use the first row
        for col in group_columns:
            aggregations[GroupDatasetDescription(df)][col][col] = "first"
            output_column_order.append(col)

        # Now collect all aggregations
        for agg_call in rel.getNamedAggCalls():
            output_col = str(agg_call.getValue())
            expr = agg_call.getKey()

            if expr.hasFilter():
                filter_column = cc.get_backend_by_frontend_index(
                    expr.filterArg)
                filter_expression = df[filter_column]
                filtered_df = df[filter_expression]

                grouped_df = GroupDatasetDescription(filtered_df,
                                                     filter_column)
            else:
                grouped_df = GroupDatasetDescription(df)

            if expr.isDistinct():
                raise NotImplementedError(
                    "DISTINCT is not implemented (yet)")  # pragma: no cover

            aggregation_name = str(expr.getAggregation().getName())
            aggregation_name = aggregation_name.lower()
            try:
                aggregation_function = self.AGGREGATION_MAPPING[
                    aggregation_name]
            except KeyError:
                try:
                    aggregation_function = context.functions[
                        aggregation_name].f
                except KeyError:  # pragma: no cover
                    raise NotImplementedError(
                        f"Aggregation function {aggregation_name} not implemented (yet)."
                    )

            inputs = expr.getArgList()
            if len(inputs) == 1:
                input_col = cc.get_backend_by_frontend_index(inputs[0])
            elif len(inputs) == 0:
                input_col = additional_column_name
            else:
                raise NotImplementedError(
                    "Can not cope with more than one input"
                )  # pragma: no cover

            aggregations[grouped_df][input_col][
                output_col] = aggregation_function
            output_column_order.append(output_col)

        return aggregations, output_column_order
Пример #27
0
    def get_annotations(self,
                        index: str,
                        columns: list,
                        agg: str = "concat",
                        filter_values: pd.Series = None):
        """Returns the Database's DataFrame such that it's indexed by :param
        index:, which then applies a groupby operation and aggregates all other
        columns by concatenating all unique values.

        Args:
            index (str): The column name of the DataFrame to join by.
            columns (list): a list of column names.
            agg (str): Function to aggregate when there is more than one values
                for each index instance. E.g. ['first', 'last', 'sum', 'mean',
                'size', 'concat'], default 'concat'.
            filter_values (pd.Series): The values on the `index` column to
                filter before performing the groupby-agg operations.

        Returns:
            DataFrame: A dataframe to be used for annotation
        """
        if not set(columns).issubset(set(self.data.columns)):
            raise Exception(
                "The columns argument must be a list such that it's subset of the following columns in the dataframe",
                "These columns doesn't exist in database:",
                set(columns) - set(self.data.columns.tolist()))

        # Select df columns including df. However the `columns` list shouldn't contain the index column
        if index in columns:
            columns.pop(columns.index(index))

        df = self.data[columns + [index]]

        if filter_values is not None:
            df = df[df[index].isin(list(filter_values))]

        # if index != self.data.index.name and index in self.data.columns:
        #     df = df.set_index(index)

        # Groupby index
        groupby = df.groupby(index)

        #  Aggregate by all columns by concatenating unique values
        if agg == "concat":
            if isinstance(df, pd.DataFrame):
                aggregated = groupby.agg(
                    {col: concat_uniques
                     for col in columns})

            elif isinstance(df, dd.DataFrame):
                collect_concat = dd.Aggregation(
                    name='collect_concat',
                    chunk=lambda s1: s1.apply(list),
                    agg=lambda s2: s2.apply(lambda chunks: filter(
                        lambda x: False if x == "None" or x is None else True,
                        set(itertools.chain.from_iterable(chunks)))),
                    finalize=lambda s3: s3.apply(lambda xx: '|'.join(xx)))
                aggregated = groupby.agg(
                    {col: collect_concat
                     for col in columns})

            else:
                raise Exception("Unsupported dataframe: {}".format(df))

        # Any other aggregation functions
        else:
            aggregated = groupby.agg({col: agg for col in columns})

        # if aggregated.index.duplicated().sum() > 0:
        #     raise ValueError("DataFrame must not have duplicates in index")
        return aggregated
Пример #28
0
cfuns = {
    "name" : 'category',
    "progtype" : 'category'
}

y = dd.read_csv(okfile, sep = ';', dtype = cfuns, parse_dates = ['lastdate'])

y.head()

y.name.unique().compute()
y.user.unique().compute()

uni_len = dd.Aggregation(
    name = 'uni_len',
    chunk = lambda x : x.unique(),
    agg = lambda xa : len(xa)
)

a = y.groupby('name').agg({'lang':sum,'times':sum, 'user': uni_len}).compute()

y['hour'] = y.lastdate.dt.hour

y.hour = y['hour'].cat.as_known()
y.name = y['name'].cat.as_known()

a = y.pivot_table(index='name', columns='hour', values='lang', aggfunc='sum').compute()

a.sort_values(by=['0'])

a.to_excel("E:/pivot_hour.xlsx")
Пример #29
0
class LogicalAggregatePlugin(BaseRelPlugin):
    """
    A LogicalAggregate is used in GROUP BY clauses, but also
    when aggregating a function over the full dataset.

    In the first case we need to find out which columns we need to
    group over, in the second case we "cheat" and add a 1-column
    to the dataframe, which allows us to reuse every aggregation
    function we already know of.
    As NULLs are not groupable in dask, we handle them special
    by adding a temporary column which is True for all NULL values
    and False otherwise (and also group by it).

    The rest is just a lot of column-name-bookkeeping.
    Fortunately calcite will already make sure, that each
    aggregation function will only every be called with a single input
    column (by splitting the inner calculation to a step before).

    Open TODO: So far we are following the dask default
    to only have a single partition after the group by (which is usual
    a reasonable assumption). It would be nice to control
    these things via HINTs.
    """

    class_name = "org.apache.calcite.rel.logical.LogicalAggregate"

    AGGREGATION_MAPPING = {
        "$sum0": AggregationSpecification("sum", AggregationOnPandas("sum")),
        "any_value": AggregationSpecification(
            dd.Aggregation(
                "any_value",
                lambda s: s.sample(n=1).values,
                lambda s0: s0.sample(n=1).values,
            )
        ),
        "avg": AggregationSpecification("mean", AggregationOnPandas("mean")),
        "bit_and": AggregationSpecification(
            ReduceAggregation("bit_and", operator.and_)
        ),
        "bit_or": AggregationSpecification(ReduceAggregation("bit_or", operator.or_)),
        "bit_xor": AggregationSpecification(ReduceAggregation("bit_xor", operator.xor)),
        "count": AggregationSpecification("count"),
        "every": AggregationSpecification(
            dd.Aggregation("every", lambda s: s.all(), lambda s0: s0.all())
        ),
        "max": AggregationSpecification("max", AggregationOnPandas("max")),
        "min": AggregationSpecification("min", AggregationOnPandas("min")),
        "single_value": AggregationSpecification("first"),
    }

    def convert(
        self, rel: "org.apache.calcite.rel.RelNode", context: "dask_sql.Context"
    ) -> DataContainer:
        (dc,) = self.assert_inputs(rel, 1, context)

        df = dc.df
        cc = dc.column_container

        # We make our life easier with having unique column names
        cc = cc.make_unique()

        # I have no idea what that is, but so far it was always of length 1
        assert len(rel.getGroupSets()) == 1, "Do not know how to handle this case!"

        # Extract the information, which columns we need to group for
        group_column_indices = [int(i) for i in rel.getGroupSet()]
        group_columns = [
            cc.get_backend_by_frontend_index(i) for i in group_column_indices
        ]

        dc = DataContainer(df, cc)

        if not group_columns:
            # There was actually no GROUP BY specified in the SQL
            # Still, this plan can also be used if we need to aggregate something over the full
            # data sample
            # To reuse the code, we just create a new column at the end with a single value
            logger.debug("Performing full-table aggregation")

        # Do all aggregates
        df_result, output_column_order = self._do_aggregations(
            rel, dc, group_columns, context,
        )

        # SQL does not care about the index, but we do not want to have any multiindices
        df_agg = df_result.reset_index(drop=True)

        # Fix the column names and the order of them, as this was messed with during the aggregations
        df_agg.columns = df_agg.columns.get_level_values(-1)
        cc = ColumnContainer(df_agg.columns).limit_to(output_column_order)

        cc = self.fix_column_to_row_type(cc, rel.getRowType())
        dc = DataContainer(df_agg, cc)
        dc = self.fix_dtype_to_row_type(dc, rel.getRowType())
        return dc

    def _do_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        dc: DataContainer,
        group_columns: List[str],
        context: "dask_sql.Context",
    ) -> Tuple[dd.DataFrame, List[str]]:
        """
        Main functionality: return the result dataframe
        and the output column order
        """
        df = dc.df
        cc = dc.column_container

        # We might need it later.
        # If not, lets hope that adding a single column should not
        # be a huge problem...
        additional_column_name = new_temporary_column(df)
        df = df.assign(**{additional_column_name: 1})

        # Add an entry for every grouped column, as SQL wants them first
        output_column_order = group_columns.copy()

        # Collect all aggregations we need to do
        collected_aggregations, output_column_order = self._collect_aggregations(
            rel, df, cc, context, additional_column_name, output_column_order
        )

        if not collected_aggregations:
            return df[group_columns].drop_duplicates(), output_column_order

        # SQL needs to have a column with the grouped values as the first
        # output column.
        # As the values of the group columns
        # are the same for a single group anyways, we just use the first row
        for col in group_columns:
            collected_aggregations[None].append((col, col, "first"))

        # Now we can go ahead and use these grouped aggregations
        # to perform the actual aggregation
        # It is very important to start with the non-filtered entry.
        # Otherwise we might loose some entries in the grouped columns
        df_result = None
        key = None
        if key in collected_aggregations:
            aggregations = collected_aggregations.pop(key)
            df_result = self._perform_aggregation(
                df, None, aggregations, additional_column_name, group_columns,
            )

        # Now we can also the the rest
        for filter_column, aggregations in collected_aggregations.items():
            agg_result = self._perform_aggregation(
                df, filter_column, aggregations, additional_column_name, group_columns,
            )

            # ... and finally concat the new data with the already present columns
            if df_result is None:
                df_result = agg_result
            else:
                df_result = df_result.assign(
                    **{col: agg_result[col] for col in agg_result.columns}
                )

        return df_result, output_column_order

    def _collect_aggregations(
        self,
        rel: "org.apache.calcite.rel.RelNode",
        df: dd.DataFrame,
        cc: ColumnContainer,
        context: "dask_sql.Context",
        additional_column_name: str,
        output_column_order: List[str],
    ) -> Tuple[Dict[Tuple[str, str], List[Tuple[str, str, Any]]], List[str]]:
        """
        Collect all aggregations together, which have the same filter column
        so that the aggregations only need to be done once.

        Returns the aggregations as mapping filter_column -> List of Aggregations
        where the aggregations are in the form (input_col, output_col, aggregation function (or string))
        """
        collected_aggregations = defaultdict(list)

        for agg_call in rel.getNamedAggCalls():
            expr = agg_call.getKey()

            # Find out about the input column
            inputs = expr.getArgList()
            if len(inputs) == 1:
                input_col = cc.get_backend_by_frontend_index(inputs[0])
            elif len(inputs) == 0:
                input_col = additional_column_name
            else:
                raise NotImplementedError("Can not cope with more than one input")

            # Extract flags (filtering/distinct)
            if expr.isDistinct():  # pragma: no cover
                raise ValueError("Apache Calcite should optimize them away!")

            filter_column = None
            if expr.hasFilter():
                filter_column = cc.get_backend_by_frontend_index(expr.filterArg)

            # Find out which aggregation function to use
            aggregation_name = str(expr.getAggregation().getName())
            aggregation_name = aggregation_name.lower()
            try:
                aggregation_function = self.AGGREGATION_MAPPING[aggregation_name]
            except KeyError:
                try:
                    aggregation_function = context.functions[aggregation_name]
                except KeyError:  # pragma: no cover
                    raise NotImplementedError(
                        f"Aggregation function {aggregation_name} not implemented (yet)."
                    )
            if isinstance(aggregation_function, AggregationSpecification):
                dtype = df[input_col].dtype
                if pd.api.types.is_numeric_dtype(dtype):
                    aggregation_function = aggregation_function.numerical_aggregation
                else:
                    aggregation_function = (
                        aggregation_function.non_numerical_aggregation
                    )

            # Finally, extract the output column name
            output_col = str(agg_call.getValue())

            # Store the aggregation
            key = filter_column
            value = (input_col, output_col, aggregation_function)
            collected_aggregations[key].append(value)
            output_column_order.append(output_col)

        return collected_aggregations, output_column_order

    def _perform_aggregation(
        self,
        df: dd.DataFrame,
        filter_column: str,
        aggregations: List[Tuple[str, str, Any]],
        additional_column_name: str,
        group_columns: List[str],
    ):
        tmp_df = df

        if filter_column:
            filter_expression = tmp_df[filter_column]
            tmp_df = tmp_df[filter_expression]

            logger.debug(f"Filtered by {filter_column} before aggregation.")

        group_columns = [tmp_df[group_column] for group_column in group_columns]
        group_columns_and_nulls = get_groupby_with_nulls_cols(
            tmp_df, group_columns, additional_column_name
        )
        grouped_df = tmp_df.groupby(by=group_columns_and_nulls)

        # Convert into the correct format for dask
        aggregations_dict = defaultdict(dict)
        for aggregation in aggregations:
            input_col, output_col, aggregation_f = aggregation

            aggregations_dict[input_col][output_col] = aggregation_f

        # Now apply the aggregation
        logger.debug(f"Performing aggregation {dict(aggregations_dict)}")
        agg_result = grouped_df.agg(aggregations_dict)

        # ... fix the column names to a single level ...
        agg_result.columns = agg_result.columns.get_level_values(-1)

        return agg_result
Пример #30
0
df_schema = pd.read_json("/srv/retail_schema.json", lines=True)


def json_engine(*args, **kwargs):
    df = pd.read_json(*args, **kwargs)
    for c in set(df_schema.columns) - set(df.columns):
        df[c] = pd.Series(dtype=df_schema[c].dtype)
    df.drop(set(df.columns) - set(df_schema.columns))
    return df.loc[:, df_schema.columns]


nunique = dd.Aggregation(
    name="nunique",
    chunk=lambda s: s.apply(lambda x: list(set(x))),
    agg=lambda s0: s0._selected_obj.groupby(level=list(
        range(s0._selected_obj.index.nlevels))).sum(),
    finalize=lambda s1: s1.apply(lambda final: len(set(final))),
)

df = dd.read_json(
    "s3://retail-bucket/topics/retail/**.json",
    lines=True,
    engine=json_engine,
    storage_options={
        "key": "access_me",
        "secret": "i_am_a_secret",
        "client_kwargs": {
            "endpoint_url": "http://minio:9000"
        },
    },