示例#1
0
def _distinct(__data, *args, _keep_all=False, **kwargs):
    if (args or kwargs) and _keep_all:
        raise NotImplementedError(
            "Distinct with variables specified in sql requires _keep_all = False"
        )

    inner_sel = mutate(__data, **kwargs).last_op if kwargs else __data.last_op

    # TODO: this is copied from the df distinct version
    # cols dict below is used as ordered set
    cols = {simple_varname(x): True for x in args}
    cols.update(kwargs)

    if None in cols:
        raise KeyError("positional arguments must be simple column, "
                       "e.g. _.colname or _['colname']")

    # use all columns by default
    if not cols:
        cols = list(inner_sel.columns.keys())

    if not len(inner_sel._order_by_clause):
        # select distinct has to include any columns in the order by clause,
        # so can only safely modify existing statement when there's no order by
        sel_cols = lift_inner_cols(inner_sel)
        distinct_cols = [sel_cols[k] for k in cols]
        sel = inner_sel.with_only_columns(distinct_cols).distinct()
    else:
        # fallback to cte
        cte = inner_sel.alias()
        distinct_cols = [cte.columns[k] for k in cols]
        sel = sql.select(distinct_cols, from_obj=cte).distinct()

    return __data.append_op(sel)
示例#2
0
def test_dply_grouped_mutate_of_agg_order():
    # see issue #139
    df = pd.DataFrame({
        'g': ['b', 'a', 'b'],
        'x':[0, 1, 2]
        })
    gdf = df.groupby('g')
    
    out = mutate(gdf, g_min = lambda d: d.x.min())

    assert_frame_equal(ungroup(out), df.assign(g_min = [0, 1, 0]))
示例#3
0
def _count(__data, *args, sort = False, wt = None, **kwargs):
    # TODO: if already col named n, use name nn, etc.. get logic from tidy.py
    if wt is not None:
        raise NotImplementedError("TODO")

    res_name = "n"
    # similar to filter verb, we need two select statements,
    # an inner one for derived cols, and outer to group by them

    # inner select ----
    # holds any mutation style columns
    arg_names = []
    for arg in args:
        name = simple_varname(arg)
        if name is None:
            raise NotImplementedError(
                    "Count positional arguments must be single column name. "
                    "Use a named argument to count using complex expressions."
                    )
        arg_names.append(name)

    tbl_inner = mutate(__data, **kwargs)
    sel_inner = tbl_inner.last_op
    group_cols = arg_names + list(kwargs)

    # outer select ----
    # holds selected columns and tally (n)
    sel_inner_cte = sel_inner.alias()
    inner_cols = sel_inner_cte.columns
    sel_outer = sql.select(from_obj = sel_inner_cte)

    # apply any group vars from a group_by verb call first
    prev_group_cols = [inner_cols[k] for k in tbl_inner.group_by]
    if prev_group_cols:
        sel_outer.append_group_by(*prev_group_cols)
        sel_outer.append_column(*prev_group_cols)

    # now any defined in the count verb call
    for k in group_cols:
        sel_outer.append_group_by(inner_cols[k])
        sel_outer.append_column(inner_cols[k])

    count_col = sql.functions.count().label(res_name)
    sel_outer.append_column(count_col)

    # count is like summarize, so removes order_by
    return tbl_inner.append_op(
            sel_outer.order_by(count_col.desc()),
            order_by = tuple()
            )
示例#4
0
def fast_mutate(__data, **kwargs):
    """Warning: this function is experimental"""

    # transform call trees, potentially bail out to slow method --------
    new_vals = _transform_args(kwargs.values())

    if new_vals is None:
        return mutate(__data, **kwargs)

    # perform fast method ----
    out = __data.obj.copy()
    groupings = __data.grouper.groupings

    for name, expr in zip(kwargs, new_vals):
        res = grouped_eval(__data, expr)
        out[name] = res

    return out.groupby(groupings)
示例#5
0
def _group_by(__data, *args, add = False, **kwargs):
    if kwargs:
        data = mutate(__data, **kwargs)
    else:
        data = __data

    cols = data.last_op.columns

    # put kwarg grouping vars last, so similar order to function call
    groups =  tuple(simple_varname(arg) for arg in args) + tuple(kwargs)
    if None in groups:
        raise NotImplementedError("Complex expressions not supported in sql group_by")

    unmatched = set(groups) - set(cols.keys())
    if unmatched:
        raise KeyError("group_by specifies columns missing from table: %s" %unmatched)

    if add:
        groups = ordered_union(data.group_by, groups)

    return data.copy(group_by = groups)
示例#6
0
def test_dply_mutate_sym(df1):
    op_stars_1k = _.stars * 1000
    out1 = mutate(df1, stars_1k=op_stars_1k)
    out2 = df1.assign(stars_1k=op_stars_1k)

    assert_frame_equal(out1, out2)