def _distinct(__data, *args, _keep_all=False, **kwargs): if (args or kwargs) and _keep_all: raise NotImplementedError( "Distinct with variables specified in sql requires _keep_all = False" ) inner_sel = mutate(__data, **kwargs).last_op if kwargs else __data.last_op # TODO: this is copied from the df distinct version # cols dict below is used as ordered set cols = {simple_varname(x): True for x in args} cols.update(kwargs) if None in cols: raise KeyError("positional arguments must be simple column, " "e.g. _.colname or _['colname']") # use all columns by default if not cols: cols = list(inner_sel.columns.keys()) if not len(inner_sel._order_by_clause): # select distinct has to include any columns in the order by clause, # so can only safely modify existing statement when there's no order by sel_cols = lift_inner_cols(inner_sel) distinct_cols = [sel_cols[k] for k in cols] sel = inner_sel.with_only_columns(distinct_cols).distinct() else: # fallback to cte cte = inner_sel.alias() distinct_cols = [cte.columns[k] for k in cols] sel = sql.select(distinct_cols, from_obj=cte).distinct() return __data.append_op(sel)
def test_arranges_back_to_back(backend): data = data_frame(x=range(1, 5), g=[1, 1, 2, 2]) dfs = backend.load_df(data) lazy_tbl = dfs >> arrange(_.x) >> arrange(_.g) order_by_vars = tuple(simple_varname(call) for call in lazy_tbl.order_by) assert order_by_vars == ("x", "g") assert [c.name for c in lazy_tbl.last_op._order_by_clause] == ["x", "g"]
def _count(__data, *args, sort = False, wt = None, **kwargs): # TODO: if already col named n, use name nn, etc.. get logic from tidy.py if wt is not None: raise NotImplementedError("TODO") res_name = "n" # similar to filter verb, we need two select statements, # an inner one for derived cols, and outer to group by them # inner select ---- # holds any mutation style columns arg_names = [] for arg in args: name = simple_varname(arg) if name is None: raise NotImplementedError( "Count positional arguments must be single column name. " "Use a named argument to count using complex expressions." ) arg_names.append(name) tbl_inner = mutate(__data, **kwargs) sel_inner = tbl_inner.last_op group_cols = arg_names + list(kwargs) # outer select ---- # holds selected columns and tally (n) sel_inner_cte = sel_inner.alias() inner_cols = sel_inner_cte.columns sel_outer = sql.select(from_obj = sel_inner_cte) # apply any group vars from a group_by verb call first prev_group_cols = [inner_cols[k] for k in tbl_inner.group_by] if prev_group_cols: sel_outer.append_group_by(*prev_group_cols) sel_outer.append_column(*prev_group_cols) # now any defined in the count verb call for k in group_cols: sel_outer.append_group_by(inner_cols[k]) sel_outer.append_column(inner_cols[k]) count_col = sql.functions.count().label(res_name) sel_outer.append_column(count_col) # count is like summarize, so removes order_by return tbl_inner.append_op( sel_outer.order_by(count_col.desc()), order_by = tuple() )
def _count(__data, *args, sort=False, wt=None, **kwargs): # TODO: if already col named n, use name nn, etc.. get logic from tidy.py if kwargs: raise NotImplementedError("TODO") if wt is not None: raise NotImplementedError("TODO") # similar to filter verb, we need two select statements, # an inner one for derived cols, and outer to group by them sel = __data.last_op.alias() sel_inner = sql.select([sel], from_obj=sel) # inner select ---- # holds any mutation style columns group_cols = [] for arg in args: col_name = simple_varname(arg) if col_name is None: # evaluate call col_expr = arg(sel.columns) if callable(arg) else arg # compile, so we can use the expr as its name (e.g. "id + 1") col_name = str(compile_el(__data, col_expr)) label = col_expr.label(col_name) sel_inner.append_column(label) group_cols.append(col_name) # outer select ---- # holds selected columns and tally (n) sel_inner_cte = sel_inner.alias() inner_cols = sel_inner_cte.columns sel_outer = sql.select(from_obj=sel_inner_cte) # apply any group vars from a group_by verb call first prev_group_cols = [inner_cols[k] for k in __data.group_by] if prev_group_cols: sel_outer.append_group_by(*prev_group_cols) sel_outer.append_column(*prev_group_cols) # now any defined in the count verb call for k in group_cols: sel_outer.append_group_by(inner_cols[k]) sel_outer.append_column(inner_cols[k]) sel_outer.append_column(sql.functions.count().label("n")) return __data.append_op(sel_outer)
def _rename(__data, **kwargs): sel = __data.last_op columns = lift_inner_cols(sel) # old_keys uses dict as ordered set old_to_new = {simple_varname(v):k for k,v in kwargs.items()} if None in old_to_new: raise KeyError("positional arguments must be simple column, " "e.g. _.colname or _['colname']" ) labs = [c.label(old_to_new[k]) if k in old_to_new else c for k,c in columns.items()] new_sel = sel.with_only_columns(labs) return __data.append_op(new_sel)
def _group_by(__data, *args, add = False, **kwargs): if kwargs: data = mutate(__data, **kwargs) else: data = __data cols = data.last_op.columns # put kwarg grouping vars last, so similar order to function call groups = tuple(simple_varname(arg) for arg in args) + tuple(kwargs) if None in groups: raise NotImplementedError("Complex expressions not supported in sql group_by") unmatched = set(groups) - set(cols.keys()) if unmatched: raise KeyError("group_by specifies columns missing from table: %s" %unmatched) if add: groups = ordered_union(data.group_by, groups) return data.copy(group_by = groups)