def _visit_filter_TopK(self, expr): # Top K is rewritten as an # - aggregation # - sort by # - limit # - left semi join with table set metric_name = '__tmp__' op = expr.op() metrics = [op.by.name(metric_name)] arg_table = L.find_base_table(op.arg) by_table = L.find_base_table(op.by) if arg_table.equals(by_table): agg = arg_table.aggregate(metrics, by=[op.arg]) else: agg = self.table_set.aggregate(metrics, by=[op.arg]) rank_set = agg.sort_by([(metric_name, False)]).limit(op.k) pred = (op.arg == getattr(rank_set, op.arg.get_name())) self.table_set = self.table_set.semi_join(rank_set, [pred]) return None
def value_counts(arg, metric_name="count"): """ Compute a frequency table for this value expression Parameters ---------- Returns ------- counts : TableExpr Aggregated table """ base = _L.find_base_table(arg) metric = base.count().name(metric_name) try: arg.get_name() except _com.ExpressionError: arg = arg.name("unnamed") return base.group_by(arg).aggregate(metric)
def value_counts(arg, metric_name='count'): """ Compute a frequency table for this value expression Parameters ---------- Returns ------- counts : TableExpr Aggregated table """ base = _L.find_base_table(arg) metric = base.count().name(metric_name) try: arg.get_name() except _com.ExpressionError: arg = arg.name('unnamed') return base.group_by(arg).aggregate(metric)
def _reduction_to_aggregation(expr, agg_name='tmp'): table = L.find_base_table(expr) return table.aggregate([expr.name(agg_name)])
def _adapt_expr(expr): # Non-table expressions need to be adapted to some well-formed table # expression, along with a way to adapt the results to the desired # arity (whether array-like or scalar, for example) # # Canonical case is scalar values or arrays produced by some reductions # (simple reductions, or distinct, say) def as_is(x): return x if isinstance(expr, ir.TableExpr): return expr, as_is def _scalar_reduce(x): return isinstance(x, ir.ScalarExpr) and ops.is_reduction(x) if isinstance(expr, ir.ScalarExpr): def scalar_handler(results): return results['tmp'][0] if _scalar_reduce(expr): table_expr = _reduction_to_aggregation(expr, agg_name='tmp') return table_expr, scalar_handler else: base_table = L.find_base_table(expr) if base_table is None: # expr with no table refs return expr.name('tmp'), scalar_handler else: raise NotImplementedError(expr._repr()) elif isinstance(expr, ir.ExprList): exprs = expr.exprs() is_aggregation = True any_aggregation = False for x in exprs: if not _scalar_reduce(x): is_aggregation = False else: any_aggregation = True if is_aggregation: table = L.find_base_table(exprs[0]) return table.aggregate(exprs), as_is elif not any_aggregation: return expr, as_is else: raise NotImplementedError(expr._repr()) elif isinstance(expr, ir.ArrayExpr): op = expr.op() def _get_column(name): def column_handler(results): return results[name] return column_handler if isinstance(op, ops.TableColumn): table_expr = op.table result_handler = _get_column(op.name) else: # Something more complicated. base_table = L.find_source_table(expr) if isinstance(op, ops.DistinctArray): expr = op.arg try: name = op.arg.get_name() except Exception: name = 'tmp' table_expr = (base_table.projection([expr.name(name) ]).distinct()) result_handler = _get_column(name) else: table_expr = base_table.projection([expr.name('tmp')]) result_handler = _get_column('tmp') return table_expr, result_handler else: raise NotImplementedError
def _notall_expand(expr): arg = expr.op().args[0] t = L.find_base_table(arg) return arg.sum() < t.count()
def _adapt_expr(expr): # Non-table expressions need to be adapted to some well-formed table # expression, along with a way to adapt the results to the desired # arity (whether array-like or scalar, for example) # # Canonical case is scalar values or arrays produced by some reductions # (simple reductions, or distinct, say) def as_is(x): return x if isinstance(expr, ir.TableExpr): return expr, as_is def _scalar_reduce(x): return isinstance(x, ir.ScalarExpr) and ops.is_reduction(x) if isinstance(expr, ir.ScalarExpr): def scalar_handler(results): return results['tmp'][0] if _scalar_reduce(expr): table_expr = _reduction_to_aggregation(expr, agg_name='tmp') return table_expr, scalar_handler else: base_table = L.find_base_table(expr) if base_table is None: # expr with no table refs return expr.name('tmp'), scalar_handler else: raise NotImplementedError(expr._repr()) elif isinstance(expr, ir.ExprList): exprs = expr.exprs() is_aggregation = True any_aggregation = False for x in exprs: if not _scalar_reduce(x): is_aggregation = False else: any_aggregation = True if is_aggregation: table = L.find_base_table(exprs[0]) return table.aggregate(exprs), as_is elif not any_aggregation: return expr, as_is else: raise NotImplementedError(expr._repr()) elif isinstance(expr, ir.ArrayExpr): op = expr.op() def _get_column(name): def column_handler(results): return results[name] return column_handler if isinstance(op, ops.TableColumn): table_expr = op.table result_handler = _get_column(op.name) else: # Something more complicated. base_table = L.find_source_table(expr) if isinstance(op, ops.DistinctArray): expr = op.arg try: name = op.arg.get_name() except Exception: name = 'tmp' table_expr = (base_table.projection([expr.name(name)]) .distinct()) result_handler = _get_column(name) else: table_expr = base_table.projection([expr.name('tmp')]) result_handler = _get_column('tmp') return table_expr, result_handler else: raise NotImplementedError