def fully_originate_from(exprs, parents): def finder(expr): op = expr.op() if isinstance(expr, ir.TableExpr): return lin.proceed, expr.op() return lin.halt if op.blocks() else lin.proceed, None # unique table dependencies of exprs and parents exprs_deps = set(lin.traverse(finder, exprs)) parents_deps = set(lin.traverse(finder, parents)) return exprs_deps <= parents_deps
def fully_originate_from(exprs, parents): def finder(expr): op = expr.op() if isinstance(expr, ir.TableExpr): return lin.proceed, expr.op() return lin.halt if op.blocks() else lin.proceed, None # unique table dependencies of exprs and parents exprs_deps = set(lin.traverse(finder, exprs)) parents_deps = set(lin.traverse(finder, parents)) return exprs_deps <= parents_deps
def find_subqueries(expr: ir.Expr) -> Counter: def predicate(counts: Counter, expr: ir.Expr) -> tuple[Sequence[ir.Table] | bool, None]: op = expr.op() if isinstance(op, ops.Join): return [op.left, op.right], None elif isinstance(op, ops.PhysicalTable): return lin.halt, None elif isinstance(op, ops.SelfReference): return lin.proceed, None elif isinstance(op, (ops.Selection, ops.Aggregation)): counts[op] += 1 return [op.table], None elif isinstance(op, ops.TableNode): counts[op] += 1 return lin.proceed, None elif isinstance(op, ops.TableColumn): return op.table.op() not in counts, None else: return lin.proceed, None counts = Counter() iterator = lin.traverse( functools.partial(predicate, counts), expr, # keep duplicates so we can determine where an expression is used # more than once dedup=False, ) # consume the iterator collections.deque(iterator, maxlen=0) return counts
def has_reduction(expr): """Does `expr` contain a reduction? Parameters ---------- expr : ibis.expr.types.Expr An ibis expression Returns ------- truth_value : bool Whether or not there's at least one reduction in `expr` Notes ----- The ``isinstance(op, ops.TableNode)`` check in this function implies that we only examine every non-table expression that precedes the first table expression. """ def fn(expr): op = expr.op() if isinstance(op, ops.TableNode): # don't go below any table nodes return lin.halt, None if isinstance(op, ops.Reduction): return lin.halt, True return lin.proceed, None reduction_status = lin.traverse(fn, expr) return any(reduction_status)
def has_reduction(expr): """Does `expr` contain a reduction? Parameters ---------- expr : ibis.expr.types.Expr An ibis expression Returns ------- truth_value : bool Whether or not there's at least one reduction in `expr` Notes ----- The ``isinstance(op, ops.TableNode)`` check in this function implies that we only examine every non-table expression that precedes the first table expression. """ def fn(expr): op = expr.op() if isinstance(op, ops.TableNode): # don't go below any table nodes return lin.halt, None if isinstance(op, ops.Reduction): return lin.halt, True return lin.proceed, None reduction_status = lin.traverse(fn, expr) return any(reduction_status)
def get_result(self): def validate(expr): op = expr.op() if isinstance(op, ops.TableColumn): return lin.proceed, self._validate_projection(expr) return lin.proceed, None return all(lin.traverse(validate, self.pred, type=ir.Value))
def _walk(self, expr): def validate(expr): op = expr.op() if isinstance(op, ops.TableColumn): return lin.proceed, self._validate_column(expr) return lin.proceed, None return lin.traverse(validate, expr, type=ir.ValueExpr)
def is_analytic(expr): def predicate(expr): if isinstance(expr.op(), (ops.Reduction, ops.Analytic)): return lin.halt, True else: return lin.proceed, None return any(lin.traverse(predicate, expr))
def _walk(self, expr): def validate(expr): op = expr.op() if isinstance(op, ops.TableColumn): return lin.proceed, self._validate_column(expr) return lin.proceed, None return lin.traverse(validate, expr, type=ir.ValueExpr)
def generate_setup_queries(self): queries = map(partial(BigQueryUDFDefinition, context=self.context), lin.traverse(find_bigquery_udf, self.expr)) # UDFs are uniquely identified by the name of the Node subclass we # generate. return list( toolz.unique(queries, key=lambda x: type(x.expr.op()).__name__))
def __init__(self, client, ddl, query_parameters=None): super(BigQueryQuery, self).__init__(client, ddl) query_parameter_names = dict( lin.traverse(_find_scalar_parameter, ddl.parent_expr)) self.query_parameters = [ bigquery_param(param.to_expr().name(query_parameter_names[param]), value) for param, value in (query_parameters or {}).items() ]
def __init__(self, client, ddl, query_parameters=None): super().__init__(client, ddl) # self.expr comes from the parent class query_parameter_names = dict(lin.traverse(_find_scalar_parameter, self.expr)) self.query_parameters = [ bigquery_param(param.to_expr().name(query_parameter_names[param]), value) for param, value in (query_parameters or {}).items() ]
def flatten_predicate(expr): """Yield the expressions corresponding to the `And` nodes of a predicate. Parameters ---------- expr : ir.BooleanColumn Returns ------- exprs : List[ir.BooleanColumn] Examples -------- >>> import ibis >>> t = ibis.table([('a', 'int64'), ('b', 'string')], name='t') >>> filt = (t.a == 1) & (t.b == 'foo') >>> predicates = flatten_predicate(filt) >>> len(predicates) 2 >>> predicates[0] # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : int64 b : string Equals[boolean*] left: a = Column[int64*] 'a' from table ref_0 right: Literal[int64] 1 >>> predicates[1] # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : int64 b : string Equals[boolean*] left: b = Column[string*] 'b' from table ref_0 right: Literal[string] foo """ def predicate(expr): if isinstance(expr.op(), ops.And): return lin.proceed, None else: return lin.halt, expr return list(lin.traverse(predicate, expr, type=ir.BooleanColumn))
def flatten_predicate(expr): """Yield the expressions corresponding to the `And` nodes of a predicate. Parameters ---------- expr : ir.BooleanColumn Returns ------- exprs : List[ir.BooleanColumn] Examples -------- >>> import ibis >>> t = ibis.table([('a', 'int64'), ('b', 'string')], name='t') >>> filt = (t.a == 1) & (t.b == 'foo') >>> predicates = flatten_predicate(filt) >>> len(predicates) 2 >>> predicates[0] # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : int64 b : string Equals[boolean*] left: a = Column[int64*] 'a' from table ref_0 right: Literal[int64] 1 >>> predicates[1] # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : int64 b : string Equals[boolean*] left: b = Column[string*] 'b' from table ref_0 right: Literal[string] foo """ def predicate(expr): if isinstance(expr.op(), ops.And): return lin.proceed, None else: return lin.halt, expr return list(lin.traverse(predicate, expr, type=ir.BooleanColumn))
def _generate_setup_queries(expr, context): """Generate DDL for temporary resources.""" queries = map( partial(BigQueryUDFDefinition, context=context), lin.traverse(find_bigquery_udf, expr), ) # UDFs are uniquely identified by the name of the Node subclass we # generate. return list(toolz.unique(queries, key=lambda x: type(x.expr.op()).__name__))
def find_predicates(expr, flatten=True): def predicate(expr): if isinstance(expr, ir.BooleanColumn): if flatten and isinstance(expr.op(), ops.And): return lin.proceed, None else: return lin.halt, expr return lin.proceed, None return list(lin.traverse(predicate, expr))
def generate_setup_queries(self): queries = map( partial(BigQueryUDFDefinition, context=self.context), lin.traverse(find_bigquery_udf, self.expr), ) # UDFs are uniquely identified by the name of the Node subclass we # generate. return list( toolz.unique(queries, key=lambda x: type(x.expr.op()).__name__) )
def generate_setup_queries( self, ): # TODO validate if I need to override this function queries = map( partial(TeradataUDFDefinition, context=self.context), lin.traverse(find_teradata_udf, self.expr), ) # UDFs are uniquely identified by the name of the Node subclass we # generate. return list( toolz.unique(queries, key=lambda x: type(x.expr.op()).__name__))
def find_first_base_table(expr): def predicate(expr): op = expr.op() if isinstance(op, ops.TableNode): return lin.halt, expr else: return lin.proceed, None try: return next(lin.traverse(predicate, expr)) except StopIteration: return None
def find_immediate_parent_tables(expr): """Find every first occurrence of a :class:`ibis.expr.types.TableExpr` object in `expr`. Parameters ---------- expr : ir.Expr Yields ------ e : ir.Expr Notes ----- This function does not traverse into TableExpr objects. This means that the underlying PhysicalTable of a Selection will not be yielded, for example. Examples -------- >>> import ibis, toolz >>> t = ibis.table([('a', 'int64')], name='t') >>> expr = t.mutate(foo=t.a + 1) >>> result = list(find_immediate_parent_tables(expr)) >>> len(result) 1 >>> result[0] # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : int64 Selection[table] table: Table: ref_0 selections: Table: ref_0 foo = Add[int64*] left: a = Column[int64*] 'a' from table ref_0 right: Literal[int8] 1 """ def finder(expr): if isinstance(expr, ir.TableExpr): return lin.halt, expr else: return lin.proceed, None return lin.traverse(finder, expr)
def find_immediate_parent_tables(expr): """Find every first occurrence of a :class:`ibis.expr.types.TableExpr` object in `expr`. Parameters ---------- expr : ir.Expr Yields ------ e : ir.Expr Notes ----- This function does not traverse into TableExpr objects. This means that the underlying PhysicalTable of a Selection will not be yielded, for example. Examples -------- >>> import ibis, toolz >>> t = ibis.table([('a', 'int64')], name='t') >>> expr = t.mutate(foo=t.a + 1) >>> result = list(find_immediate_parent_tables(expr)) >>> len(result) 1 >>> result[0] # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : int64 Selection[table] table: Table: ref_0 selections: Table: ref_0 foo = Add[int64*] left: a = Column[int64*] 'a' from table ref_0 right: Literal[int8] 1 """ def finder(expr): if isinstance(expr, ir.TableExpr): return lin.halt, expr else: return lin.proceed, None return lin.traverse(finder, expr)
def find_source_table(expr): # pragma: no cover """Find the first table expression observed for each argument that the expression depends on Parameters ---------- expr : ir.Expr Returns ------- table_expr : ir.Table Examples -------- >>> import ibis >>> t = ibis.table([('a', 'double'), ('b', 'string')], name='t') >>> expr = t.mutate(c=t.a + 42.0) >>> expr r0 := UnboundTable[t] a float64 b string Selection[r0] selections: r0 c: r0.a + 42.0 >>> find_source_table(expr) UnboundTable[t] a float64 b string >>> left = ibis.table([('a', 'int64'), ('b', 'string')]) >>> right = ibis.table([('c', 'int64'), ('d', 'string')]) >>> result = left.inner_join(right, left.a == right.c) >>> find_source_table(result) # doctest: +ELLIPSIS Traceback (most recent call last): ... NotImplementedError: More than one base table not implemented """ def finder(expr): if isinstance(expr, ir.Table): return lin.halt, expr else: return lin.proceed, None first_tables = lin.traverse(finder, expr.op().flat_args()) options = list(toolz.unique(first_tables, key=operator.methodcaller('op'))) if len(options) > 1: raise NotImplementedError('More than one base table not implemented') return options[0]
def is_row_order_preserving(exprs) -> bool: """Detects if the operation preserves row ordering. Certain operations we know will not affect the ordering of rows in the dataframe (for example elementwise operations on ungrouped dataframes). In these cases we may be able to avoid expensive joins and assign directly into the parent dataframe. """ def _is_row_order_preserving(expr: ir.Expr): if isinstance(expr.op(), (ops.Reduction, ops.WindowOp)): return (lin.halt, False) else: return (lin.proceed, True) return lin.traverse(_is_row_order_preserving, exprs)
def is_ancestor(self, other): import ibis.expr.lineage as lin if isinstance(other, ir.Expr): other = other.op() if self.equals(other): return True fn = lambda e: (lin.proceed, e.op()) # noqa: E731 expr = self.to_expr() for child in lin.traverse(fn, expr): if child.equals(other): return True return False
def execute(self): udf_nodes = lin.traverse(find_spark_udf, self.expr) # UDFs are uniquely identified by the name of the Node subclass we # generate. udf_nodes_unique = toolz.unique(udf_nodes, key=lambda node: type(node).__name__) # register UDFs in pyspark for node in udf_nodes_unique: self.client._session.udf.register( type(node).__name__, node.udf_func, ) return super().execute()
def execute(self, expr, params=None, limit='default', **kwargs): udf_nodes = lin.traverse(find_spark_udf, expr) # UDFs are uniquely identified by the name of the Node subclass we # generate. udf_nodes_unique = list( toolz.unique(udf_nodes, key=lambda node: type(node).__name__)) # register UDFs in pyspark for node in udf_nodes_unique: self._session.udf.register(type(node).__name__, node.udf_func) result = super().execute(expr, params, limit, **kwargs) for node in udf_nodes_unique: stmt = ddl.DropFunction(type(node).__name__, must_exist=True) self.raw_sql(stmt.compile()) return result
def _is_ancestor(parent, child): # pragma: no cover """ Check whether an operation is an ancestor node of another. Parameters ---------- parent : ops.Node child : ops.Node Returns ------- check output : bool """ def predicate(expr): if expr.op() == child: return lin.halt, True else: return lin.proceed, None return any(lin.traverse(predicate, parent.to_expr()))
def find_immediate_parent_tables(expr): """Find every first occurrence of a :class:`ibis.expr.types.Table` object in `expr`. Parameters ---------- expr : ir.Expr Yields ------ e : ir.Expr Notes ----- This function does not traverse into Table objects. This means that the underlying PhysicalTable of a Selection will not be yielded, for example. Examples -------- >>> import ibis, toolz >>> t = ibis.table([('a', 'int64')], name='t') >>> expr = t.mutate(foo=t.a + 1) >>> result = list(find_immediate_parent_tables(expr)) >>> len(result) 1 >>> result[0] r0 := UnboundTable[t] a int64 Selection[r0] selections: r0 foo: r0.a + 1 """ def finder(expr): if isinstance(expr, ir.Table): return lin.halt, expr else: return lin.proceed, None return toolz.unique(lin.traverse(finder, expr))
def find_data(expr): """Find data sources bound to `expr`. Parameters ---------- expr : ibis.expr.types.Expr Returns ------- data : collections.OrderedDict """ def finder(expr): op = expr.op() if hasattr(op, 'source'): data = (op, op.source.dictionary.get(op.name, None)) elif isinstance(op, ir.Literal): data = (op, op.value) else: data = None return lin.proceed, data return collections.OrderedDict(lin.traverse(finder, expr))
def execute(self): udf_nodes = lin.traverse(find_spark_udf, self.expr) # UDFs are uniquely identified by the name of the Node subclass we # generate. udf_nodes_unique = list( toolz.unique(udf_nodes, key=lambda node: type(node).__name__)) # register UDFs in pyspark for node in udf_nodes_unique: self.client._session.udf.register( type(node).__name__, node.udf_func, ) result = super().execute() for node in udf_nodes_unique: stmt = ddl.DropFunction(type(node).__name__, must_exist=True) self.client._execute(stmt.compile()) return result
def is_reduction(expr): """ Check whether an expression contains a reduction or not Aggregations yield typed scalar expressions, since the result of an aggregation is a single value. When creating an table expression containing a GROUP BY equivalent, we need to be able to easily check that we are looking at the result of an aggregation. As an example, the expression we are looking at might be something like: foo.sum().log10() + bar.sum().log10() We examine the operator DAG in the expression to determine if there are aggregations present. A bound aggregation referencing a separate table is a "false aggregation" in a GROUP BY-type expression and should be treated a literal, and must be computed as a separate query and stored in a temporary variable (or joined, for bound aggregations with keys) Parameters ---------- expr : ir.Expr Returns ------- check output : bool """ def predicate(expr): if isinstance(expr.op(), ops.Reduction): return lin.halt, True elif isinstance(expr.op(), ops.TableNode): # don't go below any table nodes return lin.halt, None else: return lin.proceed, None return any(lin.traverse(predicate, expr))
def flatten_predicate(expr): """Yield the expressions corresponding to the `And` nodes of a predicate. Parameters ---------- expr : ir.BooleanColumn Returns ------- exprs : List[ir.BooleanColumn] Examples -------- >>> import ibis >>> t = ibis.table([('a', 'int64'), ('b', 'string')], name='t') >>> filt = (t.a == 1) & (t.b == 'foo') >>> predicates = flatten_predicate(filt) >>> len(predicates) 2 >>> predicates[0] r0 := UnboundTable[t] a int64 b string r0.a == 1 >>> predicates[1] r0 := UnboundTable[t] a int64 b string r0.b == 'foo' """ def predicate(expr): if isinstance(expr.op(), ops.And): return lin.proceed, None else: return lin.halt, expr return list(lin.traverse(predicate, expr, type=ir.BooleanColumn))
def find_source_table(expr): """Find the first table expression observed for each argument that the expression depends on Parameters ---------- expr : ir.Expr Returns ------- table_expr : ir.TableExpr Examples -------- >>> import ibis >>> t = ibis.table([('a', 'double'), ('b', 'string')], name='t') >>> expr = t.mutate(c=t.a + 42.0) >>> expr # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : double b : string Selection[table] table: Table: ref_0 selections: Table: ref_0 c = Add[double*] left: a = Column[double*] 'a' from table ref_0 right: Literal[double] 42.0 >>> find_source_table(expr) UnboundTable[table] name: t schema: a : double b : string >>> left = ibis.table([('a', 'int64'), ('b', 'string')]) >>> right = ibis.table([('c', 'int64'), ('d', 'string')]) >>> result = left.inner_join(right, left.a == right.c) >>> find_source_table(result) # doctest: +ELLIPSIS Traceback (most recent call last): ... NotImplementedError: More than one base table not implemented """ def finder(expr): if isinstance(expr, ir.TableExpr): return lin.halt, expr else: return lin.proceed, None first_tables = lin.traverse(finder, expr.op().flat_args()) options = list(toolz.unique(first_tables, key=id)) if len(options) > 1: raise NotImplementedError('More than one base table not implemented') return options[0]
def find_source_table(expr): """Find the first table expression observed for each argument that the expression depends on Parameters ---------- expr : ir.Expr Returns ------- table_expr : ir.TableExpr Examples -------- >>> import ibis >>> t = ibis.table([('a', 'double'), ('b', 'string')], name='t') >>> expr = t.mutate(c=t.a + 42.0) >>> expr # doctest: +NORMALIZE_WHITESPACE ref_0 UnboundTable[table] name: t schema: a : float64 b : string Selection[table] table: Table: ref_0 selections: Table: ref_0 c = Add[float64*] left: a = Column[float64*] 'a' from table ref_0 right: Literal[float64] 42.0 >>> find_source_table(expr) UnboundTable[table] name: t schema: a : float64 b : string >>> left = ibis.table([('a', 'int64'), ('b', 'string')]) >>> right = ibis.table([('c', 'int64'), ('d', 'string')]) >>> result = left.inner_join(right, left.a == right.c) >>> find_source_table(result) # doctest: +ELLIPSIS Traceback (most recent call last): ... NotImplementedError: More than one base table not implemented """ def finder(expr): if isinstance(expr, ir.TableExpr): return lin.halt, expr else: return lin.proceed, None first_tables = lin.traverse(finder, expr.op().flat_args()) options = list(toolz.unique(first_tables, key=methodcaller('op'))) if len(options) > 1: raise NotImplementedError('More than one base table not implemented') return options[0]
def shares_some_roots(exprs, parents): # unique table dependencies of exprs and parents exprs_deps = set(lin.traverse(_find_root_table, exprs)) parents_deps = set(lin.traverse(_find_root_table, parents)) return bool(exprs_deps & parents_deps)