class ShiftBase(Analytic): arg = rlz.column(rlz.any) offset = rlz.optional(rlz.one_of((rlz.integer, rlz.interval))) default = rlz.optional(rlz.any) output_dtype = rlz.dtype_like("arg")
class Bucket(BucketLike): arg = rlz.column(rlz.any) buckets = rlz.tuple_of(rlz.scalar(rlz.any)) closed = rlz.optional(rlz.isin({'left', 'right'}), default='left') close_extreme = rlz.optional(rlz.instance_of(bool), default=True) include_under = rlz.optional(rlz.instance_of(bool), default=False) include_over = rlz.optional(rlz.instance_of(bool), default=False) def __init__(self, buckets, include_under, include_over, **kwargs): if not len(buckets): raise ValueError('Must be at least one bucket edge') elif len(buckets) == 1: if not include_under or not include_over: raise ValueError('If one bucket edge provided, must have ' 'include_under=True and include_over=True') super().__init__( buckets=buckets, include_under=include_under, include_over=include_over, **kwargs, ) @property def nbuckets(self): return len(self.buckets) - 1 + self.include_over + self.include_under
def test_optional(validator, input): expected = validator(input) if isinstance(expected, ibis.Expr): assert rlz.optional(validator)(input).equals(expected) else: assert rlz.optional(validator)(input) == expected assert rlz.optional(validator)(None) is None
class Clip(Value): arg = rlz.strict_numeric lower = rlz.optional(rlz.strict_numeric) upper = rlz.optional(rlz.strict_numeric) output_dtype = rlz.dtype_like("arg") output_shape = rlz.shape_like("args")
class StringFind(Value): arg = rlz.string substr = rlz.string start = rlz.optional(rlz.integer) end = rlz.optional(rlz.integer) output_shape = rlz.shape_like("arg") output_dtype = dt.int64
class AsOfJoin(Join): # TODO(kszucs): convert to proper predicate rules by = rlz.optional(lambda x, this: x, default=()) tolerance = rlz.optional(rlz.interval) def __init__(self, left, right, by, predicates, **kwargs): by = _clean_join_predicates(left, right, util.promote_list(by)) super().__init__(left=left, right=right, by=by, predicates=predicates, **kwargs)
class LPad(Value): arg = rlz.string length = rlz.integer pad = rlz.optional(rlz.string) output_shape = rlz.shape_like("arg") output_dtype = dt.string
class ArraySlice(Value): arg = rlz.array start = rlz.integer stop = rlz.optional(rlz.integer) output_dtype = rlz.dtype_like("arg") output_shape = rlz.shape_like("arg")
class Join(TableNode): left = rlz.table right = rlz.table # TODO(kszucs): convert to proper predicate rules predicates = rlz.optional(lambda x, this: x, default=()) def __init__(self, left, right, predicates, **kwargs): left, right, predicates = _make_distinct_join_predicates( left, right, util.promote_list(predicates)) super().__init__(left=left, right=right, predicates=predicates, **kwargs) @property def schema(self): # For joins retaining both table schemas, merge them together here return self.left.schema().append(self.right.schema()) @util.deprecated(version="4.0", instead="") def has_schema(self): return not set(self.left.columns) & set(self.right.columns) def root_tables(self): if util.all_of([self.left.op(), self.right.op()], (Join, Selection)): # Unraveling is not possible return [self.left.op(), self.right.op()] else: return distinct_roots(self.left, self.right)
class Substring(Value): arg = rlz.string start = rlz.integer length = rlz.optional(rlz.integer) output_dtype = dt.string output_shape = rlz.shape_like('arg')
class StringToTimestamp(Value): arg = rlz.string format_str = rlz.string timezone = rlz.optional(rlz.string) output_shape = rlz.shape_like("arg") output_dtype = dt.Timestamp(timezone='UTC')
class UnboundTable(PhysicalTable): schema = rlz.instance_of(sch.Schema) name = rlz.optional(rlz.instance_of(str), default=genname) def has_resolved_name(self): return True def resolve_name(self): return self.name
class DropNa(TableNode, sch.HasSchema): """Drop null values in the table.""" table = rlz.table how = rlz.isin({'any', 'all'}) subset = rlz.optional(rlz.tuple_of(rlz.column_from("table")), default=()) @property def schema(self): return self.table.schema()
class TimestampFromYMDHMS(Value): year = rlz.integer month = rlz.integer day = rlz.integer hours = rlz.integer minutes = rlz.integer seconds = rlz.integer timezone = rlz.optional(rlz.string) output_dtype = dt.timestamp output_shape = rlz.shape_like("args")
class Histogram(BucketLike): arg = rlz.numeric nbins = rlz.optional(rlz.instance_of(int)) binwidth = rlz.optional(rlz.scalar(rlz.numeric)) base = rlz.optional(rlz.scalar(rlz.numeric)) closed = rlz.optional(rlz.isin({'left', 'right'}), default='left') aux_hash = rlz.optional(rlz.instance_of(str)) def __init__(self, nbins, binwidth, **kwargs): if nbins is None: if binwidth is None: raise ValueError('Must indicate nbins or binwidth') elif binwidth is not None: raise ValueError('nbins and binwidth are mutually exclusive') super().__init__(nbins=nbins, binwidth=binwidth, **kwargs) @property def output_dtype(self): # always undefined cardinality (for now) return dt.category
class AlchemyTable(ops.DatabaseTable): sqla_table = rlz.instance_of(object) name = rlz.optional(rlz.instance_of(str), default=None) schema = rlz.optional(rlz.instance_of(sch.Schema), default=None) def __init__(self, source, sqla_table, name, schema): if name is None: name = sqla_table.name if schema is None: schema = sch.infer(sqla_table, schema=schema) super().__init__(name=name, schema=schema, sqla_table=sqla_table, source=source) # TODO(kszucs): remove this def __equals__(self, other: AlchemyTable) -> bool: # override the default implementation to not compare # sqla_table instances return (self.name == other.name and self.source == other.source and self.schema.equals(other.schema))
class Round(Value): arg = rlz.numeric digits = rlz.optional(rlz.numeric) output_shape = rlz.shape_like("arg") @property def output_dtype(self): if isinstance(self.arg.type(), dt.Decimal): return self.arg.type() elif self.digits is None: return dt.int64 else: return dt.double
class CategoryLabel(Value): arg = rlz.category labels = rlz.tuple_of(rlz.instance_of(str)) nulls = rlz.optional(rlz.instance_of(str)) output_dtype = dt.string output_shape = rlz.shape_like("arg") def __init__(self, arg, labels, **kwargs): cardinality = arg.type().cardinality if len(labels) != cardinality: raise ValueError('Number of labels must match number of ' f'categories: {cardinality}') super().__init__(arg=arg, labels=labels, **kwargs)
class ParseURL(Value): arg = rlz.string extract = rlz.isin({ 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', 'USERINFO', 'QUERY', }) key = rlz.optional(rlz.string) output_shape = rlz.shape_like("arg") output_dtype = dt.string
class SortKey(Node): expr = rlz.column(rlz.any) ascending = rlz.optional( rlz.map_to({ True: True, False: False, 1: True, 0: False, }, ), default=True, ) output_type = ir.SortExpr def root_tables(self): return self.expr.op().root_tables() def resolve_name(self): return self.expr.get_name()
class ScalarParameter(Value): _counter = itertools.count() dtype = rlz.datatype counter = rlz.optional(rlz.instance_of(int), default=lambda: next(ScalarParameter._counter)) output_shape = rlz.Shape.SCALAR output_dtype = property(attrgetter("dtype")) def resolve_name(self): return f'param_{self.counter:d}' def __hash__(self): return hash((self.dtype, self.counter)) @property def inputs(self): return () def root_tables(self): return []
class Arbitrary(Filterable, Reduction): arg = rlz.column(rlz.any) how = rlz.optional(rlz.isin({'first', 'last', 'heavy'})) output_dtype = rlz.dtype_like('arg')
class Filterable(Value): where = rlz.optional(rlz.boolean)
class StringSQLLike(FuzzySearch): arg = rlz.string pattern = rlz.string escape = rlz.optional(rlz.instance_of(str))
class NullLiteral(Literal, Singleton): """Typeless NULL literal""" value = rlz.optional(type(None)) dtype = rlz.optional(rlz.instance_of(dt.Null), default=dt.null)
class Op(Annotable): arg = rlz.optional(rlz.instance_of(list), default=default)
class Selection(TableNode, sch.HasSchema): table = rlz.table selections = rlz.optional( rlz.tuple_of( rlz.one_of(( rlz.table, rlz.column_from("table"), rlz.function_of("table"), rlz.any, ))), default=(), ) predicates = rlz.optional(rlz.tuple_of(rlz.boolean), default=()) sort_keys = rlz.optional( rlz.tuple_of( rlz.one_of(( rlz.column_from("table"), rlz.function_of("table"), rlz.sort_key(from_="table"), rlz.pair( rlz.one_of(( rlz.column_from("table"), rlz.function_of("table"), rlz.any, )), rlz.map_to({ True: True, False: False, "desc": False, "descending": False, "asc": True, "ascending": True, 1: True, 0: False, }), ), ))), default=(), ) def __init__(self, table, selections, predicates, sort_keys, **kwargs): from ibis.expr.analysis import shares_all_roots, shares_some_roots if not shares_all_roots(selections + sort_keys, table): raise com.RelationError( "Selection expressions don't fully originate from " "dependencies of the table expression.") for predicate in predicates: if not shares_some_roots(predicate, table): raise com.RelationError( "Predicate doesn't share any roots with table") super().__init__( table=table, selections=selections, predicates=predicates, sort_keys=sort_keys, **kwargs, ) # Validate no overlapping columns in schema assert self.schema @cached_property def _projection(self): return self.__class__(table=self.table, selections=self.selections) @cached_property def schema(self): # Resolve schema and initialize if not self.selections: return self.table.schema() types = [] names = [] for projection in self.selections: if isinstance(projection, ir.DestructColumn): # If this is a destruct, then we destructure # the result and assign to multiple columns struct_type = projection.type() for name in struct_type.names: names.append(name) types.append(struct_type[name]) elif isinstance(projection, ir.Value): names.append(projection.get_name()) types.append(projection.type()) elif isinstance(projection, ir.Table): schema = projection.schema() names.extend(schema.names) types.extend(schema.types) return sch.Schema(names, types) def blocks(self): return bool(self.selections) @util.deprecated(instead="instantiate Selection directly", version="4.0.0") def substitute_table(self, table_expr): # pragma: no cover return Selection(table_expr, self.selections) def root_tables(self): return [self] @util.deprecated(instead="", version="4.0.0") def can_add_filters(self, wrapped_expr, predicates): # pragma: no cover pass @util.deprecated(instead="", version="4.0.0") def empty_or_equal(self, other) -> bool: # pragma: no cover for field in "selections", "sort_keys", "predicates": selfs = getattr(self, field) others = getattr(other, field) valid = (not selfs or not others or (a.equals(b) for a, b in zip(selfs, others))) if not valid: return False return True @util.deprecated(instead="", version="4.0.0") def compatible_with(self, other): # pragma: no cover # self and other are equivalent except for predicates, selections, or # sort keys any of which is allowed to be empty. If both are not empty # then they must be equal if self.equals(other): return True if not isinstance(other, type(self)): return False return self.table.equals(other.table) and self.empty_or_equal(other) def aggregate(self, this, metrics, by=None, having=None): if len(self.selections) > 0: return Aggregation(this, metrics, by=by, having=having) else: helper = AggregateSelection(this, metrics, by, having) return helper.get_result() def sort_by(self, expr, sort_exprs): from ibis.expr.analysis import shares_all_roots resolved_keys = _maybe_convert_sort_keys([self.table, expr], sort_exprs) if not self.blocks(): if shares_all_roots(resolved_keys, self.table): return Selection( self.table, self.selections, predicates=self.predicates, sort_keys=self.sort_keys + tuple(resolved_keys), ) return Selection(expr, [], sort_keys=resolved_keys)
class Log(Logarithm): arg = rlz.strict_numeric base = rlz.optional(rlz.strict_numeric)
class Aggregation(TableNode, sch.HasSchema): """ metrics : per-group scalar aggregates by : group expressions having : post-aggregation predicate TODO: not putting this in the aggregate operation yet where : pre-aggregation predicate """ table = rlz.table metrics = rlz.optional( rlz.tuple_of( rlz.one_of(( rlz.function_of( "table", output_rule=rlz.one_of( (rlz.reduction, rlz.scalar(rlz.any))), ), rlz.reduction, rlz.scalar(rlz.any), rlz.tuple_of(rlz.scalar(rlz.any)), )), flatten=True, ), default=(), ) by = rlz.optional( rlz.tuple_of( rlz.one_of(( rlz.function_of("table"), rlz.column_from("table"), rlz.column(rlz.any), ))), default=(), ) having = rlz.optional( rlz.tuple_of( rlz.one_of(( rlz.function_of("table", output_rule=rlz.scalar(rlz.boolean)), rlz.scalar(rlz.boolean), )), ), default=(), ) predicates = rlz.optional(rlz.tuple_of(rlz.boolean), default=()) sort_keys = rlz.optional( rlz.tuple_of( rlz.one_of(( rlz.column_from("table"), rlz.function_of("table"), rlz.sort_key(from_="table"), rlz.pair( rlz.one_of(( rlz.column_from("table"), rlz.function_of("table"), rlz.any, )), rlz.map_to({ True: True, False: False, "desc": False, "descending": False, "asc": True, "ascending": True, 1: True, 0: False, }), ), ))), default=(), ) def __init__(self, table, metrics, by, having, predicates, sort_keys): from ibis.expr.analysis import shares_all_roots, shares_some_roots # All non-scalar refs originate from the input table if not shares_all_roots(metrics + by + having + sort_keys, table): raise com.RelationError( "Selection expressions don't fully originate from " "dependencies of the table expression.") # invariant due to Aggregation and AggregateSelection requiring a valid # Selection assert all( shares_some_roots(predicate, table) for predicate in predicates) if not by: sort_keys = tuple() super().__init__( table=table, metrics=metrics, by=by, having=having, predicates=predicates, sort_keys=sort_keys, ) # Validate schema has no overlapping columns assert self.schema def blocks(self): return True @util.deprecated(instead="instantiate Aggregation directly", version="4.0.0") def substitute_table(self, table_expr): # pragma: no cover return Aggregation(table_expr, self.metrics, by=self.by, having=self.having) @cached_property def schema(self): names = [] types = [] for e in self.by + self.metrics: if isinstance(e, ir.DestructValue): # If this is a destruct, then we destructure # the result and assign to multiple columns struct_type = e.type() for name in struct_type.names: names.append(name) types.append(struct_type[name]) else: names.append(e.get_name()) types.append(e.type()) return sch.Schema(names, types) def sort_by(self, expr, sort_exprs): from ibis.expr.analysis import shares_all_roots resolved_keys = _maybe_convert_sort_keys([self.table, expr], sort_exprs) if shares_all_roots(resolved_keys, self.table): return Aggregation( self.table, self.metrics, by=self.by, having=self.having, predicates=self.predicates, sort_keys=self.sort_keys + tuple(resolved_keys), ) return Selection(expr, [], sort_keys=resolved_keys)
class Log(ops.Node): arg = rlz.double() base = rlz.optional(rlz.double())