class Covariance(Filterable, Reduction): """Covariance of a set of number pairs.""" left = rlz.column(rlz.numeric) right = rlz.column(rlz.numeric) how = rlz.isin({'sample', 'pop'}) output_dtype = dt.float64
class Correlation(Filterable, Reduction): """Coefficient of correlation of a set of number pairs.""" left = rlz.column(rlz.numeric) right = rlz.column(rlz.numeric) how = rlz.isin({'sample', 'pop'}) output_dtype = dt.float64
class MyOp(ops.ValueOp): input_type = [ rules.table(name='table', schema=rules.table.with_column_subset( rules.column(name='group', value_type=rules.number), rules.column(name='value', value_type=rules.number))) ] output_type = rules.type_of_arg(0)
class NotAny(Reduction, _Negatable): arg = rlz.column(rlz.boolean) output_dtype = dt.boolean def negate(self) -> Any: return Any(*self.args)
class Bucket(BucketLike): arg = rlz.column(rlz.any) buckets = rlz.tuple_of(rlz.scalar(rlz.any)) closed = rlz.optional(rlz.isin({'left', 'right'}), default='left') close_extreme = rlz.optional(rlz.instance_of(bool), default=True) include_under = rlz.optional(rlz.instance_of(bool), default=False) include_over = rlz.optional(rlz.instance_of(bool), default=False) def __init__(self, buckets, include_under, include_over, **kwargs): if not len(buckets): raise ValueError('Must be at least one bucket edge') elif len(buckets) == 1: if not include_under or not include_over: raise ValueError('If one bucket edge provided, must have ' 'include_under=True and include_over=True') super().__init__( buckets=buckets, include_under=include_under, include_over=include_over, **kwargs, ) @property def nbuckets(self): return len(self.buckets) - 1 + self.include_over + self.include_under
class ShiftBase(Analytic): arg = rlz.column(rlz.any) offset = rlz.optional(rlz.one_of((rlz.integer, rlz.interval))) default = rlz.optional(rlz.any) output_dtype = rlz.dtype_like("arg")
class ApproxMedian(Filterable, Reduction): """ Compute the approximate median of a set of comparable values using the Count-Min-Sketch algorithm. Exposed in Impala using APPX_MEDIAN. """ arg = rlz.column(rlz.any) output_dtype = rlz.dtype_like('arg')
class Sum(Filterable, Reduction): arg = rlz.column(rlz.numeric) @immutable_property def output_dtype(self): if isinstance(self.arg, ir.BooleanValue): return dt.int64 else: return self.arg.type().largest
class VarianceBase(Filterable, Reduction): arg = rlz.column(rlz.numeric) how = rlz.isin({'sample', 'pop'}) @immutable_property def output_dtype(self): if isinstance(self.arg, ir.DecimalValue): return self.arg.type().largest else: return dt.float64
class ApproxCountDistinct(Filterable, Reduction): """Approximate number of unique values using HyperLogLog algorithm. Impala offers the NDV built-in function for this. """ arg = rlz.column(rlz.any) # Impala 2.0 and higher returns a DOUBLE return ir.DoubleScalar output_dtype = dt.int64
class Contains(Value): value = rlz.any options = rlz.one_of([ rlz.value_list_of(rlz.any), rlz.set_, rlz.column(rlz.any), rlz.array_of(rlz.any), ]) output_dtype = dt.boolean output_shape = rlz.shape_like("args")
class CumulativeMean(CumulativeOp): """Cumulative mean. Requires an order window.""" arg = rlz.column(rlz.numeric) @immutable_property def output_dtype(self): if isinstance(self.arg, ir.DecimalValue): return self.arg.type().largest else: return dt.float64
class CumulativeSum(CumulativeOp): """Cumulative sum. Requires an ordering window.""" arg = rlz.column(rlz.numeric) @immutable_property def output_dtype(self): if isinstance(self.arg, ir.BooleanValue): return dt.int64 else: return self.arg.type().largest
class Mean(Filterable, Reduction): arg = rlz.column(rlz.numeric) @immutable_property def output_dtype(self): if isinstance(self.arg, ir.DecimalValue): return self.arg.type() else: return dt.float64 def root_tables(self): return distinct_roots(self.arg)
class TopK(Node): arg = rlz.column(rlz.any) k = rlz.non_negative_integer by = rlz.one_of((rlz.function_of(rlz.base_table_of("arg")), rlz.any)) output_type = ir.TopK def blocks(self): # pragma: no cover return True def root_tables(self): # pragma: no cover args = (arg for arg in self.flat_args() if isinstance(arg, ir.Expr)) return distinct_roots(*args)
class BitOr(Filterable, Reduction): """Aggregate bitwise OR operation. All elements in an integer column are ORed together. This can be used to determine which bit flags are set on any element. Resources: * BigQuery [`BIT_OR`](https://cloud.google.com/bigquery/docs/reference/standard-sql/aggregate_functions#bit_or) * MySQL [`BIT_OR`](https://dev.mysql.com/doc/refman/5.7/en/aggregate-functions.html#function_bit-or) """ # noqa: E501 arg = rlz.column(rlz.integer) output_dtype = rlz.dtype_like('arg')
class BitXor(Filterable, Reduction): """Aggregate bitwise XOR operation. All elements in an integer column are XORed together. This can be used as a parity checksum of element values. Resources: * BigQuery [`BIT_XOR`](https://cloud.google.com/bigquery/docs/reference/standard-sql/aggregate_functions#bit_xor) * MySQL [`BIT_XOR`](https://dev.mysql.com/doc/refman/5.7/en/aggregate-functions.html#function_bit-xor) """ # noqa: E501 arg = rlz.column(rlz.integer) output_dtype = rlz.dtype_like('arg')
class ArrayColumn(Value): cols = rlz.value_list_of(rlz.column(rlz.any), min_length=1) output_shape = rlz.Shape.COLUMNAR def __init__(self, cols): if len({col.type() for col in cols}) > 1: raise com.IbisTypeError( f'The types of all input columns must match exactly in a ' f'{type(self).__name__} operation.') super().__init__(cols=cols) @immutable_property def output_dtype(self): first_dtype = self.cols[0].type() return dt.Array(first_dtype)
class VectorizedUDF(Value): func = rlz.instance_of((FunctionType, LambdaType)) func_args = rlz.tuple_of(rlz.column(rlz.any)) # TODO(kszucs): should rename these arguments to # input_dtypes and return_dtype input_type = rlz.tuple_of(rlz.datatype) return_type = rlz.datatype @property def inputs(self): return self.func_args @property def output_dtype(self): return self.return_type def root_tables(self): return distinct_roots(*self.func_args)
class SortKey(Node): expr = rlz.column(rlz.any) ascending = rlz.optional( rlz.map_to({ True: True, False: False, 1: True, 0: False, }, ), default=True, ) output_type = ir.SortExpr def root_tables(self): return self.expr.op().root_tables() def resolve_name(self): return self.expr.get_name()
class MinRank(RankBase): """ Compute position of first element within each equal-value group in sorted order. Equivalent to SQL RANK(). Examples -------- values ranks 1 0 1 0 2 2 2 2 2 2 3 5 Returns ------- Int64Column The min rank """ arg = rlz.column(rlz.any)
class DenseRank(RankBase): """ Compute position of first element within each equal-value group in sorted order, ignoring duplicate values. Equivalent to SQL DENSE_RANK(). Examples -------- values ranks 1 0 1 0 2 1 2 1 2 1 3 2 Returns ------- IntegerColumn The rank """ arg = rlz.column(rlz.any)
assert result.equals(expected) @pytest.mark.parametrize( ('units', 'value', 'expected'), [({'Y'}, ibis.interval(hours=1), IbisTypeError), ({'Y', 'M', 'D'}, ibis.interval(hours=1), IbisTypeError), ({'Q', 'W', 'D'}, ibis.interval(seconds=1), IbisTypeError)]) def test_invalid_interval(units, value, expected): with pytest.raises(expected): rlz.interval(value, units=units) @pytest.mark.parametrize( ('validator', 'value', 'expected'), [(rlz.column(rlz.any), table.int_col, table.int_col), (rlz.column(rlz.string), table.string_col, table.string_col), (rlz.scalar(rlz.integer), ibis.literal(3), ibis.literal(3)), (rlz.scalar(rlz.any), 'caracal', ibis.literal('caracal'))]) def test_valid_column_or_scalar(validator, value, expected): result = validator(value) assert result.equals(expected) @pytest.mark.parametrize(('validator', 'value', 'expected'), [ (rlz.column(rlz.integer), table.double_col, IbisTypeError), (rlz.column(rlz.any), ibis.literal(3), IbisTypeError), (rlz.column(rlz.integer), ibis.literal(3), IbisTypeError), ]) def test_invalid_column_or_scalar(validator, value, expected): with pytest.raises(expected):
('units', 'value', 'expected'), [ ({'Y'}, ibis.interval(hours=1), IbisTypeError), ({'Y', 'M', 'D'}, ibis.interval(hours=1), IbisTypeError), ({'Q', 'W', 'D'}, ibis.interval(seconds=1), IbisTypeError), ], ) def test_invalid_interval(units, value, expected): with pytest.raises(expected): rlz.interval(value, units=units) @pytest.mark.parametrize( ('validator', 'value', 'expected'), [ (rlz.column(rlz.any), table.int_col, table.int_col), (rlz.column(rlz.string), table.string_col, table.string_col), (rlz.scalar(rlz.integer), ibis.literal(3), ibis.literal(3)), (rlz.scalar(rlz.any), 'caracal', ibis.literal('caracal')), ], ) def test_valid_column_or_scalar(validator, value, expected): result = validator(value) assert result.equals(expected) @pytest.mark.parametrize( ('validator', 'value', 'expected'), [ (rlz.column(rlz.integer), table.double_col, IbisTypeError), (rlz.column(rlz.any), ibis.literal(3), IbisTypeError),
class Arbitrary(Filterable, Reduction): arg = rlz.column(rlz.any) how = rlz.optional(rlz.isin({'first', 'last', 'heavy'})) output_dtype = rlz.dtype_like('arg')
class ArrayCollect(Reduction): arg = rlz.column(rlz.any) @immutable_property def output_dtype(self): return dt.Array(self.arg.type())
class CountDistinct(Filterable, Reduction): arg = rlz.column(rlz.any) output_dtype = dt.int64
class All(Reduction): arg = rlz.column(rlz.boolean) output_dtype = dt.boolean def negate(self): return NotAll(self.arg)
class Count(Filterable, Reduction): arg = rlz.one_of((rlz.column(rlz.any), rlz.table)) output_dtype = dt.int64
class GroupConcat(Filterable, Reduction): arg = rlz.column(rlz.any) sep = rlz.string output_dtype = dt.string
class Min(Filterable, Reduction): arg = rlz.column(rlz.any) output_dtype = rlz.dtype_like('arg')