def infer_pandas_schema(df, schema=None): schema = schema if schema is not None else {} pairs = [] for column_name, pandas_dtype in df.dtypes.iteritems(): if not isinstance(column_name, str): raise TypeError( 'Column names must be strings to use the pandas backend' ) if column_name in schema: ibis_dtype = dt.dtype(schema[column_name]) elif pandas_dtype == np.object_: inferred_dtype = infer_pandas_dtype(df[column_name], skipna=True) if inferred_dtype in {'mixed', 'decimal'}: # TODO: in principal we can handle decimal (added in pandas # 0.23) raise TypeError( 'Unable to infer type of column {0!r}. Try instantiating ' 'your table from the client with client.table(' "'my_table', schema={{{0!r}: <explicit type>}})".format( column_name ) ) ibis_dtype = _inferable_pandas_dtypes[inferred_dtype] else: ibis_dtype = dt.dtype(pandas_dtype) pairs.append((column_name, ibis_dtype)) return sch.schema(pairs)
def schema_from_table(table, schema=None): """Retrieve an ibis schema from a SQLAlchemy ``Table``. Parameters ---------- table : sa.Table Returns ------- schema : ibis.expr.datatypes.Schema An ibis schema corresponding to the types of the columns in `table`. """ schema = schema if schema is not None else {} pairs = [] for name, column in table.columns.items(): if name in schema: dtype = dt.dtype(schema[name]) else: dtype = dt.dtype( getattr(table.bind, 'dialect', SQLAlchemyDialect()), column.type, nullable=column.nullable, ) pairs.append((name, dtype)) return sch.schema(pairs)
def test_literal_promotions(table, op, name, case, ex_type): col = table[name] result = op(col, case) assert result.type() == dt.dtype(ex_type) result = op(case, col) assert result.type() == dt.dtype(ex_type)
def test_interval_unvalid_unit(unit): definition = "interval('{}')".format(unit) with pytest.raises(ValueError): dt.dtype(definition) with pytest.raises(ValueError): dt.Interval(dt.int32, unit)
def test_interval(unit): definition = "interval('{}')".format(unit) dt.Interval(unit, dt.int32) == dt.dtype(definition) definition = "interval<uint16>('{}')".format(unit) dt.Interval(unit, dt.uint16) == dt.dtype(definition) definition = "interval<int64>('{}')".format(unit) dt.Interval(unit, dt.int64) == dt.dtype(definition)
def test_string_to_number(table, type): casted = table.g.cast(type) casted_literal = ibis.literal('5').cast(type).name('bar') assert isinstance(casted, ir.ColumnExpr) assert casted.type() == dt.dtype(type) assert isinstance(casted_literal, ir.ScalarExpr) assert casted_literal.type() == dt.dtype(type) assert casted_literal.get_name() == 'bar'
def param(type): """Create a parameter of a particular type to be defined just before execution. Parameters ---------- type : dt.DataType The type of the unbound parameter, e.g., double, int64, date, etc. Returns ------- ScalarExpr Examples -------- >>> import ibis >>> import ibis.expr.datatypes as dt >>> start = ibis.param(dt.date) >>> end = ibis.param(dt.date) >>> schema = [('timestamp_col', 'timestamp'), ('value', 'double')] >>> t = ibis.table(schema) >>> predicates = [t.timestamp_col >= start, t.timestamp_col <= end] >>> expr = t.filter(predicates).value.sum() """ import ibis.expr.datatypes as dt import ibis.expr.operations as ops return ops.ScalarParameter(dt.dtype(type)).to_expr()
def sa_array(dialect, satype, nullable=True): dimensions = satype.dimensions if dimensions is not None and dimensions != 1: raise NotImplementedError('Nested array types not yet supported') value_dtype = dt.dtype(dialect, satype.item_type) return dt.Array(value_dtype, nullable=nullable)
def test_literal_with_implicit_type(value, expected_type): expr = ibis.literal(value) assert isinstance(expr, ir.ScalarExpr) assert expr.type() == dt.dtype(expected_type) assert isinstance(expr.op(), ops.Literal) assert expr.op().value is value
def test_zero_subtract_literal_promotions( table, op, left_fn, right_fn, ex_type ): # in case of zero subtract the order of operands matters left, right = left_fn(table), right_fn(table) result = op(left, right) assert result.type() == dt.dtype(ex_type)
def trans_struct(t, context): return 'STRUCT<{}>'.format( ', '.join( '{} {}'.format( name, ibis_type_to_bigquery_type(dt.dtype(type), context) ) for name, type in zip(t.names, t.types) ) )
def infer_parquet_schema(schema): pairs = [] for field in schema.to_arrow_schema(): ibis_dtype = dt.dtype(field.type, nullable=field.nullable) name = field.name if not re.match(r'^__index_level_\d+__$', name): pairs.append((name, ibis_dtype)) return sch.schema(pairs)
def shape_like(arg, dtype=None): if isinstance(arg, (tuple, list, ir.ListExpr)): datatype = dtype or highest_precedence_dtype(arg) columnar = util.any_of(arg, ir.AnyColumn) else: datatype = dtype or arg.type() columnar = isinstance(arg, ir.AnyColumn) dtype = dt.dtype(datatype) if columnar: return dtype.column_type() else: return dtype.scalar_type()
def parse_type(t): t = t.lower() if t in _impala_to_ibis_type: return _impala_to_ibis_type[t] else: if 'varchar' in t or 'char' in t: return 'string' elif 'decimal' in t: result = dt.dtype(t) if result: return t else: return ValueError(t) else: raise Exception(t)
def value(dtype, arg): """Validates that the given argument is a Value with a particular datatype Parameters ---------- dtype : DataType subclass or DataType instance arg : python literal or an ibis expression If a python literal is given the validator tries to coerce it to an ibis literal. Returns ------- arg : AnyValue An ibis value expression with the specified datatype """ if not isinstance(arg, ir.Expr): # coerce python literal to ibis literal arg = ir.literal(arg) if not isinstance(arg, ir.AnyValue): raise com.IbisTypeError( 'Given argument with type {} is not a value ' 'expression'.format(type(arg)) ) # retrieve literal values for implicit cast check value = getattr(arg.op(), 'value', None) if isinstance(dtype, type) and isinstance(arg.type(), dtype): # dtype class has been specified like dt.Interval or dt.Decimal return arg elif dt.castable(arg.type(), dt.dtype(dtype), value=value): # dtype instance or string has been specified and arg's dtype is # implicitly castable to it, like dt.int8 is castable to dt.int64 return arg else: raise com.IbisTypeError( 'Given argument with datatype {} is not ' 'subtype of {} nor implicitly castable to ' 'it'.format(arg.type(), dtype) )
def test_struct(): orders = """array<struct< oid: int64, status: string, totalprice: decimal(12, 2), order_date: string, items: array<struct< iid: int64, name: string, price: decimal(12, 2), discount_perc: decimal(12, 2), shipdate: string >> >>""" expected = dt.Array( dt.Struct.from_tuples( [ ('oid', dt.int64), ('status', dt.string), ('totalprice', dt.Decimal(12, 2)), ('order_date', dt.string), ( 'items', dt.Array( dt.Struct.from_tuples( [ ('iid', dt.int64), ('name', dt.string), ('price', dt.Decimal(12, 2)), ('discount_perc', dt.Decimal(12, 2)), ('shipdate', dt.string), ] ) ), ), ] ) ) assert dt.dtype(orders) == expected
def test_char_varchar_invalid(spec): with pytest.raises(IbisTypeError): dt.dtype(spec)
def literal(value, type=None): """Create a scalar expression from a Python value. Parameters ---------- value : some Python basic type A Python value type : ibis type or string, optional An instance of :class:`ibis.expr.datatypes.DataType` or a string indicating the ibis type of `value`. This parameter should only be used in cases where ibis's type inference isn't sufficient for discovering the type of `value`. Returns ------- literal_value : Literal An expression representing a literal value Examples -------- >>> import ibis >>> x = ibis.literal(42) >>> x.type() int8 >>> y = ibis.literal(42, type='double') >>> y.type() float64 >>> ibis.literal('foobar', type='int64') # doctest: +ELLIPSIS Traceback (most recent call last): ... TypeError: Value 'foobar' cannot be safely coerced to int64 """ import ibis.expr.datatypes as dt import ibis.expr.operations as ops if hasattr(value, 'op') and isinstance(value.op(), ops.Literal): return value try: inferred_dtype = dt.infer(value) except com.InputTypeError: has_inferred = False else: has_inferred = True if type is None: has_explicit = False else: has_explicit = True explicit_dtype = dt.dtype(type) if has_explicit and has_inferred: try: # ensure type correctness: check that the inferred dtype is # implicitly castable to the explicitly given dtype and value dtype = inferred_dtype.cast(explicit_dtype, value=value) except com.IbisTypeError: raise TypeError( 'Value {!r} cannot be safely coerced to {}'.format(value, type) ) elif has_explicit: dtype = explicit_dtype elif has_inferred: dtype = inferred_dtype else: raise TypeError( 'The datatype of value {!r} cannot be inferred, try ' 'passing it explicitly with the `type` keyword.'.format(value) ) if dtype is dt.null: return null().cast(dtype) else: return ops.Literal(value, dtype=dtype).to_expr()
def test_decimal_failure(case): with pytest.raises(IbisTypeError): dt.dtype(case)
def _grouped(input_type, output_type, base_class, output_type_method): """Define a user-defined function that is applied per group. Parameters ---------- input_type : List[ibis.expr.datatypes.DataType] A list of the types found in :mod:`~ibis.expr.datatypes`. The length of this list must match the number of arguments to the function. Variadic arguments are not yet supported. output_type : ibis.expr.datatypes.DataType The return type of the function. base_class : Type[T] The base class of the generated Node output_type_method : Callable A callable that determines the method to call to get the expression type of the UDF See Also -------- ibis.pandas.udf.reduction ibis.pandas.udf.analytic """ input_type = list(map(dt.dtype, input_type)) output_type = dt.dtype(output_type) def wrapper(func): funcsig = valid_function_signature(input_type, func) UDAFNode = type( func.__name__, (base_class, ), { 'signature': sig.TypeSignature.from_dtypes(input_type), 'output_type': output_type_method(output_type), }, ) # An execution rule for a simple aggregate node @execute_node.register(UDAFNode, *udf_signature(input_type, pin=None, klass=pd.Series)) def execute_udaf_node(op, *args, **kwargs): args, kwargs = arguments_from_signature( funcsig, *args, **kwargs) return func(*args, **kwargs) # An execution rule for a grouped aggregation node. This # includes aggregates applied over a window. nargs = len(input_type) group_by_signatures = [ udf_signature(input_type, pin=pin, klass=SeriesGroupBy) for pin in range(nargs) ] @toolz.compose(*(execute_node.register(UDAFNode, *types) for types in group_by_signatures)) def execute_udaf_node_groupby(op, *args, **kwargs): # construct a generator that yields the next group of data # for every argument excluding the first (pandas performs # the iteration for the first argument) for each argument # that is a SeriesGroupBy. # # If the argument is not a SeriesGroupBy then keep # repeating it until all groups are exhausted. aggcontext = kwargs.pop('aggcontext', None) assert aggcontext is not None, 'aggcontext is None' if isinstance(aggcontext, Window): # Call the func differently for Window because of # the custom rolling logic. result = aggcontext.agg(args[0], func, *args, **kwargs) else: iters = create_gens_from_args_groupby(args[1:]) funcsig = signature(func) # TODO: Unify calling convension here to be more like # window def aggregator(first, *rest, **kwargs): # map(next, *rest) gets the inputs for the next group # TODO: might be inefficient to do this on every call args, kwargs = arguments_from_signature( funcsig, first, *map(next, rest), **kwargs) return func(*args, **kwargs) result = aggcontext.agg(args[0], aggregator, *iters, **kwargs) return result @functools.wraps(func) def wrapped(*args): return UDAFNode(*args).to_expr() return wrapped return wrapper
def test_pandas_dtype(pandas_dtype, ibis_dtype): assert dt.dtype(pandas_dtype) == ibis_dtype
def test_token_error(): with pytest.raises(IbisTypeError): dt.dtype('array<string>>')
def test_map_does_not_allow_non_primitive_keys(): with pytest.raises(IbisTypeError): dt.dtype('map<array<string>, double>')
def test_array(): assert dt.dtype('ARRAY<DOUBLE>') == dt.Array(dt.double)
def test_timestamp_with_timezone_parser_double_quote(): t = dt.dtype("timestamp('US/Eastern')") assert isinstance(t, dt.Timestamp) assert t.timezone == 'US/Eastern'
def test_char_varchar_invalid(spec): with pytest.raises(SyntaxError): dt.dtype(spec)
def test_primitive(spec, expected): assert dt.dtype(spec) == expected
def test_char_varchar(spec): assert dt.dtype(spec) == dt.string
(operator.pow, 'b', 1.5, 'double'), (operator.pow, 'c', 1.5, 'double'), (operator.pow, 'd', 1.5, 'double'), (operator.pow, 'e', 2, 'float'), (operator.pow, 'f', 2, 'double'), (operator.pow, 'a', -2, 'double'), (operator.pow, 'b', -2, 'double'), (operator.pow, 'c', -2, 'double'), (operator.pow, 'd', -2, 'double'), ], ids=lambda arg: str(getattr(arg, '__name__', arg))) def test_literal_promotions(table, op, name, case, ex_type): col = table[name] result = op(col, case) assert result.type() == dt.dtype(ex_type) result = op(case, col) assert result.type() == dt.dtype(ex_type) @pytest.mark.parametrize(('op', 'left_fn', 'right_fn', 'ex_type'), [ (operator.sub, lambda t: t['a'], lambda t: 0, 'int8'), (operator.sub, lambda t: 0, lambda t: t['a'], 'int16'), (operator.sub, lambda t: t['b'], lambda t: 0, 'int16'), (operator.sub, lambda t: 0, lambda t: t['b'], 'int32'), (operator.sub, lambda t: t['c'], lambda t: 0, 'int32'), (operator.sub, lambda t: 0, lambda t: t['c'], 'int64'), ], ids=lambda arg: str(getattr(arg, '__name__', arg))) def test_zero_subtract_literal_promotions(table, op, left_fn, right_fn,
def spark_dataframe_schema(df): """Infer the schema of a Spark SQL `DataFrame` object.""" # df.schema is a pt.StructType schema_struct = dt.dtype(df.schema) return sch.schema(schema_struct.names, schema_struct.types)
def test_timestamp_with_timezone_parser_invalid_timezone(): ts = dt.dtype("timestamp('US/Ea')") assert str(ts) == "timestamp('US/Ea')"
def test_empty_complex_type(): with pytest.raises(IbisTypeError): dt.dtype('map<>')
def test_nested_array(): assert dt.dtype('array<array<string>>') == dt.Array(dt.Array(dt.string))
def test_time_valid(): assert dt.dtype('time').equals(dt.time)
def test_interval_invalid_type(): with pytest.raises(TypeError): dt.Interval('m', dt.float32) with pytest.raises(TypeError): dt.dtype("interval<float>('s')")
def literal(value: Any, type: dt.DataType | str | None = None) -> ScalarExpr: """Create a scalar expression from a Python value. !!! tip "Use specific functions for arrays, structs and maps" Ibis supports literal construction of arrays using the following functions: 1. [`ibis.array`][ibis.array] 1. [`ibis.struct`][ibis.struct] 1. [`ibis.map`][ibis.map] Constructing these types using `literal` will be deprecated in a future release. Parameters ---------- value A Python value type An instance of [`DataType`][ibis.expr.datatypes.DataType] or a string indicating the ibis type of `value`. This parameter can be used in cases where ibis's type inference isn't sufficient for discovering the type of `value`. Returns ------- ScalarExpr An expression representing a literal value Examples -------- Construct an integer literal >>> import ibis >>> x = ibis.literal(42) >>> x.type() Int8(nullable=True) Construct a `float64` literal from an `int` >>> y = ibis.literal(42, type='double') >>> y.type() Float64(nullable=True) Ibis checks for invalid types >>> ibis.literal('foobar', type='int64') # doctest: +ELLIPSIS Traceback (most recent call last): ... TypeError: Value 'foobar' cannot be safely coerced to int64 """ import ibis.expr.datatypes as dt import ibis.expr.operations as ops if hasattr(value, 'op') and isinstance(value.op(), ops.Literal): return value try: inferred_dtype = dt.infer(value) except com.InputTypeError: has_inferred = False else: has_inferred = True if type is None: has_explicit = False else: has_explicit = True explicit_dtype = dt.dtype(type) if has_explicit and has_inferred: try: # ensure type correctness: check that the inferred dtype is # implicitly castable to the explicitly given dtype and value dtype = inferred_dtype.cast(explicit_dtype, value=value) except com.IbisTypeError: raise TypeError( f'Value {value!r} cannot be safely coerced to {type}' ) elif has_explicit: dtype = explicit_dtype elif has_inferred: dtype = inferred_dtype else: raise TypeError( 'The datatype of value {!r} cannot be inferred, try ' 'passing it explicitly with the `type` keyword.'.format(value) ) if dtype is dt.null: return null().cast(dtype) else: value = dt._normalize(dtype, value) return ops.Literal(value, dtype=dtype).to_expr()
def test_map(): assert dt.dtype('map<string, double>') == dt.Map(dt.string, dt.double)
def test_numpy_dtype(numpy_dtype, ibis_dtype): assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype
def __init__(self, input_type, output_type): self.input_type = list(map(dt.dtype, input_type)) self.output_type = dt.dtype(output_type) self.spark_output_type = spark_dtype(self.output_type)
def test_map_does_not_allow_non_primitive_keys(): with pytest.raises(SyntaxError): dt.dtype('map<array<string>, double>')
def test_dtype(spec, expected): assert dt.dtype(spec) == expected
def test_primitive_from_string(spec, expected): assert dt.dtype(spec) == expected
def infer_numpy_scalar(value): return dt.dtype(value.dtype)
def infer_array(value): # TODO(kszucs): infer series return dt.Array(dt.dtype(value.dtype.name))
def test_empty_complex_type(): with pytest.raises(parsy.ParseError): dt.dtype('map<>')
def test_string_argument_parsing_failure_mode(case): with pytest.raises(IbisTypeError): dt.dtype(case)
def test_token_error(): with pytest.raises(parsy.ParseError): dt.dtype('array<string>>')
def test_char_varchar_invalid(spec): with pytest.raises(parsy.ParseError): dt.dtype(spec)
def test_nested_map(): expected = dt.Map(dt.int64, dt.Array(dt.Map(dt.string, dt.int8))) assert dt.dtype('map<int64, array<map<string, int8>>>') == expected
def datatype(arg): return dt.dtype(arg)