class DataType(Annotable, Comparable): """Base class for all data types. [`DataType`][ibis.expr.datatypes.DataType] instances are immutable. """ nullable = optional(instance_of(bool), default=True) def __call__(self, nullable: bool = True) -> DataType: if nullable is not True and nullable is not False: raise TypeError( "__call__ only accepts the 'nullable' argument. " "Please construct a new instance of the type to change the " "values of the attributes." ) kwargs = dict(zip(self.argnames, self.args)) kwargs["nullable"] = nullable return self.__class__(**kwargs) @property def _pretty_piece(self) -> str: return "" @property def name(self) -> str: """Return the name of the data type.""" return self.__class__.__name__ def __str__(self) -> str: prefix = "!" * (not self.nullable) return f"{prefix}{self.name.lower()}{self._pretty_piece}" def __equals__( self, other: typing.Any, ) -> bool: return self.args == other.args def equals(self, other): if not isinstance(other, DataType): raise TypeError( "invalid equality comparison between DataType and " f"{type(other)}" ) return super().__cached_equals__(other) def castable(self, target, **kwargs): """Return whether this data type is castable to `target`.""" return castable(self, target, **kwargs) def cast(self, target, **kwargs): """Cast this data type to `target`.""" return cast(self, target, **kwargs)
class Timestamp(DataType): """Timestamp values.""" timezone = optional(instance_of(str)) """The timezone of values of this type.""" scalar = ir.TimestampScalar column = ir.TimestampColumn @property def _pretty_piece(self) -> str: if (timezone := self.timezone) is not None: return f"({timezone!r})" return ""
class Category(DataType): cardinality = optional(instance_of(int)) scalar = ir.CategoryScalar column = ir.CategoryColumn def __repr__(self): if self.cardinality is not None: cardinality = repr(self.cardinality) else: cardinality = "unknown" return f"{self.name}(cardinality={cardinality})" def to_integer_type(self): if self.cardinality is None: return int64 else: return infer(self.cardinality)
class GeoSpatial(DataType): """Geospatial values.""" geotype = optional(isin({"geography", "geometry"})) """The specific geospatial type""" srid = optional(instance_of(int)) """The spatial reference identifier.""" column = ir.GeoSpatialColumn scalar = ir.GeoSpatialScalar @property def _pretty_piece(self) -> str: piece = "" if self.geotype is not None: piece += f":{self.geotype}" if self.srid is not None: piece += f";{self.srid}" return piece
class Schema(Annotable, Comparable): """An object for holding table schema information, i.e., column names and types. Parameters ---------- names : Sequence[str] A sequence of ``str`` indicating the name of each column. types : Sequence[DataType] A sequence of :class:`ibis.expr.datatypes.DataType` objects representing type of each column. """ __slots__ = ('_name_locs', ) names: Sequence[str] = tuple_of(instance_of((str, UnnamedMarker))) types: Sequence[dt.DataType] = tuple_of(datatype) @immutable_property def _name_locs(self): # validate unique field names name_locs = {v: i for i, v in enumerate(self.names)} if len(name_locs) < len(self.names): duplicate_names = list(self.names) for v in name_locs.keys(): duplicate_names.remove(v) raise IntegrityError( f'Duplicate column name(s): {duplicate_names}') return name_locs def __repr__(self): space = 2 + max(map(len, self.names), default=0) return "ibis.Schema {{{}\n}}".format( indent( ''.join(f'\n{name.ljust(space)}{str(type)}' for name, type in zip(self.names, self.types)), 2, )) def __len__(self): return len(self.names) def __iter__(self): return iter(self.names) def __contains__(self, name): return name in self._name_locs def __getitem__(self, name): return self.types[self._name_locs[name]] def __equals__(self, other): return (self._hash == other._hash and self.names == other.names and self.types == other.types) def equals(self, other): if not isinstance(other, Schema): raise TypeError("invalid equality comparison between Schema and " f"{type(other)}") return self.__cached_equals__(other) def delete(self, names_to_delete): for name in names_to_delete: if name not in self: raise KeyError(name) new_names, new_types = [], [] for name, type_ in zip(self.names, self.types): if name in names_to_delete: continue new_names.append(name) new_types.append(type_) return Schema(new_names, new_types) @classmethod def from_tuples(cls, values): if not isinstance(values, (list, tuple)): values = list(values) names, types = zip(*values) if values else ([], []) return Schema(names, types) @classmethod def from_dict(cls, dictionary): names, types = zip(*dictionary.items()) if dictionary else ([], []) return Schema(names, types) def __gt__(self, other): return set(self.items()) > set(other.items()) def __ge__(self, other): return set(self.items()) >= set(other.items()) def append(self, schema): return Schema(self.names + schema.names, self.types + schema.types) def items(self): return zip(self.names, self.types) def name_at_position(self, i): """ """ upper = len(self.names) - 1 if not 0 <= i <= upper: raise ValueError( 'Column index must be between 0 and {:d}, inclusive'.format( upper)) return self.names[i] def apply_to(self, df): """Applies the Ibis schema to a pandas DataFrame Parameters ---------- df : pandas.DataFrame Returns ------- df : pandas.DataFrame Notes ----- Mutates `df` """ schema_names = self.names data_columns = df.columns assert len(schema_names) == len( data_columns ), "schema column count does not match input data column count" for column, dtype in zip(data_columns, self.types): pandas_dtype = dtype.to_pandas() col = df[column] col_dtype = col.dtype try: not_equal = pandas_dtype != col_dtype except TypeError: # ugh, we can't compare dtypes coming from pandas, # assume not equal not_equal = True if not_equal or isinstance(dtype, (dt.String, dt.Struct)): new_col = convert(col_dtype, dtype, col) else: new_col = col df[column] = new_col # return data with the schema's columns which may be different than the # input columns df.columns = schema_names return df
def client(arg, **kwargs): from ibis.backends.base import BaseBackend return instance_of(BaseBackend, arg)
def column(inner, arg, **kwargs): return instance_of(ir.Column, inner(arg, **kwargs))
def scalar(inner, arg, **kwargs): return instance_of(ir.Scalar, inner(arg, **kwargs))
class Struct(DataType): """Structured values.""" names = tuple_of(instance_of(str)) types = tuple_of(datatype) scalar = ir.StructScalar column = ir.StructColumn @classmethod def from_tuples( cls, pairs: Iterable[tuple[str, str | DataType]], nullable: bool = True, ) -> Struct: """Construct a `Struct` type from pairs. Parameters ---------- pairs An iterable of pairs of field name and type Returns ------- Struct Struct data type instance """ names, types = zip(*pairs) return cls(names, types, nullable=nullable) @classmethod def from_dict( cls, pairs: Mapping[str, str | DataType], nullable: bool = True, ) -> Struct: """Construct a `Struct` type from a [`dict`][dict]. Parameters ---------- pairs A [`dict`][dict] of `field: type` Returns ------- Struct Struct data type instance """ names, types = pairs.keys(), pairs.values() return cls(names, types, nullable=nullable) @property def pairs(self) -> Mapping[str, DataType]: """Return a mapping from names to data type instances. Returns ------- Mapping[str, DataType] Mapping of field name to data type """ return dict(zip(self.names, self.types)) def __getitem__(self, key: str) -> DataType: return self.pairs[key] def __repr__(self) -> str: return '{}({}, nullable={})'.format( self.name, list(self.pairs.items()), self.nullable ) @property def _pretty_piece(self) -> str: pairs = ", ".join(map("{}: {}".format, self.names, self.types)) return f"<{pairs}>"
class Interval(DataType): """Interval values.""" unit = optional( map_to( { 'days': 'D', 'hours': 'h', 'minutes': 'm', 'seconds': 's', 'milliseconds': 'ms', 'microseconds': 'us', 'nanoseconds': 'ns', 'Y': 'Y', 'Q': 'Q', 'M': 'M', 'W': 'W', 'D': 'D', 'h': 'h', 'm': 'm', 's': 's', 'ms': 'ms', 'us': 'us', 'ns': 'ns', } ), default="s", ) """The time unit of the interval.""" value_type = optional( compose_of([datatype, instance_of(Integer)]), default=Int32() ) """The underlying type of the stored values.""" scalar = ir.IntervalScalar column = ir.IntervalColumn # based on numpy's units _units = { 'Y': 'year', 'Q': 'quarter', 'M': 'month', 'W': 'week', 'D': 'day', 'h': 'hour', 'm': 'minute', 's': 'second', 'ms': 'millisecond', 'us': 'microsecond', 'ns': 'nanosecond', } # TODO(kszucs): assert that the nullability if the value_type is equal # to the interval's nullability @property def bounds(self): return self.value_type.bounds @property def resolution(self): """The interval unit's name.""" return self._units[self.unit] @property def _pretty_piece(self) -> str: return f"<{self.value_type}>(unit={self.unit!r})"
class Decimal(DataType): """Fixed-precision decimal values.""" precision = optional(instance_of(int)) """The number of decimal places values of this type can hold.""" scale = optional(instance_of(int)) """The number of values after the decimal point.""" scalar = ir.DecimalScalar column = ir.DecimalColumn def __init__( self, precision: int | None = None, scale: int | None = None, **kwargs: Any, ) -> None: if precision is not None: if not isinstance(precision, numbers.Integral): raise TypeError( "Decimal type precision must be an integer; " f"got {type(precision)}" ) if precision < 0: raise ValueError('Decimal type precision cannot be negative') if not precision: raise ValueError('Decimal type precision cannot be zero') if scale is not None: if not isinstance(scale, numbers.Integral): raise TypeError('Decimal type scale must be an integer') if scale < 0: raise ValueError('Decimal type scale cannot be negative') if precision is not None and precision < scale: raise ValueError( 'Decimal type precision must be greater than or equal to ' 'scale. Got precision={:d} and scale={:d}'.format( precision, scale ) ) super().__init__(precision=precision, scale=scale, **kwargs) @property def largest(self): """Return the largest type of decimal.""" return self.__class__( precision=max(self.precision, 38) if self.precision is not None else None, scale=max(self.scale, 2) if self.scale is not None else None, ) @property def _pretty_piece(self) -> str: args = [] if (precision := self.precision) is not None: args.append(f"prec={precision:d}") if (scale := self.scale) is not None: args.append(f"scale={scale:d}")