def __init__(self, parent, grouper, apply): self.parent = parent s = TableSymbol('', parent.schema, parent.iscolumn) self.grouper = grouper.subs({parent: s}) self.apply = apply.subs({parent: s}) if isdimension(self.apply.dshape[0]): raise TypeError("Expected Reduction")
def dshape_to_alchemy(dshape): """ >>> dshape_to_alchemy('int') <class 'sqlalchemy.sql.sqltypes.Integer'> >>> dshape_to_alchemy('string') <class 'sqlalchemy.sql.sqltypes.String'> >>> dshape_to_alchemy('{name: string, amount: int}') [Column('name', String(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)] >>> dshape_to_alchemy('{name: ?string, amount: ?int}') [Column('name', String(), table=None), Column('amount', Integer(), table=None)] """ if isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(dshape, Option): return dshape_to_alchemy(dshape.ty) if str(dshape) in types: return types[str(dshape)] if isinstance(dshape, datashape.Record): return [sql.Column(name, dshape_to_alchemy(typ), nullable=isinstance(typ[0], Option)) for name, typ in dshape.parameters[0]] if isinstance(dshape, datashape.DataShape): if isdimension(dshape[0]): return dshape_to_alchemy(dshape[1]) else: return dshape_to_alchemy(dshape[0]) raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" % dshape)
def __init__(self, name, dshape=None, iscolumn=False): self._name = name if isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not isdimension(dshape[0]): dshape = datashape.var * dshape self.dshape = dshape self.iscolumn = iscolumn
def discover(coll, n=50): items = list(take(n, coll.find())) for item in items: del item['_id'] ds = discover(items) if isdimension(ds[0]): return coll.count() * ds.subshape[0] else: raise ValueError("Consistent datashape not found")
def discover(coll, n=50): items = list(take(n, coll.find())) for item in items: del item['_id'] ds = discover(items) if isdimension(ds[0]): return coll.count() * ds.subshape[0] else: raise ValueError("Consistent datashape not found")
def __init__(self, data, dshape=None, name=None, columns=None, iscolumn=False, schema=None): if isinstance(data, str): data = resource(data) if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if dshape and not isdimension(dshape[0]): dshape = var * dshape if not dshape: dshape = discover(data) types = None if isinstance(dshape[1], Tuple): columns = columns or list(range(len(dshape[1].dshapes))) types = dshape[1].dshapes if isinstance(dshape[1], Record): columns = columns or dshape[1].names types = dshape[1].types if isinstance(dshape[1], Fixed): types = (dshape[2], ) * int(dshape[1]) if not columns: raise TypeError("Could not infer column names from data. " "Please specify column names with `columns=` " "keyword") if not types: raise TypeError("Could not infer data types from data. " "Please specify schema with `schema=` keyword") dshape = dshape[0] * datashape.dshape( Record(list(zip(columns, types)))) self.dshape = datashape.dshape(dshape) self.data = data if (hasattr(data, 'schema') and isinstance(data.schema, (DataShape, str, unicode)) and self.schema != data.schema): raise TypeError('%s schema %s does not match %s schema %s' % (type(data).__name__, data.schema, type(self).__name__, self.schema)) self._name = name or next(names) self.iscolumn = iscolumn
def __init__(self, data, dshape=None, name=None, columns=None, iscolumn=False, schema=None): if isinstance(data, str): data = resource(data) if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if dshape and not isdimension(dshape[0]): dshape = var * dshape if not dshape: dshape = discover(data) types = None if isinstance(dshape[1], Tuple): columns = columns or list(range(len(dshape[1].dshapes))) types = dshape[1].dshapes if isinstance(dshape[1], Record): columns = columns or dshape[1].names types = dshape[1].types if isinstance(dshape[1], Fixed): types = (dshape[2],) * int(dshape[1]) if not columns: raise TypeError("Could not infer column names from data. " "Please specify column names with `column=` " "keyword") if not types: raise TypeError("Could not infer data types from data. " "Please specify schema with `schema=` keyword") dshape = dshape[0] * datashape.dshape(Record(list(zip(columns, types)))) self.dshape = datashape.dshape(dshape) self.data = data if (hasattr(data, 'schema') and isinstance(data.schema, (DataShape, str, unicode)) and self.schema != data.schema): raise TypeError('%s schema %s does not match %s schema %s' % (type(data).__name__, data.schema, type(self).__name__, self.schema)) self._name = name or next(names) self.iscolumn = iscolumn
def dshape_to_alchemy(dshape): """ >>> dshape_to_alchemy('int') <class 'sqlalchemy.sql.sqltypes.Integer'> >>> dshape_to_alchemy('string') <class 'sqlalchemy.sql.sqltypes.Text'> >>> dshape_to_alchemy('{name: string, amount: int}') [Column('name', Text(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)] >>> dshape_to_alchemy('{name: ?string, amount: ?int}') [Column('name', Text(), table=None), Column('amount', Integer(), table=None)] """ if isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(dshape, Option): return dshape_to_alchemy(dshape.ty) if str(dshape) in types: return types[str(dshape)] if isinstance(dshape, datashape.Record): return [ sql.Column(name, dshape_to_alchemy(typ), nullable=isinstance(typ[0], Option)) for name, typ in dshape.parameters[0] ] if isinstance(dshape, datashape.DataShape): if isdimension(dshape[0]): return dshape_to_alchemy(dshape[1]) else: return dshape_to_alchemy(dshape[0]) if isinstance(dshape, datashape.String): if dshape[0].fixlen is None: return sql.types.Text if 'U' in dshape.encoding: return sql.types.Unicode(length=dshape[0].fixlen) if 'A' in dshape.encoding: return sql.types.String(length=dshape[0].fixlen) if isinstance(dshape, datashape.DateTime): if dshape.tz: return sql.types.DateTime(timezone=True) else: return sql.types.DateTime(timezone=False) raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" % dshape)
def deoption(ds): """ >>> deoption('int32') ctype("int32") >>> deoption('?int32') ctype("int32") """ if isinstance(ds, str): ds = dshape(ds) if isinstance(ds, DataShape) and not isdimension(ds[0]): return deoption(ds[0]) if isinstance(ds, Option): return ds.ty else: return ds
def deoption(ds): """ >>> deoption('int32') ctype("int32") >>> deoption('?int32') ctype("int32") """ if isinstance(ds, str): ds = dshape(ds) if isinstance(ds, DataShape) and not isdimension(ds[0]): return deoption(ds[0]) if isinstance(ds, Option): return ds.ty else: return ds
def dshape_to_alchemy(dshape): """ >>> dshape_to_alchemy('int') <class 'sqlalchemy.sql.sqltypes.Integer'> >>> dshape_to_alchemy('string') <class 'sqlalchemy.sql.sqltypes.Text'> >>> dshape_to_alchemy('{name: string, amount: int}') [Column('name', Text(), table=None, nullable=False), Column('amount', Integer(), table=None, nullable=False)] >>> dshape_to_alchemy('{name: ?string, amount: ?int}') [Column('name', Text(), table=None), Column('amount', Integer(), table=None)] """ if isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(dshape, Option): return dshape_to_alchemy(dshape.ty) if str(dshape) in types: return types[str(dshape)] if isinstance(dshape, datashape.Record): return [sql.Column(name, dshape_to_alchemy(typ), nullable=isinstance(typ[0], Option)) for name, typ in dshape.parameters[0]] if isinstance(dshape, datashape.DataShape): if isdimension(dshape[0]): return dshape_to_alchemy(dshape[1]) else: return dshape_to_alchemy(dshape[0]) if isinstance(dshape, datashape.String): if dshape[0].fixlen is None: return sql.types.Text if 'U' in dshape.encoding: return sql.types.Unicode(length=dshape[0].fixlen) if 'A' in dshape.encoding: return sql.types.String(length=dshape[0].fixlen) if isinstance(dshape, datashape.DateTime): if dshape.tz: return sql.types.DateTime(timezone=True) else: return sql.types.DateTime(timezone=False) raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" % dshape)
def ds_to_sparksql(ds): """ Convert datashape to SparkSQL type system >>> print(ds_to_sparksql('int32')) # doctest: +SKIP IntegerType >>> print(ds_to_sparksql('5 * int32')) # doctest: +SKIP ArrayType(IntegerType,false) >>> print(ds_to_sparksql('5 * ?int32')) # doctest: +SKIP ArrayType(IntegerType,true) >>> print(ds_to_sparksql('{name: string, amount: int32}')) # doctest: +SKIP StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false))) >>> print(ds_to_sparksql('10 * {name: string, amount: ?int32}')) # doctest: +SKIP ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false) """ if isinstance(ds, str): return ds_to_sparksql(dshape(ds)) if isinstance(ds, Record): return sql.StructType([ sql.StructField(name, ds_to_sparksql(deoption(typ)), isinstance(typ, datashape.Option)) for name, typ in ds.fields ]) if isinstance(ds, DataShape): if isdimension(ds[0]): elem = ds.subshape[0] if isinstance(elem, DataShape) and len(elem) == 1: elem = elem[0] return sql.ArrayType(ds_to_sparksql(deoption(elem)), isinstance(elem, Option)) else: return ds_to_sparksql(ds[0]) if ds in types: return types[ds] raise NotImplementedError()
def ds_to_sparksql(ds): """ Convert datashape to SparkSQL type system >>> print(ds_to_sparksql('int32')) # doctest: +SKIP IntegerType >>> print(ds_to_sparksql('5 * int32')) # doctest: +SKIP ArrayType(IntegerType,false) >>> print(ds_to_sparksql('5 * ?int32')) # doctest: +SKIP ArrayType(IntegerType,true) >>> print(ds_to_sparksql('{name: string, amount: int32}')) # doctest: +SKIP StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false))) >>> print(ds_to_sparksql('10 * {name: string, amount: ?int32}')) # doctest: +SKIP ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false) """ if isinstance(ds, str): return ds_to_sparksql(dshape(ds)) if isinstance(ds, Record): return sql.StructType([ sql.StructField(name, ds_to_sparksql(deoption(typ)), isinstance(typ, datashape.Option)) for name, typ in ds.fields]) if isinstance(ds, DataShape): if isdimension(ds[0]): elem = ds.subshape[0] if isinstance(elem, DataShape) and len(elem) == 1: elem = elem[0] return sql.ArrayType(ds_to_sparksql(deoption(elem)), isinstance(elem, Option)) else: return ds_to_sparksql(ds[0]) if ds in types: return types[ds] raise NotImplementedError()
def discover(c): ds = discover(first(c)) assert isdimension(ds[0]) return var * ds.subshape[0]
def discover(c): ds = discover(first(c)) assert isdimension(ds[0]) return var * ds.subshape[0]
def schema(self): if isdimension(self.dshape[0]): return self.dshape.subshape[0] else: raise TypeError("Non-tabular datashape, %s" % self.dshape)
def by(child, grouper, apply): if isdimension(apply.dshape[0]): raise TypeError("Expected Reduction") return By(child, grouper, apply)