Пример #1
0
 def dshape(self):
     measure = Record(list(zip(self.names,
                               [v._dtype for v in self.values])))
     if self.keepdims:
         return DataShape(*((1,) * self._child.ndim + (measure,)))
     else:
         return DataShape(measure)
Пример #2
0
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    sub_uri = ''
    if isinstance(data, _strtypes):
        if '::' in data:
            data, sub_uri = data.split('::')
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if columns:
        warnings.warn("columns kwarg deprecated.  Use fields instead",
                      DeprecationWarning)
    if columns and not fields:
        fields = columns
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)

    name = name or next(names)
    result = InteractiveSymbol(data, ds, name)

    if sub_uri:
        for field in sub_uri.split('/'):
            if field:
                result = result[field]

    return result
Пример #3
0
def Data(data,
         dshape=None,
         name=None,
         fields=None,
         columns=None,
         schema=None,
         **kwargs):
    if columns:
        raise ValueError("columns argument deprecated, use fields instead")
    if schema and dshape:
        raise ValueError("Please specify one of schema= or dshape= keyword"
                         " arguments")

    if isinstance(data, InteractiveSymbol):
        return Data(data.data, dshape, name, fields, columns, schema, **kwargs)

    if isinstance(data, _strtypes):
        data = resource(data,
                        schema=schema,
                        dshape=dshape,
                        columns=columns,
                        **kwargs)
    if (isinstance(data, Iterator)
            and not isinstance(data, tuple(not_an_iterator))):
        data = tuple(data)
    if schema and not dshape:
        dshape = var * schema
    if dshape and isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not dshape:
        dshape = discover(data)
        types = None
        if isinstance(dshape.measure, Tuple) and fields:
            types = dshape[1].dshapes
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))
        elif isscalar(dshape.measure) and fields:
            types = (dshape.measure, ) * int(dshape[-2])
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape[:-1] + (schema, )))
        elif isrecord(dshape.measure) and fields:
            ds = discover(data)
            assert isrecord(ds.measure)
            names = ds.measure.names
            if names != fields:
                raise ValueError(
                    'data column names %s\n'
                    '\tnot equal to fields parameter %s,\n'
                    '\tuse Data(data).relabel(%s) to rename '
                    'fields' %
                    (names, fields, ', '.join('%s=%r' % (k, v)
                                              for k, v in zip(names, fields))))
            types = dshape.measure.types
            schema = Record(list(zip(fields, types)))
            dshape = DataShape(*(dshape.shape + (schema, )))

    ds = datashape.dshape(dshape)
    return InteractiveSymbol(data, ds, name)
Пример #4
0
    def __init__(self,
                 data,
                 dshape=None,
                 name=None,
                 fields=None,
                 columns=None,
                 schema=None,
                 **kwargs):
        if isinstance(data, _strtypes):
            data = resource(data,
                            schema=schema,
                            dshape=dshape,
                            columns=columns,
                            **kwargs)
        if columns:
            warnings.warn("columns kwarg deprecated.  Use fields instead",
                          DeprecationWarning)
        if columns and not fields:
            fields = columns
        if schema and dshape:
            raise ValueError("Please specify one of schema= or dshape= keyword"
                             " arguments")
        if schema and not dshape:
            dshape = var * schema
        if dshape and isinstance(dshape, _strtypes):
            dshape = datashape.dshape(dshape)
        if not dshape:
            dshape = discover(data)
            types = None
            if isinstance(dshape.measure, Tuple) and fields:
                types = dshape[1].dshapes
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))
            elif isscalar(dshape.measure) and fields:
                types = (dshape.measure, ) * int(dshape[-2])
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))
            elif isrecord(dshape.measure) and fields:
                types = dshape.measure.types
                schema = Record(list(zip(fields, types)))
                dshape = DataShape(*(dshape.shape + (schema, )))

        self.dshape = datashape.dshape(dshape)

        self.data = data

        if (hasattr(data, 'schema') and isinstance(data.schema,
                                                   (DataShape, str, unicode))
                and self.schema != data.schema):
            raise TypeError('%s schema %s does not match %s schema %s' %
                            (type(data).__name__, data.schema,
                             type(self).__name__, self.schema))

        self._name = name or next(names)
Пример #5
0
def discover_h5py_dataset(d):
    dshape = datashape.from_numpy(d.shape, d.dtype)
    shape, measure = dshape.shape, dshape.measure
    if not isrecord(measure):
        if dshape == datashape.object_:
            args = shape + (datashape.string, )
            return DataShape(*args)
        return dshape
    else:
        records = list(
            record_dshape_replace(measure, datashape.object_,
                                  datashape.string))
        args = shape + (datashape.Record(records), )
        return DataShape(*args)
Пример #6
0
def list_to_dynd(L, **kwargs):
    ds = kwargs['dshape']
    if isinstance(ds.measure, Tuple):
        measure = Record([['f%d' % i, typ]
                          for i, typ in enumerate(ds.measure.parameters[0])])
        ds = DataShape(*(ds.shape + (measure, )))
    return nd.array(L, dtype=str(ds))
Пример #7
0
def compute_up(expr, data, **kwargs):
    leaf = expr._leaves()[0]
    chunk = symbol(
        'chunk',
        DataShape(*(tuple(map(first, data.chunks)) + (leaf.dshape.measure, ))))
    (chunk, chunk_expr), (agg, agg_expr) = split(expr._child,
                                                 expr,
                                                 chunk=chunk)

    inds = tuple(range(ndim(leaf)))
    dtype = expr.dshape.measure.to_numpy_dtype()
    tmp = atop(
        curry(compute_it, chunk_expr, [chunk], **kwargs),
        inds,
        data,
        inds,
        dtype=dtype,
    )

    return atop(
        compose(
            curry(compute_it, agg_expr, [agg], **kwargs),
            curry(_concatenate2, axes=expr.axis),
        ),
        tuple(i for i in inds if i not in expr.axis),
        tmp,
        inds,
        dtype=dtype,
    )
Пример #8
0
 def _schema(self):
     schema = self._child.schema[0]
     if isinstance(schema, Record) and len(schema.types) == 1:
         result = toolz.first(schema.types)
     else:
         result = schema
     return DataShape(result)
Пример #9
0
 def __init__(self, name, dshape):
     self._name = name
     if isinstance(dshape, _strtypes):
         dshape = datashape.dshape(dshape)
     if isinstance(dshape, Mono) and not isinstance(dshape, DataShape):
         dshape = DataShape(dshape)
     self.dshape = dshape
Пример #10
0
    def dshape(self):
        shape = self._child.dshape.shape
        schema = self._child.dshape.measure.dict[self._name]

        shape = shape + schema.shape
        schema = (schema.measure, )
        return DataShape(*(shape + schema))
Пример #11
0
def discover(metadata):
    try:
        metadata.reflect(views=metadata.bind.dialect.supports_views)
    except NotImplementedError:
        metadata.reflect()
    pairs = []
    for table in sorted(metadata.tables.values(), key=attrgetter('name')):
        name = table.name
        try:
            pairs.append([name, discover(table)])
        except sa.exc.CompileError as e:
            warnings.warn(
                "Can not discover type of table {name}.\n"
                "SQLAlchemy provided this error message:\n\t{msg}"
                "\nSkipping.".format(
                    name=name,
                    msg=e.message,
                ),
                stacklevel=3,
            )
        except NotImplementedError as e:
            warnings.warn(
                "Odo does not understand a SQLAlchemy type.\n"
                "Odo provided the following error:\n\t{msg}"
                "\nSkipping.".format(msg="\n\t".join(e.args)),
                stacklevel=3,
            )
    return DataShape(Record(pairs))
Пример #12
0
    def _dshape(self):
        '''
        since pandas supports concat for string columns, do the same for blaze
        '''
        shape = self.lhs.dshape.shape
        if isinstance(self.lhs.schema.measure, Option):
            schema = self.lhs.schema
        elif isinstance(self.rhs.schema.measure, Option):
            schema = self.rhs.schema
        else:
            _, lhs_encoding = self.lhs.schema.measure.parameters
            _, rhs_encoding = self.rhs.schema.measure.parameters
            assert lhs_encoding == rhs_encoding
            # convert fixed length string to variable length string
            schema = DataShape(String(None, lhs_encoding))

        return DataShape(*(shape + (schema, )))
Пример #13
0
    def schema(self):
        subs = dict(self.labels)
        d = self._child.dshape.measure.dict

        return DataShape(
            Record([[subs.get(name, name), dtype]
                    for name, dtype in self._child.dshape.measure.parameters[0]
                    ]))
Пример #14
0
 def dshape(self):
     axis = self.axis
     ldshape = self.lhs.dshape
     lshape = ldshape.shape
     return DataShape(
         *(lshape[:axis] +
           (_shape_add(lshape[axis], self.rhs.dshape.shape[axis]), ) +
           lshape[axis + 1:] + (ldshape.measure, )))
Пример #15
0
 def _schema(self):
     measure = self._child.schema.measure
     base = getattr(measure, 'ty', measure)
     return_type = Option if isinstance(measure, Option) else toolz.identity
     return DataShape(
         return_type(
             base if isinstance(base, Decimal) else
             base if isinstance(base, TimeDelta) else ct.float64, ))
Пример #16
0
def date_to_datetime_dshape(ds):
    shape = ds.shape
    if isinstance(ds.measure, Record):
        measure = Record([[name, ct.datetime_ if typ == ct.date_ else typ]
                          for name, typ in ds.measure.parameters[0]])
    else:
        measure = ds.measure
    return DataShape(*(shape + (measure, )))
Пример #17
0
def sql_table(table, colnames, measures, conn):
    """
    Create a new blaze Array from an SQL table description. This returns
    a Record array.
    """
    dtype = Record(list(zip(colnames, measures)))
    record_dshape = DataShape(coretypes.Var(), dtype)
    table = TableSelection(table, '*')
    return Array(SQLDataDescriptor(record_dshape, table, conn))
Пример #18
0
 def _dshape(self):
     axis = self.axis
     if self.keepdims:
         shape = tuple(1 if i in axis else d
                       for i, d in enumerate(self._child.shape))
     else:
         shape = tuple(d for i, d in enumerate(self._child.shape)
                       if i not in axis)
     return DataShape(*(shape + (self.schema, )))
Пример #19
0
    def _dshape(self):
        shape = self._child.dshape.shape
        measure = self._child.dshape.measure

        # TODO: is this too special-case-y?
        schema = getattr(measure, 'value', measure).dict[self._name]

        shape = shape + schema.shape
        schema = (schema.measure,)
        return DataShape(*(shape + schema))
Пример #20
0
def fsql(engine, fcsv, name):
    dshape = discover(fcsv)
    dshape = DataShape(
        var, Record([(n, typ) for n, typ in zip('ab', dshape.measure.types)]))
    try:
        t = resource('%s::%s' % (url, name), dshape=dshape)
    except sqlalchemy.exc.OperationalError as e:
        pytest.skip(str(e))
    else:
        yield t
        drop(t)
Пример #21
0
 def _dshape(self):
     axis = self.axis
     if self.keepdims:
         shape = tuple(1 if i in axis else d
                       for i, d in enumerate(self._child.shape))
     else:
         shape = tuple(d for i, d in enumerate(self._child.shape)
                       if i not in axis)
     measure = Record(list(zip(self.names,
                               [v.schema for v in self.values])))
     return DataShape(*(shape + (measure, )))
Пример #22
0
    def dshape(self):
        # Compute shape
        shape = tuple([d for i, d in enumerate(self.lhs.shape)
                         if i not in self._left_axes] +
                      [d for i, d in enumerate(self.rhs.shape)
                         if i not in self._right_axes])

        # Compute measure by mimicking a mul and add
        l = symbol('l', self.lhs.dshape.measure)
        r = symbol('r', self.rhs.dshape.measure)
        measure = ((l * r) + (l * r)).dshape.measure

        return DataShape(*(shape + (measure,)))
Пример #23
0
def discover(metadata):
    metadata.reflect(views=metadata.bind.dialect.supports_views)
    pairs = []
    for name, table in sorted(metadata.tables.items(), key=first):
        try:
            pairs.append([name, discover(table)])
        except sa.exc.CompileError as e:
            print("Can not discover type of table %s.\n" % name +
                "SQLAlchemy provided this error message:\n\t%s" % e.message +
                "\nSkipping.")
        except NotImplementedError as e:
            print("Blaze does not understand a SQLAlchemy type.\n"
                "Blaze provided the following error:\n\t%s" % e.message +
                "\nSkipping.")
    return DataShape(Record(pairs))
def column_dshape(dshape, colname):
    """
    Given a record dshape, project a column out
    """
    rec = dshape.measure

    if not isinstance(rec, Record):
        raise TypeError("Can only select fields from record type")
    if colname not in rec.fields:
        raise ValueError("No such field %r" % (colname, ))

    measure = rec.fields[colname]
    params = list(dshape.shape) + [measure]
    dshape = DataShape(*params)

    return dshape
Пример #25
0
def compute_down(expr, data, **kwargs):
    """ Compute expressions on H5Py datasets by operating on chunks

    This uses blaze.expr.split to break a full-array-computation into a
    per-chunk computation and a on-aggregate computation.

    This uses blaze.partition to pick out chunks from the h5py dataset, uses
    compute(numpy) to compute on each chunk and then uses blaze.partition to
    aggregate these (hopefully smaller) intermediate results into a local
    numpy array.  It then performs a second operation (again given by
    blaze.expr.split) on this intermediate aggregate

    The expression must contain some sort of Reduction.  Both the intermediate
    result and the final result are assumed to fit into memory
    """
    leaf = expr._leaves()[0]
    if not any(isinstance(node, Reduction) for node in path(expr, leaf)):
        raise MDNotImplementedError()

    # Compute chunksize (this should be improved)
    chunksize = kwargs.get('chunksize', data.chunks)

    # Split expression into per-chunk and on-aggregate pieces
    chunk = Symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure, ))))
    (chunk, chunk_expr), (agg, agg_expr) = \
            split(leaf, expr, chunk=chunk)

    # Create numpy array to hold intermediate aggregate
    shape, dtype = to_numpy(agg.dshape)
    intermediate = np.empty(shape=shape, dtype=dtype)

    # Compute partitions
    data_partitions = partitions(data, chunksize=chunksize)
    int_partitions = partitions(intermediate, chunksize=chunk_expr.shape)

    # For each partition, compute chunk->chunk_expr
    # Insert into intermediate
    # This could be parallelized
    for d, i in zip(data_partitions, int_partitions):
        chunk_data = partition_get(data, d, chunksize=chunksize)
        result = compute(chunk_expr, {chunk: chunk_data})
        partition_set(intermediate, i, result, chunksize=chunk_expr.shape)

    # Compute on the aggregate
    return compute(agg_expr, {agg: intermediate})
Пример #26
0
def compute_down(expr, data, map=None, **kwargs):
    """ Compute expressions on H5Py datasets by operating on chunks

    This uses blaze.expr.split to break a full-array-computation into a
    per-chunk computation and a on-aggregate computation.

    This uses blaze.partition to pick out chunks from the h5py dataset, uses
    compute(numpy) to compute on each chunk and then uses blaze.partition to
    aggregate these (hopefully smaller) intermediate results into a local
    numpy array.  It then performs a second operation (again given by
    blaze.expr.split) on this intermediate aggregate

    The expression must contain some sort of Reduction.  Both the intermediate
    result and the final result are assumed to fit into memory
    """
    map = _get_map(map)

    leaf = expr._leaves()[0]
    if not any(isinstance(node, Reduction) for node in path(expr, leaf)):
        raise MDNotImplementedError()

    # Compute chunksize (this should be improved)
    chunksize = kwargs.get('chunksize', data.chunks)

    # Split expression into per-chunk and on-aggregate pieces
    chunk = symbol('chunk', DataShape(*(chunksize + (leaf.dshape.measure,))))
    (chunk, chunk_expr), (agg, agg_expr) = \
            split(leaf, expr, chunk=chunk)

    # Create numpy array to hold intermediate aggregate
    shape, dtype = to_numpy(agg.dshape)
    intermediate = np.empty(shape=shape, dtype=dtype)

    # Compute partitions
    source_parts = list(partitions(data, chunksize=chunksize, keepdims=True))
    target_parts = list(partitions(intermediate, chunksize=chunk_expr.shape,
                                   keepdims=True))

    list(map(
        curry(compute_chunk, data, intermediate, chunk, chunk_expr),
        zip(source_parts, target_parts)
    ))

    # Compute on the aggregate
    return compute(agg_expr, {agg: intermediate}, return_type='native')
Пример #27
0
    def dynd_arr(self):
        # TODO: This should really use blz
        if self._dynd_result is not None:
            return self._dynd_result

        # Allocate empty dynd array
        length = sum(len(chunk) for chunk in self.query_result)
        ds = DataShape(length, self.dshape.measure)
        result = nd.empty(str(ds))

        # Fill dynd array with chunks
        offset = 0
        for chunk in self.query_result:
            result[offset:offset + len(chunk)] = chunk
            offset += len(chunk)

        self._dynd_result = result
        return result
Пример #28
0
def dynd_chunk_iterator(result, chunk_size=1024):
    """
    Turn a query Result into a bunch of DyND arrays
    """
    cursor = result.cursor

    chunk_size = max(cursor.arraysize, chunk_size)
    while True:
        try:
            results = cursor.fetchmany(chunk_size)
        except db.Error:
            break

        if not results:
            break

        dshape = DataShape(len(results), result.dshape.measure)
        chunk = nd.empty(str(dshape))
        chunk[:] = list(iter_result(results, dshape))
        yield chunk
Пример #29
0
def coalesce(a, b):
    a_dshape = discover(a)
    a_measure = a_dshape.measure
    isoption = isinstance(a_measure, Option)
    if isoption:
        a_measure = a_measure.ty
    isnull = isinstance(a_measure, Null)
    if isnull:
        # a is always null, this is just b
        return b

    if not isoption:
        # a is not an option, this is just a
        return a

    b_dshape = discover(b)
    return Coalesce(a, b, DataShape(*(
        maxshape((a_dshape.shape, b_dshape.shape)) +
        (promote(a_measure, b_dshape.measure),)
    )))
Пример #30
0
def sql_table(table_name, colnames, measures, conn):
    """
    Create a new blaze Array from an SQL table description. This returns
    a Record array.

    Parameters
    ==========

    table_name: str
        table name

    colnames: [str]
        column names

    measures: [DataShape]
        measure (element type) for each column

    conn: pyodbc/whatever Connection
    """
    dtype = Record(list(zip(colnames, measures)))
    record_dshape = DataShape(coretypes.Var(), dtype)
    table = TableSelection(table_name, '*')
    return Array(SQL_DDesc(record_dshape, table, conn))