Пример #1
0
    def test_grouped_slices(self):
        a = nd.asarray([[1, 2, 3], [1, 4, 5]])
        gb = nd.groupby(a[:, 1:], a[:, 0])
        self.assertEqual(nd.as_py(gb.groups), [1])
        self.assertEqual(nd.as_py(gb), [[[2, 3], [4, 5]]])

        a = nd.asarray([[1, 2, 3], [3, 1, 7], [1, 4, 5], [2, 6, 7], [3, 2, 5]])
        gb = nd.groupby(a[:, 1:], a[:, 0])
        self.assertEqual(nd.as_py(gb.groups), [1, 2, 3])
        self.assertEqual(nd.as_py(gb), [[[2, 3], [4, 5]],
                                        [[6, 7]],
                                        [[1, 7], [2, 5]]])
    def groupby(self, json_cmd):
        print('GroupBy operation')
        cmd = json.loads(json_cmd)
        array_url = cmd.get('input', self.base_url + self.array_name)
        if not array_url.startswith(self.base_url):
            raise RuntimeError('Input array must start with the base url')
        array_name = array_url[len(self.base_url):]
        fields = cmd['fields']

        arr = self.get_session_array(array_name)[...].ddesc.dynd_arr()

        # Do the groupby, get its groups, then
        # evaluate it because deferred operations
        # through the groupby won't work well yet.
        res = nd.groupby(arr, nd.fields(arr, *fields))
        groups = res.groups
        res = res.eval()

        # Write out the groupby result
        defarr_gb = self.array_provider.create_deferred_array_filename(
            self.session_name, 'groupby_', array(res))
        dshape_gb = nd.dshape_of(res)
        defarr_gb[0].write(
            json.dumps({
                'dshape': dshape_gb,
                'command': 'groupby',
                'params': {
                    'fields': fields
                }
            }))
        defarr_gb[0].close()

        # Write out the groups
        defarr_groups = self.array_provider.create_deferred_array_filename(
            self.session_name, 'groups_', groups)
        dshape_groups = nd.dshape_of(groups)
        defarr_groups[0].write(
            json.dumps({
                'dshape': dshape_groups,
                'command': 'groupby.groups',
                'params': {
                    'fields': fields
                }
            }))
        defarr_groups[0].close()

        content_type = 'application/json; charset=utf-8'
        body = json.dumps({
            'session': self.base_url + self.session_name,
            'output_gb': self.base_url + defarr_gb[1],
            'dshape_gb': dshape_gb,
            'output_groups': self.base_url + defarr_groups[1],
            'dshape_groups': dshape_groups
        })
        return (content_type, body)
    def groupby(self, json_cmd):
        print('GroupBy operation')
        cmd = json.loads(json_cmd)
        array_url = cmd.get('input', self.base_url + self.array_name)
        if not array_url.startswith(self.base_url):
            raise RuntimeError('Input array must start with the base url')
        array_name = array_url[len(self.base_url):]
        fields = cmd['fields']

        arr = self.get_session_array(array_name)[...].ddesc.dynd_arr()

        # Do the groupby, get its groups, then
        # evaluate it because deferred operations
        # through the groupby won't work well yet.
        res = nd.groupby(arr, nd.fields(arr, *fields))
        groups = res.groups
        res = res.eval()

        # Write out the groupby result
        defarr_gb = self.array_provider.create_deferred_array_filename(
                        self.session_name, 'groupby_', array(res))
        dshape_gb = nd.dshape_of(res)
        defarr_gb[0].write(json.dumps({
                'dshape': dshape_gb,
                'command': 'groupby',
                'params': {
                    'fields': fields
                }
            }))
        defarr_gb[0].close()

        # Write out the groups
        defarr_groups = self.array_provider.create_deferred_array_filename(
                        self.session_name, 'groups_', groups)
        dshape_groups = nd.dshape_of(groups)
        defarr_groups[0].write(json.dumps({
                'dshape': dshape_groups,
                'command': 'groupby.groups',
                'params': {
                    'fields': fields
                }
            }))
        defarr_groups[0].close()

        content_type = 'application/json; charset=utf-8'
        body = json.dumps({
                'session': self.base_url + self.session_name,
                'output_gb': self.base_url + defarr_gb[1],
                'dshape_gb': dshape_gb,
                'output_groups': self.base_url + defarr_groups[1],
                'dshape_groups': dshape_groups
            })
        return (content_type, body)
Пример #4
0
 def test_immutable(self):
     a = nd.array([
             ('x', 0),
             ('y', 1),
             ('x', 2),
             ('x', 3),
             ('y', 4)],
             dtype='{A: string, B: int32}').eval_immutable()
     gb = nd.groupby(a, nd.fields(a, 'A'))
     self.assertEqual(nd.as_py(gb.groups), [{'A': 'x'}, {'A': 'y'}])
     self.assertEqual(nd.as_py(gb), [
             [{'A': 'x', 'B': 0},
              {'A': 'x', 'B': 2},
              {'A': 'x', 'B': 3}],
             [{'A': 'y', 'B': 1},
              {'A': 'y', 'B': 4}]])
Пример #5
0
 def test_aggregate(self):
     a = nd.array([
         ('A', 1, 2),
         ('A', 3, 4),
         ('B', 1.5, 2.5),
         ('A', 0.5, 9),
         ('C', 1, 5),
         ('B', 2, 2)],
         dtype='c{cat: string, x: float32, y: float32}')
     gb = nd.groupby(a, nd.fields(a, 'cat')).eval()
     b = nd.make_computed_fields(gb, 1,
             fields=[('sum_x', ndt.float32, 'sum(x)'),
                     ('mean_y', ndt.float32, 'mean(y)'),
                     ('max_x', ndt.float32, 'max(x)'),
                     ('max_y', ndt.float32, 'max(y)')])
     self.assertEqual(nd.as_py(b.sum_x), [4.5, 3.5, 1])
     self.assertEqual(nd.as_py(b.mean_y), [5, 2.25, 5])
     self.assertEqual(nd.as_py(b.max_x), [3, 2, 1])
     self.assertEqual(nd.as_py(b.max_y), [9, 2.5, 5])
Пример #6
0
 def test_type_id(self):
     # Numeric type id
     self.assertEqual(self.type_id_of(ndt.bool),
                     _lowlevel.type_id.BOOL)
     self.assertEqual(self.type_id_of(ndt.int8),
                     _lowlevel.type_id.INT8)
     self.assertEqual(self.type_id_of(ndt.int16),
                     _lowlevel.type_id.INT16)
     self.assertEqual(self.type_id_of(ndt.int32),
                     _lowlevel.type_id.INT32)
     self.assertEqual(self.type_id_of(ndt.int64),
                     _lowlevel.type_id.INT64)
     self.assertEqual(self.type_id_of(ndt.uint8),
                     _lowlevel.type_id.UINT8)
     self.assertEqual(self.type_id_of(ndt.uint16),
                     _lowlevel.type_id.UINT16)
     self.assertEqual(self.type_id_of(ndt.uint32),
                     _lowlevel.type_id.UINT32)
     self.assertEqual(self.type_id_of(ndt.uint64),
                     _lowlevel.type_id.UINT64)
     self.assertEqual(self.type_id_of(ndt.float32),
                     _lowlevel.type_id.FLOAT32)
     self.assertEqual(self.type_id_of(ndt.float64),
                     _lowlevel.type_id.FLOAT64)
     self.assertEqual(self.type_id_of(ndt.complex_float32),
                     _lowlevel.type_id.COMPLEX_FLOAT32)
     self.assertEqual(self.type_id_of(ndt.complex_float64),
                     _lowlevel.type_id.COMPLEX_FLOAT64)
     # String/bytes
     self.assertEqual(self.type_id_of(ndt.string),
                     _lowlevel.type_id.STRING)
     self.assertEqual(self.type_id_of(ndt.make_fixedstring(16)),
                     _lowlevel.type_id.FIXEDSTRING)
     self.assertEqual(self.type_id_of(ndt.bytes),
                     _lowlevel.type_id.BYTES)
     self.assertEqual(self.type_id_of(ndt.make_fixedbytes(16)),
                     _lowlevel.type_id.FIXEDBYTES)
     self.assertEqual(self.type_id_of(ndt.json),
                     _lowlevel.type_id.JSON)
     # Date
     self.assertEqual(self.type_id_of(ndt.date),
                     _lowlevel.type_id.DATE)
     # Property
     self.assertEqual(self.type_id_of(nd.type_of(ndt.date(2000, 1, 1).year)),
                     _lowlevel.type_id.PROPERTY)
     # Categorical
     self.assertEqual(self.type_id_of(ndt.make_categorical([1, 2, 3])),
                     _lowlevel.type_id.CATEGORICAL)
     # Struct
     self.assertEqual(self.type_id_of(ndt.make_struct(
                                 [ndt.int32, ndt.int32], ['x', 'y'])),
                     _lowlevel.type_id.STRUCT)
     self.assertEqual(self.type_id_of(ndt.type('{x : int32, y : int32}')),
                     _lowlevel.type_id.FIXEDSTRUCT)
     # Convert/byteswap/view
     self.assertEqual(self.type_id_of(ndt.make_convert(
                                 ndt.int32, ndt.int8)),
                     _lowlevel.type_id.CONVERT)
     self.assertEqual(self.type_id_of(ndt.make_byteswap(ndt.int32)),
                     _lowlevel.type_id.BYTESWAP)
     self.assertEqual(self.type_id_of(ndt.make_view(
                                 ndt.int32, ndt.uint32)),
                     _lowlevel.type_id.VIEW)
     # CUDA types
     if ndt.cuda_support:
         self.assertEqual(self.type_id_of(ndt.type('cuda_device[int32]')),
                          _lowlevel.type_id.CUDA_DEVICE)
         self.assertEqual(self.type_id_of(ndt.type('cuda_host[int32]')),
                          _lowlevel.type_id.CUDA_HOST)
     # Uniform arrays
     self.assertEqual(self.type_id_of(ndt.type('3 * int32')),
                     _lowlevel.type_id.FIXED_DIM)
     self.assertEqual(self.type_id_of(ndt.type('strided * int32')),
                     _lowlevel.type_id.STRIDED_DIM)
     self.assertEqual(self.type_id_of(ndt.type('var * int32')),
                     _lowlevel.type_id.VAR_DIM)
     # GroupBy
     self.assertEqual(self.type_id_of(nd.type_of(nd.groupby([1, 2],
                                                            ['a', 'a']))),
                     _lowlevel.type_id.GROUPBY)
     # Type
     self.assertEqual(self.type_id_of(ndt.type('type')),
                     _lowlevel.type_id.TYPE)
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC):
    """Group the `val` field in `sreader` stream of lines by `key` index.

    Parameters
    ----------
    sreader : iterator
        Iterator over a stream of CSV lines.
    key : string
        The name of the field to be grouped by.
    val : string
        The field name with the values that have to be grouped.
    dtype : dynd dtype
        The DyND data type with all the fields of the CSV lines,
        including the `key` and `val` names.
    path : string
        The path of the file where the BLZ array with the final
        grouping will be stored.  If None (default), the BLZ will be
        stored in-memory (and hence non-persistent).
    lines_per_chunk : int
        The number of chunks that have to be read to be grouped by
        in-memory.  For optimal perfomance, some experimentation
        should be needed.  The default value should work reasonably
        well, though.

    Returns
    -------
    output : BLZ table
        Returns a BLZ table with column names that are the groups
        resulting from the groupby operation.  The columns are filled
        with the `val` field of the lines delivered by `sreader`.

    """

    try:
        nptype = get_nptype(dtype, val)
    except ValueError:
        raise ValueError("`val` should be a valid field")

    # Start reading chunks
    prev_keys = set()
    while True:
        ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype)
        if len(ndbuf) == 0: break   # CSV data exhausted

        # Do the groupby for this chunk
        keys = getattr(ndbuf, key)
        if val is None:
            vals = ndbuf
        else:
            vals = getattr(ndbuf, val)
        sby = nd.groupby(vals, keys)
        lkeys = nd.as_py(sby.groups)
        skeys = set(lkeys)
        # BLZ does not understand dynd objects (yet)
        sby = nd.as_py(sby.eval())

        if len(prev_keys) == 0:
            # Add the initial keys to a BLZ table
            columns = [np.array(sby[i], nptype) for i in range(len(lkeys))]
            ssby = blz.btable(columns=columns, names=lkeys, rootdir=path,
                              mode='w')
        else:
            # Have we new keys?
            new_keys = skeys.difference(prev_keys)
            for new_key in new_keys:
                # Get the index of the new key
                idx = lkeys.index(new_key)
                # and add the values as a new columns
                ssby.addcol(sby[idx], new_key, dtype=nptype)
            # Now fill the pre-existing keys
            existing_keys = skeys.intersection(prev_keys)
            for existing_key in existing_keys:
                # Get the index of the existing key
                idx = lkeys.index(existing_key)
                # and append the values here
                ssby[existing_key].append(sby[idx])

        # Add the new keys to the existing ones
        prev_keys |= skeys

    # Before returning, flush all data into disk
    if path is not None:
        ssby.flush()
    return ssby
Пример #8
0
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC):
    """Group the `val` field in `sreader` stream of lines by `key` index.

    Parameters
    ----------
    sreader : iterator
        Iterator over a stream of CSV lines.
    key : string
        The name of the field to be grouped by.
    val : string
        The field name with the values that have to be grouped.
    dtype : dynd dtype
        The DyND data type with all the fields of the CSV lines,
        including the `key` and `val` names.
    path : string
        The path of the file where the BLZ array with the final
        grouping will be stored.  If None (default), the BLZ will be
        stored in-memory (and hence non-persistent).
    lines_per_chunk : int
        The number of chunks that have to be read to be grouped by
        in-memory.  For optimal perfomance, some experimentation
        should be needed.  The default value should work reasonably
        well, though.

    Returns
    -------
    output : BLZ table
        Returns a BLZ table with column names that are the groups
        resulting from the groupby operation.  The columns are filled
        with the `val` field of the lines delivered by `sreader`.

    """

    try:
        nptype = get_nptype(dtype, val)
    except ValueError:
        raise ValueError("`val` should be a valid field")

    # Start reading chunks
    prev_keys = set()
    while True:
        ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype)
        if len(ndbuf) == 0: break  # CSV data exhausted

        # Do the groupby for this chunk
        keys = getattr(ndbuf, key)
        if val is None:
            vals = ndbuf
        else:
            vals = getattr(ndbuf, val)
        sby = nd.groupby(vals, keys)
        lkeys = nd.as_py(sby.groups)
        skeys = set(lkeys)
        # BLZ does not understand dynd objects (yet)
        sby = nd.as_py(sby.eval())

        if len(prev_keys) == 0:
            # Add the initial keys to a BLZ table
            columns = [np.array(sby[i], nptype) for i in range(len(lkeys))]
            ssby = blz.btable(columns=columns,
                              names=lkeys,
                              rootdir=path,
                              mode='w')
        else:
            # Have we new keys?
            new_keys = skeys.difference(prev_keys)
            for new_key in new_keys:
                # Get the index of the new key
                idx = lkeys.index(new_key)
                # and add the values as a new columns
                ssby.addcol(sby[idx], new_key, dtype=nptype)
            # Now fill the pre-existing keys
            existing_keys = skeys.intersection(prev_keys)
            for existing_key in existing_keys:
                # Get the index of the existing key
                idx = lkeys.index(existing_key)
                # and append the values here
                ssby[existing_key].append(sby[idx])

        # Add the new keys to the existing ones
        prev_keys |= skeys

    # Before returning, flush all data into disk
    if path is not None:
        ssby.flush()
    return ssby
Пример #9
0
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC):
    """Group the `val` field in `sreader` stream of lines by `key` index.

    Parameters
    ----------
    sreader : iterator
        Iterator over a stream of CSV lines.
    key : string
        The name of the field to be grouped by.
    val : string
        The field name with the values that have to be grouped.
    dtype : dynd dtype
        The DyND data type with all the fields of the CSV lines,
        including the `key` and `val` names.
    path : string
        The path of the file where the BLZ array with the final
        grouping will be stored.  If None (default), the BLZ will be
        stored in-memory (and hence non-persistent).
    lines_per_chunk : int
        The number of chunks that have to be read to be grouped by
        in-memory.  For optimal perfomance, some experimentation
        should be needed.  The default value should work reasonably
        well, though.

    Returns
    -------
    output : BLZ table
        Returns a BLZ table with column names that are the groups
        resulting from the groupby operation.  The columns are filled
        with the `val` field of the lines delivered by `sreader`.

    """

    def get_nptype(dtype, val):
        # Convert the `val` field into a numpy dtype
        dytype = dtype[nd.as_py(dtype.field_names).index(val)]
        # strings and bytes cannot be natively represented in numpy
        if dytype == ndt.string:
            nptype = np.dtype("U%d" % MAXCHARS)
        elif dytype == ndt.bytes:
            nptype = np.dtype("S%d" % MAXCHARS)
        else:
            # There should be no problems with the rest
            nptype = dytype.as_numpy()
        return nptype

    if val is None:
        types = [(bytes(name), get_nptype(dtype, name))
                 for name in nd.as_py(dtype.field_names)]
        nptype = np.dtype(types)
    else:
        nptype = get_nptype(dtype, val)

    # Start reading chunks
    prev_keys = set()
    while True:
        ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype)
        if len(ndbuf) == 0: break   # CSV data exhausted

        # Do the groupby for this chunk
        keys = getattr(ndbuf, key)
        if val is None:
            vals = ndbuf
        else:
            vals = getattr(ndbuf, val)
        sby = nd.groupby(vals, keys)
        skeys = set(nd.as_py(sby.groups))
        lkeys = list(skeys)
        # BLZ does not understand dynd objects (yet)
        sby = nd.as_py(sby.eval())

        if len(prev_keys) == 0:
            # Check path and if it exists, remove it and every
            # directory below it
            if os.path.exists(path): rmtree(path)
            # Add the initial keys to a BLZ table
            columns = [np.array(sby[i], nptype) for i in range(len(lkeys))]
            ssby = blz.btable(columns=columns, names=lkeys, rootdir=path)
        else:
            # Have we new keys?
            new_keys = skeys.difference(prev_keys)
            for new_key in new_keys:
                # Get the index of the new key
                idx = lkeys.index(new_key)
                # and add the values as a new columns
                ssby.addcol(sby[idx], new_key, dtype=nptype)
            # Now fill the pre-existing keys
            existing_keys = skeys.intersection(prev_keys)
            for existing_key in existing_keys:
                # Get the index of the existing key
                idx = lkeys.index(existing_key)
                # and append the values here
                ssby[existing_key].append(sby[idx])
            assert skeys == existing_keys | new_keys

        # Add the new keys to the existing ones
        prev_keys |= skeys

    # Before returning, flush all data into disk
    ssby.flush()
    return ssby