def test_grouped_slices(self): a = nd.asarray([[1, 2, 3], [1, 4, 5]]) gb = nd.groupby(a[:, 1:], a[:, 0]) self.assertEqual(nd.as_py(gb.groups), [1]) self.assertEqual(nd.as_py(gb), [[[2, 3], [4, 5]]]) a = nd.asarray([[1, 2, 3], [3, 1, 7], [1, 4, 5], [2, 6, 7], [3, 2, 5]]) gb = nd.groupby(a[:, 1:], a[:, 0]) self.assertEqual(nd.as_py(gb.groups), [1, 2, 3]) self.assertEqual(nd.as_py(gb), [[[2, 3], [4, 5]], [[6, 7]], [[1, 7], [2, 5]]])
def groupby(self, json_cmd): print('GroupBy operation') cmd = json.loads(json_cmd) array_url = cmd.get('input', self.base_url + self.array_name) if not array_url.startswith(self.base_url): raise RuntimeError('Input array must start with the base url') array_name = array_url[len(self.base_url):] fields = cmd['fields'] arr = self.get_session_array(array_name)[...].ddesc.dynd_arr() # Do the groupby, get its groups, then # evaluate it because deferred operations # through the groupby won't work well yet. res = nd.groupby(arr, nd.fields(arr, *fields)) groups = res.groups res = res.eval() # Write out the groupby result defarr_gb = self.array_provider.create_deferred_array_filename( self.session_name, 'groupby_', array(res)) dshape_gb = nd.dshape_of(res) defarr_gb[0].write( json.dumps({ 'dshape': dshape_gb, 'command': 'groupby', 'params': { 'fields': fields } })) defarr_gb[0].close() # Write out the groups defarr_groups = self.array_provider.create_deferred_array_filename( self.session_name, 'groups_', groups) dshape_groups = nd.dshape_of(groups) defarr_groups[0].write( json.dumps({ 'dshape': dshape_groups, 'command': 'groupby.groups', 'params': { 'fields': fields } })) defarr_groups[0].close() content_type = 'application/json; charset=utf-8' body = json.dumps({ 'session': self.base_url + self.session_name, 'output_gb': self.base_url + defarr_gb[1], 'dshape_gb': dshape_gb, 'output_groups': self.base_url + defarr_groups[1], 'dshape_groups': dshape_groups }) return (content_type, body)
def groupby(self, json_cmd): print('GroupBy operation') cmd = json.loads(json_cmd) array_url = cmd.get('input', self.base_url + self.array_name) if not array_url.startswith(self.base_url): raise RuntimeError('Input array must start with the base url') array_name = array_url[len(self.base_url):] fields = cmd['fields'] arr = self.get_session_array(array_name)[...].ddesc.dynd_arr() # Do the groupby, get its groups, then # evaluate it because deferred operations # through the groupby won't work well yet. res = nd.groupby(arr, nd.fields(arr, *fields)) groups = res.groups res = res.eval() # Write out the groupby result defarr_gb = self.array_provider.create_deferred_array_filename( self.session_name, 'groupby_', array(res)) dshape_gb = nd.dshape_of(res) defarr_gb[0].write(json.dumps({ 'dshape': dshape_gb, 'command': 'groupby', 'params': { 'fields': fields } })) defarr_gb[0].close() # Write out the groups defarr_groups = self.array_provider.create_deferred_array_filename( self.session_name, 'groups_', groups) dshape_groups = nd.dshape_of(groups) defarr_groups[0].write(json.dumps({ 'dshape': dshape_groups, 'command': 'groupby.groups', 'params': { 'fields': fields } })) defarr_groups[0].close() content_type = 'application/json; charset=utf-8' body = json.dumps({ 'session': self.base_url + self.session_name, 'output_gb': self.base_url + defarr_gb[1], 'dshape_gb': dshape_gb, 'output_groups': self.base_url + defarr_groups[1], 'dshape_groups': dshape_groups }) return (content_type, body)
def test_immutable(self): a = nd.array([ ('x', 0), ('y', 1), ('x', 2), ('x', 3), ('y', 4)], dtype='{A: string, B: int32}').eval_immutable() gb = nd.groupby(a, nd.fields(a, 'A')) self.assertEqual(nd.as_py(gb.groups), [{'A': 'x'}, {'A': 'y'}]) self.assertEqual(nd.as_py(gb), [ [{'A': 'x', 'B': 0}, {'A': 'x', 'B': 2}, {'A': 'x', 'B': 3}], [{'A': 'y', 'B': 1}, {'A': 'y', 'B': 4}]])
def test_aggregate(self): a = nd.array([ ('A', 1, 2), ('A', 3, 4), ('B', 1.5, 2.5), ('A', 0.5, 9), ('C', 1, 5), ('B', 2, 2)], dtype='c{cat: string, x: float32, y: float32}') gb = nd.groupby(a, nd.fields(a, 'cat')).eval() b = nd.make_computed_fields(gb, 1, fields=[('sum_x', ndt.float32, 'sum(x)'), ('mean_y', ndt.float32, 'mean(y)'), ('max_x', ndt.float32, 'max(x)'), ('max_y', ndt.float32, 'max(y)')]) self.assertEqual(nd.as_py(b.sum_x), [4.5, 3.5, 1]) self.assertEqual(nd.as_py(b.mean_y), [5, 2.25, 5]) self.assertEqual(nd.as_py(b.max_x), [3, 2, 1]) self.assertEqual(nd.as_py(b.max_y), [9, 2.5, 5])
def test_type_id(self): # Numeric type id self.assertEqual(self.type_id_of(ndt.bool), _lowlevel.type_id.BOOL) self.assertEqual(self.type_id_of(ndt.int8), _lowlevel.type_id.INT8) self.assertEqual(self.type_id_of(ndt.int16), _lowlevel.type_id.INT16) self.assertEqual(self.type_id_of(ndt.int32), _lowlevel.type_id.INT32) self.assertEqual(self.type_id_of(ndt.int64), _lowlevel.type_id.INT64) self.assertEqual(self.type_id_of(ndt.uint8), _lowlevel.type_id.UINT8) self.assertEqual(self.type_id_of(ndt.uint16), _lowlevel.type_id.UINT16) self.assertEqual(self.type_id_of(ndt.uint32), _lowlevel.type_id.UINT32) self.assertEqual(self.type_id_of(ndt.uint64), _lowlevel.type_id.UINT64) self.assertEqual(self.type_id_of(ndt.float32), _lowlevel.type_id.FLOAT32) self.assertEqual(self.type_id_of(ndt.float64), _lowlevel.type_id.FLOAT64) self.assertEqual(self.type_id_of(ndt.complex_float32), _lowlevel.type_id.COMPLEX_FLOAT32) self.assertEqual(self.type_id_of(ndt.complex_float64), _lowlevel.type_id.COMPLEX_FLOAT64) # String/bytes self.assertEqual(self.type_id_of(ndt.string), _lowlevel.type_id.STRING) self.assertEqual(self.type_id_of(ndt.make_fixedstring(16)), _lowlevel.type_id.FIXEDSTRING) self.assertEqual(self.type_id_of(ndt.bytes), _lowlevel.type_id.BYTES) self.assertEqual(self.type_id_of(ndt.make_fixedbytes(16)), _lowlevel.type_id.FIXEDBYTES) self.assertEqual(self.type_id_of(ndt.json), _lowlevel.type_id.JSON) # Date self.assertEqual(self.type_id_of(ndt.date), _lowlevel.type_id.DATE) # Property self.assertEqual(self.type_id_of(nd.type_of(ndt.date(2000, 1, 1).year)), _lowlevel.type_id.PROPERTY) # Categorical self.assertEqual(self.type_id_of(ndt.make_categorical([1, 2, 3])), _lowlevel.type_id.CATEGORICAL) # Struct self.assertEqual(self.type_id_of(ndt.make_struct( [ndt.int32, ndt.int32], ['x', 'y'])), _lowlevel.type_id.STRUCT) self.assertEqual(self.type_id_of(ndt.type('{x : int32, y : int32}')), _lowlevel.type_id.FIXEDSTRUCT) # Convert/byteswap/view self.assertEqual(self.type_id_of(ndt.make_convert( ndt.int32, ndt.int8)), _lowlevel.type_id.CONVERT) self.assertEqual(self.type_id_of(ndt.make_byteswap(ndt.int32)), _lowlevel.type_id.BYTESWAP) self.assertEqual(self.type_id_of(ndt.make_view( ndt.int32, ndt.uint32)), _lowlevel.type_id.VIEW) # CUDA types if ndt.cuda_support: self.assertEqual(self.type_id_of(ndt.type('cuda_device[int32]')), _lowlevel.type_id.CUDA_DEVICE) self.assertEqual(self.type_id_of(ndt.type('cuda_host[int32]')), _lowlevel.type_id.CUDA_HOST) # Uniform arrays self.assertEqual(self.type_id_of(ndt.type('3 * int32')), _lowlevel.type_id.FIXED_DIM) self.assertEqual(self.type_id_of(ndt.type('strided * int32')), _lowlevel.type_id.STRIDED_DIM) self.assertEqual(self.type_id_of(ndt.type('var * int32')), _lowlevel.type_id.VAR_DIM) # GroupBy self.assertEqual(self.type_id_of(nd.type_of(nd.groupby([1, 2], ['a', 'a']))), _lowlevel.type_id.GROUPBY) # Type self.assertEqual(self.type_id_of(ndt.type('type')), _lowlevel.type_id.TYPE)
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC): """Group the `val` field in `sreader` stream of lines by `key` index. Parameters ---------- sreader : iterator Iterator over a stream of CSV lines. key : string The name of the field to be grouped by. val : string The field name with the values that have to be grouped. dtype : dynd dtype The DyND data type with all the fields of the CSV lines, including the `key` and `val` names. path : string The path of the file where the BLZ array with the final grouping will be stored. If None (default), the BLZ will be stored in-memory (and hence non-persistent). lines_per_chunk : int The number of chunks that have to be read to be grouped by in-memory. For optimal perfomance, some experimentation should be needed. The default value should work reasonably well, though. Returns ------- output : BLZ table Returns a BLZ table with column names that are the groups resulting from the groupby operation. The columns are filled with the `val` field of the lines delivered by `sreader`. """ try: nptype = get_nptype(dtype, val) except ValueError: raise ValueError("`val` should be a valid field") # Start reading chunks prev_keys = set() while True: ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype) if len(ndbuf) == 0: break # CSV data exhausted # Do the groupby for this chunk keys = getattr(ndbuf, key) if val is None: vals = ndbuf else: vals = getattr(ndbuf, val) sby = nd.groupby(vals, keys) lkeys = nd.as_py(sby.groups) skeys = set(lkeys) # BLZ does not understand dynd objects (yet) sby = nd.as_py(sby.eval()) if len(prev_keys) == 0: # Add the initial keys to a BLZ table columns = [np.array(sby[i], nptype) for i in range(len(lkeys))] ssby = blz.btable(columns=columns, names=lkeys, rootdir=path, mode='w') else: # Have we new keys? new_keys = skeys.difference(prev_keys) for new_key in new_keys: # Get the index of the new key idx = lkeys.index(new_key) # and add the values as a new columns ssby.addcol(sby[idx], new_key, dtype=nptype) # Now fill the pre-existing keys existing_keys = skeys.intersection(prev_keys) for existing_key in existing_keys: # Get the index of the existing key idx = lkeys.index(existing_key) # and append the values here ssby[existing_key].append(sby[idx]) # Add the new keys to the existing ones prev_keys |= skeys # Before returning, flush all data into disk if path is not None: ssby.flush() return ssby
def groupby(sreader, key, val, dtype, path=None, lines_per_chunk=LPC): """Group the `val` field in `sreader` stream of lines by `key` index. Parameters ---------- sreader : iterator Iterator over a stream of CSV lines. key : string The name of the field to be grouped by. val : string The field name with the values that have to be grouped. dtype : dynd dtype The DyND data type with all the fields of the CSV lines, including the `key` and `val` names. path : string The path of the file where the BLZ array with the final grouping will be stored. If None (default), the BLZ will be stored in-memory (and hence non-persistent). lines_per_chunk : int The number of chunks that have to be read to be grouped by in-memory. For optimal perfomance, some experimentation should be needed. The default value should work reasonably well, though. Returns ------- output : BLZ table Returns a BLZ table with column names that are the groups resulting from the groupby operation. The columns are filled with the `val` field of the lines delivered by `sreader`. """ def get_nptype(dtype, val): # Convert the `val` field into a numpy dtype dytype = dtype[nd.as_py(dtype.field_names).index(val)] # strings and bytes cannot be natively represented in numpy if dytype == ndt.string: nptype = np.dtype("U%d" % MAXCHARS) elif dytype == ndt.bytes: nptype = np.dtype("S%d" % MAXCHARS) else: # There should be no problems with the rest nptype = dytype.as_numpy() return nptype if val is None: types = [(bytes(name), get_nptype(dtype, name)) for name in nd.as_py(dtype.field_names)] nptype = np.dtype(types) else: nptype = get_nptype(dtype, val) # Start reading chunks prev_keys = set() while True: ndbuf = nd.array(islice(sreader, lines_per_chunk), dtype) if len(ndbuf) == 0: break # CSV data exhausted # Do the groupby for this chunk keys = getattr(ndbuf, key) if val is None: vals = ndbuf else: vals = getattr(ndbuf, val) sby = nd.groupby(vals, keys) skeys = set(nd.as_py(sby.groups)) lkeys = list(skeys) # BLZ does not understand dynd objects (yet) sby = nd.as_py(sby.eval()) if len(prev_keys) == 0: # Check path and if it exists, remove it and every # directory below it if os.path.exists(path): rmtree(path) # Add the initial keys to a BLZ table columns = [np.array(sby[i], nptype) for i in range(len(lkeys))] ssby = blz.btable(columns=columns, names=lkeys, rootdir=path) else: # Have we new keys? new_keys = skeys.difference(prev_keys) for new_key in new_keys: # Get the index of the new key idx = lkeys.index(new_key) # and add the values as a new columns ssby.addcol(sby[idx], new_key, dtype=nptype) # Now fill the pre-existing keys existing_keys = skeys.intersection(prev_keys) for existing_key in existing_keys: # Get the index of the existing key idx = lkeys.index(existing_key) # and append the values here ssby[existing_key].append(sby[idx]) assert skeys == existing_keys | new_keys # Add the new keys to the existing ones prev_keys |= skeys # Before returning, flush all data into disk ssby.flush() return ssby