def from_keys(cls, keys, dtype=None, fingerprint=''): keys = vaex.array_types.convert(keys, 'numpy-arrow') dtype = vaex.dtype_of(keys) if dtype is None else dtype if dtype == float: nancount = np.isnan(keys).sum() else: nancount = 0 null_count = 0 null_value = -1 if np.ma.isMaskedArray(keys): null_count = keys.mask.sum() if null_count == 0: keys = keys.data elif null_count == 1: null_value = np.where(keys.mask == 1)[0] keys = keys.data else: raise ValueError('key arrays contained more than 1 null value') set_type = vaex.hash.ordered_set_type_from_dtype(dtype) if dtype.is_string: values = vaex.column.ColumnStringArrow.from_arrow(keys) string_sequence = values.string_sequence mask = string_sequence.mask() if mask is not None: null_count = mask.sum() if null_count == 0: pass # fine elif null_count == 1: null_value = np.where(mask == 1)[0] else: raise ValueError('key arrays contained more than 1 null value') hash_map_unique_internal = set_type(string_sequence, null_value, nancount, null_count, fingerprint) else: hash_map_unique_internal = set_type(keys, null_value, nancount, null_count, fingerprint) return HashMapUnique(dtype, _internal=hash_map_unique_internal)
def reduce(self, others): all = [self] + others all = [k.set for k in all if k.set is not None] set_merged, *others = all import time t0 = time.time() set_merged.merge(others) logger.info( f'merge took {time.time()-t0} seconds, size {len(set_merged):,}, byte_size {sys.getsizeof(set_merged):,}' ) if self.return_inverse: # sort by row index self.chunks.sort(key=lambda x: x[0]) length = 0 for i1, i2, values, map_index in self.chunks: length += len(values) self.values = np.empty(length, vaex.dtype_of(self.chunks[0][2]).numpy) # TODO: we could do this parallel, but overhead is small for i1, i2, values, map_index in self.chunks: set_merged.flatten_values(values, map_index, self.values[i1:i2]) if self.unique_limit is not None: count = len(set_merged) if count > self.unique_limit: raise vaex.RowLimitException( f'Resulting set has {count:,} unique combinations, which is larger than the allowed value of {self.unique_limit:,}' ) self.set = set_merged self.set.fingerprint = f'set-{self.fingerprint}'
def process(_ignore): logger.info(f"extracing indices of parent groupers ({self.N:,} unique rows)") df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values}) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns, progress=progressbar) def compress(ar): if vaex.dtype_of(ar).kind == 'i': ar = vaex.array_types.to_numpy(ar) max_value = ar.max() ar = ar.astype(vaex.utils.required_dtype_for_max(max_value)) return ar indices_parents = [compress(ar) for ar in indices_parents] bin_values = {} logger.info(f"extracing labels of parent groupers...") # NOTE: we can also use dict encoding instead of take for indices, parent in zip(indices_parents, parents): if sort: assert parent.pre_sort, "cannot sort while parent not presorted" assert parent.sort_indices is None dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar) else: bin_values[parent.label] = parent.bin_values.take(indices) # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values) logger.info(f"extracing labels of parent groupers done") return pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def compress(ar): if vaex.dtype_of(ar).kind == 'i': ar = vaex.array_types.to_numpy(ar) max_value = ar.max() ar = ar.astype( vaex.utils.required_dtype_for_max(max_value)) return ar
def process(set): self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): # TODO: find out why this more efficient path does not work # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values) # self.bin_values = pa.array(col) self.bin_values = pa.array(self.bin_values.to_numpy()) if vaex.dtype_of(self.bin_values) == int: max_value = self.bin_values.max() self.bin_values = self.bin_values.astype( vaex.utils.required_dtype_for_max(max_value)) logger.debug( 'Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values)) if set.has_null and (dtype.is_primitive or dtype.is_datetime): mask = np.zeros(shape=self.bin_values.shape, dtype="?") mask[set.null_value] = 1 self.bin_values = np.ma.array(self.bin_values, mask=mask) if self.sort: self.bin_values = vaex.array_types.to_arrow( self.bin_values) indices = pa.compute.sort_indices( self.bin_values) #[offset:]) if pre_sort: self.bin_values = pa.compute.take( self.bin_values, indices) # arrow sorts with null last null_value = -1 if not set.has_null else len( self.bin_values) - 1 fingerprint = set.fingerprint + "-sorted" if dtype.is_string: bin_values = vaex.column.ColumnStringArrow.from_arrow( self.bin_values) string_sequence = bin_values.string_sequence set = type(set)(string_sequence, null_value, set.nan_count, set.null_count, fingerprint) else: set = type(set)(self.bin_values, null_value, set.nan_count, set.null_count, fingerprint) self.sort_indices = None else: self.sort_indices = vaex.array_types.to_numpy(indices) # the bin_values will still be pre sorted, maybe that is confusing (implementation detail) self.bin_values = pa.compute.take( self.bin_values, self.sort_indices) else: self.sort_indices = None self.set = set self.basename = 'set_%s' % vaex.utils._python_save_name( str(self.expression) + "_" + set.fingerprint) self.N = len(self.bin_values) # for datetimes, we converted to int if dtype.is_datetime: self.bin_values = dtype.create_array(self.bin_values)
def decode(encoding, obj_spec): clsname = obj_spec['class'] cls = getattr(vaex.hash, clsname) keys = encoding.decode('array', obj_spec['data']['keys']) dtype = vaex.dtype_of(keys) if dtype.is_string: keys = vaex.strings.to_string_sequence(keys) _hash_map_internal = cls(keys, obj_spec['data']['null_value'], obj_spec['data']['nan_count'], obj_spec['data']['missing_count'], obj_spec['data']['fingerprint']) dtype = encoding.decode('dtype', obj_spec['dtype']) return vaex.hash.HashMapUnique(dtype, _internal=_hash_map_internal)
def list_unwrap(ar, level=-1): '''Returns the values in a (nested) list, and a callable that puts it back in the same structure''' from .convert import trim_offsets list_parameters = [] dtype = vaex.dtype_of(ar) array_levels = [ar] while dtype.is_list: list_parameters.append( [ar.type, len(ar), ar.buffers(), ar.null_count, ar.offset]) # flattened.append(ar.type) i1 = ar.offsets[0].as_py() i2 = ar.offsets[-1].as_py() ar = ar.values.slice(i1, i2) array_levels.append(ar) dtype = dtype.value_type if level == -1: ar = array_levels[-1] else: ar = array_levels[level] list_parameters = list_parameters[:level + 1] def wrapper(new_values): if list_parameters and list_parameters: ar = None for type, length, buffers, null_count, offset in list_parameters[:: -1]: if ar is None: # buffers = # assert offset == 0 buffers = buffers[:2] buffers = trim_offsets(offset, length, *buffers) offset = 0 new_values = vaex.array_types.to_arrow(new_values) type = pa.list_(new_values.type) ar = pa.ListArray.from_buffers(type, length, [buffers[0], buffers[1]], null_count, offset, children=[new_values]) else: ar = pa.ListArray.from_buffers(type, length, [buffers[0], buffers[1]], null_count, offset, children=[ar]) return ar else: return new_values return ar, wrapper
def __init__(self, expression, df, multipliers, parents, sort, row_limit=None): '''Will group by 1 expression, which is build up from multiple expressions. Used in the sparse/combined group by. ''' super().__init__(expression, df, sort=sort, row_limit=row_limit) assert len(multipliers) == len(parents) assert multipliers[-1] == 1 self.df = df self.label = 'SHOULD_NOT_BE_USED' self.expression = expression # efficient way to find the original bin values (parent.bin_value) from the 'compressed' # self.bin_values df = vaex.from_dict({ 'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values }) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns) bin_values = {} for indices, parent in zip(indices_parents, parents): dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) else: bin_values[parent.label] = parent.bin_values.take(indices) self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def __init__(self, expression, df, multipliers, parents, sort, row_limit=None): '''Will group by 1 expression, which is build up from multiple expressions. Used in the sparse/combined group by. ''' super().__init__(expression, df, sort=sort, row_limit=row_limit) assert len(multipliers) == len(parents) assert multipliers[-1] == 1 self.df = df self.label = 'SHOULD_NOT_BE_USED' self.expression = expression # efficient way to find the original bin values (parent.bin_value) from the 'compressed' # self.bin_values df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values}) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns) def compress(ar): if vaex.dtype_of(ar).kind == 'i': ar = vaex.array_types.to_numpy(ar) max_value = ar.max() ar = ar.astype(vaex.utils.required_dtype_for_max(max_value)) return ar indices_parents = [compress(ar) for ar in indices_parents] bin_values = {} # NOTE: we can also use dict encoding instead of take for indices, parent in zip(indices_parents, parents): dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar) else: bin_values[parent.label] = parent.bin_values.take(indices) # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values) self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def _map_hdf5_array(self, data, mask=None, as_arrow=False): offset = data.id.get_offset() if len(data) == 0 and offset is None: offset = 0 # we don't care about the offset for empty arrays if offset is None: # non contiguous array, chunked arrays etc # we don't support masked in this case assert as_arrow is False column = ColumnNumpyLike(data) self._all_mmapped = False return column else: shape = data.shape dtype = data.dtype if "dtype" in data.attrs: # ignore the special str type, which is not a numpy dtype if data.attrs["dtype"] != "str": dtype = data.attrs["dtype"] if dtype == 'utf32': dtype = np.dtype('U' + str(data.attrs['dlength'])) #self.addColumn(column_name, offset, len(data), dtype=dtype) array = self._map_array(offset, dtype=dtype, shape=shape) if as_arrow: if isinstance(array, np.ndarray): array = vaex.array_types.to_arrow(array) else: array = vaex.column.ColumnArrowLazyCast( array, vaex.dtype_of(array).arrow) if mask is not None: if as_arrow: raise TypeError('Arrow does not support byte masks') mask_array = self._map_hdf5_array(mask) if isinstance(array, np.ndarray): ar = np.ma.array(array, mask=mask_array, shrink=False) # assert ar.mask is mask_array, "masked array was copied" else: ar = vaex.column.ColumnMaskedNumpy(array, mask_array) return ar else: return array
def process(hashmap_unique): self.bin_values = hashmap_unique.keys() if self.allow_simplify and dtype == int and len(self.bin_values): vmin = self.bin_values.min() vmax = self.bin_values.max() int_range = vmax - vmin + 1 # we allow for 25% unused 'slots' bins = len(self.bin_values) if int_range <= (bins * 4 / 3): dense = bins == int_range self.simpler = BinnerInteger(self.expression, min_value=vmin, max_value=vmax, dropmissing=not hashmap_unique.has_null, dense=dense) return if vaex.dtype_of(self.bin_values) == int and len(self.bin_values): max_value = self.bin_values.max() self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value)) logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values)) if self.sort: if pre_sort: hashmap_unique, self.bin_values = hashmap_unique.sorted(keys=self.bin_values, return_keys=True) self.sort_indices = None else: indices = pa.compute.sort_indices(self.bin_values) self.sort_indices = vaex.array_types.to_numpy(indices) # the bin_values will still be pre sorted, maybe that is confusing (implementation detail) self.bin_values = pa.compute.take(self.bin_values, self.sort_indices) else: self.sort_indices = None self.hashmap_unique = hashmap_unique self.basename = 'hashmap_unique_%s' % vaex.utils._python_save_name(str(self.expression) + "_" + hashmap_unique.fingerprint) self.N = len(self.bin_values) # for datetimes, we converted to int if dtype.is_datetime: self.bin_values = dtype.create_array(self.bin_values)
def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False): self.df = df or expression.ds # we prefer to calculate the set the original dataframe to have better cache hits, and modify df if df_original is None: df_original = self.df self.sort = sort self.expression = expression # make sure it's an expression self.expression = self.df[_ensure_string_from_expression(self.expression)] self.label = self.expression._label if materialize_experimental: set, values = df_original._set(self.expression, unique_limit=row_limit, return_inverse=True) # TODO: add column should have a unique argument self.df.add_column(f'__materialized_{self.label}', values) self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): self.bin_values = pa.array(self.bin_values.to_numpy()) self.binby_expression = 'bla' self.N = len(self.bin_values) self.min_value = 0 self.binner = self.df._binner_ordinal('bla', self.N, self.min_value) self.sort_indices = None else: set = df_original._set(self.expression, unique_limit=row_limit) self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): # TODO: find out why this more efficient path does not work # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values) # self.bin_values = pa.array(col) self.bin_values = pa.array(self.bin_values.to_numpy()) if vaex.dtype_of(self.bin_values).kind == 'i': max_value = self.bin_values.max() self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value)) logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values)) # since nan and null are at the start, we skip them with sorting if self.sort: dtype = self.expression.dtype indices = pa.compute.sort_indices(self.bin_values)#[offset:]) if pre_sort: self.bin_values = pa.compute.take(self.bin_values, indices) # arrow sorts with null last null_value = -1 if not set.has_null else len(self.bin_values)-1 fingerprint = set.fingerprint + "-sorted" if dtype.is_string: bin_values = vaex.column.ColumnStringArrow.from_arrow(self.bin_values) string_sequence = bin_values.string_sequence set = type(set)(string_sequence, null_value, set.nan_count, set.null_count, fingerprint) else: set = type(set)(self.bin_values, null_value, set.nan_count, set.null_count, fingerprint) self.sort_indices = None else: # TODO: skip first or first two values (null and/or nan) self.sort_indices = vaex.array_types.to_numpy(indices) # the bin_values will still be pre sorted, maybe that is confusing (implementation detail) self.bin_values = pa.compute.take(self.bin_values, self.sort_indices) else: self.sort_indices = None self.set = set # TODO: we modify the dataframe in place, this is not nice basename = 'set_%s' % vaex.utils._python_save_name(str(expression)) self.setname = self.df.add_variable(basename, self.set, unique=True) self.binby_expression = '_ordinal_values(%s, %s)' % (self.expression, self.setname) self.N = len(self.bin_values) self.bin_values = self.expression.dtype.create_array(self.bin_values) self.binner = self.df._binner_ordinal(self.binby_expression, self.N)
def _load_columns(self, h5data, first=[]): # print h5data # make sure x y x etc are first finished = set() if "description" in h5data.attrs: self.description = ensure_string(h5data.attrs["description"]) # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later h5columns = h5data if self._version == 1 else h5data['columns'] if "column_order" in h5columns.attrs: column_order = ensure_string( h5columns.attrs["column_order"]).split(",") else: column_order = [] # for name in list(h5columns): # if name not in column_order: # column_order.append(name) # for column_name in column_order: # if column_name in h5columns and column_name not in finished: for group_name in list(h5columns): logger.debug('loading column: %s', group_name) group = h5columns[group_name] if 'type' in group.attrs: type = group.attrs['type'] if type in ['csr_matrix']: from scipy.sparse import csc_matrix, csr_matrix class csr_matrix_nocheck(csr_matrix): def check_format(self, *args, **kwargs): pass data = self._map_hdf5_array(group['data']) indptr = self._map_hdf5_array(group['indptr']) indices = self._map_hdf5_array(group['indices']) #column_names = ensure_string(group.attrs["column_names"]).split(",") # make sure we keep the original order groups = [(name, value) for name, value in group.items() if isinstance(value, h5py.Group)] column_names = [None] * len(groups) for name, column in groups: column_names[column.attrs['column_index']] = name matrix = csr_matrix_nocheck( (data, indices, indptr), shape=(len(indptr) - 1, len(column_names))) assert matrix.data is data # assert matrix.indptr is indptr assert matrix.indices is indices self.add_columns(column_names, matrix) if type == 'dictionary_encoded': index = self._map_column(group['indices'], as_arrow=True) values = self._map_column(group['dictionary'], as_arrow=True) if 'null_bitmap' in group['indices'] or 'mask' in group[ 'indices']: raise ValueError( f'Did not expect null data in encoded column {group_name}' ) if isinstance(values, vaex.column.Column): encoded = vaex.column.ColumnArrowDictionaryEncoded( index, values) else: encoded = pa.DictionaryArray.from_arrays(index, values) self.add_column(group_name, encoded) else: raise TypeError( f'Unexpected type {type!r} in {group_name}') else: column_name = group_name column = h5columns[column_name] if "alias" in column.attrs: column_name = column.attrs["alias"] if "ucd" in column.attrs: self.ucds[column_name] = ensure_string(column.attrs["ucd"]) if "description" in column.attrs: self.descriptions[column_name] = ensure_string( column.attrs["description"]) if "unit" in column.attrs: try: unitname = ensure_string(column.attrs["unit"]) if unitname and unitname != "None": self.units[column_name] = _try_unit(unitname) except: logger.exception("error parsing unit: %s", column.attrs["unit"]) if "units" in column.attrs: # Amuse case unitname = ensure_string(column.attrs["units"]) logger.debug("amuse unit: %s", unitname) if unitname == "(0.01 * system.get('S.I.').base('length'))": self.units[column_name] = astropy.units.Unit("cm") if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))": self.units[column_name] = astropy.units.Unit("cm/s") if unitname == "(0.001 * system.get('S.I.').base('mass'))": self.units[column_name] = astropy.units.Unit("gram") if unitname == "system.get('S.I.').base('length')": self.units[column_name] = astropy.units.Unit("m") if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))": self.units[column_name] = astropy.units.Unit("m/s") if unitname == "system.get('S.I.').base('mass')": self.units[column_name] = astropy.units.Unit("kg") if self._version == 1: column = self._map_hdf5_array(column) self.add_column(column_name, column) elif hasattr(column["data"], "dtype"): column = self._map_column(column) self.add_column(column_name, column) dtype = vaex.dtype_of(column) logger.debug("adding column %r with dtype %r", column_name, dtype) else: raise TypeError(f'{group_name} is missing dtype') all_columns = dict(**self._columns) # in case the column_order refers to non-existing columns column_order = [k for k in column_order if k in all_columns] column_names = [] self._columns = {} for name in column_order: self._columns[name] = all_columns.pop(name) # add the rest for name, col in all_columns.items(): self._columns[name] = col
def __init__(self, expression, values, keep_other=True, other_value=None, sort=False, label=None, df=None): self.df = df or expression.df self.sort = sort self.pre_sort = True self.expression = self.df[str(expression)] self.label = label or self.expression._label self.keep_other = keep_other if isinstance(values, pa.ChunkedArray): values = pa.concat_arrays(values.chunks) if sort: indices = pa.compute.sort_indices(values) values = pa.compute.take(values, indices) if self.keep_other: self.bin_values = pa.array( vaex.array_types.tolist(values) + [other_value]) self.values = self.bin_values.slice(0, len(self.bin_values) - 1) else: raise NotImplementedError("not supported yet") # although we can support this, it will fail with _combine, because of # the mapping of the set to -1 self.bin_values = pa.array(vaex.array_types.tolist(values)) self.values = self.bin_values self.N = len(self.bin_values) dtype = vaex.dtype_of(self.values) set_type = vaex.hash.ordered_set_type_from_dtype(dtype) values_list = self.values.tolist() try: null_value = values_list.index(None) null_count = 1 except ValueError: null_value = -1 null_count = 0 if vaex.dtype_of(self.values) == float: nancount = np.isnan(self.values).sum() else: nancount = 0 fp = vaex.cache.fingerprint(values) fingerprint = f"set-grouper-fixed-{fp}" if dtype.is_string: values = vaex.column.ColumnStringArrow.from_arrow(self.values) string_sequence = values.string_sequence self.set = set_type(string_sequence, null_value, nancount, null_count, fingerprint) else: self.set = set_type(self.values, null_value, nancount, null_count, fingerprint) self.basename = "set_%s" % vaex.utils._python_save_name( str(self.expression) + "_" + self.set.fingerprint) self.binby_expression = expression self.sort_indices = None self._promise = vaex.promise.Promise.fulfilled(None)