def reduce(self, others): all = [self] + others all = [k.set for k in all if k.set is not None] set_merged, *others = all import time t0 = time.time() set_merged.merge(others) logger.info( f'merge took {time.time()-t0} seconds, size {len(set_merged):,}, byte_size {sys.getsizeof(set_merged):,}' ) if self.return_inverse: # sort by row index self.chunks.sort(key=lambda x: x[0]) length = 0 for i1, i2, values, map_index in self.chunks: length += len(values) self.values = np.empty(length, vaex.dtype_of(self.chunks[0][2]).numpy) # TODO: we could do this parallel, but overhead is small for i1, i2, values, map_index in self.chunks: set_merged.flatten_values(values, map_index, self.values[i1:i2]) if self.unique_limit is not None: count = len(set_merged) if count > self.unique_limit: raise vaex.RowLimitException( f'Resulting set has {count:,} unique combinations, which is larger than the allowed value of {self.unique_limit:,}' ) self.set = set_merged self.set.fingerprint = f'set-{self.fingerprint}'
def __init__(self, expression, df=None, sort=False, row_limit=None): self.df = df or expression.ds self.sort = sort # make sure it's an expression expression = self.df[str(expression)] self.expression_original = expression self.label = expression._label self.expression = expression.index_values() if expression.dtype.is_encoded else expression self.bin_values = self.df.category_labels(self.expression_original, aslist=False) if self.sort: self.sort_indices = pa.compute.sort_indices(self.bin_values)#[offset:]) self.bin_values = pa.compute.take(self.bin_values, self.sort_indices) else: self.sort_indices = None if isinstance(self.bin_values, list): self.bin_values = pa.array(self.bin_values) self.N = self.df.category_count(self.expression_original) if row_limit is not None: if self.N > row_limit: raise vaex.RowLimitException(f'Resulting grouper has {self.N:,} unique combinations, which is larger than the allowed row limit of {row_limit:,}') self.min_value = self.df.category_offset(self.expression_original) # TODO: what do we do with null values for categories? # if self.set.has_null: # self.N += 1 # keys += ['null'] self.binner = self.df._binner_ordinal(self.expression, self.N, self.min_value) self.binby_expression = str(self.expression)
def process(self, thread_index, i1, i2, filter_mask, ar): from vaex.column import _to_string_sequence if self.set is None: self.set = self.ordered_set_type() if self.selection: selection_mask = self.df.evaluate_selection_mask(self.selection, i1=i1, i2=i2, cache=True) ar = filter(ar, selection_mask) if self.dtype.is_list and self.flatten: ar = ar.values if self.dtype_item.is_string: ar = _to_string_sequence(ar) else: ar = vaex.array_types.to_numpy(ar) if np.ma.isMaskedArray(ar): mask = np.ma.getmaskarray(ar) self.set.update(ar, mask) else: self.set.update(ar) if self.unique_limit is not None: count = self.set.count # we skip null and nan here, since this is just an early bail out if count > self.unique_limit: raise vaex.RowLimitException( f'Resulting set would have >= {self.unique_limit} unique combinations' )
def _check_row_limit(self): if self.limit is not None: # we only raise when we EXCEED the limit if self.limit_raise and len(self.hash_map_unique) > self.limit: raise vaex.RowLimitException(f'Resulting hash_map_unique would have >= {self.limit} unique combinations') # but we can stop when we are AT the limit if not self.limit_raise and len(self.hash_map_unique) >= self.limit: self.stopped = True
def __init__(self, expression, df=None, sort=False, row_limit=None, pre_sort=True): self.df = df or expression.ds self.sort = sort self.pre_sort = pre_sort # make sure it's an expression expression = self.df[str(expression)] self.expression_original = expression self.label = expression._label self.expression = expression.index_values( ) if expression.dtype.is_encoded else expression self.row_limit = row_limit self.min_value = self.df.category_offset(self.expression_original) self.bin_values = self.df.category_labels(self.expression_original, aslist=False) self.N = self.df.category_count(self.expression_original) dtype = self.expression.dtype if self.sort: # not pre-sorting is faster sort_indices = pa.compute.sort_indices(self.bin_values) self.bin_values = pa.compute.take(self.bin_values, sort_indices) if self.pre_sort: sort_indices = vaex.array_types.to_numpy(sort_indices) # TODO: this is kind of like expression.map from .hash import ordered_set_type_from_dtype ordered_set_type = ordered_set_type_from_dtype(dtype) fingerprint = self.expression.fingerprint( ) + "-grouper-sort-mapper" self.set = ordered_set_type(sort_indices + self.min_value, -1, 0, 0, fingerprint) self.min_value = 0 self.sort_indices = None self.basename = "set_%s" % vaex.utils._python_save_name( str(self.expression) + "_" + self.set.fingerprint) else: self.sort_indices = sort_indices else: self.sort_indices = None if isinstance(self.bin_values, list): self.bin_values = pa.array(self.bin_values) if row_limit is not None: if self.N > row_limit: raise vaex.RowLimitException( f'Resulting grouper has {self.N:,} unique combinations, which is larger than the allowed row limit of {row_limit:,}' ) # TODO: what do we do with null values for categories? # if self.set.has_null: # self.N += 1 # keys += ['null'] self._promise = vaex.promise.Promise.fulfilled(None)
def reduce(self, others): set_merged = self.set for other in others: if set_merged is None and other.set is not None: set_merged = other.set elif other.set is not None: set_merged.merge(other.set) if self.unique_limit is not None: count = set_merged.count if set_merged.has_nan: count += 1 if set_merged.has_null: count += 1 if count > self.unique_limit: raise vaex.RowLimitException( f'Resulting set has {count:,} unique combinations, which is larger than the allowed value of {self.unique_limit:,}' ) self.set = set_merged
def _check_row_limit(self): if self.unique_limit is not None: if len(self.set) > self.unique_limit: raise vaex.RowLimitException( f'Resulting set would have >= {self.unique_limit} unique combinations' )