def __init__(self, df, by, sort=False, combine=False, expand=True, row_limit=None): '''Note that row_limit only works in combination with combine=True''' df_original = df df = df.copy() # we're gonna mutate, so create a shallow copy self.df = df self.sort = sort self.expand = expand # keep as pyarrow struct? if not isinstance(by, collections_abc.Iterable)\ or isinstance(by, six.string_types): by = [by] self.by = [] self.by_original = by for by_value in by: if not isinstance(by_value, BinnerBase): if df.is_category(by_value): by_value = GrouperCategory(df[_ensure_string_from_expression(by_value)], sort=sort, row_limit=row_limit) else: by_value = Grouper(df[_ensure_string_from_expression(by_value)], sort=sort, row_limit=row_limit, df_original=df_original) self.by.append(by_value) if combine is True and len(self.by) >= 2: self.by = [_combine(self.df, self.by, sort=sort, row_limit=row_limit)] self.combine = True elif combine == 'auto' and len(self.by) >= 2: cells = product([grouper.N for grouper in self.by]) dim = len(self.by) rows = df.length_unfiltered() # we don't want to trigger a computation occupancy = rows/cells logger.debug('%s rows and %s grid cells => occupancy=%s', rows, cells, occupancy) # we want each cell to have a least 10x occupacy if occupancy < 10: logger.info(f'Combining {len(self.by)} groupers into 1') self.by = [_combine(self.df, self.by, sort=sort, row_limit=row_limit)] self.combine = True else: self.combine = False else: self.combine = False # binby may be an expression based on self.by.expression # if we want to have all columns, minus the columns grouped by # we should keep track of the original expressions, but binby self.groupby_expression = [str(by.expression) for by in self.by] self.binners = tuple(by.binner for by in self.by) self.shape = [by.N for by in self.by] self.dims = self.groupby_expression[:]
def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None): self.df = df or expression.ds # we prefer to calculate the set the original dataframe to have better cache hits, and modify df if df_original is None: df_original = self.df self.sort = sort self.expression = expression # make sure it's an expression self.expression = self.df[_ensure_string_from_expression( self.expression)] self.label = self.expression._label set = df_original._set(self.expression, unique_limit=row_limit) keys = set.keys() if self.sort: if pre_sort: sort_indices = np.argsort(keys) keys = np.array(keys)[sort_indices].tolist() set_dict = dict(zip(keys, range(len(keys)))) set = type(set)(set_dict, set.count, set.nan_count, set.null_count) self.sort_indices = None else: self.sort_indices = np.argsort(keys) keys = np.array(keys)[self.sort_indices].tolist() else: self.sort_indices = None self.set = set # TODO: we modify the dataframe in place, this is not nice basename = 'set_%s' % vaex.utils._python_save_name(str(expression)) self.setname = self.df.add_variable(basename, self.set, unique=True) self.bin_values = keys self.binby_expression = '_ordinal_values(%s, %s)' % (self.expression, self.setname) self.N = len(self.bin_values) if self.set.has_null: self.N += 1 self.bin_values = [None] + self.bin_values if self.set.has_nan: self.N += 1 self.bin_values = [np.nan] + self.bin_values if self.sort_indices is not None: if self.set.has_null and self.set.has_nan: self.sort_indices = np.concatenate([[0, 1], self.sort_indices + 2]) elif self.set.has_null or self.set.has_nan: self.sort_indices = np.concatenate([[0], self.sort_indices + 1]) self.bin_values = self.expression.dtype.create_array(self.bin_values) self.binner = self.df._binner_ordinal(self.binby_expression, self.N)
def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False): self.df = df or expression.ds # we prefer to calculate the set the original dataframe to have better cache hits, and modify df if df_original is None: df_original = self.df self.sort = sort self.expression = expression # make sure it's an expression self.expression = self.df[_ensure_string_from_expression(self.expression)] self.label = self.expression._label if materialize_experimental: set, values = df_original._set(self.expression, unique_limit=row_limit, return_inverse=True) # TODO: add column should have a unique argument self.df.add_column(f'__materialized_{self.label}', values) self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): self.bin_values = pa.array(self.bin_values.to_numpy()) self.binby_expression = 'bla' self.N = len(self.bin_values) self.min_value = 0 self.binner = self.df._binner_ordinal('bla', self.N, self.min_value) self.sort_indices = None else: set = df_original._set(self.expression, unique_limit=row_limit) self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): # TODO: find out why this more efficient path does not work # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values) # self.bin_values = pa.array(col) self.bin_values = pa.array(self.bin_values.to_numpy()) if vaex.dtype_of(self.bin_values).kind == 'i': max_value = self.bin_values.max() self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value)) logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values)) # since nan and null are at the start, we skip them with sorting if self.sort: dtype = self.expression.dtype indices = pa.compute.sort_indices(self.bin_values)#[offset:]) if pre_sort: self.bin_values = pa.compute.take(self.bin_values, indices) # arrow sorts with null last null_value = -1 if not set.has_null else len(self.bin_values)-1 fingerprint = set.fingerprint + "-sorted" if dtype.is_string: bin_values = vaex.column.ColumnStringArrow.from_arrow(self.bin_values) string_sequence = bin_values.string_sequence set = type(set)(string_sequence, null_value, set.nan_count, set.null_count, fingerprint) else: set = type(set)(self.bin_values, null_value, set.nan_count, set.null_count, fingerprint) self.sort_indices = None else: # TODO: skip first or first two values (null and/or nan) self.sort_indices = vaex.array_types.to_numpy(indices) # the bin_values will still be pre sorted, maybe that is confusing (implementation detail) self.bin_values = pa.compute.take(self.bin_values, self.sort_indices) else: self.sort_indices = None self.set = set # TODO: we modify the dataframe in place, this is not nice basename = 'set_%s' % vaex.utils._python_save_name(str(expression)) self.setname = self.df.add_variable(basename, self.set, unique=True) self.binby_expression = '_ordinal_values(%s, %s)' % (self.expression, self.setname) self.N = len(self.bin_values) self.bin_values = self.expression.dtype.create_array(self.bin_values) self.binner = self.df._binner_ordinal(self.binby_expression, self.N)
def join(df, other, on=None, left_on=None, right_on=None, lprefix='', rprefix='', lsuffix='', rsuffix='', how='left', allow_duplication=False, prime_growth=False, cardinality_other=None, inplace=False): # implementation of DataFrameLocal.join inner = False left = df right = other left_original = left.copy() right_original = right.copy() rprefix_original, lprefix_original = rprefix, lprefix rsuffix_original, lsuffix_original = rsuffix, lsuffix right_on_original, left_on_original = right_on, left_on if how == 'left': pass elif how == 'right': left, right = right, left lprefix, rprefix = rprefix, lprefix lsuffix, rsuffix = rsuffix, lsuffix left_on, right_on = right_on, left_on elif how == 'inner': inner = True else: raise ValueError('join type not supported: {}, only left and right'.format(how)) left = left if inplace else left.copy() on = _ensure_string_from_expression(on) left_on = _ensure_string_from_expression(left_on) right_on = _ensure_string_from_expression(right_on) left_on = left_on or on right_on = right_on or on for name in right: if left_on and (rprefix + name + rsuffix == lprefix + left_on + lsuffix): continue # it's ok when we join on the same column name if name in left and rprefix + name + rsuffix == lprefix + name + lsuffix: raise ValueError('column name collision: {} exists in both column, and no proper suffix given' .format(name)) right = right.extract() # get rid of filters and active_range assert left.length_unfiltered() == left.length_original() N = left.length_unfiltered() N_other = len(right) if left_on is None and right_on is None: lookup = None else: df = left # we index the right side, this assumes right is smaller in size index = right._index(right_on, prime_growth=prime_growth, cardinality=cardinality_other) dtype = left.data_type(left_on) duplicates_right = index.has_duplicates if duplicates_right and not allow_duplication: raise ValueError('This join will lead to duplication of rows which is disabled, pass allow_duplication=True') # our max value for the lookup table is the row index number, so if we join a small # df with say 100 rows, we can do it with a int8 lookup_dtype = vaex.utils.required_dtype_for_max(len(right)) # we put in the max value to maximize triggering failures in the case of a bug (we don't want # to point to row 0 in case we do, we'd rather crash) lookup = np.full(left._length_original, np.iinfo(lookup_dtype).max, dtype=lookup_dtype) nthreads = df.executor.thread_pool.nthreads lookup_masked = [False] * nthreads # does the lookup contain masked/-1 values? lookup_extra_chunks = [] from vaex.column import _to_string_sequence def map(thread_index, i1, i2, selection_masks, blocks): ar = blocks[0] if vaex.array_types.is_string_type(dtype): previous_ar = ar ar = _to_string_sequence(ar) if dtype.is_datetime: ar = ar.view(np.int64) if np.ma.isMaskedArray(ar): mask = np.ma.getmaskarray(ar) found_masked = index.map_index_masked(ar.data, mask, lookup[i1:i2]) lookup_masked[thread_index] = lookup_masked[thread_index] or found_masked if duplicates_right: extra = index.map_index_duplicates(ar.data, mask, i1) lookup_extra_chunks.append(extra) else: found_masked = index.map_index(ar, lookup[i1:i2]) lookup_masked[thread_index] = lookup_masked[thread_index] or found_masked if duplicates_right: extra = index.map_index_duplicates(ar, i1) lookup_extra_chunks.append(extra) def reduce(a, b): pass left.map_reduce(map, reduce, [left_on], delay=False, name='fill looking', info=True, to_numpy=False, ignore_filter=True) if len(lookup_extra_chunks): # if the right has duplicates, we increase the left of left, and the lookup array lookup_left = np.concatenate([k[0] for k in lookup_extra_chunks]) lookup_right = np.concatenate([k[1] for k in lookup_extra_chunks]) left = left.concat(left.take(lookup_left)) lookup = np.concatenate([lookup, lookup_right]) if inner: left_mask_matched = lookup != -1 # all the places where we found a match to the right lookup = lookup[left_mask_matched] # filter the lookup table to the right left_indices_matched = np.where(left_mask_matched)[0] # convert mask to indices for the left # indices can still refer to filtered rows, so do not drop the filter left = left.take(left_indices_matched, filtered=False, dropfilter=False) direct_indices_map = {} # for performance, keeps a cache of two levels of indirection of indices def mangle_name(prefix, name, suffix): if name.startswith('__'): return '__' + prefix + name[2:] + suffix else: return prefix + name + suffix # first, do renaming, so all column names are unique right_names = right.get_names(hidden=True) left_names = left.get_names(hidden=True) for name in right_names: if name in left_names: # find a unique name across both dataframe, including the new name for the left all_names = list(set(right_names + left_names)) all_names.append(mangle_name(lprefix, name, lsuffix)) # we dont want to steal the left's name all_names.remove(name) # we could even claim the original name new_name = mangle_name(rprefix, name, rsuffix) # we will not add this column twice when it is the join column if new_name != left_on: if new_name in all_names: # it's still not unique new_name = vaex.utils.find_valid_name(new_name, all_names) right.rename(name, new_name) right_names[right_names.index(name)] = new_name # and the same for the left all_names = list(set(right_names + left_names)) all_names.remove(name) new_name = mangle_name(lprefix, name, lsuffix) if new_name in all_names: # still not unique new_name = vaex.utils.find_valid_name(new_name, all_names) left.rename(name, new_name) left_names[left_names.index(name)] = new_name # now we add columns from the right, to the left right_names = right.get_names(hidden=True) left_names = left.get_names(hidden=True) right_columns = [] for name in right_names: column_name = name if name == left_on and name in left_names: continue # skip when it's the join column assert name not in left_names if name in right.variables: left.set_variable(name, right.variables[name]) elif column_name in right.virtual_columns: left.add_virtual_column(name, right.virtual_columns[column_name]) elif column_name in right.functions: if name in left.functions: raise NameError(f'Name collision for function {name}') left.functions[name] = right.functions[name] else: right_columns.append(name) # we already add the column name here to get the same order left.column_names.append(name) left._initialize_column(name) # merge the two datasets right_dataset = right.dataset.project(*right_columns) if lookup is not None: # if lookup is None, we do a row based join # and we only need to merge. # if we have an array of lookup indices, we 'take' those right_dataset = right_dataset.take(lookup, masked=any(lookup_masked)) dataset = left.dataset.merged(right_dataset) # row number etc should not have changed, we only append new columns # so no need to reset caches left._dataset = DatasetJoin(dataset, left_original, right_original, on=on, left_on=left_on_original, right_on=right_on_original, lprefix=lprefix_original, rprefix=rprefix_original, lsuffix=lsuffix_original, rsuffix=rsuffix_original, how=how, allow_duplication=allow_duplication, prime_growth=prime_growth, cardinality_other=cardinality_other ) return left
def derivative(self, var, simplify=True): var = _ensure_string_from_expression(var) return self.__class__( self.ds, expresso.derivative(self.expression, var, simplify=simplify))
def __init__(self, df, by, sort=False, combine=False, expand=True, row_limit=None, copy=True, progress=None): '''Note that row_limit only works in combination with combine=True''' df_original = df if copy: df = df.copy() # we will mutate the df (Add variables), this will keep the original dataframe unchanged self.df = df self.sort = sort self.expand = expand # keep as pyarrow struct? self.progressbar = vaex.utils.progressbars(progress) self.progressbar_groupers = self.progressbar.add("groupers") if not isinstance(by, collections_abc.Iterable)\ or isinstance(by, six.string_types): by = [by] self.by = [] self.by_original = by for by_value in by: if not isinstance(by_value, BinnerBase): expression = df[_ensure_string_from_expression(by_value)] if df.is_category(by_value): by_value = GrouperCategory(expression, sort=sort, row_limit=row_limit) else: dtype = expression.dtype if dtype == np.dtype('uint8') or dtype == np.dtype('int8') or dtype == np.dtype('bool'): by_value = BinnerInteger(expression) # always sorted, and pre_sorted else: # we cannot mix _combine with BinnerInteger yet by_value = Grouper(expression, sort=sort, row_limit=row_limit, df_original=df_original, progress=self.progressbar_groupers, allow_simplify=True) self.by.append(by_value) @vaex.delayed def possible_combine(*binner_promises): # if a binner realized there is a simpler way (e.g. grouper -> intbinner) self.by = [by.simpler if by.simpler is not None else by for by in self.by] # because binners can be created from other dataframes (we make a copy) # we let it mutate *our* dataframe for binner in self.by: binner._create_binner(self.df) cells = product([grouper.N for grouper in self.by]) @vaex.delayed def set_combined(combined): combined._create_binner(self.df) self.by = [combined] self.combine = True if ((row_limit is not None) or (combine is True)) and len(self.by) >= 2 and cells > 0: promise = set_combined(_combine(self.df, self.by, sort=sort, row_limit=row_limit, progress=self.progressbar_groupers)) elif combine == 'auto' and len(self.by) >= 2: # default assume we cannot combined self.combine = False promise = vaex.promise.Promise.fulfilled(None) # don't even try when one grouper has 0 options if cells > 0: dim = len(self.by) rows = df.length_unfiltered() # we don't want to trigger a computation occupancy = rows/cells logger.debug('%s rows and %s grid cells => occupancy=%s', rows, cells, occupancy) # we want each cell to have a least 10x occupacy if occupancy < 10: logger.info(f'Combining {len(self.by)} groupers into 1') promise = set_combined(_combine(self.df, self.by, sort=sort, row_limit=row_limit, progress=self.progressbar_groupers)) self.combine = True else: self.combine = False promise = vaex.promise.Promise.fulfilled(None) @vaex.delayed def process(_ignore): self.dense = len(self.by) == 1 and self.by[0].dense self.groupby_expression = [str(by.expression) for by in self.by] self.binners = tuple(by.binner for by in self.by) self.shape = [by.N for by in self.by] self.dims = [by.label for by in self.by] self.progressbar_groupers(1) return process(promise) self._promise_by = self.progressbar_groupers.exit_on(possible_combine(*[by._promise for by in self.by]))
def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False, progress=None, allow_simplify=False): self.df = df or expression.ds self.sort = sort self.pre_sort = pre_sort # we prefer to calculate the set the original dataframe to have better cache hits, and modify df if df_original is None: df_original = self.df self.sort = sort self.expression = expression self.allow_simplify = allow_simplify # make sure it's an expression self.expression = self.df[_ensure_string_from_expression(self.expression)] self.label = self.expression._label self.progressbar = vaex.utils.progressbars(progress, title=f"grouper: {repr(self.label)}" ) dtype = self.expression.dtype if materialize_experimental: set, values = df_original._set(self.expression, limit=row_limit, return_inverse=True) # TODO: add column should have a unique argument self.df.add_column(f'__materialized_{self.label}', values) self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): self.bin_values = pa.array(self.bin_values.to_numpy()) self.binby_expression = 'bla' self.N = len(self.bin_values) self.min_value = 0 self.binner = self.df._binner_ordinal('bla', self.N, self.min_value) self.sort_indices = None else: @vaex.delayed def process(hashmap_unique): self.bin_values = hashmap_unique.keys() if self.allow_simplify and dtype == int and len(self.bin_values): vmin = self.bin_values.min() vmax = self.bin_values.max() int_range = vmax - vmin + 1 # we allow for 25% unused 'slots' bins = len(self.bin_values) if int_range <= (bins * 4 / 3): dense = bins == int_range self.simpler = BinnerInteger(self.expression, min_value=vmin, max_value=vmax, dropmissing=not hashmap_unique.has_null, dense=dense) return if vaex.dtype_of(self.bin_values) == int and len(self.bin_values): max_value = self.bin_values.max() self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value)) logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values)) if self.sort: if pre_sort: hashmap_unique, self.bin_values = hashmap_unique.sorted(keys=self.bin_values, return_keys=True) self.sort_indices = None else: indices = pa.compute.sort_indices(self.bin_values) self.sort_indices = vaex.array_types.to_numpy(indices) # the bin_values will still be pre sorted, maybe that is confusing (implementation detail) self.bin_values = pa.compute.take(self.bin_values, self.sort_indices) else: self.sort_indices = None self.hashmap_unique = hashmap_unique self.basename = 'hashmap_unique_%s' % vaex.utils._python_save_name(str(self.expression) + "_" + hashmap_unique.fingerprint) self.N = len(self.bin_values) # for datetimes, we converted to int if dtype.is_datetime: self.bin_values = dtype.create_array(self.bin_values) self._promise = process(df_original._hash_map_unique(self.expression, limit=row_limit, delay=True, progress=self.progressbar))
def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False, progress=None): self.df = df or expression.ds self.sort = sort self.pre_sort = pre_sort # we prefer to calculate the set the original dataframe to have better cache hits, and modify df if df_original is None: df_original = self.df self.sort = sort self.expression = expression # make sure it's an expression self.expression = self.df[_ensure_string_from_expression( self.expression)] self.label = self.expression._label self.progressbar = vaex.utils.progressbars( progress, title=f"grouper: {repr(self.label)}") dtype = self.expression.dtype if materialize_experimental: set, values = df_original._set(self.expression, unique_limit=row_limit, return_inverse=True) # TODO: add column should have a unique argument self.df.add_column(f'__materialized_{self.label}', values) self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): self.bin_values = pa.array(self.bin_values.to_numpy()) self.binby_expression = 'bla' self.N = len(self.bin_values) self.min_value = 0 self.binner = self.df._binner_ordinal('bla', self.N, self.min_value) self.sort_indices = None else: @vaex.delayed def process(set): self.bin_values = set.key_array() if isinstance(self.bin_values, vaex.superstrings.StringList64): # TODO: find out why this more efficient path does not work # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values) # self.bin_values = pa.array(col) self.bin_values = pa.array(self.bin_values.to_numpy()) if vaex.dtype_of(self.bin_values) == int: max_value = self.bin_values.max() self.bin_values = self.bin_values.astype( vaex.utils.required_dtype_for_max(max_value)) logger.debug( 'Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values)) if set.has_null and (dtype.is_primitive or dtype.is_datetime): mask = np.zeros(shape=self.bin_values.shape, dtype="?") mask[set.null_value] = 1 self.bin_values = np.ma.array(self.bin_values, mask=mask) if self.sort: self.bin_values = vaex.array_types.to_arrow( self.bin_values) indices = pa.compute.sort_indices( self.bin_values) #[offset:]) if pre_sort: self.bin_values = pa.compute.take( self.bin_values, indices) # arrow sorts with null last null_value = -1 if not set.has_null else len( self.bin_values) - 1 fingerprint = set.fingerprint + "-sorted" if dtype.is_string: bin_values = vaex.column.ColumnStringArrow.from_arrow( self.bin_values) string_sequence = bin_values.string_sequence set = type(set)(string_sequence, null_value, set.nan_count, set.null_count, fingerprint) else: set = type(set)(self.bin_values, null_value, set.nan_count, set.null_count, fingerprint) self.sort_indices = None else: self.sort_indices = vaex.array_types.to_numpy(indices) # the bin_values will still be pre sorted, maybe that is confusing (implementation detail) self.bin_values = pa.compute.take( self.bin_values, self.sort_indices) else: self.sort_indices = None self.set = set self.basename = 'set_%s' % vaex.utils._python_save_name( str(self.expression) + "_" + set.fingerprint) self.N = len(self.bin_values) # for datetimes, we converted to int if dtype.is_datetime: self.bin_values = dtype.create_array(self.bin_values) self._promise = process( df_original._set(self.expression, unique_limit=row_limit, delay=True, progress=self.progressbar))
def derivative(self, var, simplify=True): var = _ensure_string_from_expression(var) return self.__class__(self, expresso.derivative(self.expression, var, simplify=simplify))
def __init__(self, df, by, sort=False, combine=False, expand=True, row_limit=None, copy=True): '''Note that row_limit only works in combination with combine=True''' df_original = df if copy: df = df.copy( ) # we will mutate the df (Add variables), this will keep the original dataframe unchanged self.df = df self.sort = sort self.expand = expand # keep as pyarrow struct? if not isinstance(by, collections_abc.Iterable)\ or isinstance(by, six.string_types): by = [by] self.by = [] self.by_original = by for by_value in by: if not isinstance(by_value, BinnerBase): expression = df[_ensure_string_from_expression(by_value)] if df.is_category(by_value): by_value = GrouperCategory(expression, sort=sort, row_limit=row_limit) else: dtype = expression.dtype if dtype == np.dtype('uint8') or dtype == np.dtype('bool'): by_value = BinnerInteger( expression) # doesn't modify, always sorted else: by_value = Grouper(expression, sort=sort, row_limit=row_limit, df_original=df_original) self.by.append(by_value) @vaex.delayed def possible_combine(*binner_promises): # because binners can be created from other dataframes (we make a copy) # we let it mutate *our* dataframe for binner in self.by: binner._create_binner(self.df) @vaex.delayed def set_combined(combined): combined._create_binner(self.df) self.by = [combined] self.combine = True if combine is True and len(self.by) >= 2: promise = set_combined( _combine(self.df, self.by, sort=sort, row_limit=row_limit)) elif combine == 'auto' and len(self.by) >= 2: cells = product([grouper.N for grouper in self.by]) dim = len(self.by) rows = df.length_unfiltered( ) # we don't want to trigger a computation occupancy = rows / cells logger.debug('%s rows and %s grid cells => occupancy=%s', rows, cells, occupancy) # we want each cell to have a least 10x occupacy if occupancy < 10: logger.info(f'Combining {len(self.by)} groupers into 1') promise = set_combined( _combine(self.df, self.by, sort=sort, row_limit=row_limit)) self.combine = True else: self.combine = False promise = vaex.promise.Promise.fulfilled(None) else: self.combine = False promise = vaex.promise.Promise.fulfilled(None) @vaex.delayed def process(_ignore): self.groupby_expression = [ str(by.expression) for by in self.by ] self.binners = tuple(by.binner for by in self.by) self.shape = [by.N for by in self.by] self.dims = self.groupby_expression[:] return process(promise) self._promise_by = possible_combine(*[by._promise for by in self.by])