def process(_ignore): logger.info(f"extracing indices of parent groupers ({self.N:,} unique rows)") df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values}) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns, progress=progressbar) def compress(ar): if vaex.dtype_of(ar).kind == 'i': ar = vaex.array_types.to_numpy(ar) max_value = ar.max() ar = ar.astype(vaex.utils.required_dtype_for_max(max_value)) return ar indices_parents = [compress(ar) for ar in indices_parents] bin_values = {} logger.info(f"extracing labels of parent groupers...") # NOTE: we can also use dict encoding instead of take for indices, parent in zip(indices_parents, parents): if sort: assert parent.pre_sort, "cannot sort while parent not presorted" assert parent.sort_indices is None dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar) else: bin_values[parent.label] = parent.bin_values.take(indices) # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values) logger.info(f"extracing labels of parent groupers done") return pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def test_vconstant(value): length = 100 df = vaex.from_arrays(x=vaex.vconstant(value=value, length=length), y=vaex.vrange(0, length)) assert len(df.columns['x']) == length assert df.x[:3].tolist() == [value] * 3 df_filter = df[df.y < 31] assert len(df_filter) == 31 assert df_filter.x[:3].tolist() == [value] * 3
def test_vrange(): N = 1000**3 df = vaex.from_arrays(x=vaex.vrange(0, N)) assert len(df.columns['x']) == N trimmed = df.columns['x'].trim(2,4) assert trimmed.start == 2 assert trimmed.stop == 4 assert len(df) == N assert len(df[0:10]) == 10 assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist() df['y'] = df.x**2 assert df[1:11].y.tolist()== (np.arange(1, 11)**2).tolist()
def __init__(self, expression, df, multipliers, parents, sort, row_limit=None): '''Will group by 1 expression, which is build up from multiple expressions. Used in the sparse/combined group by. ''' super().__init__(expression, df, sort=sort, row_limit=row_limit) assert len(multipliers) == len(parents) assert multipliers[-1] == 1 self.df = df self.label = 'SHOULD_NOT_BE_USED' self.expression = expression # efficient way to find the original bin values (parent.bin_value) from the 'compressed' # self.bin_values df = vaex.from_dict({ 'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values }) df[f'index_0'] = df['bin_value'] // multipliers[0] df[f'leftover_0'] = df[f'bin_value'] % multipliers[0] for i in range(1, len(multipliers)): df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i] df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i] columns = [f'index_{i}' for i in range(len(multipliers))] indices_parents = df.evaluate(columns) bin_values = {} for indices, parent in zip(indices_parents, parents): dtype = vaex.dtype_of(parent.bin_values) if dtype.is_struct: # collapse parent struct into our flat struct for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()): bin_values[field.name] = ar.take(indices) else: bin_values[parent.label] = parent.bin_values.take(indices) self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
def test_columns(): df = vaex.from_arrays(x=vaex.vrange(0, 10)) ar = pa.array(['foo', 'bar']) df = vaex.from_arrays(x=vaex.column.ColumnStringArrow.from_arrow(ar))