Exemplo n.º 1
0
 def process(_ignore):
     logger.info(f"extracing indices of parent groupers ({self.N:,} unique rows)")
     df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values})
     df[f'index_0'] = df['bin_value'] // multipliers[0]
     df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
     for i in range(1, len(multipliers)):
         df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
         df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
     columns = [f'index_{i}' for i in range(len(multipliers))]
     indices_parents = df.evaluate(columns, progress=progressbar)
     def compress(ar):
         if vaex.dtype_of(ar).kind == 'i':
             ar = vaex.array_types.to_numpy(ar)
             max_value = ar.max()
             ar = ar.astype(vaex.utils.required_dtype_for_max(max_value))
             return ar
     indices_parents = [compress(ar) for ar in indices_parents]
     bin_values = {}
     logger.info(f"extracing labels of parent groupers...")
     # NOTE: we can also use dict encoding instead of take
     for indices, parent in zip(indices_parents, parents):
         if sort:
             assert parent.pre_sort, "cannot sort while parent not presorted"
             assert parent.sort_indices is None
         dtype = vaex.dtype_of(parent.bin_values)
         if dtype.is_struct:
             # collapse parent struct into our flat struct
             for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()):
                 bin_values[field.name] = ar.take(indices)
                 # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar)
         else:
             bin_values[parent.label] = parent.bin_values.take(indices)
             # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values)
     logger.info(f"extracing labels of parent groupers done")
     return pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
Exemplo n.º 2
0
def test_vconstant(value):
    length = 100
    df = vaex.from_arrays(x=vaex.vconstant(value=value, length=length),
                          y=vaex.vrange(0, length))

    assert len(df.columns['x']) == length
    assert df.x[:3].tolist() == [value] * 3

    df_filter = df[df.y < 31]
    assert len(df_filter) == 31
    assert df_filter.x[:3].tolist() == [value] * 3
Exemplo n.º 3
0
def test_vrange():
    N = 1000**3
    df = vaex.from_arrays(x=vaex.vrange(0, N))
    assert len(df.columns['x']) == N
    trimmed = df.columns['x'].trim(2,4)
    assert trimmed.start == 2
    assert trimmed.stop == 4
    assert len(df) == N
    assert len(df[0:10]) == 10
    assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist()
    df['y'] = df.x**2
    assert df[1:11].y.tolist()== (np.arange(1, 11)**2).tolist()
Exemplo n.º 4
0
def test_vrange():
    N = 1000**3
    df = vaex.from_arrays(x=vaex.vrange(0, N))
    assert len(df.columns['x']) == N
    trimmed = df.columns['x'].trim(2,4)
    assert trimmed.start == 2
    assert trimmed.stop == 4
    assert len(df) == N
    assert len(df[0:10]) == 10
    assert df[1:11].x.tolist() == (np.arange(1, 11.)).tolist()
    df['y'] = df.x**2
    assert df[1:11].y.tolist()== (np.arange(1, 11)**2).tolist()
Exemplo n.º 5
0
    def __init__(self,
                 expression,
                 df,
                 multipliers,
                 parents,
                 sort,
                 row_limit=None):
        '''Will group by 1 expression, which is build up from multiple expressions.

        Used in the sparse/combined group by.
        '''
        super().__init__(expression, df, sort=sort, row_limit=row_limit)
        assert len(multipliers) == len(parents)

        assert multipliers[-1] == 1
        self.df = df
        self.label = 'SHOULD_NOT_BE_USED'
        self.expression = expression
        # efficient way to find the original bin values (parent.bin_value) from the 'compressed'
        # self.bin_values
        df = vaex.from_dict({
            'row': vaex.vrange(0, self.N, dtype='i8'),
            'bin_value': self.bin_values
        })
        df[f'index_0'] = df['bin_value'] // multipliers[0]
        df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
        for i in range(1, len(multipliers)):
            df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
            df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
        columns = [f'index_{i}' for i in range(len(multipliers))]
        indices_parents = df.evaluate(columns)
        bin_values = {}
        for indices, parent in zip(indices_parents, parents):
            dtype = vaex.dtype_of(parent.bin_values)
            if dtype.is_struct:
                # collapse parent struct into our flat struct
                for field, ar in zip(parent.bin_values.type,
                                     parent.bin_values.flatten()):
                    bin_values[field.name] = ar.take(indices)
            else:
                bin_values[parent.label] = parent.bin_values.take(indices)
        self.bin_values = pa.StructArray.from_arrays(bin_values.values(),
                                                     bin_values.keys())
Exemplo n.º 6
0
def test_columns():
    df = vaex.from_arrays(x=vaex.vrange(0, 10))
    ar = pa.array(['foo', 'bar'])
    df = vaex.from_arrays(x=vaex.column.ColumnStringArrow.from_arrow(ar))