示例#1
0
文件: hash.py 项目: t-triobox/vaex
 def from_keys(cls, keys, dtype=None, fingerprint=''):
     keys = vaex.array_types.convert(keys, 'numpy-arrow')
     dtype = vaex.dtype_of(keys) if dtype is None else dtype
     if dtype == float:
         nancount = np.isnan(keys).sum()
     else:
         nancount = 0
     null_count = 0
     null_value = -1
     if np.ma.isMaskedArray(keys):
         null_count = keys.mask.sum()
         if null_count == 0:
             keys = keys.data
         elif null_count == 1:
             null_value = np.where(keys.mask == 1)[0]
             keys = keys.data
         else:
             raise ValueError('key arrays contained more than 1 null value')
     set_type = vaex.hash.ordered_set_type_from_dtype(dtype)
     if dtype.is_string:
         values = vaex.column.ColumnStringArrow.from_arrow(keys)
         string_sequence = values.string_sequence
         mask = string_sequence.mask()
         if mask is not None:
             null_count = mask.sum()
             if null_count == 0:
                 pass  # fine
             elif null_count == 1:
                 null_value = np.where(mask == 1)[0]
             else:
                 raise ValueError('key arrays contained more than 1 null value')
         hash_map_unique_internal = set_type(string_sequence, null_value, nancount, null_count, fingerprint)
     else:
         hash_map_unique_internal = set_type(keys, null_value, nancount, null_count, fingerprint)
     return HashMapUnique(dtype, _internal=hash_map_unique_internal)
示例#2
0
    def reduce(self, others):
        all = [self] + others
        all = [k.set for k in all if k.set is not None]
        set_merged, *others = all
        import time
        t0 = time.time()
        set_merged.merge(others)
        logger.info(
            f'merge took {time.time()-t0} seconds, size {len(set_merged):,}, byte_size {sys.getsizeof(set_merged):,}'
        )

        if self.return_inverse:
            # sort by row index
            self.chunks.sort(key=lambda x: x[0])
            length = 0
            for i1, i2, values, map_index in self.chunks:
                length += len(values)
            self.values = np.empty(length,
                                   vaex.dtype_of(self.chunks[0][2]).numpy)
            # TODO: we could do this parallel, but overhead is small
            for i1, i2, values, map_index in self.chunks:
                set_merged.flatten_values(values, map_index,
                                          self.values[i1:i2])

        if self.unique_limit is not None:
            count = len(set_merged)
            if count > self.unique_limit:
                raise vaex.RowLimitException(
                    f'Resulting set has {count:,} unique combinations, which is larger than the allowed value of {self.unique_limit:,}'
                )
        self.set = set_merged
        self.set.fingerprint = f'set-{self.fingerprint}'
示例#3
0
文件: groupby.py 项目: t-triobox/vaex
 def process(_ignore):
     logger.info(f"extracing indices of parent groupers ({self.N:,} unique rows)")
     df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values})
     df[f'index_0'] = df['bin_value'] // multipliers[0]
     df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
     for i in range(1, len(multipliers)):
         df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
         df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
     columns = [f'index_{i}' for i in range(len(multipliers))]
     indices_parents = df.evaluate(columns, progress=progressbar)
     def compress(ar):
         if vaex.dtype_of(ar).kind == 'i':
             ar = vaex.array_types.to_numpy(ar)
             max_value = ar.max()
             ar = ar.astype(vaex.utils.required_dtype_for_max(max_value))
             return ar
     indices_parents = [compress(ar) for ar in indices_parents]
     bin_values = {}
     logger.info(f"extracing labels of parent groupers...")
     # NOTE: we can also use dict encoding instead of take
     for indices, parent in zip(indices_parents, parents):
         if sort:
             assert parent.pre_sort, "cannot sort while parent not presorted"
             assert parent.sort_indices is None
         dtype = vaex.dtype_of(parent.bin_values)
         if dtype.is_struct:
             # collapse parent struct into our flat struct
             for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()):
                 bin_values[field.name] = ar.take(indices)
                 # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar)
         else:
             bin_values[parent.label] = parent.bin_values.take(indices)
             # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values)
     logger.info(f"extracing labels of parent groupers done")
     return pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
示例#4
0
 def compress(ar):
     if vaex.dtype_of(ar).kind == 'i':
         ar = vaex.array_types.to_numpy(ar)
         max_value = ar.max()
         ar = ar.astype(
             vaex.utils.required_dtype_for_max(max_value))
         return ar
示例#5
0
            def process(set):
                self.bin_values = set.key_array()

                if isinstance(self.bin_values, vaex.superstrings.StringList64):
                    # TODO: find out why this more efficient path does not work
                    # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values)
                    # self.bin_values = pa.array(col)
                    self.bin_values = pa.array(self.bin_values.to_numpy())
                if vaex.dtype_of(self.bin_values) == int:
                    max_value = self.bin_values.max()
                    self.bin_values = self.bin_values.astype(
                        vaex.utils.required_dtype_for_max(max_value))
                logger.debug(
                    'Constructed grouper for expression %s with %i values',
                    str(expression), len(self.bin_values))

                if set.has_null and (dtype.is_primitive or dtype.is_datetime):
                    mask = np.zeros(shape=self.bin_values.shape, dtype="?")
                    mask[set.null_value] = 1
                    self.bin_values = np.ma.array(self.bin_values, mask=mask)
                if self.sort:
                    self.bin_values = vaex.array_types.to_arrow(
                        self.bin_values)
                    indices = pa.compute.sort_indices(
                        self.bin_values)  #[offset:])
                    if pre_sort:
                        self.bin_values = pa.compute.take(
                            self.bin_values, indices)
                        # arrow sorts with null last
                        null_value = -1 if not set.has_null else len(
                            self.bin_values) - 1
                        fingerprint = set.fingerprint + "-sorted"
                        if dtype.is_string:
                            bin_values = vaex.column.ColumnStringArrow.from_arrow(
                                self.bin_values)
                            string_sequence = bin_values.string_sequence
                            set = type(set)(string_sequence, null_value,
                                            set.nan_count, set.null_count,
                                            fingerprint)
                        else:
                            set = type(set)(self.bin_values, null_value,
                                            set.nan_count, set.null_count,
                                            fingerprint)
                        self.sort_indices = None
                    else:
                        self.sort_indices = vaex.array_types.to_numpy(indices)
                        # the bin_values will still be pre sorted, maybe that is confusing (implementation detail)
                        self.bin_values = pa.compute.take(
                            self.bin_values, self.sort_indices)
                else:
                    self.sort_indices = None
                self.set = set

                self.basename = 'set_%s' % vaex.utils._python_save_name(
                    str(self.expression) + "_" + set.fingerprint)

                self.N = len(self.bin_values)
                # for datetimes, we converted to int
                if dtype.is_datetime:
                    self.bin_values = dtype.create_array(self.bin_values)
示例#6
0
文件: hash.py 项目: t-triobox/vaex
 def decode(encoding, obj_spec):
     clsname = obj_spec['class']
     cls = getattr(vaex.hash, clsname)
     keys = encoding.decode('array', obj_spec['data']['keys'])
     dtype = vaex.dtype_of(keys)
     if dtype.is_string:
         keys = vaex.strings.to_string_sequence(keys)
     _hash_map_internal = cls(keys, obj_spec['data']['null_value'], obj_spec['data']['nan_count'], obj_spec['data']['missing_count'], obj_spec['data']['fingerprint'])
     dtype = encoding.decode('dtype', obj_spec['dtype'])
     return vaex.hash.HashMapUnique(dtype, _internal=_hash_map_internal)
示例#7
0
def list_unwrap(ar, level=-1):
    '''Returns the values in a (nested) list, and a callable that puts it back in the same structure'''
    from .convert import trim_offsets
    list_parameters = []
    dtype = vaex.dtype_of(ar)
    array_levels = [ar]
    while dtype.is_list:
        list_parameters.append(
            [ar.type, len(ar),
             ar.buffers(), ar.null_count, ar.offset])
        # flattened.append(ar.type)
        i1 = ar.offsets[0].as_py()
        i2 = ar.offsets[-1].as_py()
        ar = ar.values.slice(i1, i2)
        array_levels.append(ar)
        dtype = dtype.value_type

    if level == -1:
        ar = array_levels[-1]
    else:
        ar = array_levels[level]
        list_parameters = list_parameters[:level + 1]

    def wrapper(new_values):
        if list_parameters and list_parameters:
            ar = None
            for type, length, buffers, null_count, offset in list_parameters[::
                                                                             -1]:
                if ar is None:
                    # buffers =
                    # assert offset == 0
                    buffers = buffers[:2]
                    buffers = trim_offsets(offset, length, *buffers)
                    offset = 0
                    new_values = vaex.array_types.to_arrow(new_values)
                    type = pa.list_(new_values.type)
                    ar = pa.ListArray.from_buffers(type,
                                                   length,
                                                   [buffers[0], buffers[1]],
                                                   null_count,
                                                   offset,
                                                   children=[new_values])
                else:
                    ar = pa.ListArray.from_buffers(type,
                                                   length,
                                                   [buffers[0], buffers[1]],
                                                   null_count,
                                                   offset,
                                                   children=[ar])
            return ar
        else:
            return new_values

    return ar, wrapper
示例#8
0
    def __init__(self,
                 expression,
                 df,
                 multipliers,
                 parents,
                 sort,
                 row_limit=None):
        '''Will group by 1 expression, which is build up from multiple expressions.

        Used in the sparse/combined group by.
        '''
        super().__init__(expression, df, sort=sort, row_limit=row_limit)
        assert len(multipliers) == len(parents)

        assert multipliers[-1] == 1
        self.df = df
        self.label = 'SHOULD_NOT_BE_USED'
        self.expression = expression
        # efficient way to find the original bin values (parent.bin_value) from the 'compressed'
        # self.bin_values
        df = vaex.from_dict({
            'row': vaex.vrange(0, self.N, dtype='i8'),
            'bin_value': self.bin_values
        })
        df[f'index_0'] = df['bin_value'] // multipliers[0]
        df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
        for i in range(1, len(multipliers)):
            df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
            df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
        columns = [f'index_{i}' for i in range(len(multipliers))]
        indices_parents = df.evaluate(columns)
        bin_values = {}
        for indices, parent in zip(indices_parents, parents):
            dtype = vaex.dtype_of(parent.bin_values)
            if dtype.is_struct:
                # collapse parent struct into our flat struct
                for field, ar in zip(parent.bin_values.type,
                                     parent.bin_values.flatten()):
                    bin_values[field.name] = ar.take(indices)
            else:
                bin_values[parent.label] = parent.bin_values.take(indices)
        self.bin_values = pa.StructArray.from_arrays(bin_values.values(),
                                                     bin_values.keys())
示例#9
0
文件: groupby.py 项目: sthagen/vaex
    def __init__(self, expression, df, multipliers, parents, sort, row_limit=None):
        '''Will group by 1 expression, which is build up from multiple expressions.

        Used in the sparse/combined group by.
        '''
        super().__init__(expression, df, sort=sort, row_limit=row_limit)
        assert len(multipliers) == len(parents)

        assert multipliers[-1] == 1
        self.df = df
        self.label = 'SHOULD_NOT_BE_USED'
        self.expression = expression
        # efficient way to find the original bin values (parent.bin_value) from the 'compressed'
        # self.bin_values
        df = vaex.from_dict({'row': vaex.vrange(0, self.N, dtype='i8'), 'bin_value': self.bin_values})
        df[f'index_0'] = df['bin_value'] // multipliers[0]
        df[f'leftover_0'] = df[f'bin_value'] % multipliers[0]
        for i in range(1, len(multipliers)):
            df[f'index_{i}'] = df[f'leftover_{i-1}'] // multipliers[i]
            df[f'leftover_{i}'] = df[f'leftover_{i-1}'] % multipliers[i]
        columns = [f'index_{i}' for i in range(len(multipliers))]
        indices_parents = df.evaluate(columns)
        def compress(ar):
            if vaex.dtype_of(ar).kind == 'i':
                ar = vaex.array_types.to_numpy(ar)
                max_value = ar.max()
                ar = ar.astype(vaex.utils.required_dtype_for_max(max_value))
                return ar
        indices_parents = [compress(ar) for ar in indices_parents]
        bin_values = {}
        # NOTE: we can also use dict encoding instead of take
        for indices, parent in zip(indices_parents, parents):
            dtype = vaex.dtype_of(parent.bin_values)
            if dtype.is_struct:
                # collapse parent struct into our flat struct
                for field, ar in zip(parent.bin_values.type, parent.bin_values.flatten()):
                    bin_values[field.name] = ar.take(indices)
                    # bin_values[field.name] = pa.DictionaryArray.from_arrays(indices, ar)
            else:
                bin_values[parent.label] = parent.bin_values.take(indices)
                # bin_values[parent.label] = pa.DictionaryArray.from_arrays(indices, parent.bin_values)
        self.bin_values = pa.StructArray.from_arrays(bin_values.values(), bin_values.keys())
示例#10
0
文件: dataset.py 项目: t-triobox/vaex
 def _map_hdf5_array(self, data, mask=None, as_arrow=False):
     offset = data.id.get_offset()
     if len(data) == 0 and offset is None:
         offset = 0  # we don't care about the offset for empty arrays
     if offset is None:  # non contiguous array, chunked arrays etc
         # we don't support masked in this case
         assert as_arrow is False
         column = ColumnNumpyLike(data)
         self._all_mmapped = False
         return column
     else:
         shape = data.shape
         dtype = data.dtype
         if "dtype" in data.attrs:
             # ignore the special str type, which is not a numpy dtype
             if data.attrs["dtype"] != "str":
                 dtype = data.attrs["dtype"]
                 if dtype == 'utf32':
                     dtype = np.dtype('U' + str(data.attrs['dlength']))
         #self.addColumn(column_name, offset, len(data), dtype=dtype)
         array = self._map_array(offset, dtype=dtype, shape=shape)
         if as_arrow:
             if isinstance(array, np.ndarray):
                 array = vaex.array_types.to_arrow(array)
             else:
                 array = vaex.column.ColumnArrowLazyCast(
                     array,
                     vaex.dtype_of(array).arrow)
         if mask is not None:
             if as_arrow:
                 raise TypeError('Arrow does not support byte masks')
             mask_array = self._map_hdf5_array(mask)
             if isinstance(array, np.ndarray):
                 ar = np.ma.array(array, mask=mask_array, shrink=False)
                 # assert ar.mask is mask_array, "masked array was copied"
             else:
                 ar = vaex.column.ColumnMaskedNumpy(array, mask_array)
             return ar
         else:
             return array
示例#11
0
文件: groupby.py 项目: t-triobox/vaex
            def process(hashmap_unique):
                self.bin_values = hashmap_unique.keys()
                if self.allow_simplify and dtype == int and len(self.bin_values):
                    vmin = self.bin_values.min()
                    vmax = self.bin_values.max()
                    int_range = vmax - vmin + 1
                    # we allow for 25% unused 'slots'
                    bins = len(self.bin_values)
                    if int_range <= (bins * 4 / 3):
                        dense = bins == int_range
                        self.simpler = BinnerInteger(self.expression, min_value=vmin, max_value=vmax, dropmissing=not hashmap_unique.has_null, dense=dense)
                        return

                if vaex.dtype_of(self.bin_values) == int and len(self.bin_values):
                    max_value = self.bin_values.max()
                    self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value))
                logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values))

                if self.sort:
                    if pre_sort:
                        hashmap_unique, self.bin_values = hashmap_unique.sorted(keys=self.bin_values, return_keys=True)
                        self.sort_indices = None
                    else:
                        indices = pa.compute.sort_indices(self.bin_values)
                        self.sort_indices = vaex.array_types.to_numpy(indices)
                        # the bin_values will still be pre sorted, maybe that is confusing (implementation detail)
                        self.bin_values = pa.compute.take(self.bin_values, self.sort_indices)
                else:
                    self.sort_indices = None
                self.hashmap_unique = hashmap_unique

                self.basename = 'hashmap_unique_%s' % vaex.utils._python_save_name(str(self.expression) + "_" + hashmap_unique.fingerprint)

                self.N = len(self.bin_values)
                # for datetimes, we converted to int
                if dtype.is_datetime:
                    self.bin_values = dtype.create_array(self.bin_values)
示例#12
0
文件: groupby.py 项目: sthagen/vaex
    def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False):
        self.df = df or expression.ds
        # we prefer to calculate the set the original dataframe to have better cache hits, and modify df
        if df_original is None:
            df_original = self.df
        self.sort = sort
        self.expression = expression
        # make sure it's an expression
        self.expression = self.df[_ensure_string_from_expression(self.expression)]
        self.label = self.expression._label
        if materialize_experimental:
            set, values = df_original._set(self.expression, unique_limit=row_limit, return_inverse=True)
            # TODO: add column should have a unique argument
            self.df.add_column(f'__materialized_{self.label}', values)

            self.bin_values = set.key_array()
            if isinstance(self.bin_values, vaex.superstrings.StringList64):
                self.bin_values = pa.array(self.bin_values.to_numpy())
            self.binby_expression = 'bla'
            self.N = len(self.bin_values)
            self.min_value = 0
            self.binner = self.df._binner_ordinal('bla', self.N, self.min_value)
            self.sort_indices = None
        else:
            set = df_original._set(self.expression, unique_limit=row_limit)
            self.bin_values = set.key_array()

            if isinstance(self.bin_values, vaex.superstrings.StringList64):
                # TODO: find out why this more efficient path does not work
                # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values)
                # self.bin_values = pa.array(col)
                self.bin_values = pa.array(self.bin_values.to_numpy())
            if vaex.dtype_of(self.bin_values).kind == 'i':
                max_value = self.bin_values.max()
                self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value))
            logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values))

            # since nan and null are at the start, we skip them with sorting
            if self.sort:
                dtype = self.expression.dtype
                indices = pa.compute.sort_indices(self.bin_values)#[offset:])
                if pre_sort:
                    self.bin_values = pa.compute.take(self.bin_values, indices)
                    # arrow sorts with null last
                    null_value = -1 if not set.has_null else len(self.bin_values)-1
                    fingerprint = set.fingerprint + "-sorted"
                    if dtype.is_string:
                        bin_values = vaex.column.ColumnStringArrow.from_arrow(self.bin_values)
                        string_sequence = bin_values.string_sequence
                        set = type(set)(string_sequence, null_value, set.nan_count, set.null_count, fingerprint)
                    else:
                        set = type(set)(self.bin_values, null_value, set.nan_count, set.null_count, fingerprint)
                    self.sort_indices = None
                else:
                    # TODO: skip first or first two values (null and/or nan)
                    self.sort_indices = vaex.array_types.to_numpy(indices)
                    # the bin_values will still be pre sorted, maybe that is confusing (implementation detail)
                    self.bin_values = pa.compute.take(self.bin_values, self.sort_indices)
            else:
                self.sort_indices = None
            self.set = set

            # TODO: we modify the dataframe in place, this is not nice
            basename = 'set_%s' % vaex.utils._python_save_name(str(expression))
            self.setname = self.df.add_variable(basename, self.set, unique=True)

            self.binby_expression = '_ordinal_values(%s, %s)' % (self.expression, self.setname)
            self.N = len(self.bin_values)
            self.bin_values = self.expression.dtype.create_array(self.bin_values)
            self.binner = self.df._binner_ordinal(self.binby_expression, self.N)
示例#13
0
文件: dataset.py 项目: t-triobox/vaex
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
        # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            logger.debug('loading column: %s', group_name)
            group = h5columns[group_name]
            if 'type' in group.attrs:
                type = group.attrs['type']
                if type in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix

                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass

                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items()
                              if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck(
                        (data, indices, indptr),
                        shape=(len(indptr) - 1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
                if type == 'dictionary_encoded':
                    index = self._map_column(group['indices'], as_arrow=True)
                    values = self._map_column(group['dictionary'],
                                              as_arrow=True)
                    if 'null_bitmap' in group['indices'] or 'mask' in group[
                            'indices']:
                        raise ValueError(
                            f'Did not expect null data in encoded column {group_name}'
                        )
                    if isinstance(values, vaex.column.Column):
                        encoded = vaex.column.ColumnArrowDictionaryEncoded(
                            index, values)
                    else:
                        encoded = pa.DictionaryArray.from_arrays(index, values)
                    self.add_column(group_name, encoded)
                else:
                    raise TypeError(
                        f'Unexpected type {type!r} in {group_name}')
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "alias" in column.attrs:
                    column_name = column.attrs["alias"]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                if self._version == 1:
                    column = self._map_hdf5_array(column)
                    self.add_column(column_name, column)
                elif hasattr(column["data"], "dtype"):
                    column = self._map_column(column)
                    self.add_column(column_name, column)
                    dtype = vaex.dtype_of(column)
                    logger.debug("adding column %r with dtype %r", column_name,
                                 dtype)
                else:
                    raise TypeError(f'{group_name} is missing dtype')

        all_columns = dict(**self._columns)
        # in case the column_order refers to non-existing columns
        column_order = [k for k in column_order if k in all_columns]
        column_names = []
        self._columns = {}
        for name in column_order:
            self._columns[name] = all_columns.pop(name)
        # add the rest
        for name, col in all_columns.items():
            self._columns[name] = col
示例#14
0
    def __init__(self,
                 expression,
                 values,
                 keep_other=True,
                 other_value=None,
                 sort=False,
                 label=None,
                 df=None):
        self.df = df or expression.df
        self.sort = sort
        self.pre_sort = True
        self.expression = self.df[str(expression)]
        self.label = label or self.expression._label
        self.keep_other = keep_other
        if isinstance(values, pa.ChunkedArray):
            values = pa.concat_arrays(values.chunks)
        if sort:
            indices = pa.compute.sort_indices(values)
            values = pa.compute.take(values, indices)

        if self.keep_other:
            self.bin_values = pa.array(
                vaex.array_types.tolist(values) + [other_value])
            self.values = self.bin_values.slice(0, len(self.bin_values) - 1)
        else:
            raise NotImplementedError("not supported yet")
            # although we can support this, it will fail with _combine, because of
            # the mapping of the set to -1
            self.bin_values = pa.array(vaex.array_types.tolist(values))
            self.values = self.bin_values
        self.N = len(self.bin_values)
        dtype = vaex.dtype_of(self.values)
        set_type = vaex.hash.ordered_set_type_from_dtype(dtype)
        values_list = self.values.tolist()
        try:
            null_value = values_list.index(None)
            null_count = 1
        except ValueError:
            null_value = -1
            null_count = 0
        if vaex.dtype_of(self.values) == float:
            nancount = np.isnan(self.values).sum()
        else:
            nancount = 0

        fp = vaex.cache.fingerprint(values)
        fingerprint = f"set-grouper-fixed-{fp}"
        if dtype.is_string:
            values = vaex.column.ColumnStringArrow.from_arrow(self.values)
            string_sequence = values.string_sequence
            self.set = set_type(string_sequence, null_value, nancount,
                                null_count, fingerprint)
        else:
            self.set = set_type(self.values, null_value, nancount, null_count,
                                fingerprint)

        self.basename = "set_%s" % vaex.utils._python_save_name(
            str(self.expression) + "_" + self.set.fingerprint)
        self.binby_expression = expression
        self.sort_indices = None
        self._promise = vaex.promise.Promise.fulfilled(None)