Пример #1
0
    def reduce(self, others):
        all = [self] + others
        all = [k.set for k in all if k.set is not None]
        set_merged, *others = all
        import time
        t0 = time.time()
        set_merged.merge(others)
        logger.info(
            f'merge took {time.time()-t0} seconds, size {len(set_merged):,}, byte_size {sys.getsizeof(set_merged):,}'
        )

        if self.return_inverse:
            # sort by row index
            self.chunks.sort(key=lambda x: x[0])
            length = 0
            for i1, i2, values, map_index in self.chunks:
                length += len(values)
            self.values = np.empty(length,
                                   vaex.dtype_of(self.chunks[0][2]).numpy)
            # TODO: we could do this parallel, but overhead is small
            for i1, i2, values, map_index in self.chunks:
                set_merged.flatten_values(values, map_index,
                                          self.values[i1:i2])

        if self.unique_limit is not None:
            count = len(set_merged)
            if count > self.unique_limit:
                raise vaex.RowLimitException(
                    f'Resulting set has {count:,} unique combinations, which is larger than the allowed value of {self.unique_limit:,}'
                )
        self.set = set_merged
        self.set.fingerprint = f'set-{self.fingerprint}'
Пример #2
0
    def __init__(self, expression, df=None, sort=False, row_limit=None):
        self.df = df or expression.ds
        self.sort = sort
        # make sure it's an expression
        expression = self.df[str(expression)]
        self.expression_original = expression
        self.label = expression._label
        self.expression = expression.index_values() if expression.dtype.is_encoded else expression

        self.bin_values = self.df.category_labels(self.expression_original, aslist=False)
        if self.sort:
            self.sort_indices = pa.compute.sort_indices(self.bin_values)#[offset:])
            self.bin_values = pa.compute.take(self.bin_values, self.sort_indices)
        else:
            self.sort_indices = None
        if isinstance(self.bin_values, list):
            self.bin_values = pa.array(self.bin_values)

        self.N = self.df.category_count(self.expression_original)
        if row_limit is not None:
            if self.N > row_limit:
                raise vaex.RowLimitException(f'Resulting grouper has {self.N:,} unique combinations, which is larger than the allowed row limit of {row_limit:,}')
        self.min_value = self.df.category_offset(self.expression_original)
        # TODO: what do we do with null values for categories?
        # if self.set.has_null:
        #     self.N += 1
        #     keys += ['null']
        self.binner = self.df._binner_ordinal(self.expression, self.N, self.min_value)
        self.binby_expression = str(self.expression)
Пример #3
0
 def process(self, thread_index, i1, i2, filter_mask, ar):
     from vaex.column import _to_string_sequence
     if self.set is None:
         self.set = self.ordered_set_type()
     if self.selection:
         selection_mask = self.df.evaluate_selection_mask(self.selection,
                                                          i1=i1,
                                                          i2=i2,
                                                          cache=True)
         ar = filter(ar, selection_mask)
     if self.dtype.is_list and self.flatten:
         ar = ar.values
     if self.dtype_item.is_string:
         ar = _to_string_sequence(ar)
     else:
         ar = vaex.array_types.to_numpy(ar)
     if np.ma.isMaskedArray(ar):
         mask = np.ma.getmaskarray(ar)
         self.set.update(ar, mask)
     else:
         self.set.update(ar)
     if self.unique_limit is not None:
         count = self.set.count
         # we skip null and nan here, since this is just an early bail out
         if count > self.unique_limit:
             raise vaex.RowLimitException(
                 f'Resulting set would have >= {self.unique_limit} unique combinations'
             )
Пример #4
0
 def _check_row_limit(self):
     if self.limit is not None:
         # we only raise when we EXCEED the limit
         if self.limit_raise and len(self.hash_map_unique) > self.limit:
             raise vaex.RowLimitException(f'Resulting hash_map_unique would have >= {self.limit} unique combinations')
         # but we can stop when we are AT the limit
         if not self.limit_raise and len(self.hash_map_unique) >= self.limit:
             self.stopped = True
Пример #5
0
    def __init__(self,
                 expression,
                 df=None,
                 sort=False,
                 row_limit=None,
                 pre_sort=True):
        self.df = df or expression.ds
        self.sort = sort
        self.pre_sort = pre_sort
        # make sure it's an expression
        expression = self.df[str(expression)]
        self.expression_original = expression
        self.label = expression._label
        self.expression = expression.index_values(
        ) if expression.dtype.is_encoded else expression
        self.row_limit = row_limit

        self.min_value = self.df.category_offset(self.expression_original)
        self.bin_values = self.df.category_labels(self.expression_original,
                                                  aslist=False)
        self.N = self.df.category_count(self.expression_original)
        dtype = self.expression.dtype
        if self.sort:
            # not pre-sorting is faster
            sort_indices = pa.compute.sort_indices(self.bin_values)
            self.bin_values = pa.compute.take(self.bin_values, sort_indices)
            if self.pre_sort:
                sort_indices = vaex.array_types.to_numpy(sort_indices)
                # TODO: this is kind of like expression.map
                from .hash import ordered_set_type_from_dtype

                ordered_set_type = ordered_set_type_from_dtype(dtype)
                fingerprint = self.expression.fingerprint(
                ) + "-grouper-sort-mapper"
                self.set = ordered_set_type(sort_indices + self.min_value, -1,
                                            0, 0, fingerprint)
                self.min_value = 0
                self.sort_indices = None
                self.basename = "set_%s" % vaex.utils._python_save_name(
                    str(self.expression) + "_" + self.set.fingerprint)
            else:
                self.sort_indices = sort_indices
        else:
            self.sort_indices = None
        if isinstance(self.bin_values, list):
            self.bin_values = pa.array(self.bin_values)

        if row_limit is not None:
            if self.N > row_limit:
                raise vaex.RowLimitException(
                    f'Resulting grouper has {self.N:,} unique combinations, which is larger than the allowed row limit of {row_limit:,}'
                )
        # TODO: what do we do with null values for categories?
        # if self.set.has_null:
        #     self.N += 1
        #     keys += ['null']
        self._promise = vaex.promise.Promise.fulfilled(None)
Пример #6
0
 def reduce(self, others):
     set_merged = self.set
     for other in others:
         if set_merged is None and other.set is not None:
             set_merged = other.set
         elif other.set is not None:
             set_merged.merge(other.set)
     if self.unique_limit is not None:
         count = set_merged.count
         if set_merged.has_nan:
             count += 1
         if set_merged.has_null:
             count += 1
         if count > self.unique_limit:
             raise vaex.RowLimitException(
                 f'Resulting set has {count:,} unique combinations, which is larger than the allowed value of {self.unique_limit:,}'
             )
     self.set = set_merged
Пример #7
0
 def _check_row_limit(self):
     if self.unique_limit is not None:
         if len(self.set) > self.unique_limit:
             raise vaex.RowLimitException(
                 f'Resulting set would have >= {self.unique_limit} unique combinations'
             )