示例#1
0
文件: groupby.py 项目: sthagen/vaex
    def __init__(self, df, by, sort=False, combine=False, expand=True, row_limit=None):
        '''Note that row_limit only works in combination with combine=True'''
        df_original = df
        df = df.copy()  # we're gonna mutate, so create a shallow copy
        self.df = df
        self.sort = sort
        self.expand = expand  # keep as pyarrow struct?

        if not isinstance(by, collections_abc.Iterable)\
            or isinstance(by, six.string_types):
            by = [by]

        self.by = []
        self.by_original = by
        for by_value in by:
            if not isinstance(by_value, BinnerBase):
                if df.is_category(by_value):
                    by_value = GrouperCategory(df[_ensure_string_from_expression(by_value)], sort=sort, row_limit=row_limit)
                else:
                    by_value = Grouper(df[_ensure_string_from_expression(by_value)], sort=sort, row_limit=row_limit, df_original=df_original)
            self.by.append(by_value)
        if combine is True and  len(self.by) >= 2:
            self.by = [_combine(self.df, self.by, sort=sort, row_limit=row_limit)]
            self.combine = True
        elif combine == 'auto' and len(self.by) >= 2:
            cells = product([grouper.N for grouper in self.by])
            dim = len(self.by)
            rows = df.length_unfiltered()  # we don't want to trigger a computation
            occupancy = rows/cells
            logger.debug('%s rows and %s grid cells => occupancy=%s', rows, cells, occupancy)
            # we want each cell to have a least 10x occupacy
            if occupancy < 10:
                logger.info(f'Combining {len(self.by)} groupers into 1')
                self.by = [_combine(self.df, self.by, sort=sort, row_limit=row_limit)]
                self.combine = True
            else:
                self.combine = False
        else:
            self.combine = False


        # binby may be an expression based on self.by.expression
        # if we want to have all columns, minus the columns grouped by
        # we should keep track of the original expressions, but binby
        self.groupby_expression = [str(by.expression) for by in self.by]
        self.binners = tuple(by.binner for by in self.by)
        self.shape = [by.N for by in self.by]
        self.dims = self.groupby_expression[:]
示例#2
0
    def __init__(self,
                 expression,
                 df=None,
                 sort=False,
                 pre_sort=True,
                 row_limit=None,
                 df_original=None):
        self.df = df or expression.ds
        # we prefer to calculate the set the original dataframe to have better cache hits, and modify df
        if df_original is None:
            df_original = self.df
        self.sort = sort
        self.expression = expression
        # make sure it's an expression
        self.expression = self.df[_ensure_string_from_expression(
            self.expression)]
        self.label = self.expression._label
        set = df_original._set(self.expression, unique_limit=row_limit)
        keys = set.keys()
        if self.sort:
            if pre_sort:
                sort_indices = np.argsort(keys)
                keys = np.array(keys)[sort_indices].tolist()
                set_dict = dict(zip(keys, range(len(keys))))
                set = type(set)(set_dict, set.count, set.nan_count,
                                set.null_count)
                self.sort_indices = None
            else:
                self.sort_indices = np.argsort(keys)
                keys = np.array(keys)[self.sort_indices].tolist()
        else:
            self.sort_indices = None
        self.set = set

        # TODO: we modify the dataframe in place, this is not nice
        basename = 'set_%s' % vaex.utils._python_save_name(str(expression))
        self.setname = self.df.add_variable(basename, self.set, unique=True)

        self.bin_values = keys
        self.binby_expression = '_ordinal_values(%s, %s)' % (self.expression,
                                                             self.setname)
        self.N = len(self.bin_values)
        if self.set.has_null:
            self.N += 1
            self.bin_values = [None] + self.bin_values
        if self.set.has_nan:
            self.N += 1
            self.bin_values = [np.nan] + self.bin_values
        if self.sort_indices is not None:
            if self.set.has_null and self.set.has_nan:
                self.sort_indices = np.concatenate([[0, 1],
                                                    self.sort_indices + 2])
            elif self.set.has_null or self.set.has_nan:
                self.sort_indices = np.concatenate([[0],
                                                    self.sort_indices + 1])
        self.bin_values = self.expression.dtype.create_array(self.bin_values)
        self.binner = self.df._binner_ordinal(self.binby_expression, self.N)
示例#3
0
文件: groupby.py 项目: sthagen/vaex
    def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False):
        self.df = df or expression.ds
        # we prefer to calculate the set the original dataframe to have better cache hits, and modify df
        if df_original is None:
            df_original = self.df
        self.sort = sort
        self.expression = expression
        # make sure it's an expression
        self.expression = self.df[_ensure_string_from_expression(self.expression)]
        self.label = self.expression._label
        if materialize_experimental:
            set, values = df_original._set(self.expression, unique_limit=row_limit, return_inverse=True)
            # TODO: add column should have a unique argument
            self.df.add_column(f'__materialized_{self.label}', values)

            self.bin_values = set.key_array()
            if isinstance(self.bin_values, vaex.superstrings.StringList64):
                self.bin_values = pa.array(self.bin_values.to_numpy())
            self.binby_expression = 'bla'
            self.N = len(self.bin_values)
            self.min_value = 0
            self.binner = self.df._binner_ordinal('bla', self.N, self.min_value)
            self.sort_indices = None
        else:
            set = df_original._set(self.expression, unique_limit=row_limit)
            self.bin_values = set.key_array()

            if isinstance(self.bin_values, vaex.superstrings.StringList64):
                # TODO: find out why this more efficient path does not work
                # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values)
                # self.bin_values = pa.array(col)
                self.bin_values = pa.array(self.bin_values.to_numpy())
            if vaex.dtype_of(self.bin_values).kind == 'i':
                max_value = self.bin_values.max()
                self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value))
            logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values))

            # since nan and null are at the start, we skip them with sorting
            if self.sort:
                dtype = self.expression.dtype
                indices = pa.compute.sort_indices(self.bin_values)#[offset:])
                if pre_sort:
                    self.bin_values = pa.compute.take(self.bin_values, indices)
                    # arrow sorts with null last
                    null_value = -1 if not set.has_null else len(self.bin_values)-1
                    fingerprint = set.fingerprint + "-sorted"
                    if dtype.is_string:
                        bin_values = vaex.column.ColumnStringArrow.from_arrow(self.bin_values)
                        string_sequence = bin_values.string_sequence
                        set = type(set)(string_sequence, null_value, set.nan_count, set.null_count, fingerprint)
                    else:
                        set = type(set)(self.bin_values, null_value, set.nan_count, set.null_count, fingerprint)
                    self.sort_indices = None
                else:
                    # TODO: skip first or first two values (null and/or nan)
                    self.sort_indices = vaex.array_types.to_numpy(indices)
                    # the bin_values will still be pre sorted, maybe that is confusing (implementation detail)
                    self.bin_values = pa.compute.take(self.bin_values, self.sort_indices)
            else:
                self.sort_indices = None
            self.set = set

            # TODO: we modify the dataframe in place, this is not nice
            basename = 'set_%s' % vaex.utils._python_save_name(str(expression))
            self.setname = self.df.add_variable(basename, self.set, unique=True)

            self.binby_expression = '_ordinal_values(%s, %s)' % (self.expression, self.setname)
            self.N = len(self.bin_values)
            self.bin_values = self.expression.dtype.create_array(self.bin_values)
            self.binner = self.df._binner_ordinal(self.binby_expression, self.N)
示例#4
0
def join(df, other, on=None, left_on=None, right_on=None, lprefix='', rprefix='', lsuffix='', rsuffix='', how='left', allow_duplication=False, prime_growth=False, cardinality_other=None, inplace=False):
    # implementation of DataFrameLocal.join
    inner = False
    left = df
    right = other
    left_original = left.copy()
    right_original = right.copy()
    rprefix_original, lprefix_original = rprefix, lprefix
    rsuffix_original, lsuffix_original = rsuffix, lsuffix
    right_on_original, left_on_original = right_on, left_on
    if how == 'left':
        pass
    elif how == 'right':
        left, right = right, left
        lprefix, rprefix = rprefix, lprefix
        lsuffix, rsuffix = rsuffix, lsuffix
        left_on, right_on = right_on, left_on
    elif how == 'inner':
        inner = True
    else:
        raise ValueError('join type not supported: {}, only left and right'.format(how))
    left = left if inplace else left.copy()

    on = _ensure_string_from_expression(on)
    left_on = _ensure_string_from_expression(left_on)
    right_on = _ensure_string_from_expression(right_on)
    left_on = left_on or on
    right_on = right_on or on
    for name in right:
        if left_on and (rprefix + name + rsuffix == lprefix + left_on + lsuffix):
            continue  # it's ok when we join on the same column name
        if name in left and rprefix + name + rsuffix == lprefix + name + lsuffix:
            raise ValueError('column name collision: {} exists in both column, and no proper suffix given'
                                .format(name))

    right = right.extract()  # get rid of filters and active_range
    assert left.length_unfiltered() == left.length_original()
    N = left.length_unfiltered()
    N_other = len(right)
    if left_on is None and right_on is None:
        lookup = None
    else:
        df = left
        # we index the right side, this assumes right is smaller in size
        index = right._index(right_on, prime_growth=prime_growth, cardinality=cardinality_other)
        dtype = left.data_type(left_on)
        duplicates_right = index.has_duplicates

        if duplicates_right and not allow_duplication:
            raise ValueError('This join will lead to duplication of rows which is disabled, pass allow_duplication=True')

        # our max value for the lookup table is the row index number, so if we join a small
        # df with say 100 rows, we can do it with a int8
        lookup_dtype = vaex.utils.required_dtype_for_max(len(right))
        # we put in the max value to maximize triggering failures in the case of a bug (we don't want
        # to point to row 0 in case we do, we'd rather crash)
        lookup = np.full(left._length_original, np.iinfo(lookup_dtype).max, dtype=lookup_dtype)
        nthreads = df.executor.thread_pool.nthreads
        lookup_masked = [False] * nthreads  # does the lookup contain masked/-1 values?
        lookup_extra_chunks = []

        from vaex.column import _to_string_sequence
        def map(thread_index, i1, i2, selection_masks, blocks):
            ar = blocks[0]
            if vaex.array_types.is_string_type(dtype):
                previous_ar = ar
                ar = _to_string_sequence(ar)
            if dtype.is_datetime:
                ar = ar.view(np.int64)
            if np.ma.isMaskedArray(ar):
                mask = np.ma.getmaskarray(ar)
                found_masked = index.map_index_masked(ar.data, mask, lookup[i1:i2])
                lookup_masked[thread_index] = lookup_masked[thread_index] or found_masked
                if duplicates_right:
                    extra = index.map_index_duplicates(ar.data, mask, i1)
                    lookup_extra_chunks.append(extra)
            else:
                found_masked = index.map_index(ar, lookup[i1:i2])
                lookup_masked[thread_index] = lookup_masked[thread_index] or found_masked
                if duplicates_right:
                    extra = index.map_index_duplicates(ar, i1)
                    lookup_extra_chunks.append(extra)
        def reduce(a, b):
            pass
        left.map_reduce(map, reduce, [left_on], delay=False, name='fill looking', info=True, to_numpy=False, ignore_filter=True)
        if len(lookup_extra_chunks):
            # if the right has duplicates, we increase the left of left, and the lookup array
            lookup_left = np.concatenate([k[0] for k in lookup_extra_chunks])
            lookup_right = np.concatenate([k[1] for k in lookup_extra_chunks])
            left = left.concat(left.take(lookup_left))
            lookup = np.concatenate([lookup, lookup_right])

        if inner:
            left_mask_matched = lookup != -1  # all the places where we found a match to the right
            lookup = lookup[left_mask_matched]  # filter the lookup table to the right
            left_indices_matched = np.where(left_mask_matched)[0]  # convert mask to indices for the left
            # indices can still refer to filtered rows, so do not drop the filter
            left = left.take(left_indices_matched, filtered=False, dropfilter=False)
    direct_indices_map = {}  # for performance, keeps a cache of two levels of indirection of indices

    def mangle_name(prefix, name, suffix):
        if name.startswith('__'):
            return '__' + prefix + name[2:] + suffix
        else:
            return prefix + name + suffix

    # first, do renaming, so all column names are unique
    right_names = right.get_names(hidden=True)
    left_names = left.get_names(hidden=True)
    for name in right_names:
        if name in left_names:
            # find a unique name across both dataframe, including the new name for the left
            all_names = list(set(right_names + left_names))
            all_names.append(mangle_name(lprefix, name, lsuffix))  # we dont want to steal the left's name
            all_names.remove(name)  # we could even claim the original name
            new_name = mangle_name(rprefix, name, rsuffix)
            # we will not add this column twice when it is the join column
            if new_name != left_on:
                if new_name in all_names:  # it's still not unique
                    new_name = vaex.utils.find_valid_name(new_name, all_names)
                right.rename(name, new_name)
                right_names[right_names.index(name)] = new_name

            # and the same for the left
            all_names = list(set(right_names + left_names))
            all_names.remove(name)
            new_name = mangle_name(lprefix, name, lsuffix)
            if new_name in all_names:  # still not unique
                new_name = vaex.utils.find_valid_name(new_name, all_names)
            left.rename(name, new_name)
            left_names[left_names.index(name)] = new_name

    # now we add columns from the right, to the left
    right_names = right.get_names(hidden=True)
    left_names = left.get_names(hidden=True)
    right_columns = []
    for name in right_names:
        column_name = name
        if name == left_on and name in left_names:
            continue  # skip when it's the join column
        assert name not in left_names
        if name in right.variables:
            left.set_variable(name, right.variables[name])
        elif column_name in right.virtual_columns:
            left.add_virtual_column(name, right.virtual_columns[column_name])
        elif column_name in right.functions:
            if name in left.functions:
                raise NameError(f'Name collision for function {name}')
            left.functions[name] = right.functions[name]
        else:
            right_columns.append(name)
            # we already add the column name here to get the same order
            left.column_names.append(name)
            left._initialize_column(name)
    # merge the two datasets
    right_dataset = right.dataset.project(*right_columns)
    if lookup is not None:
        # if lookup is None, we do a row based join
        # and we only need to merge.
        # if we have an array of lookup indices, we 'take' those
        right_dataset = right_dataset.take(lookup, masked=any(lookup_masked))
    dataset = left.dataset.merged(right_dataset)
    # row number etc should not have changed, we only append new columns
    # so no need to reset caches
    left._dataset = DatasetJoin(dataset, left_original, right_original,
        on=on, left_on=left_on_original, right_on=right_on_original,
        lprefix=lprefix_original, rprefix=rprefix_original, lsuffix=lsuffix_original, rsuffix=rsuffix_original,
        how=how, allow_duplication=allow_duplication, prime_growth=prime_growth, cardinality_other=cardinality_other
    )
    return left
 def derivative(self, var, simplify=True):
     var = _ensure_string_from_expression(var)
     return self.__class__(
         self.ds,
         expresso.derivative(self.expression, var, simplify=simplify))
示例#6
0
文件: groupby.py 项目: t-triobox/vaex
    def __init__(self, df, by, sort=False, combine=False, expand=True, row_limit=None, copy=True, progress=None):
        '''Note that row_limit only works in combination with combine=True'''
        df_original = df
        if copy:
            df = df.copy() # we will mutate the df (Add variables), this will keep the original dataframe unchanged
        self.df = df
        self.sort = sort
        self.expand = expand  # keep as pyarrow struct?
        self.progressbar = vaex.utils.progressbars(progress)
        self.progressbar_groupers = self.progressbar.add("groupers")

        if not isinstance(by, collections_abc.Iterable)\
            or isinstance(by, six.string_types):
            by = [by]

        self.by = []
        self.by_original = by
        for by_value in by:
            if not isinstance(by_value, BinnerBase):
                expression = df[_ensure_string_from_expression(by_value)]
                if df.is_category(by_value):
                    by_value = GrouperCategory(expression, sort=sort, row_limit=row_limit)
                else:
                    dtype = expression.dtype
                    if dtype == np.dtype('uint8') or dtype == np.dtype('int8') or dtype == np.dtype('bool'):
                        by_value = BinnerInteger(expression)  # always sorted, and pre_sorted
                    else:
                        # we cannot mix _combine with BinnerInteger yet
                        by_value = Grouper(expression, sort=sort, row_limit=row_limit, df_original=df_original, progress=self.progressbar_groupers, allow_simplify=True)
            self.by.append(by_value)
        @vaex.delayed
        def possible_combine(*binner_promises):
            # if a binner realized there is a simpler way (e.g. grouper -> intbinner)
            self.by = [by.simpler if by.simpler is not None else by for by in self.by]
            # because binners can be created from other dataframes (we make a copy)
            # we let it mutate *our* dataframe
            for binner in self.by:
                binner._create_binner(self.df)
            cells = product([grouper.N for grouper in self.by])
            @vaex.delayed
            def set_combined(combined):
                combined._create_binner(self.df)
                self.by = [combined]
                self.combine = True
            if ((row_limit is not None) or (combine is True)) and len(self.by) >= 2 and cells > 0:
                promise = set_combined(_combine(self.df, self.by, sort=sort, row_limit=row_limit, progress=self.progressbar_groupers))
            elif combine == 'auto' and len(self.by) >= 2:
                # default assume we cannot combined
                self.combine = False
                promise = vaex.promise.Promise.fulfilled(None)
                # don't even try when one grouper has 0 options
                if cells > 0:
                    dim = len(self.by)
                    rows = df.length_unfiltered()  # we don't want to trigger a computation
                    occupancy = rows/cells
                    logger.debug('%s rows and %s grid cells => occupancy=%s', rows, cells, occupancy)
                    # we want each cell to have a least 10x occupacy
                    if occupancy < 10:
                        logger.info(f'Combining {len(self.by)} groupers into 1')
                        promise = set_combined(_combine(self.df, self.by, sort=sort, row_limit=row_limit, progress=self.progressbar_groupers))
                        self.combine = True
            else:
                self.combine = False
                promise = vaex.promise.Promise.fulfilled(None)
            @vaex.delayed
            def process(_ignore):
                self.dense = len(self.by) == 1 and self.by[0].dense
                self.groupby_expression = [str(by.expression) for by in self.by]
                self.binners = tuple(by.binner for by in self.by)
                self.shape = [by.N for by in self.by]
                self.dims = [by.label for by in self.by]
                self.progressbar_groupers(1)
            return process(promise)

        self._promise_by = self.progressbar_groupers.exit_on(possible_combine(*[by._promise for by in self.by]))
示例#7
0
文件: groupby.py 项目: t-triobox/vaex
    def __init__(self, expression, df=None, sort=False, pre_sort=True, row_limit=None, df_original=None, materialize_experimental=False, progress=None, allow_simplify=False):
        self.df = df or expression.ds
        self.sort = sort
        self.pre_sort = pre_sort
        # we prefer to calculate the set the original dataframe to have better cache hits, and modify df
        if df_original is None:
            df_original = self.df
        self.sort = sort
        self.expression = expression
        self.allow_simplify = allow_simplify
        # make sure it's an expression
        self.expression = self.df[_ensure_string_from_expression(self.expression)]
        self.label = self.expression._label
        self.progressbar = vaex.utils.progressbars(progress, title=f"grouper: {repr(self.label)}" )
        dtype = self.expression.dtype
        if materialize_experimental:
            set, values = df_original._set(self.expression, limit=row_limit, return_inverse=True)
            # TODO: add column should have a unique argument
            self.df.add_column(f'__materialized_{self.label}', values)

            self.bin_values = set.key_array()
            if isinstance(self.bin_values, vaex.superstrings.StringList64):
                self.bin_values = pa.array(self.bin_values.to_numpy())
            self.binby_expression = 'bla'
            self.N = len(self.bin_values)
            self.min_value = 0
            self.binner = self.df._binner_ordinal('bla', self.N, self.min_value)
            self.sort_indices = None
        else:
            @vaex.delayed
            def process(hashmap_unique):
                self.bin_values = hashmap_unique.keys()
                if self.allow_simplify and dtype == int and len(self.bin_values):
                    vmin = self.bin_values.min()
                    vmax = self.bin_values.max()
                    int_range = vmax - vmin + 1
                    # we allow for 25% unused 'slots'
                    bins = len(self.bin_values)
                    if int_range <= (bins * 4 / 3):
                        dense = bins == int_range
                        self.simpler = BinnerInteger(self.expression, min_value=vmin, max_value=vmax, dropmissing=not hashmap_unique.has_null, dense=dense)
                        return

                if vaex.dtype_of(self.bin_values) == int and len(self.bin_values):
                    max_value = self.bin_values.max()
                    self.bin_values = self.bin_values.astype(vaex.utils.required_dtype_for_max(max_value))
                logger.debug('Constructed grouper for expression %s with %i values', str(expression), len(self.bin_values))

                if self.sort:
                    if pre_sort:
                        hashmap_unique, self.bin_values = hashmap_unique.sorted(keys=self.bin_values, return_keys=True)
                        self.sort_indices = None
                    else:
                        indices = pa.compute.sort_indices(self.bin_values)
                        self.sort_indices = vaex.array_types.to_numpy(indices)
                        # the bin_values will still be pre sorted, maybe that is confusing (implementation detail)
                        self.bin_values = pa.compute.take(self.bin_values, self.sort_indices)
                else:
                    self.sort_indices = None
                self.hashmap_unique = hashmap_unique

                self.basename = 'hashmap_unique_%s' % vaex.utils._python_save_name(str(self.expression) + "_" + hashmap_unique.fingerprint)

                self.N = len(self.bin_values)
                # for datetimes, we converted to int
                if dtype.is_datetime:
                    self.bin_values = dtype.create_array(self.bin_values)
            self._promise = process(df_original._hash_map_unique(self.expression, limit=row_limit, delay=True, progress=self.progressbar))
示例#8
0
    def __init__(self,
                 expression,
                 df=None,
                 sort=False,
                 pre_sort=True,
                 row_limit=None,
                 df_original=None,
                 materialize_experimental=False,
                 progress=None):
        self.df = df or expression.ds
        self.sort = sort
        self.pre_sort = pre_sort
        # we prefer to calculate the set the original dataframe to have better cache hits, and modify df
        if df_original is None:
            df_original = self.df
        self.sort = sort
        self.expression = expression
        # make sure it's an expression
        self.expression = self.df[_ensure_string_from_expression(
            self.expression)]
        self.label = self.expression._label
        self.progressbar = vaex.utils.progressbars(
            progress, title=f"grouper: {repr(self.label)}")
        dtype = self.expression.dtype
        if materialize_experimental:
            set, values = df_original._set(self.expression,
                                           unique_limit=row_limit,
                                           return_inverse=True)
            # TODO: add column should have a unique argument
            self.df.add_column(f'__materialized_{self.label}', values)

            self.bin_values = set.key_array()
            if isinstance(self.bin_values, vaex.superstrings.StringList64):
                self.bin_values = pa.array(self.bin_values.to_numpy())
            self.binby_expression = 'bla'
            self.N = len(self.bin_values)
            self.min_value = 0
            self.binner = self.df._binner_ordinal('bla', self.N,
                                                  self.min_value)
            self.sort_indices = None
        else:

            @vaex.delayed
            def process(set):
                self.bin_values = set.key_array()

                if isinstance(self.bin_values, vaex.superstrings.StringList64):
                    # TODO: find out why this more efficient path does not work
                    # col = vaex.column.ColumnStringArrow.from_string_sequence(self.bin_values)
                    # self.bin_values = pa.array(col)
                    self.bin_values = pa.array(self.bin_values.to_numpy())
                if vaex.dtype_of(self.bin_values) == int:
                    max_value = self.bin_values.max()
                    self.bin_values = self.bin_values.astype(
                        vaex.utils.required_dtype_for_max(max_value))
                logger.debug(
                    'Constructed grouper for expression %s with %i values',
                    str(expression), len(self.bin_values))

                if set.has_null and (dtype.is_primitive or dtype.is_datetime):
                    mask = np.zeros(shape=self.bin_values.shape, dtype="?")
                    mask[set.null_value] = 1
                    self.bin_values = np.ma.array(self.bin_values, mask=mask)
                if self.sort:
                    self.bin_values = vaex.array_types.to_arrow(
                        self.bin_values)
                    indices = pa.compute.sort_indices(
                        self.bin_values)  #[offset:])
                    if pre_sort:
                        self.bin_values = pa.compute.take(
                            self.bin_values, indices)
                        # arrow sorts with null last
                        null_value = -1 if not set.has_null else len(
                            self.bin_values) - 1
                        fingerprint = set.fingerprint + "-sorted"
                        if dtype.is_string:
                            bin_values = vaex.column.ColumnStringArrow.from_arrow(
                                self.bin_values)
                            string_sequence = bin_values.string_sequence
                            set = type(set)(string_sequence, null_value,
                                            set.nan_count, set.null_count,
                                            fingerprint)
                        else:
                            set = type(set)(self.bin_values, null_value,
                                            set.nan_count, set.null_count,
                                            fingerprint)
                        self.sort_indices = None
                    else:
                        self.sort_indices = vaex.array_types.to_numpy(indices)
                        # the bin_values will still be pre sorted, maybe that is confusing (implementation detail)
                        self.bin_values = pa.compute.take(
                            self.bin_values, self.sort_indices)
                else:
                    self.sort_indices = None
                self.set = set

                self.basename = 'set_%s' % vaex.utils._python_save_name(
                    str(self.expression) + "_" + set.fingerprint)

                self.N = len(self.bin_values)
                # for datetimes, we converted to int
                if dtype.is_datetime:
                    self.bin_values = dtype.create_array(self.bin_values)

            self._promise = process(
                df_original._set(self.expression,
                                 unique_limit=row_limit,
                                 delay=True,
                                 progress=self.progressbar))
示例#9
0
 def derivative(self, var, simplify=True):
     var = _ensure_string_from_expression(var)
     return self.__class__(self, expresso.derivative(self.expression, var, simplify=simplify))
示例#10
0
    def __init__(self,
                 df,
                 by,
                 sort=False,
                 combine=False,
                 expand=True,
                 row_limit=None,
                 copy=True):
        '''Note that row_limit only works in combination with combine=True'''
        df_original = df
        if copy:
            df = df.copy(
            )  # we will mutate the df (Add variables), this will keep the original dataframe unchanged
        self.df = df
        self.sort = sort
        self.expand = expand  # keep as pyarrow struct?

        if not isinstance(by, collections_abc.Iterable)\
            or isinstance(by, six.string_types):
            by = [by]

        self.by = []
        self.by_original = by
        for by_value in by:
            if not isinstance(by_value, BinnerBase):
                expression = df[_ensure_string_from_expression(by_value)]
                if df.is_category(by_value):
                    by_value = GrouperCategory(expression,
                                               sort=sort,
                                               row_limit=row_limit)
                else:
                    dtype = expression.dtype
                    if dtype == np.dtype('uint8') or dtype == np.dtype('bool'):
                        by_value = BinnerInteger(
                            expression)  # doesn't modify, always sorted
                    else:
                        by_value = Grouper(expression,
                                           sort=sort,
                                           row_limit=row_limit,
                                           df_original=df_original)
            self.by.append(by_value)

        @vaex.delayed
        def possible_combine(*binner_promises):
            # because binners can be created from other dataframes (we make a copy)
            # we let it mutate *our* dataframe
            for binner in self.by:
                binner._create_binner(self.df)

            @vaex.delayed
            def set_combined(combined):
                combined._create_binner(self.df)
                self.by = [combined]
                self.combine = True

            if combine is True and len(self.by) >= 2:
                promise = set_combined(
                    _combine(self.df, self.by, sort=sort, row_limit=row_limit))
            elif combine == 'auto' and len(self.by) >= 2:
                cells = product([grouper.N for grouper in self.by])
                dim = len(self.by)
                rows = df.length_unfiltered(
                )  # we don't want to trigger a computation
                occupancy = rows / cells
                logger.debug('%s rows and %s grid cells => occupancy=%s', rows,
                             cells, occupancy)
                # we want each cell to have a least 10x occupacy
                if occupancy < 10:
                    logger.info(f'Combining {len(self.by)} groupers into 1')
                    promise = set_combined(
                        _combine(self.df,
                                 self.by,
                                 sort=sort,
                                 row_limit=row_limit))
                    self.combine = True
                else:
                    self.combine = False
                    promise = vaex.promise.Promise.fulfilled(None)
            else:
                self.combine = False
                promise = vaex.promise.Promise.fulfilled(None)

            @vaex.delayed
            def process(_ignore):
                self.groupby_expression = [
                    str(by.expression) for by in self.by
                ]
                self.binners = tuple(by.binner for by in self.by)
                self.shape = [by.N for by in self.by]
                self.dims = self.groupby_expression[:]

            return process(promise)

        self._promise_by = possible_combine(*[by._promise for by in self.by])