示例#1
0
    def unique(self, col_or_col_list):
        """
        Return a list of unique values of a column or a list of lists of column list

        :param col_or_col_list: a column or a list of columns
        :return:
        """

        if isinstance(col_or_col_list, list):
            col_is_list = True
            col_list = col_or_col_list
        else:
            col_is_list = False
            col_list = [col_or_col_list]

        output = []

        for col in col_list:

            if self.cache_valid(col):
                # retrieve values from existing disk-based factorization
                col_values_rootdir = self[col].rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r')
                values = list(carray_values)
            else:
                # factorize on-the-fly
                _, values = ctable_ext.factorize(self[col])
                values = values.values()

            output.append(values)

        if not col_is_list:
            output = output[0]

        return output
示例#2
0
    def factorize_groupby_cols(self, groupby_cols):
        """

        :type self: ctable
        """
        # first check if the factorized arrays already exist
        # unless we need to refresh the cache
        factor_list = []
        values_list = []

        # factorize the groupby columns
        for col in groupby_cols:

            if self.cache_valid(col):
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'
                col_factor_carray = \
                    bcolz.carray(rootdir=col_factor_rootdir, mode='r')
                col_values_carray = \
                    bcolz.carray(rootdir=col_values_rootdir, mode='r')
            else:
                col_factor_carray, values = ctable_ext.factorize(self[col])
                col_values_carray = \
                    bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype))

            factor_list.append(col_factor_carray)
            values_list.append(col_values_carray)

        return factor_list, values_list
示例#3
0
文件: ctable.py 项目: ankravch/bquery
    def cache_factor(self, col_list, refresh=False):
        """
        Existing todos here are: these should be hidden helper carrays
        As in: not normal columns that you would normally see as a user

        The factor (label index) carray is as long as the original carray
        (and the rest of the table therefore)
        But the (unique) values carray is not as long (as long as the number
        of unique values)

        :param col_list:
        :param refresh:
        :return:
        """

        if not self.rootdir:
            raise TypeError('Only out-of-core ctables can have '
                            'factorization caching at the moment')

        if not isinstance(col_list, list):
            col_list = [col_list]

        if refresh:
            kill_list = [x for x in os.listdir(self.rootdir) if '.factor' in x or '.values' in x]
            for kill_dir in kill_list:
                rm_file_or_dir(os.path.join(self.rootdir, kill_dir))

        for col in col_list:

            # create cache if needed
            if refresh or not self.cache_valid(col):
                # todo: also add locking mechanism here

                # create directories
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_factor_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-')
                col_values_rootdir = col_rootdir + '.values'
                col_values_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-')

                # create factor
                carray_factor = \
                    bcolz.carray([], dtype='int64', expectedlen=self.size,
                                 rootdir=col_factor_rootdir_tmp, mode='w')
                _, values = \
                    ctable_ext.factorize(self[col], labels=carray_factor)
                carray_factor.flush()

                rm_file_or_dir(col_factor_rootdir, ignore_errors=True)
                shutil.move(col_factor_rootdir_tmp, col_factor_rootdir)

                # create values
                carray_values = \
                    bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype),
                                 rootdir=col_values_rootdir_tmp, mode='w')
                carray_values.flush()
                rm_file_or_dir(col_values_rootdir, ignore_errors=True)
                shutil.move(col_values_rootdir_tmp, col_values_rootdir)
            else:
                rm_file_or_dir(col_factor_rootdir_tmp, ignore_errors=True)
示例#4
0
 def _calc_group_index(eval_list, factor_set, vm=None):
     factorize_list = []
     for eval_node in eval_list:
         # calculate the cartesian group index for each row
         factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm)
         # now factorize the unique groupby combinations
         sub_factor_carray, sub_values = ctable_ext.factorize(factor_input)
         factorize_list.append((sub_factor_carray, sub_values))
     return factorize_list
示例#5
0
    def cache_factor(self, col_list, refresh=False):
        """
        Existing todos here are: these should be hidden helper carrays
        As in: not normal columns that you would normally see as a user

        The factor (label index) carray is as long as the original carray
        (and the rest of the table therefore)
        But the (unique) values carray is not as long (as long as the number
        of unique values)

        :param col_list:
        :param refresh:
        :return:
        """

        if not self.rootdir:
            raise TypeError('Only out-of-core ctables can have '
                            'factorization caching at the moment')

        if not isinstance(col_list, list):
            col_list = [col_list]

        for col in col_list:

            # create cache if needed
            if refresh or not self.cache_valid(col):
                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'

                carray_factor = \
                    bcolz.carray([], dtype='int64', expectedlen=self.size,
                                 rootdir=col_factor_rootdir, mode='w')
                _, values = \
                    ctable_ext.factorize(self[col], labels=carray_factor)
                carray_factor.flush()

                carray_values = \
                    bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype),
                                 rootdir=col_values_rootdir, mode='w')
                carray_values.flush()
示例#6
0
文件: ctable.py 项目: useric/bquery
    def unique(self, col_or_col_list):
        """
        Return a list of unique values of a column or a list of lists of column list

        :param col_or_col_list: a column or a list of columns
        :return:
        """

        if isinstance(col_or_col_list, list):
            col_is_list = True
            col_list = col_or_col_list
        else:
            col_is_list = False
            col_list = [col_or_col_list]

        output = []

        for col in col_list:

            if self.auto_cache or self.cache_valid(col):
                # create factorization cache
                if not self.cache_valid(col):
                    self.cache_factor([col])

                # retrieve values from existing disk-based factorization
                col_values_rootdir = self[col].rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r')
                values = list(carray_values)
            else:
                # factorize on-the-fly
                _, values = ctable_ext.factorize(self[col])
                values = values.values()

            output.append(values)

        if not col_is_list:
            output = output[0]

        return output
示例#7
0
文件: ctable.py 项目: useric/bquery
    def factorize_groupby_cols(self, groupby_cols):
        """
        factorizes all columns that are used in the groupby
        it will use cache carrays if available
        if not yet auto_cache is valid, it will create cache carrays

        """
        # first check if the factorized arrays already exist
        # unless we need to refresh the cache
        factor_list = []
        values_list = []

        # factorize the groupby columns
        for col in groupby_cols:

            if self.auto_cache or self.cache_valid(col):
                # create factorization cache if needed
                if not self.cache_valid(col):
                    self.cache_factor([col])

                col_rootdir = self[col].rootdir
                col_factor_rootdir = col_rootdir + '.factor'
                col_values_rootdir = col_rootdir + '.values'
                col_carray_factor = \
                    bcolz.carray(rootdir=col_factor_rootdir, mode='r')
                col_carray_values = \
                    bcolz.carray(rootdir=col_values_rootdir, mode='r')
            else:
                col_carray_factor, values = ctable_ext.factorize(self[col])
                col_carray_values = \
                    bcolz.carray(np.fromiter(values.values(), dtype=self[col].dtype))

            factor_list.append(col_carray_factor)
            values_list.append(col_carray_values)

        return factor_list, values_list
示例#8
0
文件: ctable.py 项目: useric/bquery
    def make_group_index(self, groupby_cols, bool_arr):
        '''Create unique groups for groupby loop

            Args:
                factor_list:
                values_list:
                groupby_cols:
                bool_arr:

            Returns:
                carray: (carray_factor)
                int: (nr_groups) the number of resulting groups
                int: (skip_key)
        '''
        factor_list, values_list = self.factorize_groupby_cols(groupby_cols)

        # create unique groups for groupby loop
        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total
            tmp_rootdir = self.create_tmp_rootdir()
            carray_factor = bcolz.zeros(len(self), dtype='int64', rootdir=tmp_rootdir, mode='w')
            carray_values = ['Total']
        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            carray_factor = factor_list[0]
            carray_values = values_list[0]
        else:
            # multi column groupby
            # first combine the factorized columns to single values
            if self.group_cache_valid(col_list=groupby_cols):
                # there is a group cache that we can use
                col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols))
                col_factor_rootdir = col_rootdir + '.factor'
                carray_factor = bcolz.carray(rootdir=col_factor_rootdir)
                col_values_rootdir = col_rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir)
            else:
                # create a brand new groupby col combination
                carray_factor, carray_values = \
                    self.create_group_column_factor(factor_list, groupby_cols, cache=self.auto_cache)

        nr_groups = len(carray_values)
        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            tmp_rootdir = self.create_tmp_rootdir()
            carray_factor = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': carray_factor, 'bool': bool_arr}, rootdir=tmp_rootdir, mode='w')
            # now check how many unique values there are left
            tmp_rootdir = self.create_tmp_rootdir()
            labels = bcolz.carray([], dtype='int64', expectedlen=len(carray_factor), rootdir=tmp_rootdir, mode='w')
            carray_factor, values = ctable_ext.factorize(carray_factor, labels)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.items() if value == -1]
            if filter_check:
                skip_key = filter_check[0]
            # the new nr of groups depends on the outcome after filtering
            nr_groups = len(values)

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally

        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return carray_factor, nr_groups, skip_key
示例#9
0
文件: ctable.py 项目: useric/bquery
    def create_group_column_factor(self, factor_list, groupby_cols, cache=False):
        """
        Create a unique, factorized column out of several individual columns

        Parameters
        ----------
        factor_list
        groupby_cols
        cache

        Returns
        -------

        """
        if not self.rootdir:
            # in-memory scenario
            input_rootdir = None
            col_rootdir = None
            col_factor_rootdir = None
            col_values_rootdir = None
            col_factor_rootdir_tmp = None
            col_values_rootdir_tmp = None
        else:
            # temporary
            input_rootdir = tempfile.mkdtemp(prefix='bcolz-')
            col_factor_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-')
            col_values_rootdir_tmp = tempfile.mkdtemp(prefix='bcolz-')

        # create combination of groupby columns
        group_array = bcolz.zeros(0, dtype=np.int64, expectedlen=len(self), rootdir=input_rootdir, mode='w')
        factor_table = bcolz.ctable(factor_list, names=groupby_cols)
        ctable_iter = factor_table.iter(outcols=groupby_cols, out_flavor=tuple)
        ctable_ext.create_group_index(ctable_iter, len(groupby_cols), group_array)

        # now factorize the results
        carray_factor = \
            bcolz.carray([], dtype='int64', expectedlen=self.size, rootdir=col_factor_rootdir_tmp, mode='w')
        carray_factor, values = ctable_ext.factorize(group_array, labels=carray_factor)
        carray_factor.flush()

        carray_values = \
            bcolz.carray(np.fromiter(values.values(), dtype=np.int64), rootdir=col_values_rootdir_tmp, mode='w')
        carray_values.flush()

        del group_array
        if cache:
            # clean up the temporary file
            rm_file_or_dir(input_rootdir, ignore_errors=True)

        if cache:
            # official end destination
            col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols))
            col_factor_rootdir = col_rootdir + '.factor'
            col_values_rootdir = col_rootdir + '.values'
            lock_file = col_rootdir + '.lock'

            # only works for linux
            if not os.path.exists(lock_file):
                uid = str(uuid.uuid4())
                try:
                    with open(lock_file, 'a+') as fn:
                        fn.write(uid + '\n')
                    with open(lock_file, 'r') as fn:
                        temp = fn.read().splitlines()
                    if temp[0] == uid:
                        lock = True
                    else:
                        lock = False
                    del temp
                except:
                    lock = False
            else:
                lock = False

            if lock:
                rm_file_or_dir(col_factor_rootdir, ignore_errors=False)
                shutil.move(col_factor_rootdir_tmp, col_factor_rootdir)
                carray_factor = bcolz.carray(rootdir=col_factor_rootdir, mode='r')

                rm_file_or_dir(col_values_rootdir, ignore_errors=False)
                shutil.move(col_values_rootdir_tmp, col_values_rootdir)
                carray_values = bcolz.carray(rootdir=col_values_rootdir, mode='r')
            else:
                # another process has a lock, we will work with our current files and clean up later
                self._dir_clean_list.append(col_factor_rootdir)
                self._dir_clean_list.append(col_values_rootdir)

        return carray_factor, carray_values
示例#10
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        '''Create unique groups for groupby loop

            Args:
                factor_list:
                values_list:
                groupby_cols:
                array_length:
                bool_arr:

            Returns:
                carray: (factor_carray)
                int: (nr_groups) the number of resulting groups
                int: (skip_key)
        '''

        def _create_eval_str(groupby_cols, values_list, check_overflow=True):

            eval_list = []
            eval_str = ''
            col_list = []
            previous_value = 1
            # Sort evaluated columns by length
            col_len_list = [(col, values) for col, values in zip(groupby_cols, values_list)]
            col_len_list.sort(key=lambda x: len(x[1]))
            groupby_cols = [col for col, _ in col_len_list]
            values_list = [values for _, values in col_len_list]

            for col, values \
                    in zip(groupby_cols, values_list):

                # check for overflow
                if check_overflow:
                    if previous_value * len(values) > 4294967295:
                        eval_list.append((eval_str, col_list))
                        # reset
                        eval_str = ''
                        col_list = []
                        previous_value = 1

                if eval_str:
                    eval_str += ' + '
                else:
                    eval_str += '-2147483648 + '

                eval_str += str(previous_value) + '*' + col
                col_list.append(col)
                previous_value *= len(values)

            eval_list.append((eval_str, col_list))
            return eval_list

        def _calc_group_index(eval_list, factor_set, vm=None):
            factorize_list = []
            for eval_node in eval_list:
                # calculate the cartesian group index for each row
                factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm)
                # now factorize the unique groupby combinations
                sub_factor_carray, sub_values = ctable_ext.factorize(factor_input)
                factorize_list.append((sub_factor_carray, sub_values))
            return factorize_list

        def _is_reducible(eval_list):
            for eval_node in eval_list:
                if len(eval_node[1]) > 1:
                    return True
            return False

        def calc_index(groupby_cols, values_list, factor_set, vm=None):
            # Initialize eval list
            eval_list = _create_eval_str(groupby_cols, values_list)

            # Reduce expression as possible
            while _is_reducible(eval_list):
                del groupby_cols
                del values_list
                factorize_list = _calc_group_index(eval_list, factor_set)
                factor_set = {'g' + str(i): x[0] for i, x in enumerate(factorize_list)}
                groupby_cols = ['g' + str(i) for i, x in enumerate(factorize_list)]
                values_list = [x[1] for i, x in enumerate(factorize_list)]
                eval_list = _create_eval_str(groupby_cols, values_list)
            # If we have multiple expressions that cannot be reduced anymore, rewrite as a single one and use Python vm
            if len(eval_list) > 1:
                eval_list = _create_eval_str(groupby_cols, values_list, check_overflow=False)
                vm = 'python'

            del groupby_cols
            del values_list

            # Now we have a single expression, factorize it
            return _calc_group_index(eval_list, factor_set, vm=vm)[0]

        # create unique groups for groupby loop
        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']
        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]
        else:
            # multi column groupby
            # nb: this might also be cached in the future
            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}
            # create a numexpr expression that calculates the place on
            # a cartesian join index
            factor_carray, values = calc_index(groupby_cols, values_list, factor_set)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': factor_carray, 'bool': bool_arr})
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.items() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key