Exemplo n.º 1
0
class Regression(CompoundExpression):
    """abstract base class for all regressions"""
    @staticmethod
    def add_filter(expr, filter):
        if filter is not None:
            missing_value = missing_values[getdtype(expr, None)]
            return Where(filter, expr, missing_value)
        else:
            return expr

    dtype = always(float)
Exemplo n.º 2
0
class Trunc(FunctionExpr):
    # TODO: check that the dtype is correct at compilation time (__init__ is too
    # early since we do not have the context yet)
    # assert getdtype(self.args[0], context) == float
    def compute(self, context, expr):
        if isinstance(expr, np.ndarray):
            return expr.astype(int)
        else:
            return int(expr)

    dtype = always(int)
Exemplo n.º 3
0
class Std(WeightedFilteredAggregateFunction):
    dtype = always(float)

    def compute(self, context, expr, filter=None, skip_na=True, weights=None):
        values, weights = self.get_filtered_values_weights(
            expr, filter_values=filter, weights=weights, skip_na=skip_na)
        if weights is None:
            return np.std(values)
        else:
            average = np.average(values, weights=weights)
            variance = np.average((values - average)**2, weights=weights)
            return np.sqrt(variance)
Exemplo n.º 4
0
class LogitRegr(Regression):
    funcname = 'logit_regr'

    def build_expr(self, context, expr, filter=None, align=None):
        score_expr = LogitScore(expr)
        if align is not None:
            # we do not need add_filter because Alignment already handles it
            return Alignment(score_expr, align, filter=filter)
        else:
            return self.add_filter(ComparisonOp('>', score_expr, 0.5), filter)

    dtype = always(bool)
Exemplo n.º 5
0
class Erf(FunctionExpr):
    def compute(self, context, expr):
        if scipy is None:
            raise ImportError(
                "Failed to import scipy, which is required for erf(). Please make sure scipy is installed and working.",
            )
        if isinstance(expr, np.ndarray):
            return special.erf(expr)
        else:
            return scipy.math.erf(expr)

    dtype = always(float)
Exemplo n.º 6
0
class Gini(WeightedFilteredAggregateFunction):
    def compute(self, context, expr, filter=None, skip_na=True, weights=None):
        values, weights = self.get_filtered_values_weights(
            expr, filter_values=filter, weights=weights, skip_na=skip_na)
        if weights is not None:
            # ported from a GPL algorithm written in R found at:
            # https://rdrr.io/cran/acid/src/R/weighted.gini.R
            sorted_indices = np.argsort(values)
            sorted_values = values[sorted_indices]
            sorted_weights = weights[sorted_indices]
            # force float to avoid overflows with integer inputs
            cumw = np.cumsum(sorted_weights, dtype=float)
            cumvalw = np.cumsum(sorted_values * sorted_weights, dtype=float)
            sumw = cumw[-1]
            sumvalw = cumvalw[-1]
            if sumvalw == 0:
                print(
                    "WARNING: gini(%s, filter=%s): value * weight is all zeros (or nan) for filter"
                    % (self.args[0], self.args[1]))
            # FWIW, this formula with all weights equal to 1 simplifies to the "usual" gini formula without weights,
            # as seen below. Using c = cumxw for concision:
            # cumw = np.arange(1, n + 1)
            # gini = sum(c[1] * 1 - c[0] * 2 + c[2] * 2 - c[1] * 3 + ... + c[-1] * n-1 - c[-2] * n) / (c[-1] * n)
            # gini = sum(- 2 * c[0] - 2 * c[1] - 2 * c[2] - ... - 2 * c[-2] + c[-1] * n-1) / (c[-1] * n)
            # gini = (- 2 * sum(c) + (n + 1) * c[-1]) / (c[-1] * n)
            # gini = (n + 1 - 2 * sum(c) / c[-1]) / n
            return np.sum(cumvalw[1:] * cumw[:-1] -
                          cumvalw[:-1] * cumw[1:]) / (sumvalw * sumw)
        else:
            sorted_values = np.sort(values)
            n = len(values)
            # force float to avoid overflows with integer input expressions
            cumval = np.cumsum(sorted_values, dtype=float)
            sumval = cumval[-1]
            if sumval == 0:
                print(
                    "WARNING: gini(%s, filter=%s): expression is all zeros (or nan) for filter"
                    % (self.args[0], self.args[1]))
            # From Wikipedia (https://en.wikipedia.org/wiki/Gini_coefficient)
            # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i])))
            #                        i=1..n                    i=1..n
            # but since in Python we are indexing from 0, a[i] should be written a[i - 1].
            # The central part is thus:
            #  = sum((n + 1 - i) * a[i - 1])
            #   i=1..n
            #  = sum((n - i) * a[i])
            #   i=0..n-1
            #  = n * a[0] + (n - 1) * a[1] + (n - 2) * a[2] + ... + 1 * a[n - 1]
            #  = sum(cumsum(a))
            return (n + 1 - 2 * np.sum(cumval) / sumval) / n

    dtype = always(float)
Exemplo n.º 7
0
class Avg(Sum):
    def count(self, source_rows, expr_value, weights_value):
        sums = super(Avg, self).count(source_rows, expr_value, weights_value)
        if weights_value is None:
            counts = np.bincount(source_rows)
        else:
            # sum weights
            counts = super(Avg, self).count(source_rows,
                                            expr_value=weights_value,
                                            weights_value=None)
        return sums / counts

    dtype = always(float)
Exemplo n.º 8
0
class Average(FunctionExpr):
    funcname = 'avg'
    no_eval = ('expr', 'filter', 'weights')

    def compute(self, context, expr, filter=None, skip_na=True, weights=None):
        # FIXME: either take "contextual filter" into account here (by using
        # self._getfilter), or don't do it in sum & gini

        if filter is not None and weights is not None:
            filter_weight_expr = BinaryOp('*', filter, weights)
        elif filter is not None:
            filter_weight_expr = filter
        elif weights is not None:
            filter_weight_expr = weights
        else:
            filter_weight_expr = True

        filter_weight = expr_eval(filter_weight_expr, context)
        if filter_weight is not True:
            filter_weight = np.asarray(filter_weight)

        # we do not build an expression for values * filter_weight because we will need filter_weight "alone" to
        # compute numrows
        values = expr_eval(expr, context)
        if np.isscalar(values):
            values = np.array([values])
        else:
            values = np.asarray(values)

        sum_func = np.sum
        if skip_na:
            filter_weight = filter_weight * ispresent(values)
            # even though we already set filter_weight to 0 for na, we still need to ignore nans because nan * 0 == nan
            sum_func = np.nansum

        # we cannot simply use filter_weight is True because it can also be 1 (because of the "* ispresent(values)"
        if np.isscalar(filter_weight) and filter_weight:
            values_filter_weight = values
            numrows = len(values)
        else:
            values_filter_weight = values * filter_weight
            numrows = np.sum(filter_weight)

        if not numrows:
            return float('nan')

        return sum_func(values_filter_weight) / numrows

    dtype = always(float)
Exemplo n.º 9
0
class Median(WeightedFilteredAggregateFunction):
    dtype = always(float)

    def compute(self,
                context,
                expr,
                filter=None,
                skip_na=True,
                weights=None,
                weights_type='sampling'):
        values, weights = self.get_filtered_values_weights(
            expr, filter_values=filter, weights=weights, skip_na=skip_na)
        if weights is None:
            return np.median(values)
        else:
            return wpercentile(values, weights, 50, weights_type=weights_type)
Exemplo n.º 10
0
class TimeAverage(TimeFunction):
    funcname = 'tavg'

    def compute(self, context, expr):
        entity = context.entity

        baseperiod = entity.base_period
        period = context.period - 1

        res_size = len(entity.array)

        num_values = np.zeros(res_size, dtype=np.int)
        # current period
        last_period_wh_value = np.full(res_size, context.period, dtype=np.int)

        sum_values = np.zeros(res_size, dtype=np.float)
        id_to_rownum = context.id_to_rownum
        while period >= baseperiod:
            ids, values = self.value_for_period(expr,
                                                period,
                                                context,
                                                fill=None)

            # filter out lines which are present because there was a value for
            # that individual at that period but not for that column
            acceptable_rows = hasvalue(values)
            acceptable_ids = ids[acceptable_rows]
            if len(acceptable_ids):
                acceptable_values = values[acceptable_rows]

                value_rows = id_to_rownum[acceptable_ids]

                has_value = np.zeros(res_size, dtype=bool)
                safe_put(has_value, value_rows, True)

                period_value = np.zeros(res_size, dtype=np.float)
                safe_put(period_value, value_rows, acceptable_values)

                num_values += has_value * (last_period_wh_value - period)
                sum_values += period_value
                last_period_wh_value[has_value] = period
            period -= 1
        return sum_values / num_values

    dtype = always(float)
Exemplo n.º 11
0
def make_np_class(baseclass, signature, dtypefunc):
    name, args = split_signature(signature)
    if isinstance(dtypefunc, type):
        dtypefunc = always(dtypefunc)
    evalfunc = getattr(np.random, name)

    # we need to explicitly set funcname, because the usual mechanism of
    # getting it from the class name during class creation (in the metaclass)
    # does not work because the class name is not set yet.
    class FuncClass(baseclass):
        np_func = evalfunc
        funcname = name
        argspec = argspec(args, **baseclass.kwonlyargs)
        if dtypefunc is not None:
            dtype = dtypefunc
    FuncClass.__name__ = name.capitalize()
    FuncClass.__doc__ = clean_docstring(evalfunc.__doc__)
    return FuncClass
Exemplo n.º 12
0
class LogitScore(CompoundExpression):
    funcname = 'logit_score'

    def build_expr(self, context, expr):
        if isinstance(expr, basestring):
            # assume it is a filename
            expr = ExtExpr(expr)

        u = Uniform()
        # expr in (0, 0.0, False, '')
        if not isinstance(expr, Expr) and not expr:
            expr = u
        else:
            epsilon = Logit(u)
            # expr = logistic(expr - epsilon)
            expr = Logistic(BinaryOp('-', expr, epsilon))
        return expr

    dtype = always(float)
Exemplo n.º 13
0
class Count(FunctionExpr):
    def compute(self, context, filter=None, weights=None):
        if filter is not None:
            filter = np.asarray(filter)

            # TODO: check this at "compile" time (in __init__), though for that we need to know the type of all
            # temporary variables first
            if not np.issubdtype(filter.dtype, np.bool_):
                raise ValueError("count filter must be a boolean expression")
        if weights is not None:
            weights = np.asarray(weights)

        if filter is None and weights is None:
            return context_length(context)
        elif weights is None:
            return np.sum(filter)
        elif filter is None:
            return np.sum(weights)
        else:
            return np.sum(weights * filter)

    # FIXME: this is wrong when weights are floats
    dtype = always(int)
Exemplo n.º 14
0
class Matching(FilteredExpression):
    """
    Base class for matching functions
    """
    dtype = always(int)
Exemplo n.º 15
0
class AlignmentAbsoluteValues(FilteredExpression):
    funcname = 'align_abs'
    no_eval = ('filter', 'secondary_axis', 'expressions', 'method')

    def __init__(self, *args, **kwargs):
        super(AlignmentAbsoluteValues, self).__init__(*args, **kwargs)

        need = self.args[1]
        if isinstance(need, basestring):
            fpath = os.path.join(config.input_directory, need)
            need = load_ndarray(fpath, float)
            # XXX: store args in a list so that we can modify it?
            # self.args[1] = load_ndarray(fpath, float)
            # XXX: but we should be able to do better than a list, eg.
            # self.args.need = load_ndarray(fpath, float)
            self.args = (self.args[0], need) + self.args[2:]
        self.past_error = None

    def collect_variables(self):
        # args[9] is the "link" argument
        # if self.args.link is None:
        if self.args[9] is None:
            return FilteredExpression.collect_variables(self)
        else:
            # in this case, it's tricky
            return set()

    def _eval_need(self,
                   context,
                   need,
                   expressions,
                   possible_values,
                   expressions_context=None):
        assert isinstance(need, (np.ndarray, la.LArray))
        if expressions_context is None:
            expressions_context = context
        # When given a 0d array, we convert it to 1d. This can happen e.g. for
        # >>> b = True; x = ne.evaluate('where(b, 0.1, 0.2)')
        # >>> isinstance(x, np.ndarray)
        # True
        # >>> x.shape
        # ()
        if not need.shape:
            need = np.array([need])

        if isinstance(need, la.LArray):
            if not expressions:
                expressions = [
                    Variable(expressions_context.entity, name)
                    for name in need.axes.names
                ]
            if not possible_values:
                possible_values = need.axes.labels

        assert isinstance(need, (np.ndarray, la.LArray))

        if len(expressions) != len(possible_values):
            raise Exception("align() expressions and possible_values "
                            "have different length: %d vs %d" %
                            (len(expressions), len(possible_values)))

        if 'period' in [str(e) for e in expressions]:
            period = context.period
            expressions, possible_values, need = \
                kill_axis('period', period, expressions, possible_values, need)

        # kill any axis where the value is constant for all individuals
        # satisfying the filter


#        tokill = [(expr, column[0])
#                  for expr, column in zip(expressions, columns)
#                  if isconstant(column, filter_value)]
#        for expr, value in tokill:
#            expressions, possible_values, need = \
#                kill_axis(str(expr), value, expressions, possible_values,
#                          need)

        return need, expressions, possible_values

    def _handle_frac_need(self, need, method):
        # handle the "fractional people problem"
        if not np.issubdtype(need.dtype, np.integer):
            if method == 'uniform':
                int_need = need.astype(int)
                if config.debug and config.log_level == "processes":
                    print()
                    print("random sequence position before:",
                          np.random.get_state()[2])
                u = np.random.uniform(size=need.shape)
                if config.debug and config.log_level == "processes":
                    print("random sequence position after:",
                          np.random.get_state()[2])
                need = int_need + (u < need - int_need)
            elif method == 'cutoff':
                int_need = need.astype(int)
                frac_need = need - int_need
                need = int_need

                # the sum of fractional objects is the number of extra objects
                # we want aligned
                extra_wanted = int(round(np.sum(frac_need)))
                if extra_wanted:
                    # search the cutoff point yielding:
                    # sum(frac_need >= cutoff) == extra_wanted
                    sorted_frac_need = frac_need.flatten()
                    sorted_frac_need.sort()
                    cutoff = sorted_frac_need[-extra_wanted]
                    extra = frac_need >= cutoff
                    if np.sum(extra) > extra_wanted:
                        # This case can only happen when several bins have the
                        # same frac_need. In this case we could try to be even
                        # closer to our target by randomly selecting X out of
                        # the Y bins which have a frac_need equal to the
                        # cutoff (in addition to all those which are strictly
                        # greater than it).
                        assert np.sum(frac_need == cutoff) > 1
                    need += extra
            elif method == 'round':
                # always use 0.5 as a cutoff point
                need = (need + 0.5).astype(int)

        assert np.issubdtype(need.dtype, np.integer)
        return need

    def _add_past_error(self, context, need, method='default'):
        if method == 'default':
            method = context.get('__on_align_error__')

        if method == 'carry':
            if self.past_error is None:
                # TODO: we should store this somewhere in the context instead
                # XXX: I wonder if past_error shouldn't be float
                # (and _add_past_error should not be called before
                # _handle_frac_need instead of after like it does now).
                self.past_error = np.zeros(need.shape, dtype=int)

            print("adding %d individuals from last period error" %
                  np.sum(self.past_error))
            need += self.past_error

        return need

    def _display_unaligned(self, expressions, ids, columns, unaligned):
        print("Warning: %d individual(s) do not fit in any alignment "
              "category" % np.sum(unaligned))
        header = ['id'] + [str(e) for e in expressions]
        columns = [ids] + columns
        num_rows = len(ids)
        print(
            PrettyTable([header] +
                        [[col[row] for col in columns]
                         for row in range(num_rows) if unaligned[row]]))

    def compute(self,
                context,
                score,
                need=None,
                filter=None,
                take=None,
                leave=None,
                expressions=None,
                possible_values=None,
                errors='default',
                frac_need='uniform',
                link=None,
                secondary_axis=None,
                method='bysorting'):

        if method not in ("bysorting", "sidewalk"):
            raise Exception(
                "Method for alignment should be either 'bysorting' "
                "or 'sidewalk'")
        if method == 'bysorting' and need is None:
            raise Exception("need argument is required when using the "
                            "'bysorting' method (which is the default)")

        if method == "sidewalk":
            # need is calculated over score and we could think of
            # calculate without leave_filter and without take_filter
            if need is None:
                need = sum(score)

        # XXX: move this to _eval_need?
        # need is a single scalar
        if np.isscalar(need):
            need = [need]

        # need is a non-ndarray sequence
        if isinstance(need, (tuple, list)):
            need = np.array(need)
        assert isinstance(need, (np.ndarray, la.LArray))

        if expressions is None:
            expressions = []

        if possible_values is None:
            possible_values = []
        else:
            possible_values = [np.array(pv) for pv in possible_values]

        if frac_need not in ('uniform', 'cutoff', 'round'):
            cls = ValueError if isinstance(frac_need,
                                           basestring) else TypeError
            raise cls("frac_need should be one of: 'uniform', 'cutoff' or "
                      "'round'")

        if secondary_axis is not None and link is None:
            raise Exception("the 'secondary_axis' argument is only valid in "
                            "combination with the 'link' argument")
        if not isinstance(secondary_axis, (type(None), int, Variable)):
            raise Exception(
                "'secondary_axis' should be either an integer or "
                "an axis name (but got '%s' which is of type '%s')" %
                (secondary_axis, type(secondary_axis)))

        func = self.align_no_link if link is None else self.align_link
        return func(context, score, need, filter, take, leave, expressions,
                    possible_values, errors, frac_need, link, secondary_axis,
                    method)

    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)

    def align_link(self, context, score, need, filter, take, leave,
                   expressions, possible_values, errors, frac_need, link,
                   secondary_axis, method):
        target_context = link._target_context(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values,
                            target_context)

        # handle secondary axis
        if isinstance(secondary_axis, Expr):
            axis_name = str(secondary_axis)
            try:
                secondary_axis = need.axes.names.index(axis_name)
            except ValueError:
                raise ValueError("invalid value for secondary_axis: there is "
                                 "no axis named '%s' in the need array" %
                                 axis_name)
        elif isinstance(secondary_axis, int):
            if secondary_axis >= need.ndim:
                raise Exception("%d is an invalid value for secondary_axis: "
                                "it should be smaller than the number of "
                                "dimension of the need array (%d)" %
                                (secondary_axis, need.ndim))
        else:
            assert secondary_axis is None

        # evaluate columns
        target_columns = [expr_eval(e, target_context) for e in expressions]
        # this is a one2many, so the link column is on the target side
        link_column = target_context[link._link_field]

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            reverse_link = Many2One("reverse", link._link_field,
                                    context.entity.name)
            target_filter = LinkGet(reverse_link, filter_expr, False)
            target_filter_value = expr_eval(target_filter, target_context)

            # It is often not a good idea to pre-filter columns like this
            # because we loose information about "indices", but in this case,
            # it is fine, because we do not need that information afterwards.
            filtered_columns = [
                col[target_filter_value]
                if isinstance(col, np.ndarray) and col.shape else [col]
                for col in target_columns
            ]

            link_column = link_column[target_filter_value]
        else:
            filtered_columns = target_columns
            target_filter_value = None

        # compute labels for filtered columns
        # -----------------------------------
        # We can't use _group_labels_light because group_labels assigns labels
        # on a first come, first served basis, not using the order they are
        # in pvalues
        fcols_labels = []
        filtered_length = len(filtered_columns[0])
        unaligned = np.zeros(filtered_length, dtype=bool)
        # XXX: probably needs to use "possible_values" instead
        for fcol, pvalues in zip(filtered_columns, need.axes.labels):
            pvalues_index = dict((v, i) for i, v in enumerate(pvalues))
            fcol_labels = np.empty(filtered_length, dtype=np.int32)
            for i in range(filtered_length):
                value_idx = pvalues_index.get(fcol[i], -1)
                if value_idx == -1:
                    unaligned[i] = True
                fcol_labels[i] = value_idx
            fcols_labels.append(fcol_labels)

        num_unaligned = np.sum(unaligned)
        if num_unaligned:
            # further filter label columns and link_column
            validlabels = ~unaligned
            fcols_labels = [labels[validlabels] for labels in fcols_labels]
            link_column = link_column[validlabels]

            # display who are the evil ones
            ids = target_context['id']
            if target_filter_value is not None:
                filtered_ids = ids[target_filter_value]
            else:
                filtered_ids = ids
            self._display_unaligned(expressions, filtered_ids,
                                    filtered_columns, unaligned)
        else:
            del unaligned

        id_to_rownum = context.id_to_rownum
        missing_int = missing_values[int]
        source_ids = link_column

        if len(id_to_rownum):
            source_rows = id_to_rownum[source_ids]
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            source_rows = []

        # filtered_columns are not filtered further on invalid labels
        # (num_unaligned) but this is not a problem since those will be
        # ignored by GroupBy anyway.
        # TODO: the result of this is ugly because a groupby on *values*, returns an LArray with those
        # values (ndarrays) as axes *names*. Ugh.
        groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values)

        # FIXME: target_context is not correct, as it is not filtered while
        # filtered_columns are. Since we do not use the context "columns" it
        # mostly works but I had to disable an assertion in utils.expand
        # because the length of the context is not correct.
        num_candidates = expr_eval(groupby_expr, target_context)

        # fetch the list of linked individuals for each local individual.
        # e.g. the list of person ids for each household
        hh = np.empty(context_length(context), dtype=object)
        # we can't use .fill([]) because it reuses the same list for all
        # objects
        for i in range(len(hh)):
            hh[i] = []

        # Even though this is highly sub-optimal, the time taken to create
        # those lists of ids is very small compared to the total time taken
        # for align_other (0.2s vs 4.26), so I shouldn't care too much about
        # it for now.

        # target_row (row of person) is an index valid for *filtered/label*
        # columns !
        for target_row, source_row in enumerate(source_rows):
            if source_row == -1:
                continue
            hh[source_row].append(target_row)

        class FakeContainer(object):
            def __init__(self, length):
                self.length = length

            def __len__(self):
                return self.length

        groups = [FakeContainer(g) for g in num_candidates]
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        aligned, error = \
            align_link_nd(score, need, num_candidates, hh, fcols_labels,
                          secondary_axis)
        self.past_error = error
        return aligned

    def _get_need_correction(self, groups, possible_values):
        return 1

    dtype = always(bool)
Exemplo n.º 16
0
class Exp(NumexprFunction):
    argspec = argspec('expr')
    dtype = always(float)
Exemplo n.º 17
0
class New(FilteredExpression):
    no_eval = ('filter', 'kwargs')

    def _initial_values(self, array, to_give_birth, num_birth, default_values):
        return get_default_array(num_birth, array.dtype, default_values)

    @classmethod
    def _collect_kwargs_variables(cls, kwargs):
        used_variables = set()
        # kwargs are stored as a list of (k, v) pairs
        for k, v in kwargs.items():
            used_variables.update(collect_variables(v))
        return used_variables

    def compute(self,
                context,
                entity_name=None,
                filter=None,
                number=None,
                **kwargs):
        if filter is not None and number is not None:
            # Having neither is allowed, though, as there can be a contextual
            # filter. Also, there is no reason to prevent the whole
            # population giving birth, even though the usefulness of such
            # usage seem dubious.
            raise ValueError("new() 'filter' and 'number' arguments are "
                             "mutually exclusive")
        source_entity = context.entity
        if entity_name is None:
            target_entity = source_entity
        else:
            target_entity = context.entities[entity_name]

        # target context is the context where the new individuals will be
        # created
        if target_entity is source_entity:
            target_context = context
        else:
            # we do need to copy the data (.extra) because we will insert into
            # the entity.array anyway => fresh_data=True
            target_context = context.clone(fresh_data=True,
                                           entity_name=target_entity.name)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif number is not None:
            to_give_birth = None
            num_birth = number
        else:
            to_give_birth = np.ones(len(context), dtype=bool)
            num_birth = len(context)

        array = target_entity.array
        default_values = target_entity.fields.default_values

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth,
                                        default_values)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context.period

            used_variables = [
                v.name for v in self._collect_kwargs_variables(kwargs)
            ]
            if to_give_birth is None:
                assert not used_variables
                child_context = context.empty(num_birth)
            else:
                child_context = context.subset(to_give_birth, used_variables,
                                               filter_expr)
            for k, v in kwargs.items():
                if k not in array.dtype.names:
                    print("WARNING: {} is unknown, ignoring it!".format(k))
                    continue
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        expr_cache.invalidate(context.period, context.entity_name)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.full(context_length(context), -1, dtype=int)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See https://github.com/numpy/numpy/issues/2462
            result[to_give_birth] = children['id']
            return result
        else:
            return None

    dtype = always(int)
Exemplo n.º 18
0
class Dump(TableExpression):
    no_eval = ('args', )
    kwonlyargs = {
        'filter': None,
        'missing': None,
        'header': True,
        'limit': None
    }

    def compute(self, context, *args, **kwargs):
        filter_value = kwargs.pop('filter', None)
        missing = kwargs.pop('missing', None)
        # periods = kwargs.pop('periods', None)
        header = kwargs.pop('header', True)
        limit = kwargs.pop('limit', None)
        entity = context.entity

        if args:
            expressions = list(args)
        else:
            # extra=False because we don't want globals nor "system" variables
            # (nan, period, __xxx__)
            # FIXME: we should also somehow "traverse" expressions in this case
            # too (args is ()) => all keys in the current context
            expressions = [
                Variable(entity, name) for name in context.keys(extra=False)
            ]

        str_expressions = [str(e) for e in expressions]
        if 'id' not in str_expressions:
            str_expressions.insert(0, 'id')
            expressions.insert(0, Variable(entity, 'id'))
            id_pos = 0
        else:
            id_pos = str_expressions.index('id')

        #        if (self.periods is not None and len(self.periods) and
        #            'period' not in str_expressions):
        #            str_expressions.insert(0, 'period')
        #            expressions.insert(0, Variable('period'))
        #            id_pos += 1

        columns = []
        for expr in expressions:
            if filter_value is False:
                # dtype does not matter much
                expr_value = np.empty(0)
            else:
                # TODO: set filter before evaluating expressions
                expr_value = expr_eval(expr, context)
                if (filter_value is not None
                        and isinstance(expr_value, np.ndarray)
                        and expr_value.shape):
                    expr_value = expr_value[filter_value]
            columns.append(expr_value)

        ids = columns[id_pos]
        if isinstance(ids, np.ndarray) and ids.shape:
            numrows = len(ids)
        else:
            # FIXME: we need a test for this case (no idea how this can happen)
            numrows = 1

        # expand scalar columns to full columns in memory
        for idx, col in enumerate(columns):
            dtype = None
            if not isinstance(col, np.ndarray):
                dtype = type(col)
            elif not col.shape:
                dtype = col.dtype.type

            if dtype is not None:
                # TODO: try using itertools.repeat instead as it seems to be a
                # bit faster and would consume less memory (however, it might
                # not play very well with Pandas.to_csv)
                newcol = np.full(numrows, col, dtype=dtype)
                columns[idx] = newcol
            elif col.ndim > 1:
                # move last axis (should be id axis) first
                # np.moveaxis requires numpy >= 1.11
                # columns[idx] = np.moveaxis(col, -1, 0)
                columns[idx] = col.transpose((-1, ) +
                                             tuple(range(col.ndim - 1)))

        assert all(isinstance(col, np.ndarray) for col in columns)
        bad_lengths = {
            str_expr: col.shape
            for col, str_expr in zip(columns, str_expressions)
            if col.shape[0] != numrows
        }
        if bad_lengths:
            raise ValueError(
                "first dimension of some columns are not the same length as the id column (%d): %s"
                % (numrows, str(bad_lengths)))

        if limit is not None:
            assert isinstance(limit, (int, long))
            columns = [col[:limit] for col in columns]

        # Transform to Python lists of normal Python types (ie no numpy types).
        # on py2, csv.writer uses repr(value) for float and str(value) for other but
        # on py3 since str(float) == repr(float), they switched to str(value) for everything
        # but str(np.float64) does not have full precision (truncated at the 12th decimal)
        # besides, this seems to be faster (but probably takes more memory).
        # Also on python2, converting produces nicer/shorter float strings (see issue #225).
        columns = [c.tolist() for c in columns]
        data = zip(*columns)
        if header:
            table = [str_expressions]
            table.extend(data)
        else:
            table = list(data)
        return PrettyTable(table, missing)

    dtype = always(None)
Exemplo n.º 19
0
class All(NumpyAggregate):
    np_func = np.all
    dtype = always(bool)
Exemplo n.º 20
0
class Any(NumpyAggregate):
    np_func = np.any
    dtype = always(bool)