class Regression(CompoundExpression): """abstract base class for all regressions""" @staticmethod def add_filter(expr, filter): if filter is not None: missing_value = missing_values[getdtype(expr, None)] return Where(filter, expr, missing_value) else: return expr dtype = always(float)
class Trunc(FunctionExpr): # TODO: check that the dtype is correct at compilation time (__init__ is too # early since we do not have the context yet) # assert getdtype(self.args[0], context) == float def compute(self, context, expr): if isinstance(expr, np.ndarray): return expr.astype(int) else: return int(expr) dtype = always(int)
class Std(WeightedFilteredAggregateFunction): dtype = always(float) def compute(self, context, expr, filter=None, skip_na=True, weights=None): values, weights = self.get_filtered_values_weights( expr, filter_values=filter, weights=weights, skip_na=skip_na) if weights is None: return np.std(values) else: average = np.average(values, weights=weights) variance = np.average((values - average)**2, weights=weights) return np.sqrt(variance)
class LogitRegr(Regression): funcname = 'logit_regr' def build_expr(self, context, expr, filter=None, align=None): score_expr = LogitScore(expr) if align is not None: # we do not need add_filter because Alignment already handles it return Alignment(score_expr, align, filter=filter) else: return self.add_filter(ComparisonOp('>', score_expr, 0.5), filter) dtype = always(bool)
class Erf(FunctionExpr): def compute(self, context, expr): if scipy is None: raise ImportError( "Failed to import scipy, which is required for erf(). Please make sure scipy is installed and working.", ) if isinstance(expr, np.ndarray): return special.erf(expr) else: return scipy.math.erf(expr) dtype = always(float)
class Gini(WeightedFilteredAggregateFunction): def compute(self, context, expr, filter=None, skip_na=True, weights=None): values, weights = self.get_filtered_values_weights( expr, filter_values=filter, weights=weights, skip_na=skip_na) if weights is not None: # ported from a GPL algorithm written in R found at: # https://rdrr.io/cran/acid/src/R/weighted.gini.R sorted_indices = np.argsort(values) sorted_values = values[sorted_indices] sorted_weights = weights[sorted_indices] # force float to avoid overflows with integer inputs cumw = np.cumsum(sorted_weights, dtype=float) cumvalw = np.cumsum(sorted_values * sorted_weights, dtype=float) sumw = cumw[-1] sumvalw = cumvalw[-1] if sumvalw == 0: print( "WARNING: gini(%s, filter=%s): value * weight is all zeros (or nan) for filter" % (self.args[0], self.args[1])) # FWIW, this formula with all weights equal to 1 simplifies to the "usual" gini formula without weights, # as seen below. Using c = cumxw for concision: # cumw = np.arange(1, n + 1) # gini = sum(c[1] * 1 - c[0] * 2 + c[2] * 2 - c[1] * 3 + ... + c[-1] * n-1 - c[-2] * n) / (c[-1] * n) # gini = sum(- 2 * c[0] - 2 * c[1] - 2 * c[2] - ... - 2 * c[-2] + c[-1] * n-1) / (c[-1] * n) # gini = (- 2 * sum(c) + (n + 1) * c[-1]) / (c[-1] * n) # gini = (n + 1 - 2 * sum(c) / c[-1]) / n return np.sum(cumvalw[1:] * cumw[:-1] - cumvalw[:-1] * cumw[1:]) / (sumvalw * sumw) else: sorted_values = np.sort(values) n = len(values) # force float to avoid overflows with integer input expressions cumval = np.cumsum(sorted_values, dtype=float) sumval = cumval[-1] if sumval == 0: print( "WARNING: gini(%s, filter=%s): expression is all zeros (or nan) for filter" % (self.args[0], self.args[1])) # From Wikipedia (https://en.wikipedia.org/wiki/Gini_coefficient) # G = 1/n * (n + 1 - 2 * (sum((n + 1 - i) * a[i]) / sum(a[i]))) # i=1..n i=1..n # but since in Python we are indexing from 0, a[i] should be written a[i - 1]. # The central part is thus: # = sum((n + 1 - i) * a[i - 1]) # i=1..n # = sum((n - i) * a[i]) # i=0..n-1 # = n * a[0] + (n - 1) * a[1] + (n - 2) * a[2] + ... + 1 * a[n - 1] # = sum(cumsum(a)) return (n + 1 - 2 * np.sum(cumval) / sumval) / n dtype = always(float)
class Avg(Sum): def count(self, source_rows, expr_value, weights_value): sums = super(Avg, self).count(source_rows, expr_value, weights_value) if weights_value is None: counts = np.bincount(source_rows) else: # sum weights counts = super(Avg, self).count(source_rows, expr_value=weights_value, weights_value=None) return sums / counts dtype = always(float)
class Average(FunctionExpr): funcname = 'avg' no_eval = ('expr', 'filter', 'weights') def compute(self, context, expr, filter=None, skip_na=True, weights=None): # FIXME: either take "contextual filter" into account here (by using # self._getfilter), or don't do it in sum & gini if filter is not None and weights is not None: filter_weight_expr = BinaryOp('*', filter, weights) elif filter is not None: filter_weight_expr = filter elif weights is not None: filter_weight_expr = weights else: filter_weight_expr = True filter_weight = expr_eval(filter_weight_expr, context) if filter_weight is not True: filter_weight = np.asarray(filter_weight) # we do not build an expression for values * filter_weight because we will need filter_weight "alone" to # compute numrows values = expr_eval(expr, context) if np.isscalar(values): values = np.array([values]) else: values = np.asarray(values) sum_func = np.sum if skip_na: filter_weight = filter_weight * ispresent(values) # even though we already set filter_weight to 0 for na, we still need to ignore nans because nan * 0 == nan sum_func = np.nansum # we cannot simply use filter_weight is True because it can also be 1 (because of the "* ispresent(values)" if np.isscalar(filter_weight) and filter_weight: values_filter_weight = values numrows = len(values) else: values_filter_weight = values * filter_weight numrows = np.sum(filter_weight) if not numrows: return float('nan') return sum_func(values_filter_weight) / numrows dtype = always(float)
class Median(WeightedFilteredAggregateFunction): dtype = always(float) def compute(self, context, expr, filter=None, skip_na=True, weights=None, weights_type='sampling'): values, weights = self.get_filtered_values_weights( expr, filter_values=filter, weights=weights, skip_na=skip_na) if weights is None: return np.median(values) else: return wpercentile(values, weights, 50, weights_type=weights_type)
class TimeAverage(TimeFunction): funcname = 'tavg' def compute(self, context, expr): entity = context.entity baseperiod = entity.base_period period = context.period - 1 res_size = len(entity.array) num_values = np.zeros(res_size, dtype=np.int) # current period last_period_wh_value = np.full(res_size, context.period, dtype=np.int) sum_values = np.zeros(res_size, dtype=np.float) id_to_rownum = context.id_to_rownum while period >= baseperiod: ids, values = self.value_for_period(expr, period, context, fill=None) # filter out lines which are present because there was a value for # that individual at that period but not for that column acceptable_rows = hasvalue(values) acceptable_ids = ids[acceptable_rows] if len(acceptable_ids): acceptable_values = values[acceptable_rows] value_rows = id_to_rownum[acceptable_ids] has_value = np.zeros(res_size, dtype=bool) safe_put(has_value, value_rows, True) period_value = np.zeros(res_size, dtype=np.float) safe_put(period_value, value_rows, acceptable_values) num_values += has_value * (last_period_wh_value - period) sum_values += period_value last_period_wh_value[has_value] = period period -= 1 return sum_values / num_values dtype = always(float)
def make_np_class(baseclass, signature, dtypefunc): name, args = split_signature(signature) if isinstance(dtypefunc, type): dtypefunc = always(dtypefunc) evalfunc = getattr(np.random, name) # we need to explicitly set funcname, because the usual mechanism of # getting it from the class name during class creation (in the metaclass) # does not work because the class name is not set yet. class FuncClass(baseclass): np_func = evalfunc funcname = name argspec = argspec(args, **baseclass.kwonlyargs) if dtypefunc is not None: dtype = dtypefunc FuncClass.__name__ = name.capitalize() FuncClass.__doc__ = clean_docstring(evalfunc.__doc__) return FuncClass
class LogitScore(CompoundExpression): funcname = 'logit_score' def build_expr(self, context, expr): if isinstance(expr, basestring): # assume it is a filename expr = ExtExpr(expr) u = Uniform() # expr in (0, 0.0, False, '') if not isinstance(expr, Expr) and not expr: expr = u else: epsilon = Logit(u) # expr = logistic(expr - epsilon) expr = Logistic(BinaryOp('-', expr, epsilon)) return expr dtype = always(float)
class Count(FunctionExpr): def compute(self, context, filter=None, weights=None): if filter is not None: filter = np.asarray(filter) # TODO: check this at "compile" time (in __init__), though for that we need to know the type of all # temporary variables first if not np.issubdtype(filter.dtype, np.bool_): raise ValueError("count filter must be a boolean expression") if weights is not None: weights = np.asarray(weights) if filter is None and weights is None: return context_length(context) elif weights is None: return np.sum(filter) elif filter is None: return np.sum(weights) else: return np.sum(weights * filter) # FIXME: this is wrong when weights are floats dtype = always(int)
class Matching(FilteredExpression): """ Base class for matching functions """ dtype = always(int)
class AlignmentAbsoluteValues(FilteredExpression): funcname = 'align_abs' no_eval = ('filter', 'secondary_axis', 'expressions', 'method') def __init__(self, *args, **kwargs): super(AlignmentAbsoluteValues, self).__init__(*args, **kwargs) need = self.args[1] if isinstance(need, basestring): fpath = os.path.join(config.input_directory, need) need = load_ndarray(fpath, float) # XXX: store args in a list so that we can modify it? # self.args[1] = load_ndarray(fpath, float) # XXX: but we should be able to do better than a list, eg. # self.args.need = load_ndarray(fpath, float) self.args = (self.args[0], need) + self.args[2:] self.past_error = None def collect_variables(self): # args[9] is the "link" argument # if self.args.link is None: if self.args[9] is None: return FilteredExpression.collect_variables(self) else: # in this case, it's tricky return set() def _eval_need(self, context, need, expressions, possible_values, expressions_context=None): assert isinstance(need, (np.ndarray, la.LArray)) if expressions_context is None: expressions_context = context # When given a 0d array, we convert it to 1d. This can happen e.g. for # >>> b = True; x = ne.evaluate('where(b, 0.1, 0.2)') # >>> isinstance(x, np.ndarray) # True # >>> x.shape # () if not need.shape: need = np.array([need]) if isinstance(need, la.LArray): if not expressions: expressions = [ Variable(expressions_context.entity, name) for name in need.axes.names ] if not possible_values: possible_values = need.axes.labels assert isinstance(need, (np.ndarray, la.LArray)) if len(expressions) != len(possible_values): raise Exception("align() expressions and possible_values " "have different length: %d vs %d" % (len(expressions), len(possible_values))) if 'period' in [str(e) for e in expressions]: period = context.period expressions, possible_values, need = \ kill_axis('period', period, expressions, possible_values, need) # kill any axis where the value is constant for all individuals # satisfying the filter # tokill = [(expr, column[0]) # for expr, column in zip(expressions, columns) # if isconstant(column, filter_value)] # for expr, value in tokill: # expressions, possible_values, need = \ # kill_axis(str(expr), value, expressions, possible_values, # need) return need, expressions, possible_values def _handle_frac_need(self, need, method): # handle the "fractional people problem" if not np.issubdtype(need.dtype, np.integer): if method == 'uniform': int_need = need.astype(int) if config.debug and config.log_level == "processes": print() print("random sequence position before:", np.random.get_state()[2]) u = np.random.uniform(size=need.shape) if config.debug and config.log_level == "processes": print("random sequence position after:", np.random.get_state()[2]) need = int_need + (u < need - int_need) elif method == 'cutoff': int_need = need.astype(int) frac_need = need - int_need need = int_need # the sum of fractional objects is the number of extra objects # we want aligned extra_wanted = int(round(np.sum(frac_need))) if extra_wanted: # search the cutoff point yielding: # sum(frac_need >= cutoff) == extra_wanted sorted_frac_need = frac_need.flatten() sorted_frac_need.sort() cutoff = sorted_frac_need[-extra_wanted] extra = frac_need >= cutoff if np.sum(extra) > extra_wanted: # This case can only happen when several bins have the # same frac_need. In this case we could try to be even # closer to our target by randomly selecting X out of # the Y bins which have a frac_need equal to the # cutoff (in addition to all those which are strictly # greater than it). assert np.sum(frac_need == cutoff) > 1 need += extra elif method == 'round': # always use 0.5 as a cutoff point need = (need + 0.5).astype(int) assert np.issubdtype(need.dtype, np.integer) return need def _add_past_error(self, context, need, method='default'): if method == 'default': method = context.get('__on_align_error__') if method == 'carry': if self.past_error is None: # TODO: we should store this somewhere in the context instead # XXX: I wonder if past_error shouldn't be float # (and _add_past_error should not be called before # _handle_frac_need instead of after like it does now). self.past_error = np.zeros(need.shape, dtype=int) print("adding %d individuals from last period error" % np.sum(self.past_error)) need += self.past_error return need def _display_unaligned(self, expressions, ids, columns, unaligned): print("Warning: %d individual(s) do not fit in any alignment " "category" % np.sum(unaligned)) header = ['id'] + [str(e) for e in expressions] columns = [ids] + columns num_rows = len(ids) print( PrettyTable([header] + [[col[row] for col in columns] for row in range(num_rows) if unaligned[row]])) def compute(self, context, score, need=None, filter=None, take=None, leave=None, expressions=None, possible_values=None, errors='default', frac_need='uniform', link=None, secondary_axis=None, method='bysorting'): if method not in ("bysorting", "sidewalk"): raise Exception( "Method for alignment should be either 'bysorting' " "or 'sidewalk'") if method == 'bysorting' and need is None: raise Exception("need argument is required when using the " "'bysorting' method (which is the default)") if method == "sidewalk": # need is calculated over score and we could think of # calculate without leave_filter and without take_filter if need is None: need = sum(score) # XXX: move this to _eval_need? # need is a single scalar if np.isscalar(need): need = [need] # need is a non-ndarray sequence if isinstance(need, (tuple, list)): need = np.array(need) assert isinstance(need, (np.ndarray, la.LArray)) if expressions is None: expressions = [] if possible_values is None: possible_values = [] else: possible_values = [np.array(pv) for pv in possible_values] if frac_need not in ('uniform', 'cutoff', 'round'): cls = ValueError if isinstance(frac_need, basestring) else TypeError raise cls("frac_need should be one of: 'uniform', 'cutoff' or " "'round'") if secondary_axis is not None and link is None: raise Exception("the 'secondary_axis' argument is only valid in " "combination with the 'link' argument") if not isinstance(secondary_axis, (type(None), int, Variable)): raise Exception( "'secondary_axis' should be either an integer or " "an axis name (but got '%s' which is of type '%s')" % (secondary_axis, type(secondary_axis))) func = self.align_no_link if link is None else self.align_link return func(context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method) def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method) def align_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): target_context = link._target_context(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values, target_context) # handle secondary axis if isinstance(secondary_axis, Expr): axis_name = str(secondary_axis) try: secondary_axis = need.axes.names.index(axis_name) except ValueError: raise ValueError("invalid value for secondary_axis: there is " "no axis named '%s' in the need array" % axis_name) elif isinstance(secondary_axis, int): if secondary_axis >= need.ndim: raise Exception("%d is an invalid value for secondary_axis: " "it should be smaller than the number of " "dimension of the need array (%d)" % (secondary_axis, need.ndim)) else: assert secondary_axis is None # evaluate columns target_columns = [expr_eval(e, target_context) for e in expressions] # this is a one2many, so the link column is on the target side link_column = target_context[link._link_field] filter_expr = self._getfilter(context, filter) if filter_expr is not None: reverse_link = Many2One("reverse", link._link_field, context.entity.name) target_filter = LinkGet(reverse_link, filter_expr, False) target_filter_value = expr_eval(target_filter, target_context) # It is often not a good idea to pre-filter columns like this # because we loose information about "indices", but in this case, # it is fine, because we do not need that information afterwards. filtered_columns = [ col[target_filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in target_columns ] link_column = link_column[target_filter_value] else: filtered_columns = target_columns target_filter_value = None # compute labels for filtered columns # ----------------------------------- # We can't use _group_labels_light because group_labels assigns labels # on a first come, first served basis, not using the order they are # in pvalues fcols_labels = [] filtered_length = len(filtered_columns[0]) unaligned = np.zeros(filtered_length, dtype=bool) # XXX: probably needs to use "possible_values" instead for fcol, pvalues in zip(filtered_columns, need.axes.labels): pvalues_index = dict((v, i) for i, v in enumerate(pvalues)) fcol_labels = np.empty(filtered_length, dtype=np.int32) for i in range(filtered_length): value_idx = pvalues_index.get(fcol[i], -1) if value_idx == -1: unaligned[i] = True fcol_labels[i] = value_idx fcols_labels.append(fcol_labels) num_unaligned = np.sum(unaligned) if num_unaligned: # further filter label columns and link_column validlabels = ~unaligned fcols_labels = [labels[validlabels] for labels in fcols_labels] link_column = link_column[validlabels] # display who are the evil ones ids = target_context['id'] if target_filter_value is not None: filtered_ids = ids[target_filter_value] else: filtered_ids = ids self._display_unaligned(expressions, filtered_ids, filtered_columns, unaligned) else: del unaligned id_to_rownum = context.id_to_rownum missing_int = missing_values[int] source_ids = link_column if len(id_to_rownum): source_rows = id_to_rownum[source_ids] # filter out missing values: those where the value of the link # points to nowhere (-1) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) source_rows = [] # filtered_columns are not filtered further on invalid labels # (num_unaligned) but this is not a problem since those will be # ignored by GroupBy anyway. # TODO: the result of this is ugly because a groupby on *values*, returns an LArray with those # values (ndarrays) as axes *names*. Ugh. groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values) # FIXME: target_context is not correct, as it is not filtered while # filtered_columns are. Since we do not use the context "columns" it # mostly works but I had to disable an assertion in utils.expand # because the length of the context is not correct. num_candidates = expr_eval(groupby_expr, target_context) # fetch the list of linked individuals for each local individual. # e.g. the list of person ids for each household hh = np.empty(context_length(context), dtype=object) # we can't use .fill([]) because it reuses the same list for all # objects for i in range(len(hh)): hh[i] = [] # Even though this is highly sub-optimal, the time taken to create # those lists of ids is very small compared to the total time taken # for align_other (0.2s vs 4.26), so I shouldn't care too much about # it for now. # target_row (row of person) is an index valid for *filtered/label* # columns ! for target_row, source_row in enumerate(source_rows): if source_row == -1: continue hh[source_row].append(target_row) class FakeContainer(object): def __init__(self, length): self.length = length def __len__(self): return self.length groups = [FakeContainer(g) for g in num_candidates] need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) aligned, error = \ align_link_nd(score, need, num_candidates, hh, fcols_labels, secondary_axis) self.past_error = error return aligned def _get_need_correction(self, groups, possible_values): return 1 dtype = always(bool)
class Exp(NumexprFunction): argspec = argspec('expr') dtype = always(float)
class New(FilteredExpression): no_eval = ('filter', 'kwargs') def _initial_values(self, array, to_give_birth, num_birth, default_values): return get_default_array(num_birth, array.dtype, default_values) @classmethod def _collect_kwargs_variables(cls, kwargs): used_variables = set() # kwargs are stored as a list of (k, v) pairs for k, v in kwargs.items(): used_variables.update(collect_variables(v)) return used_variables def compute(self, context, entity_name=None, filter=None, number=None, **kwargs): if filter is not None and number is not None: # Having neither is allowed, though, as there can be a contextual # filter. Also, there is no reason to prevent the whole # population giving birth, even though the usefulness of such # usage seem dubious. raise ValueError("new() 'filter' and 'number' arguments are " "mutually exclusive") source_entity = context.entity if entity_name is None: target_entity = source_entity else: target_entity = context.entities[entity_name] # target context is the context where the new individuals will be # created if target_entity is source_entity: target_context = context else: # we do need to copy the data (.extra) because we will insert into # the entity.array anyway => fresh_data=True target_context = context.clone(fresh_data=True, entity_name=target_entity.name) filter_expr = self._getfilter(context, filter) if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif number is not None: to_give_birth = None num_birth = number else: to_give_birth = np.ones(len(context), dtype=bool) num_birth = len(context) array = target_entity.array default_values = target_entity.fields.default_values id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth, default_values) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context.period used_variables = [ v.name for v in self._collect_kwargs_variables(kwargs) ] if to_give_birth is None: assert not used_variables child_context = context.empty(num_birth) else: child_context = context.subset(to_give_birth, used_variables, filter_expr) for k, v in kwargs.items(): if k not in array.dtype.names: print("WARNING: {} is unknown, ignoring it!".format(k)) continue children[k] = expr_eval(v, child_context) add_individuals(target_context, children) expr_cache.invalidate(context.period, context.entity_name) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.full(context_length(context), -1, dtype=int) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See https://github.com/numpy/numpy/issues/2462 result[to_give_birth] = children['id'] return result else: return None dtype = always(int)
class Dump(TableExpression): no_eval = ('args', ) kwonlyargs = { 'filter': None, 'missing': None, 'header': True, 'limit': None } def compute(self, context, *args, **kwargs): filter_value = kwargs.pop('filter', None) missing = kwargs.pop('missing', None) # periods = kwargs.pop('periods', None) header = kwargs.pop('header', True) limit = kwargs.pop('limit', None) entity = context.entity if args: expressions = list(args) else: # extra=False because we don't want globals nor "system" variables # (nan, period, __xxx__) # FIXME: we should also somehow "traverse" expressions in this case # too (args is ()) => all keys in the current context expressions = [ Variable(entity, name) for name in context.keys(extra=False) ] str_expressions = [str(e) for e in expressions] if 'id' not in str_expressions: str_expressions.insert(0, 'id') expressions.insert(0, Variable(entity, 'id')) id_pos = 0 else: id_pos = str_expressions.index('id') # if (self.periods is not None and len(self.periods) and # 'period' not in str_expressions): # str_expressions.insert(0, 'period') # expressions.insert(0, Variable('period')) # id_pos += 1 columns = [] for expr in expressions: if filter_value is False: # dtype does not matter much expr_value = np.empty(0) else: # TODO: set filter before evaluating expressions expr_value = expr_eval(expr, context) if (filter_value is not None and isinstance(expr_value, np.ndarray) and expr_value.shape): expr_value = expr_value[filter_value] columns.append(expr_value) ids = columns[id_pos] if isinstance(ids, np.ndarray) and ids.shape: numrows = len(ids) else: # FIXME: we need a test for this case (no idea how this can happen) numrows = 1 # expand scalar columns to full columns in memory for idx, col in enumerate(columns): dtype = None if not isinstance(col, np.ndarray): dtype = type(col) elif not col.shape: dtype = col.dtype.type if dtype is not None: # TODO: try using itertools.repeat instead as it seems to be a # bit faster and would consume less memory (however, it might # not play very well with Pandas.to_csv) newcol = np.full(numrows, col, dtype=dtype) columns[idx] = newcol elif col.ndim > 1: # move last axis (should be id axis) first # np.moveaxis requires numpy >= 1.11 # columns[idx] = np.moveaxis(col, -1, 0) columns[idx] = col.transpose((-1, ) + tuple(range(col.ndim - 1))) assert all(isinstance(col, np.ndarray) for col in columns) bad_lengths = { str_expr: col.shape for col, str_expr in zip(columns, str_expressions) if col.shape[0] != numrows } if bad_lengths: raise ValueError( "first dimension of some columns are not the same length as the id column (%d): %s" % (numrows, str(bad_lengths))) if limit is not None: assert isinstance(limit, (int, long)) columns = [col[:limit] for col in columns] # Transform to Python lists of normal Python types (ie no numpy types). # on py2, csv.writer uses repr(value) for float and str(value) for other but # on py3 since str(float) == repr(float), they switched to str(value) for everything # but str(np.float64) does not have full precision (truncated at the 12th decimal) # besides, this seems to be faster (but probably takes more memory). # Also on python2, converting produces nicer/shorter float strings (see issue #225). columns = [c.tolist() for c in columns] data = zip(*columns) if header: table = [str_expressions] table.extend(data) else: table = list(data) return PrettyTable(table, missing) dtype = always(None)
class All(NumpyAggregate): np_func = np.all dtype = always(bool)
class Any(NumpyAggregate): np_func = np.any dtype = always(bool)