def store_result(self, result, context): if isinstance(result, (np.ndarray, la.LArray)): res_type = result.dtype.type else: res_type = type(result) if self.temporary: target = self.entity.temp_variables else: # we cannot store/cache self.entity.array[self.name] because the # array object can change (eg when enlarging it due to births) target = self.entity.array result = np.asarray(result) # TODO: assert type for temporary variables too target_type_idx = type_to_idx[target[self.name].dtype.type] res_type_idx = type_to_idx[res_type] if res_type_idx > target_type_idx: raise Exception( "trying to store %s value into '%s' field which is of " "type %s" % (idx_to_type[res_type_idx].__name__, self.name, idx_to_type[target_type_idx].__name__)) # the whole column is updated target[self.name] = result # invalidate cache period = context.period if isinstance(period, np.ndarray): assert np.isscalar(period) or not period.shape period = int(period) expr_cache.invalidate(period, context.entity_name, Variable(self.entity, self.name))
def compute(self, context, coefficients): assert isinstance(coefficients, la.LArray) # XXX: change to "variable"? because we can use temporary variables too! # or even to "expressions" if we want to support expressions. # FIXME013: in any case, it should be singular field_axis = coefficients.axes['fields'] other_axes = coefficients.axes - field_axis expr = None # XXX: instead of retrieving labels along a dimension & splitting manually, # we should have a "split" operation in LArray (opposite of stack) for name in field_axis.labels: coef_value = coefficients[name] # automatically index other (remaining) dimensions if other_axes: coef_value = index_array_by_variables(coef_value, context, other_axes) coef_var = self.add_tmp_var(context, coef_value) if name != 'constant': # XXX: should I reuse variables instances defined in the entity at # context.entity.variables[name] # XXX: parse expressions instead of only simple Variable? term = _mul(Variable(context.entity, name), coef_var) else: term = coef_var if expr is None: expr = term else: expr = _plus(expr, term) return expr_eval(expr, context)
def build_regression_expr(self, expr, mult=0.0, error_var=None): if error_var is not None: # expr += error_var expr = BinaryOp('+', expr, Variable(None, error_var)) if mult: # expr += normal(0, 1) * mult expr = BinaryOp('+', expr, BinaryOp('*', Normal(0, 1), mult)) return expr
def _eval_need(self, context, need, expressions, possible_values, expressions_context=None): assert isinstance(need, (np.ndarray, la.LArray)) if expressions_context is None: expressions_context = context # When given a 0d array, we convert it to 1d. This can happen e.g. for # >>> b = True; x = ne.evaluate('where(b, 0.1, 0.2)') # >>> isinstance(x, np.ndarray) # True # >>> x.shape # () if not need.shape: need = np.array([need]) if isinstance(need, la.LArray): if not expressions: expressions = [ Variable(expressions_context.entity, name) for name in need.axes.names ] if not possible_values: possible_values = need.axes.labels assert isinstance(need, (np.ndarray, la.LArray)) if len(expressions) != len(possible_values): raise Exception("align() expressions and possible_values " "have different length: %d vs %d" % (len(expressions), len(possible_values))) if 'period' in [str(e) for e in expressions]: period = context.period expressions, possible_values, need = \ kill_axis('period', period, expressions, possible_values, need) # kill any axis where the value is constant for all individuals # satisfying the filter # tokill = [(expr, column[0]) # for expr, column in zip(expressions, columns) # if isconstant(column, filter_value)] # for expr, value in tokill: # expressions, possible_values, need = \ # kill_axis(str(expr), value, expressions, possible_values, # need) return need, expressions, possible_values
def get(self, key, *args, **kwargs): if isinstance(key, basestring): entity = self._target_entity # We could use entity.variables instead but since local variables # are not in there (and links can currently point to them), we need # to special case that and it does not make things any simpler. if key in entity.links: key = entity.links[key] else: key = Variable(entity, key) return LinkGet(self, key, *args, **kwargs)
def __getattr__(self, key): if key in self.macros: raise Exception("Using macros with the 'other' link is not " "supported yet") # macro = self.macros[key] # variables = macro.collect_variables() # renames = dict((name, self.prefix + name) for name in variables) # return macro.rename_variables(renames) if key in self.links: link = self.links[key] # noinspection PyProtectedMember return link.__class__(link._name, self.prefix + link._link_field, link._target_entity_name, link._target_entity) return Variable(self.entity, self.prefix + key)
def get_group_context(context, varnames): ent_name = context['__entity__'] entity = context['__entities__'][ent_name] group_context = context.copy() entity_context = group_context[ent_name].copy() # this creates a Variable for each name in varnames # There is an obscure subtle bug here. get_group_context is used both for functions and for # "code blocks". For functions where varnames represent arguments, this is probably fine to # shadow global variables with local variables, but for code blocks, this means that # global VariableMethodHybrid gets replaced by a simple Variable, if you set the value of that # global VariableMethodHybrid anywhere in the method. For example, this will fail: # age: age + 1 # ageing: # - age: age + 1 # - age() # I will not fix this though as it is too obscure and VariableMethodHybrids should not be used anyway. entity_context.update( (name, Variable(entity, name)) for name in varnames) group_context[ent_name] = entity_context return group_context
def execute(self, s): entity = self.entity if entity is None: raise Exception(entity_required) period = self.period if period is None: raise Exception(period_required) entity_name = self.entity.name parse_ctx = self.parse_ctx.copy() local_parse_ctx = parse_ctx[entity_name].copy() # add all currently defined temp_variables because otherwise # local variables (defined within a function) wouldn't be available local_parse_ctx.update((name, Variable(entity, name)) for name in entity.temp_variables.keys()) parse_ctx[entity_name] = local_parse_ctx expr = parse(s, parse_ctx, interactive=True) result = expr_eval(expr, self.eval_ctx) if result is None: print("done.") return result
def variables(self): if self._variables is None: if self.process_strings: processes = list(self.process_strings.items()) else: processes = [] # names of all processes (hybrid or not) of the entity process_names = set(k for k, v in processes if k is not None) # names of all entity variables (temporary or not) which are set # globally all_entity_variables = set(self.collect_predictors(processes)) field_names = set(self.fields.names) # normal fields (non-callable/no hybrid variable-function for them) variables = dict((name, Variable(self, name, type_)) for name, type_ in self.fields.name_types if name in field_names - process_names) if config.debug: print("hybrids (field and method):", field_names & process_names) # callable fields (fields with a process of the same name) variables.update((name, VariableMethodHybrid(self, name, type_)) for name, type_ in self.fields.name_types if name in field_names & process_names) if config.debug: print("hybrids (global temporary & method):", all_entity_variables - field_names) # global temporaries (they are all callable). variables.update((name, VariableMethodHybrid(self, name)) for name in all_entity_variables - field_names) variables.update(self.links) self._variables = variables return self._variables
def compute(self, context, a, size=None, replace=True, p=None): if isinstance(a, la.LArray): assert p is None outcomes_axis = a.axes['outcomes'] outcomes = outcomes_axis.labels other_axes = a.axes - outcomes_axis if other_axes: a = index_array_by_variables(a, context, other_axes) p = np.asarray(a.transpose('outcomes')) else: p = np.asarray(a) a = outcomes if isinstance(p, (list, np.ndarray)) and len(p) and not np.isscalar(p[0]): assert len(p) == len(a) assert all(len(px) == size for px in p) assert len(a) >= 2 if isinstance(p, list) and any( isinstance(px, la.LArray) for px in p): p = [np.asarray(px) for px in p] ap = np.asarray(p) cdf = ap.cumsum(axis=0) # copied & adapted from numpy/random/mtrand/mtrand.pyx atol = np.sqrt(np.finfo(np.float64).eps) if np.issubdtype(ap.dtype, np.floating): atol = max(atol, np.sqrt(np.finfo(ap.dtype).eps)) if np.any(np.abs(cdf[-1] - 1.) > atol): raise ValueError("probabilities do not sum to 1") cdf /= cdf[-1] # I have not found a way to do this without an explicit loop as # np.digitize only supports a 1d array for bins. What we do is # worse than a linear "search" since we always evaluate all # possibilities (there is no shortcut when the value is found). # It might be faster to rewrite this using numba + np.digitize # for each individual (assuming it has a low setup overhead). # the goal is to build something like: # if(u < proba1, outcome1, # if(u < proba2, outcome2, # outcome3)) data = {'u': np.random.uniform(size=size)} expr = a[-1] # iterate in reverse and skip last pairs = zip(cdf[-2::-1], a[-2::-1]) for i, (proba_x, outcome_x) in enumerate(pairs): data['p%d' % i] = proba_x expr = Where( ComparisonOp('<', Variable(None, 'u'), Variable(None, 'p%d' % i)), outcome_x, expr) local_ctx = context.clone(fresh_data=True, entity_data=data) return expr.evaluate(local_ctx) else: return NumpyRandom.compute(self, context, a, size, replace, p)
def traverse(self): # XXX: don't we also need the fields within the target expression? # noinspection PyProtectedMember yield Variable(self.link._entity, self.link._link_field) yield self
def compute(self, context, *args, **kwargs): filter_value = kwargs.pop('filter', None) missing = kwargs.pop('missing', None) # periods = kwargs.pop('periods', None) header = kwargs.pop('header', True) limit = kwargs.pop('limit', None) entity = context.entity if args: expressions = list(args) else: # extra=False because we don't want globals nor "system" variables # (nan, period, __xxx__) # FIXME: we should also somehow "traverse" expressions in this case # too (args is ()) => all keys in the current context expressions = [ Variable(entity, name) for name in context.keys(extra=False) ] str_expressions = [str(e) for e in expressions] if 'id' not in str_expressions: str_expressions.insert(0, 'id') expressions.insert(0, Variable(entity, 'id')) id_pos = 0 else: id_pos = str_expressions.index('id') # if (self.periods is not None and len(self.periods) and # 'period' not in str_expressions): # str_expressions.insert(0, 'period') # expressions.insert(0, Variable('period')) # id_pos += 1 columns = [] for expr in expressions: if filter_value is False: # dtype does not matter much expr_value = np.empty(0) else: # TODO: set filter before evaluating expressions expr_value = expr_eval(expr, context) if (filter_value is not None and isinstance(expr_value, np.ndarray) and expr_value.shape): expr_value = expr_value[filter_value] columns.append(expr_value) ids = columns[id_pos] if isinstance(ids, np.ndarray) and ids.shape: numrows = len(ids) else: # FIXME: we need a test for this case (no idea how this can happen) numrows = 1 # expand scalar columns to full columns in memory for idx, col in enumerate(columns): dtype = None if not isinstance(col, np.ndarray): dtype = type(col) elif not col.shape: dtype = col.dtype.type if dtype is not None: # TODO: try using itertools.repeat instead as it seems to be a # bit faster and would consume less memory (however, it might # not play very well with Pandas.to_csv) newcol = np.full(numrows, col, dtype=dtype) columns[idx] = newcol elif col.ndim > 1: # move last axis (should be id axis) first # np.moveaxis requires numpy >= 1.11 # columns[idx] = np.moveaxis(col, -1, 0) columns[idx] = col.transpose((-1, ) + tuple(range(col.ndim - 1))) assert all(isinstance(col, np.ndarray) for col in columns) bad_lengths = { str_expr: col.shape for col, str_expr in zip(columns, str_expressions) if col.shape[0] != numrows } if bad_lengths: raise ValueError( "first dimension of some columns are not the same length as the id column (%d): %s" % (numrows, str(bad_lengths))) if limit is not None: assert isinstance(limit, (int, long)) columns = [col[:limit] for col in columns] # Transform to Python lists of normal Python types (ie no numpy types). # on py2, csv.writer uses repr(value) for float and str(value) for other but # on py3 since str(float) == repr(float), they switched to str(value) for everything # but str(np.float64) does not have full precision (truncated at the 12th decimal) # besides, this seems to be faster (but probably takes more memory). # Also on python2, converting produces nicer/shorter float strings (see issue #225). columns = [c.tolist() for c in columns] data = zip(*columns) if header: table = [str_expressions] table.extend(data) else: table = list(data) return PrettyTable(table, missing)