def load_period_data(self, period): if self.lag_fields: # TODO: use ColumnArray here # XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] # if not self.indexed_input_table.has_period(period): # # nothing needs to be done in that case # return # # input_array = self.indexed_input_table.read(period) rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1', default_values=self.fields.default_values) # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array)
def _autodiff(self, period, showdiffs=10, raiseondiff=False): fields = self._modified_fields if not fields: return fname, numrows = config.autodiff h5file = config.autodump_file tablepath = '/p{}/{}'.format(period, self._tablename(period)) print("comparing with {}{} ...".format(fname, tablepath)) if tablepath in h5file: table = h5file.getNode(tablepath) disk_array = ColumnArray.from_table(table, stop=numrows) diff_array(disk_array, ColumnArray(fields), showdiffs, raiseondiff) else: print(" SKIPPED (could not find table)")
def load_period_data(self, period): if self.lag_fields: #TODO: use ColumnArray here #XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1') # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array)
def _autodiff(self, period, numdiff=10, raiseondiff=False): fields = self._modified_fields if not fields: return fname, numrows = config.autodiff h5file = config.autodump_file tablepath = '/{}/{}'.format(period, self._tablename(period)) print("comparing with {}{} ...".format(fname, tablepath)) if tablepath in h5file: table = h5file.getNode(tablepath) disk_array = ColumnArray.from_table(table, stop=numrows) diff_array(disk_array, ColumnArray(fields), numdiff, raiseondiff) else: print(" SKIPPED (could not find table)")
class Entity(object): """ fields is a list of tuple (name, type) """ def __init__(self, name, fields=None, missing_fields=None, links=None, macro_strings=None, process_strings=None, array=None): self.name = name # we should have exactly one of either array or fields defined assert ((fields is None and array is not None) or (fields is not None and array is None)) if array is not None: if fields is None: fields = get_fields(array) array_period = np.min(array['period']) else: array_period = None duplicate_names = [name for name, num in count_occurrences(fname for fname, _ in fields) if num > 1] if duplicate_names: raise Exception("duplicate fields in entity '%s': %s" % (self.name, ', '.join(duplicate_names))) fnames = [name for name, _ in fields] if 'id' not in fnames: fields.insert(0, ('id', int)) if 'period' not in fnames: fields.insert(0, ('period', int)) self.fields = fields # only used in data (to check that all "required" fields are present # in the input file) # one potential solution would be to split the fields argument and # attribute in input_fields and output_fields (regardless of whether # it is split in the simulation/yaml file). # however that might be just a temporary solution as we will soon need # more arguments to fields (default values, ranges, etc...) # another solution is to use a Field class # seems like the better long term solution self.missing_fields = missing_fields self.stored_fields = set(name for name, _ in fields) self.links = links self.macro_strings = macro_strings self.process_strings = process_strings self.processes = None self.expectedrows = tables.parameters.EXPECTED_ROWS_TABLE self.table = None self.input_table = None self.indexed_input_table = None self.indexed_output_table = None self.input_rows = {} #TODO: it is unnecessary to keep periods which have already been # simulated, because (currently) when we go back in time, we always go # back using the output table. self.input_index = {} self.output_rows = {} self.output_index = {} self.base_period = None # we need a separate field, instead of using array['period'] to be able # to get the period even when the array is empty. self.array_period = array_period self.array = None self.lag_fields = [] self.array_lag = None self.num_tmp = 0 self.temp_variables = {} self.id_to_rownum = None self._variables = None self._methods = None @classmethod def from_yaml(cls, ent_name, entity_def): from links import Many2One, One2Many # YAML "ordered dict" syntax returns a list of dict and we want a list # of tuples #FIXME: if "fields" key is present but no field is defined, #entity_def.get('fields', []) returns None and this breaks fields_def = [d.items()[0] for d in entity_def.get('fields', [])] fields = [] missing_fields = [] for name, fielddef in fields_def: if isinstance(fielddef, dict): strtype = fielddef['type'] if not fielddef.get('initialdata', True): missing_fields.append(name) else: strtype = fielddef fields.append((name, field_str_to_type(strtype, "field '%s'" % name))) link_defs = entity_def.get('links', {}) str2class = {'one2many': One2Many, 'many2one': Many2One} links = dict((name, str2class[l['type']](name, l['field'], l['target'])) for name, l in link_defs.iteritems()) return Entity(ent_name, fields, missing_fields, links, entity_def.get('macros', {}), entity_def.get('processes', {})) @property def local_var_names(self): return set(self.temp_variables.keys()) - set(self.variables.keys()) @classmethod def from_table(cls, table): return Entity(table.name, get_fields(table), missing_fields=[], links={}, macro_strings={}, process_strings={}) @staticmethod def collect_predictors(items): # this excludes lists (procedures) and dict (while, ...) return [k for k, v in items if k is not None and isinstance(v, (basestring, int, float))] @property def variables(self): if self._variables is None: processes = self.process_strings.items() # names of all processes (hybrid or not) of the entity process_names = set(k for k, v in processes if k is not None) # names of all entity variables (temporary or not) which are set # globally all_predictors = set(self.collect_predictors(processes)) stored_fields = self.stored_fields # non-callable fields (no variable-procedure for them) variables = dict((name, Variable(name, type_)) for name, type_ in self.fields if name in stored_fields - process_names) # callable fields variables.update((name, VariableMethodHybrid(name, self, type_)) for name, type_ in self.fields if stored_fields & process_names) # temporary fields (they are all callable) variables.update((name, VariableMethodHybrid(name, self)) for name in all_predictors - stored_fields) variables.update(self.links) self._variables = variables return self._variables @staticmethod def ismethod(v): keys = ('args', 'code', 'return') return (isinstance(v, list) or isinstance(v, dict) and any(key in v for key in keys)) @property def methods(self): if self._methods is None: # variable-method hybrids are handled by the self.variable property self._methods = [(key, MethodSymbol(key, self)) for key, value in self.process_strings.iteritems() if self.ismethod(value) and key not in self.stored_fields] return self._methods def check_links(self): for name, link in self.links.iteritems(): #noinspection PyProtectedMember target_name = link._target_entity_name if target_name not in entity_registry: raise Exception("Target of '%s' link in entity '%s' is an " "unknown entity (%s)" % (name, self.name, target_name)) def get_cond_context(self, entities_visited=None): """returns the conditional context: {link: variables}""" if entities_visited is None: entities_visited = set() else: entities_visited = entities_visited.copy() entities_visited.add(self) linked_entities = {} for k, link in self.links.items(): #noinspection PyProtectedMember entity = link._target_entity() if entity not in entities_visited: linked_entities[k] = entity cond_context = {} # use a set of entities to compute the conditional context only once # per target entity for entity in set(linked_entities.values()): cond_context.update(entity.get_cond_context(entities_visited)) # entities linked directly take priority over (override) farther ones cond_context.update((k, entity.variables) for k, entity in linked_entities.items()) return cond_context conditional_context = property(get_cond_context) def all_symbols(self, globals_def): from links import PrefixingLink symbols = global_variables(globals_def).copy() symbols = WarnOverrideDict(symbols) symbols.update(self.variables) cond_context = self.conditional_context macros = dict((k, parse(v, symbols, cond_context)) for k, v in self.macro_strings.iteritems()) symbols.update(macros) symbols['other'] = PrefixingLink(macros, self.links, '__other_') symbols.update(self.methods) return symbols def parse_expr(self, k, v, variables, cond_context): if isinstance(v, (bool, int, float)): return Assignment(v) elif isinstance(v, basestring): expr = parse(v, variables, cond_context) if isinstance(expr, Process): return expr else: if k is None: return Compute(expr) else: return Assignment(expr) else: # lets be explicit about it return None def parse_process_group(self, k, v, variables, cond_context, purge=True): # v is a procedure # it should be a list of dict (assignment) or string (action) group_expressions = [elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in v] group_predictors = \ self.collect_predictors(group_expressions) group_context = variables.copy() group_context.update((name, Variable(name)) for name in group_predictors) sub_processes = self.parse_expressions(group_expressions, group_context, cond_context) return ProcessGroup(k, sub_processes, purge) def parse_expressions(self, items, context, cond_context): """ items -- a list of tuples (name, process_string) context -- a dict of all symbols available in the scope cond_context -- """ processes = [] for k, v in items: if k == 'while': if not isinstance(v, dict): raise ValueError("while is a reserved keyword") cond = parse(v['cond'], context, cond_context) assert isinstance(cond, Expr) code = self.parse_process_group("while:code", v['code'], context, cond_context, purge=False) process = While(cond, code) else: process = self.parse_expr(k, v, context, cond_context) if process is None: if self.ismethod(v): if isinstance(v, list): # v should be a list of dicts (assignments) or # strings (actions) argnames, code_def, result_def = [], v, None else: assert isinstance(v, dict) args_def = v.get('args', '') argnames = [a.strip() for a in args_def.split(',') if a != ''] code_def = v.get('code', []) result_def = v.get('return') method_context = context.copy() method_context.update((name, Variable(name)) for name in argnames) code = self.parse_process_group("func:code", code_def, method_context, cond_context, purge=False) #TODO: use code.predictors instead (but it currently # fails for some reason) group_expressions = [elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in code_def] group_predictors = \ self.collect_predictors(group_expressions) method_context.update((name, Variable(name)) for name in group_predictors) result = parse(result_def, method_context, cond_context) assert result is None or isinstance(result, Expr) process = Function(argnames, code, result) elif isinstance(v, dict) and 'predictor' in v: raise ValueError("Using the 'predictor' keyword is " "not supported anymore. " "If you need several processes to " "write to the same variable, you " "should rather use procedures.") else: raise Exception("unknown expression type for %s: %s" % (k, type(v))) processes.append((k, process)) return processes def parse_processes(self, globals_def): processes = self.parse_expressions(self.process_strings.iteritems(), self.all_symbols(globals_def), self.conditional_context) # attach processes # TODO: make actions inherit from Expr instead of Process, and wrap # them in a Compute process so that I can kill attach for k, v in processes: v.attach(k, self) self.processes = dict(processes) def compute_lagged_fields(self): from tfunc import Lag from links import LinkValue lag_vars = set() for p in self.processes.itervalues(): for expr in p.expressions(): for node in expr.all_of(Lag): for v in node.all_of(Variable): if not isinstance(v, GlobalVariable): lag_vars.add(v.name) for lv in node.all_of(LinkValue): #noinspection PyProtectedMember lag_vars.add(lv.link._link_field) #noinspection PyProtectedMember target_entity = lv.link._target_entity() if target_entity == self: target_vars = lv.target_expression.all_of(Variable) lag_vars.update(v.name for v in target_vars) if lag_vars: # make sure we have an 'id' column, and that it comes first # (makes debugging easier). 'id' is always necessary for lag # expressions to be able to "expand" the vector of values to the # "current" individuals. lag_vars.discard('id') lag_vars = ['id'] + sorted(lag_vars) field_type = dict(self.fields) self.lag_fields = [(v, field_type[v]) for v in lag_vars] def load_period_data(self, period): if self.lag_fields: #TODO: use ColumnArray here #XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1') # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array) def purge_locals(self): """purge all local variables""" global max_vars temp_vars = self.temp_variables local_var_names = self.local_var_names num_locals = len(local_var_names) if config.debug and num_locals: local_vars = [v for k, v in temp_vars.iteritems() if k in local_var_names and isinstance(v, np.ndarray)] max_vars = max(max_vars, num_locals) temp_mem = sum(v.nbytes for v in local_vars) avgsize = sum(v.dtype.itemsize for v in local_vars) / num_locals print(("purging {} variables (max {}), will free {} of memory " "(avg field size: {} b)".format(num_locals, max_vars, size2str(temp_mem), avgsize))) for var in local_var_names: del temp_vars[var] def store_period_data(self, period): if config.debug: temp_mem = sum(v.nbytes for v in self.temp_variables.itervalues() if isinstance(v, np.ndarray)) main_mem = self.array.nbytes print("mem used: %s (main: %s / temp: %s)" % (size2str(temp_mem + main_mem), size2str(main_mem), size2str(temp_mem))) # erase all temporary variables which have been computed this period self.temp_variables = {} if period in self.output_rows: raise Exception("trying to modify already simulated rows") else: startrow = self.table.nrows self.array.append_to_table(self.table) self.output_rows[period] = (startrow, self.table.nrows) self.output_index[period] = self.id_to_rownum self.table.flush() # def compress_period_data(self, level): # compressed = ca.ctable(self.array, cparams=ca.cparams(level)) # print "%d -> %d (%f)" % compressed._get_stats() @staticmethod def fill_missing_values(ids, values, context, filler='auto'): """ ids: ids present in past period context: current period context """ if filler is 'auto': filler = get_missing_value(values) result = np.empty(context_length(context), dtype=values.dtype) result.fill(filler) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result def value_for_period(self, expr, period, context, fill='auto'): sub_context = EntityContext(self, {'period': period, '__globals__': context['__globals__']}) result = expr_eval(expr, sub_context) if isinstance(result, np.ndarray) and result.shape: ids = expr_eval(Variable('id'), sub_context) if fill is None: return ids, result else: # expand values to the current "outer" context return self.fill_missing_values(ids, result, context, fill) else: return result def __repr__(self): return "<Entity '%s'>" % self.name
class Entity(object): """ fields is a list of tuple (name, type) """ def __init__(self, name, fields=None, missing_fields=None, links=None, macro_strings=None, process_strings=None, array=None): self.name = name # we should have exactly one of either array or fields defined assert ((fields is None and array is not None) or (fields is not None and array is None)) if array is not None: if fields is None: fields = get_fields(array) array_period = np.min(array['period']) else: array_period = None duplicate_names = [name for name, num in count_occurrences(fname for fname, _ in fields) if num > 1] if duplicate_names: raise Exception("duplicate fields in entity '%s': %s" % (self.name, ', '.join(duplicate_names))) fnames = [name for name, _ in fields] if 'id' not in fnames: fields.insert(0, ('id', int)) if 'period' not in fnames: fields.insert(0, ('period', int)) self.fields = fields # only used in data (to check that all "required" fields are present # in the input file) # one potential solution would be to split the fields argument and # attribute in input_fields and output_fields (regardless of whether # it is split in the simulation/yaml file). # however that might be just a temporary solution as we will soon need # more arguments to fields (default values, ranges, etc...) # another solution is to use a Field class # seems like the better long term solution self.missing_fields = missing_fields self.stored_fields = set(name for name, _ in fields) self.links = links if macro_strings is None: macro_strings = {} self.macro_strings = macro_strings self.process_strings = process_strings self.processes = None self.expectedrows = tables.parameters.EXPECTED_ROWS_TABLE self.table = None self.input_table = None self.indexed_input_table = None self.indexed_output_table = None self.input_rows = {} #TODO: it is unnecessary to keep periods which have already been # simulated, because (currently) when we go back in time, we always go # back using the output table... but periods before the start_period # are only present in input_index #FIXME: the proper way to fix this is to copy the input_index into # the output_index during H5Date.run() and not keep input_index # beyond that point. self.input_index = {} self.output_rows = {} self.output_index = {} self.output_index_node = None self.base_period = None # we need a separate field, instead of using array['period'] to be able # to get the period even when the array is empty. self.array_period = array_period self.array = array self.lag_fields = [] self.array_lag = None self.num_tmp = 0 self.temp_variables = {} self.id_to_rownum = None if array is not None: rows_per_period, index_per_period = index_table(array) self.input_rows = rows_per_period self.output_rows = rows_per_period self.input_index = index_per_period self.output_index = index_per_period self.id_to_rownum = index_per_period[array_period] self._variables = None self._methods = None @classmethod def from_yaml(cls, ent_name, entity_def): from links import Many2One, One2Many # YAML "ordered dict" syntax returns a list of dict and we want a list # of tuples #FIXME: if "fields" key is present but no field is defined, #entity_def.get('fields', []) returns None and this breaks fields_def = [d.items()[0] for d in entity_def.get('fields', [])] fields = [] missing_fields = [] for name, fielddef in fields_def: if isinstance(fielddef, dict): strtype = fielddef['type'] if not fielddef.get('initialdata', True): missing_fields.append(name) else: strtype = fielddef fields.append((name, field_str_to_type(strtype, "field '%s'" % name))) link_defs = entity_def.get('links', {}) str2class = {'one2many': One2Many, 'many2one': Many2One} links = dict((name, str2class[l['type']](name, l['field'], l['target'])) for name, l in link_defs.iteritems()) return Entity(ent_name, fields, missing_fields, links, entity_def.get('macros', {}), entity_def.get('processes', {})) # noinspection PyProtectedMember def attach_and_resolve_links(self, entities): for link in self.links.itervalues(): link._attach(self) link._resolve_target(entities) @property def local_var_names(self): return set(self.temp_variables.keys()) - set(self.variables.keys()) @classmethod def from_table(cls, table): return Entity(table.name, get_fields(table), missing_fields=[], links={}, macro_strings={}, process_strings={}) @staticmethod def collect_predictors(items): # this excludes lists (procedures) and dict (while, ...) return [k for k, v in items if k is not None and isinstance(v, (basestring, int, float))] @property def variables(self): if self._variables is None: if self.process_strings: processes = self.process_strings.items() else: processes = [] # names of all processes (hybrid or not) of the entity process_names = set(k for k, v in processes if k is not None) # names of all entity variables (temporary or not) which are set # globally all_predictors = set(self.collect_predictors(processes)) stored_fields = self.stored_fields # non-callable fields (no variable-procedure for them) variables = dict((name, Variable(self, name, type_)) for name, type_ in self.fields if name in stored_fields - process_names) # callable fields variables.update((name, VariableMethodHybrid(self, name, type_)) for name, type_ in self.fields if name in stored_fields & process_names) # global temporaries (they are all callable) variables.update((name, VariableMethodHybrid(self, name)) for name in all_predictors - stored_fields) variables.update(self.links) self._variables = variables return self._variables @staticmethod def ismethod(v): keys = ('args', 'code', 'return') return (isinstance(v, list) or isinstance(v, dict) and any(key in v for key in keys)) @property def methods(self): if self._methods is None: if self.process_strings is None: self._methods = [] else: # variable-method hybrids are handled by the self.variable # property self._methods = \ [(k, MethodSymbol(k, self)) for k, v in self.process_strings.iteritems() if self.ismethod(v) and k not in self.stored_fields] return self._methods def all_symbols(self, global_context): from links import PrefixingLink symbols = WarnOverrideDict(self.variables.copy()) local_context = global_context.copy() local_context[self.name] = symbols local_context['__entity__'] = self.name macros = dict((k, parse(v, local_context)) for k, v in self.macro_strings.iteritems()) symbols.update(macros) symbols['other'] = PrefixingLink(self, macros, self.links, '__other_') symbols.update(self.methods) return symbols def parse_expr(self, k, v, context): if isinstance(v, (bool, int, float)): return Assignment(k, self, v) elif isinstance(v, basestring): return Assignment(k, self, parse(v, context)) else: # lets be explicit about it return None @staticmethod def get_group_context(context, varnames): ent_name = context['__entity__'] entity = context['__entities__'][ent_name] group_context = context.copy() entity_context = group_context[ent_name].copy() entity_context.update((name, Variable(entity, name)) for name in varnames) group_context[ent_name] = entity_context return group_context def parse_process_group(self, k, items, context, purge=True): # items is a list of [dict (assignment) or string (action)] if items is None: raise ValueError("no processes in '%s'" % k) group_expressions = [elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in items] group_predictors = self.collect_predictors(group_expressions) group_context = self.get_group_context(context, group_predictors) sub_processes = self.parse_expressions(group_expressions, group_context) return ProcessGroup(k, self, sub_processes, purge) def parse_expressions(self, items, context): """ items -- a list of tuples (name, process_string) context -- parsing context a dict of all symbols available for all entities """ processes = [] for k, v in items: if k == 'while': if not isinstance(v, dict): raise ValueError("while is a reserved keyword") cond = parse(v['cond'], context) assert isinstance(cond, Expr) code = self.parse_process_group("while:code", v['code'], context, purge=False) process = While(k, self, cond, code) else: process = self.parse_expr(k, v, context) if process is None: if self.ismethod(v): if isinstance(v, list): # v should be a list of dicts (assignments) or # strings (actions) argnames, code_def, result_def = [], v, None else: assert isinstance(v, dict) args_def = v.get('args', '') argnames = [a.strip() for a in args_def.split(',') if a != ''] code_def = v.get('code', []) result_def = v.get('return') method_context = self.get_group_context(context, argnames) code = self.parse_process_group("func:code", code_def, method_context, purge=False) #TODO: use code.predictors instead (but it currently # fails for some reason) or at least factor this out # with the code in parse_process_group group_expressions = [elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in code_def] group_predictors = \ self.collect_predictors(group_expressions) method_context = self.get_group_context( method_context, group_predictors) result = parse(result_def, method_context) assert result is None or isinstance(result, Expr) process = Function(k, self, argnames, code, result) elif isinstance(v, dict) and 'predictor' in v: raise ValueError("Using the 'predictor' keyword is " "not supported anymore. " "If you need several processes to " "write to the same variable, you " "should rather use procedures.") else: raise Exception("unknown expression type for %s: %s" % (k, type(v))) processes.append((k, process)) return processes def parse_processes(self, context): processes = self.parse_expressions(self.process_strings.iteritems(), context) self.processes = dict(processes) # self.ssa() # def resolve_method_calls(self): # for p in self.processes.itervalues(): # for expr in p.expressions(): # for node in expr.all_of(MethodCallToResolve): # # replace node in the parent node by the "resolved" node # #TODO: mimic ast.NodeTransformer # node.resolve() def ssa(self): fields_versions = collections.defaultdict(int) for p in self.processes.itervalues(): if isinstance(p, ProcessGroup): p.ssa(fields_versions) def compute_lagged_fields(self): from tfunc import Lag from links import LinkGet lag_vars = set() for p in self.processes.itervalues(): for expr in p.expressions(): for node in expr.all_of(Lag): for v in node.all_of(Variable): if not isinstance(v, GlobalVariable): lag_vars.add(v.name) for lv in node.all_of(LinkGet): #noinspection PyProtectedMember lag_vars.add(lv.link._link_field) #noinspection PyProtectedMember target_entity = lv.link._target_entity if target_entity == self: target_vars = lv.target_expr.all_of(Variable) lag_vars.update(v.name for v in target_vars) if lag_vars: # make sure we have an 'id' column, and that it comes first # (makes debugging easier). 'id' is always necessary for lag # expressions to be able to "expand" the vector of values to the # "current" individuals. lag_vars.discard('id') lag_vars = ['id'] + sorted(lag_vars) field_type = dict(self.fields) self.lag_fields = [(v, field_type[v]) for v in lag_vars] def load_period_data(self, period): if self.lag_fields: #TODO: use ColumnArray here #XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1') # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array) def purge_locals(self): """purge all local variables""" global max_vars temp_vars = self.temp_variables local_var_names = self.local_var_names num_locals = len(local_var_names) if config.debug and num_locals: local_vars = [v for k, v in temp_vars.iteritems() if k in local_var_names and isinstance(v, np.ndarray)] max_vars = max(max_vars, num_locals) temp_mem = sum(v.nbytes for v in local_vars) avgsize = sum(v.dtype.itemsize for v in local_vars) / num_locals if config.log_level in ("procedures", "processes"): print(("purging {} variables (max {}), will free {} of memory " "(avg field size: {} b)".format(num_locals, max_vars, size2str(temp_mem), avgsize))) for var in local_var_names: del temp_vars[var] def flush_index(self, period): # keep an in-memory copy of the index for the current period self.output_index[period] = self.id_to_rownum # also flush it to disk h5file = self.output_index_node._v_file h5file.create_array(self.output_index_node, "_%d" % period, self.id_to_rownum, "Period %d index" % period) # if an old index exists (this is not the case for the first period!), # point to the one on the disk, instead of the one in memory, # effectively clearing the one in memory idxname = '_%d' % (period - 1) if idxname in self.output_index_node: prev_disk_array = getattr(self.output_index_node, idxname) # DiskBackedArray is a workaround for pytables#360 (see above) self.output_index[period - 1] = DiskBackedArray(prev_disk_array) def store_period_data(self, period): if config.debug and config.log_level in ("procedures", "processes"): temp_mem = sum(v.nbytes for v in self.temp_variables.itervalues() if isinstance(v, np.ndarray)) main_mem = self.array.nbytes print("mem used: %s (main: %s / temp: %s)" % (size2str(temp_mem + main_mem), size2str(main_mem), size2str(temp_mem))) # erase all temporary variables which have been computed this period self.temp_variables = {} if period in self.output_rows: raise Exception("trying to modify already simulated rows") startrow = self.table.nrows self.array.append_to_table(self.table) self.output_rows[period] = (startrow, self.table.nrows) self.flush_index(period) self.table.flush() # def compress_period_data(self, level): # compressed = bcolz.ctable(self.array, cparams=bcolz.cparams(level)) # print "%d -> %d (%f)" % compressed._get_stats() def optimize_processes(self): """ Common subexpression elimination """ #XXX: # * we either need to do SSA first, or for each assignment process, # "forget" all expressions containing the assigned variable # doing it using SSA seems cleaner, but in the end it shouldn't # change much. If we do not do SSA, we will need to "forget" about # expressions which contain an assigned variable at *each step* of # the process, including when simply counting the number of occurrence # of expressions. In that case we also need to iterate on the # processes in the same order than the simulation! # * I don't know if it is a good idea to optimize cross-procedures. # On one hand it offers much more possibilities for optimizations # but, on the other hand the optimization pass might just take too # much time... If we do not do it globally, we should move the method # to ProcessGroup instead. But let's try it cross-procedures first. # * cross-procedures might get tricky when we take function calls # into account. #TODO: # * it will be simpler and better to do this in two passes: first # find duplicated expr and number of occurrences of each expr, then # proceed with the factorization expr_count = collections.Counter() for p in self.processes.itervalues(): for expr in p.expressions(): for subexpr in expr.traverse(): if isinstance(subexpr, Expr) and \ not isinstance(subexpr, Variable): expr_count[subexpr] += 1 print() print("most common expressions") print("=" * 20) print(expr_count.most_common(100)) # if count(larger) <= count(smaller) <= count(larger) + 1: kill smaller # if count(smaller) > count(larger) + 1: do both (larger uses smaller) # seen = {} # for p in self.processes.itervalues(): # for expr in p.expressions(): # for subexpr in expr.traverse(): # if subexpr in seen: # original = seen[subexpr] # # 1) add an assignment process before the process of # # the "original" expression to compute a temporary # # variable # # 2) modify "original" expr to use the temp var # # 3) modify the current expr to use the temp var # else: # seen[subexpr] = subexpr def __repr__(self): return "<Entity '%s'>" % self.name def __str__(self): return self.name
class Entity(object): def __init__(self, name, fields=None, links=None, macro_strings=None, process_strings=None, array=None): """ Parameters ---------- name fields : list of tuple (name, type) links : {name: links.Link} macro_strings process_strings array """ self.name = name # we should have exactly one of either array or fields defined assert ((fields is None and array is not None) or (fields is not None and array is None)) if array is not None: if fields is None: fields = get_fields(array) array_period = np.min(array['period']) else: array_period = None if not isinstance(fields, FieldCollection): def fdef2field(name, fielddef): initialdata = True output = True default_value = None if isinstance(fielddef, Field): return fielddef elif isinstance(fielddef, (dict, str)): if isinstance(fielddef, dict): strtype = fielddef['type'] initialdata = fielddef.get('initialdata', True) output = fielddef.get('output', True) default_value = fielddef.get( 'default', default_value_by_strtype[strtype]) elif isinstance(fielddef, str): strtype = fielddef default_value = default_value_by_strtype[strtype] else: raise Exception('invalid field definition') dtype = field_str_to_type(strtype, "field '%s'" % name) else: assert isinstance(fielddef, type) dtype = normalize_type(fielddef) return Field(name, dtype, initialdata, output, default_value) fields = FieldCollection( fdef2field(name, fdef) for name, fdef in fields) duplicate_names = [ name for name, num in count_occurrences(fields.names) if num > 1 ] if duplicate_names: raise Exception("duplicate fields in entity '%s': %s" % (self.name, ', '.join(duplicate_names))) fnames = set(fields.names) if 'id' not in fnames: fields.insert(0, Field('id', int)) if 'period' not in fnames: fields.insert(0, Field('period', int)) self.fields = fields self.links = links if macro_strings is None: macro_strings = {} self.macro_strings = macro_strings self.process_strings = process_strings self.processes = None self.expectedrows = tables.parameters.EXPECTED_ROWS_TABLE self.table = None self.input_table = None self.indexed_input_table = None self.indexed_output_table = None self.input_rows = {} # TODO: it is unnecessary to keep periods which have already been # simulated, because (currently) when we go back in time, we always go # back using the output table... but periods before the start_period # are only present in input_index self.input_index = {} self.output_rows = {} self.output_index = {} self.output_index_node = None self.base_period = None # we need a separate field, instead of using array['period'] to be able # to get the period even when the array is empty. self.array_period = array_period self.array = array self.lag_fields = [] self.array_lag = None self.num_tmp = 0 self.temp_variables = {} self.id_to_rownum = None if array is not None: rows_per_period, index_per_period = index_table(array) self.input_rows = rows_per_period self.output_rows = rows_per_period self.input_index = index_per_period self.output_index = index_per_period self.id_to_rownum = index_per_period[array_period] self._variables = None self._methods = None @classmethod def from_yaml(cls, ent_name, entity_def): from links import Many2One, One2Many # YAML "ordered dict" syntax returns a list of dict and we want a list # of tuples # FIXME: if "fields" key is present but no field is defined, # entity_def.get('fields', []) returns None and this breaks fields_def = [d.items()[0] for d in entity_def.get('fields', [])] link_defs = entity_def.get('links', {}) str2class = {'one2many': One2Many, 'many2one': Many2One} links = dict( (name, str2class[l['type']](name, l['field'], l['target'])) for name, l in link_defs.iteritems()) return Entity(ent_name, fields_def, links, entity_def.get('macros', {}), entity_def.get('processes', {})) # noinspection PyProtectedMember def attach_and_resolve_links(self, entities): for link in self.links.itervalues(): link._attach(self) link._resolve_target(entities) @property def local_var_names(self): return set(self.temp_variables.keys()) - set(self.variables.keys()) @classmethod def from_table(cls, table): return Entity(table.name, get_fields(table), links={}, macro_strings={}, process_strings={}) @staticmethod def collect_predictors(items): # this excludes lists (functions) and dict (while, ...) return [ k for k, v in items if k is not None and isinstance(v, (basestring, int, float)) ] @property def variables(self): if self._variables is None: if self.process_strings: processes = self.process_strings.items() else: processes = [] # names of all processes (hybrid or not) of the entity process_names = set(k for k, v in processes if k is not None) # names of all entity variables (temporary or not) which are set # globally all_predictors = set(self.collect_predictors(processes)) field_names = set(self.fields.names) # normal fields (non-callable/no hybrid variable-function for them) variables = dict((name, Variable(self, name, type_)) for name, type_ in self.fields.name_types if name in field_names - process_names) # callable fields (fields with a process of the same name) variables.update((name, VariableMethodHybrid(self, name, type_)) for name, type_ in self.fields.name_types if name in field_names & process_names) # global temporaries (they are all callable). variables.update((name, VariableMethodHybrid(self, name)) for name in all_predictors - field_names) variables.update(self.links) self._variables = variables return self._variables @staticmethod def ismethod(v): keys = ('args', 'code', 'return') return (isinstance(v, list) or isinstance(v, dict) and any(key in v for key in keys)) @property def methods(self): if self._methods is None: pstrings = self.process_strings items = pstrings.iteritems() if pstrings is not None else () # variable-method hybrids are handled by the self.variable property stored_fields = set(self.fields.in_output.names) methodnames = [ k for k, v in items if self.ismethod(v) and k not in stored_fields ] # factorial(n) -> factorial methodnames = [ split_signature(name)[0] if '(' in name else name for name in methodnames ] self._methods = [(name, MethodSymbol(name, self)) for name in methodnames] return self._methods def all_symbols(self, global_context): from links import PrefixingLink symbols = WarnOverrideDict(self.variables.copy()) local_context = global_context.copy() local_context[self.name] = symbols local_context['__entity__'] = self.name macros = dict((k, parse(v, local_context)) for k, v in self.macro_strings.iteritems()) symbols.update(macros) symbols['other'] = PrefixingLink(self, macros, self.links, '__other_') symbols.update(self.methods) return symbols def parse_expr(self, k, v, context): if isinstance(v, (bool, int, float)): return Assignment(k, self, v) elif isinstance(v, basestring): return Assignment(k, self, parse(v, context)) else: # lets be explicit about it return None @staticmethod def get_group_context(context, varnames): ent_name = context['__entity__'] entity = context['__entities__'][ent_name] group_context = context.copy() entity_context = group_context[ent_name].copy() entity_context.update( (name, Variable(entity, name)) for name in varnames) group_context[ent_name] = entity_context return group_context def parse_process_group(self, k, items, context, purge=True): # items is a list of [dict (assignment) or string (action)] if items is None: raise ValueError("no processes in '%s'" % k) group_expressions = [ elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in items ] group_predictors = self.collect_predictors(group_expressions) group_context = self.get_group_context(context, group_predictors) sub_processes = self.parse_expressions(group_expressions, group_context) return ProcessGroup(k, self, sub_processes, purge) # Once we can make it an error for non-function processes/statements, # we should probably split this method into parse_functions and # parse_function_body. def parse_expressions(self, items, context, functions_only=False): """ items -- a list of tuples (name, process_string) context -- parsing context a dict of all symbols available for all entities functions_only -- whether non-functions processes are allowed """ processes = [] for k, v in items: if k == 'while': if isinstance(v, dict): raise SyntaxError(""" This syntax for while is not supported anymore: - while: cond: {cond_expr} code: - ... Please use this instead: - while {cond_expr}: - ... """.format(cond_expr=v['cond'])) else: raise ValueError("while is a reserved keyword") elif k is not None and k.startswith('while '): if not isinstance(v, list): raise SyntaxError("while is a reserved keyword") cond = parse(k[6:].strip(), context) assert isinstance(cond, Expr) code = self.parse_process_group("while_code", v, context, purge=False) process = While(k, self, cond, code) elif k == 'return': e = SyntaxError("return is a reserved keyword. To return " "from a function, use 'return expr' " "instead of 'return: expr'") e.liam2context = "while parsing: return: {}".format(v) raise e elif k is None and isinstance(v, str) and v.startswith('return'): assert len(v) == 6 or v[6] == ' ' if len(v) > 6: result_def = v[7:].strip() else: result_def = None result_expr = parse(result_def, context) process = Return(None, self, result_expr) else: process = self.parse_expr(k, v, context) if process is not None and functions_only: if k in self.fields.names: msg = """defining a process outside of a function is deprecated because it is ambiguous. You should: * wrap the '{name}: {expr}' assignment inside a function like this: compute_{name}: # you can name it any way you like but simply \ '{name}' is not recommended ! - {name}: {expr} * update the simulation.processes list to use 'compute_{name}' (the function \ name) instead of '{name}'. """ else: msg = """defining a process outside of a function is \ deprecated because it is ambiguous. 1) If '{name}: {expr}' is an assignment ('{name}' stores the result of \ '{expr}'), you should: * wrap the assignment inside a function, for example, like this: compute_{name}: # you can name it any way you like but simply \ '{name}' is not recommended ! - {name}: {expr} * update the simulation.processes list to use 'compute_{name}' (the function \ name) instead of '{name}'. * add '{name}' in the entities fields with 'output: False' 2) otherwise if '{expr}' is an expression which does not return any value, you \ can simply transform it into a function, like this: {name}: - {expr} """ warnings.warn(msg.format(name=k, expr=v), UserDeprecationWarning) if process is None: if self.ismethod(v): if isinstance(v, dict): args = v.get('args', '') code = v.get('code', '') result = v.get('return', '') oldargs = "\n args: {}".format(args) \ if args else '' oldcode = "\n code:\n - ..." \ if code else '' newcode = "\n - ..." if code else '' oldresult = "\n return: " + result \ if result else '' newresult = "\n - return " + result \ if result else '' template = """ This syntax for defining functions with arguments or a return value is not supported anymore: {funcname}:{oldargs}{oldcode}{oldresult} Please use this instead: {funcname}({newargs}):{newcode}{newresult}""" msg = template.format(funcname=k, oldargs=oldargs, oldcode=oldcode, oldresult=oldresult, newargs=args, newcode=newcode, newresult=newresult) raise SyntaxError(msg) assert isinstance(v, list) # v should be a list of dicts (assignments) or # strings (actions) if "(" in k: k, args = split_signature(k) argnames = argspec(args).args code_def, result_def = v, None else: argnames, code_def, result_def = [], v, None method_context = self.get_group_context( context, argnames) code = self.parse_process_group(k + "_code", code_def, method_context, purge=False) # TODO: use code.predictors instead (but it currently # fails for some reason) or at least factor this out # with the code in parse_process_group group_expressions = [ elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in code_def ] group_predictors = \ self.collect_predictors(group_expressions) method_context = self.get_group_context( method_context, group_predictors) result_expr = parse(result_def, method_context) assert result_expr is None or \ isinstance(result_expr, Expr) process = Function(k, self, argnames, code, result_expr) elif isinstance(v, dict) and 'predictor' in v: raise ValueError("Using the 'predictor' keyword is " "not supported anymore. " "If you need several processes to " "write to the same variable, you " "should rather use functions.") elif k is None and v is None: raise ValueError("empty process found ('-')") else: raise Exception("unknown expression type for " "%s: %s (%s)" % (k, v, type(v))) processes.append((k, process)) return processes def parse_processes(self, context): processes = self.parse_expressions(self.process_strings.iteritems(), context, functions_only=True) self.processes = dict(processes) # self.ssa() # def resolve_method_calls(self): # for p in self.processes.itervalues(): # for expr in p.expressions(): # for node in expr.all_of(MethodCallToResolve): # # replace node in the parent node by the "resolved" node # # TODO: mimic ast.NodeTransformer # node.resolve() def ssa(self): fields_versions = collections.defaultdict(int) for p in self.processes.itervalues(): if isinstance(p, ProcessGroup): p.ssa(fields_versions) def compute_lagged_fields(self, inspect_one_period=True): from tfunc import Lag from links import LinkGet lag_vars = collections.defaultdict(set) for p in self.processes.itervalues(): for expr in p.expressions(): for node in expr.all_of((Lag, ValueForPeriod)): if isinstance(node, Lag): num_periods = node.args[1] else: assert isinstance(node, ValueForPeriod) period = node.args[1] # is the period argument equal to "period - X"? if (isinstance(period, BinaryOp) and period.op == '-' and isinstance(period.expr1, Variable) and period.expr1.name == 'period'): num_periods = period.expr2 else: num_periods = None # if num_periods is an Expr, we cannot really tell whether # or not it is 1 or more, so we must always take the node if num_periods is not None and np.isscalar(num_periods): inspect_expr = (num_periods == 1) == inspect_one_period else: # the safe thing is to take everything when not sure inspect_expr = True if inspect_expr: expr_node = node.args[0] for v in expr_node.all_of(Variable): if not isinstance(v, GlobalVariable): lag_vars[v.entity].add(v.name) for lv in expr_node.all_of(LinkGet): # noinspection PyProtectedMember lag_vars[lv.link._entity].add(lv.link._link_field) target_vars = list(lv.target_expr.all_of(Variable)) assert all(v.entity is not None for v in target_vars) for v in target_vars: lag_vars[v.entity].add(v.name) return lag_vars def build_period_array(self, start_period): self.array, self.id_to_rownum = \ build_period_array(self.input_table, list(self.fields.name_types), self.input_rows, self.input_index, start_period, default_values = self.fields.default_values) assert isinstance(self.array, ColumnArray) self.array_period = start_period def load_period_data(self, period): if self.lag_fields: # TODO: use ColumnArray here # XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] # if not self.indexed_input_table.has_period(period): # # nothing needs to be done in that case # return # # input_array = self.indexed_input_table.read(period) rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1', default_values=self.fields.default_values) # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array) def purge_locals(self): """purge all local variables""" global max_vars temp_vars = self.temp_variables local_var_names = self.local_var_names num_locals = len(local_var_names) if config.debug and num_locals: local_vars = [ v for k, v in temp_vars.iteritems() if k in local_var_names ] max_vars = max(max_vars, num_locals) temp_mem = sum( sys.getsizeof(v) + (v.nbytes if isinstance(v, np.ndarray) else 0) for v in local_vars) avgsize = sum(v.dtype.itemsize if isinstance(v, np.ndarray) else 0 for v in local_vars) / num_locals if config.log_level in ("functions", "processes"): print(("purging {} variables (max {}), will free {} of memory " "(avg field size: {} b)".format(num_locals, max_vars, size2str(temp_mem), avgsize))) for var in local_var_names: del temp_vars[var] def flush_index(self, period): # keep an in-memory copy of the index for the current period self.output_index[period] = self.id_to_rownum # also flush it to disk # noinspection PyProtectedMember h5file = self.output_index_node._v_file h5file.create_array(self.output_index_node, "_%d" % period, self.id_to_rownum, "Period %d index" % period) # if an old index exists (this is not the case for the first period!), # point to the one on the disk, instead of the one in memory, # effectively clearing the one in memory idxname = '_%d' % (period - 1) if idxname in self.output_index_node: prev_disk_array = getattr(self.output_index_node, idxname) # DiskBackedArray is a workaround for pytables#360 (see above) self.output_index[period - 1] = DiskBackedArray(prev_disk_array) def store_period_data(self, period): if config.debug and config.log_level in ("functions", "processes"): temp_mem = sum(v.nbytes for v in self.temp_variables.itervalues() if isinstance(v, np.ndarray)) main_mem = self.array.nbytes print("mem used: %s (main: %s / temp: %s)" % (size2str(temp_mem + main_mem), size2str(main_mem), size2str(temp_mem))) # erase all temporary variables which have been computed this period self.temp_variables = {} if period in self.output_rows: raise Exception("trying to modify already simulated rows") if self.table is not None: startrow = self.table.nrows self.array.append_to_table(self.table) self.output_rows[period] = (startrow, self.table.nrows) self.flush_index(period) self.table.flush() # def compress_period_data(self, level): # compressed = bcolz.ctable(self.array, cparams=bcolz.cparams(level)) # print "%d -> %d (%f)" % compressed._get_stats() def optimize_processes(self): """ Common subexpression elimination """ # XXX: # * we either need to do SSA first, or for each assignment process, # "forget" all expressions containing the assigned variable # doing it using SSA seems cleaner, but in the end it shouldn't # change much. If we do not do SSA, we will need to "forget" about # expressions which contain an assigned variable at *each step* of # the process, including when simply counting the number of occurrence # of expressions. In that case we also need to iterate on the # processes in the same order than the simulation! # * I don't know if it is a good idea to optimize cross-functions. # On one hand it offers much more possibilities for optimizations # but, on the other hand the optimization pass might just take too # much time... If we do not do it globally, we should move the method # to ProcessGroup instead. But let's try it cross-functions first. # * cross-functions might get tricky when we take function calls # into account. # TODO: # * it will be simpler and better to do this in two passes: first # find duplicated expr and number of occurrences of each expr, then # proceed with the factorization expr_count = collections.Counter() for p in self.processes.itervalues(): for expr in p.expressions(): for subexpr in expr.traverse(): if isinstance(subexpr, Expr) and \ not isinstance(subexpr, Variable): expr_count[subexpr] += 1 print() print("most common expressions") print("=" * 20) print(expr_count.most_common(100)) # if count(larger) <= count(smaller) <= count(larger) + 1: kill smaller # if count(smaller) > count(larger) + 1: do both (larger uses smaller) # seen = {} # for p in self.processes.itervalues(): # for expr in p.expressions(): # for subexpr in expr.traverse(): # if subexpr in seen: # original = seen[subexpr] # # 1) add an assignment process before the process of # # the "original" expression to compute a temporary # # variable # # 2) modify "original" expr to use the temp var # # 3) modify the current expr to use the temp var # else: # seen[subexpr] = subexpr def __repr__(self): return "<Entity '%s'>" % self.name def __str__(self): return self.name
class Entity(object): """ fields is a list of tuple (name, type) """ def __init__(self, name, fields=None, links=None, macro_strings=None, process_strings=None, array=None): self.name = name # we should have exactly one of either array or fields defined assert ((fields is None and array is not None) or (fields is not None and array is None)) if array is not None: if fields is None: fields = get_fields(array) array_period = np.min(array['period']) else: array_period = None if not isinstance(fields, FieldCollection): def fdef2field(name, fielddef): initialdata = True output = True default_value = None if isinstance(fielddef, Field): return fielddef elif isinstance(fielddef, (dict, str)): if isinstance(fielddef, dict): strtype = fielddef['type'] initialdata = fielddef.get('initialdata', True) output = fielddef.get('output', True) default_value = fielddef.get('default', default_value_by_strtype[strtype]) elif isinstance(fielddef, str): strtype = fielddef default_value = default_value_by_strtype[strtype] else: raise Exception('invalid field definition') dtype = field_str_to_type(strtype, "field '%s'" % name) else: assert isinstance(fielddef, type) dtype = normalize_type(fielddef) return Field(name, dtype, initialdata, output, default_value) fields = FieldCollection(fdef2field(name, fdef) for name, fdef in fields) duplicate_names = [name for name, num in count_occurrences(fields.names) if num > 1] if duplicate_names: raise Exception("duplicate fields in entity '%s': %s" % (self.name, ', '.join(duplicate_names))) fnames = set(fields.names) if 'id' not in fnames: fields.insert(0, Field('id', int)) if 'period' not in fnames: fields.insert(0, Field('period', int)) self.fields = fields self.links = links if macro_strings is None: macro_strings = {} self.macro_strings = macro_strings self.process_strings = process_strings self.processes = None self.expectedrows = tables.parameters.EXPECTED_ROWS_TABLE self.table = None self.input_table = None self.indexed_input_table = None self.indexed_output_table = None self.input_rows = {} # TODO: it is unnecessary to keep periods which have already been # simulated, because (currently) when we go back in time, we always go # back using the output table... but periods before the start_period # are only present in input_index self.input_index = {} self.output_rows = {} self.output_index = {} self.output_index_node = None self.base_period = None # we need a separate field, instead of using array['period'] to be able # to get the period even when the array is empty. self.array_period = array_period self.array = array self.lag_fields = [] self.array_lag = None self.num_tmp = 0 self.temp_variables = {} self.id_to_rownum = None if array is not None: rows_per_period, index_per_period = index_table(array) self.input_rows = rows_per_period self.output_rows = rows_per_period self.input_index = index_per_period self.output_index = index_per_period self.id_to_rownum = index_per_period[array_period] self._variables = None self._methods = None @classmethod def from_yaml(cls, ent_name, entity_def): from links import Many2One, One2Many # YAML "ordered dict" syntax returns a list of dict and we want a list # of tuples # FIXME: if "fields" key is present but no field is defined, # entity_def.get('fields', []) returns None and this breaks fields_def = [d.items()[0] for d in entity_def.get('fields', [])] link_defs = entity_def.get('links', {}) str2class = {'one2many': One2Many, 'many2one': Many2One} links = dict((name, str2class[l['type']](name, l['field'], l['target'])) for name, l in link_defs.iteritems()) return Entity(ent_name, fields_def, links, entity_def.get('macros', {}), entity_def.get('processes', {})) # noinspection PyProtectedMember def attach_and_resolve_links(self, entities): for link in self.links.itervalues(): link._attach(self) link._resolve_target(entities) @property def local_var_names(self): return set(self.temp_variables.keys()) - set(self.variables.keys()) @classmethod def from_table(cls, table): return Entity(table.name, get_fields(table), links={}, macro_strings={}, process_strings={}) @staticmethod def collect_predictors(items): # this excludes lists (functions) and dict (while, ...) return [k for k, v in items if k is not None and isinstance(v, (basestring, int, float))] @property def variables(self): if self._variables is None: if self.process_strings: processes = self.process_strings.items() else: processes = [] # names of all processes (hybrid or not) of the entity process_names = set(k for k, v in processes if k is not None) # names of all entity variables (temporary or not) which are set # globally all_predictors = set(self.collect_predictors(processes)) field_names = set(self.fields.names) # normal fields (non-callable/no hybrid variable-function for them) variables = dict((name, Variable(self, name, type_)) for name, type_ in self.fields.name_types if name in field_names - process_names) # callable fields (fields with a process of the same name) variables.update((name, VariableMethodHybrid(self, name, type_)) for name, type_ in self.fields.name_types if name in field_names & process_names) # global temporaries (they are all callable). variables.update((name, VariableMethodHybrid(self, name)) for name in all_predictors - field_names) variables.update(self.links) self._variables = variables return self._variables @staticmethod def ismethod(v): keys = ('args', 'code', 'return') return (isinstance(v, list) or isinstance(v, dict) and any(key in v for key in keys)) @property def methods(self): if self._methods is None: pstrings = self.process_strings items = pstrings.iteritems() if pstrings is not None else () # variable-method hybrids are handled by the self.variable property stored_fields = set(self.fields.in_output.names) methodnames = [k for k, v in items if self.ismethod(v) and k not in stored_fields] # factorial(n) -> factorial methodnames = [split_signature(name)[0] if '(' in name else name for name in methodnames] self._methods = [(name, MethodSymbol(name, self)) for name in methodnames] return self._methods def all_symbols(self, global_context): from links import PrefixingLink symbols = WarnOverrideDict(self.variables.copy()) local_context = global_context.copy() local_context[self.name] = symbols local_context['__entity__'] = self.name macros = dict((k, parse(v, local_context)) for k, v in self.macro_strings.iteritems()) symbols.update(macros) symbols['other'] = PrefixingLink(self, macros, self.links, '__other_') symbols.update(self.methods) return symbols def parse_expr(self, k, v, context): if isinstance(v, (bool, int, float)): return Assignment(k, self, v) elif isinstance(v, basestring): return Assignment(k, self, parse(v, context)) else: # lets be explicit about it return None @staticmethod def get_group_context(context, varnames): ent_name = context['__entity__'] entity = context['__entities__'][ent_name] group_context = context.copy() entity_context = group_context[ent_name].copy() entity_context.update((name, Variable(entity, name)) for name in varnames) group_context[ent_name] = entity_context return group_context def parse_process_group(self, k, items, context, purge=True): # items is a list of [dict (assignment) or string (action)] if items is None: raise ValueError("no processes in '%s'" % k) group_expressions = [elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in items] group_predictors = self.collect_predictors(group_expressions) group_context = self.get_group_context(context, group_predictors) sub_processes = self.parse_expressions(group_expressions, group_context) return ProcessGroup(k, self, sub_processes, purge) # Once we can make it an error for non-function processes/statements, # we should probably split this method into parse_functions and # parse_function_body. def parse_expressions(self, items, context, functions_only=False): """ items -- a list of tuples (name, process_string) context -- parsing context a dict of all symbols available for all entities functions_only -- whether non-functions processes are allowed """ processes = [] for k, v in items: if k == 'while': if isinstance(v, dict): raise SyntaxError(""" This syntax for while is not supported anymore: - while: cond: {cond_expr} code: - ... Please use this instead: - while {cond_expr}: - ... """.format(cond_expr=v['cond'])) else: raise ValueError("while is a reserved keyword") elif k is not None and k.startswith('while '): if not isinstance(v, list): raise SyntaxError("while is a reserved keyword") cond = parse(k[6:].strip(), context) assert isinstance(cond, Expr) code = self.parse_process_group("while_code", v, context, purge=False) process = While(k, self, cond, code) elif k == 'return': e = SyntaxError("return is a reserved keyword. To return " "from a function, use 'return expr' " "instead of 'return: expr'") e.liam2context = "while parsing: return: {}".format(v) raise e elif k is None and isinstance(v, str) and v.startswith('return'): assert len(v) == 6 or v[6] == ' ' if len(v) > 6: result_def = v[7:].strip() else: result_def = None result_expr = parse(result_def, context) process = Return(None, self, result_expr) else: process = self.parse_expr(k, v, context) if process is not None and functions_only: if k in self.fields.names: msg = """defining a process outside of a function is deprecated because it is ambiguous. You should: * wrap the '{name}: {expr}' assignment inside a function like this: compute_{name}: # you can name it any way you like but simply \ '{name}' is not recommended ! - {name}: {expr} * update the simulation.processes list to use 'compute_{name}' (the function \ name) instead of '{name}'. """ else: msg = """defining a process outside of a function is \ deprecated because it is ambiguous. 1) If '{name}: {expr}' is an assignment ('{name}' stores the result of \ '{expr}'), you should: * wrap the assignment inside a function, for example, like this: compute_{name}: # you can name it any way you like but simply \ '{name}' is not recommended ! - {name}: {expr} * update the simulation.processes list to use 'compute_{name}' (the function \ name) instead of '{name}'. * add '{name}' in the entities fields with 'output: False' 2) otherwise if '{expr}' is an expression which does not return any value, you \ can simply transform it into a function, like this: {name}: - {expr} """ warnings.warn(msg.format(name=k, expr=v), UserDeprecationWarning) if process is None: if self.ismethod(v): if isinstance(v, dict): args = v.get('args', '') code = v.get('code', '') result = v.get('return', '') oldargs = "\n args: {}".format(args) \ if args else '' oldcode = "\n code:\n - ..." \ if code else '' newcode = "\n - ..." if code else '' oldresult = "\n return: " + result \ if result else '' newresult = "\n - return " + result \ if result else '' template = """ This syntax for defining functions with arguments or a return value is not supported anymore: {funcname}:{oldargs}{oldcode}{oldresult} Please use this instead: {funcname}({newargs}):{newcode}{newresult}""" msg = template.format(funcname=k, oldargs=oldargs, oldcode=oldcode, oldresult=oldresult, newargs=args, newcode=newcode, newresult=newresult) raise SyntaxError(msg) assert isinstance(v, list) # v should be a list of dicts (assignments) or # strings (actions) if "(" in k: k, args = split_signature(k) argnames = argspec(args).args code_def, result_def = v, None else: argnames, code_def, result_def = [], v, None method_context = self.get_group_context(context, argnames) code = self.parse_process_group(k + "_code", code_def, method_context, purge=False) # TODO: use code.predictors instead (but it currently # fails for some reason) or at least factor this out # with the code in parse_process_group group_expressions = [elem.items()[0] if isinstance(elem, dict) else (None, elem) for elem in code_def] group_predictors = \ self.collect_predictors(group_expressions) method_context = self.get_group_context( method_context, group_predictors) result_expr = parse(result_def, method_context) assert result_expr is None or \ isinstance(result_expr, Expr) process = Function(k, self, argnames, code, result_expr) elif isinstance(v, dict) and 'predictor' in v: raise ValueError("Using the 'predictor' keyword is " "not supported anymore. " "If you need several processes to " "write to the same variable, you " "should rather use functions.") elif k is None and v is None: raise ValueError("empty process found ('-')") else: raise Exception("unknown expression type for " "%s: %s (%s)" % (k, v, type(v))) processes.append((k, process)) return processes def parse_processes(self, context): processes = self.parse_expressions(self.process_strings.iteritems(), context, functions_only=True) self.processes = dict(processes) # self.ssa() # def resolve_method_calls(self): # for p in self.processes.itervalues(): # for expr in p.expressions(): # for node in expr.all_of(MethodCallToResolve): # # replace node in the parent node by the "resolved" node # # TODO: mimic ast.NodeTransformer # node.resolve() def ssa(self): fields_versions = collections.defaultdict(int) for p in self.processes.itervalues(): if isinstance(p, ProcessGroup): p.ssa(fields_versions) def compute_lagged_fields(self, inspect_one_period=True): from tfunc import Lag from links import LinkGet lag_vars = collections.defaultdict(set) for p in self.processes.itervalues(): for expr in p.expressions(): for node in expr.all_of((Lag, ValueForPeriod)): if isinstance(node, Lag): num_periods = node.args[1] else: assert isinstance(node, ValueForPeriod) period = node.args[1] # is the period argument equal to "period - X"? if (isinstance(period, BinaryOp) and period.op == '-' and isinstance(period.expr1, Variable) and period.expr1.name == 'period'): num_periods = period.expr2 else: num_periods = None # if num_periods is an Expr, we cannot really tell whether # or not it is 1 or more, so we must always take the node if num_periods is not None and np.isscalar(num_periods): inspect_expr = (num_periods == 1) == inspect_one_period else: # the safe thing is to take everything when not sure inspect_expr = True if inspect_expr: expr_node = node.args[0] for v in expr_node.all_of(Variable): if not isinstance(v, GlobalVariable): lag_vars[v.entity].add(v.name) for lv in expr_node.all_of(LinkGet): # noinspection PyProtectedMember lag_vars[lv.link._entity].add(lv.link._link_field) target_vars = list(lv.target_expr.all_of(Variable)) assert all(v.entity is not None for v in target_vars) for v in target_vars: lag_vars[v.entity].add(v.name) return lag_vars def build_period_array(self, start_period): self.array, self.id_to_rownum = \ build_period_array(self.input_table, list(self.fields.name_types), self.input_rows, self.input_index, start_period, default_values = self.fields.default_values) assert isinstance(self.array, ColumnArray) self.array_period = start_period def load_period_data(self, period): if self.lag_fields: # TODO: use ColumnArray here # XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] # if not self.indexed_input_table.has_period(period): # # nothing needs to be done in that case # return # # input_array = self.indexed_input_table.read(period) rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1', default_values=self.fields.default_values) # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array) def purge_locals(self): """purge all local variables""" global max_vars temp_vars = self.temp_variables local_var_names = self.local_var_names num_locals = len(local_var_names) if config.debug and num_locals: local_vars = [v for k, v in temp_vars.iteritems() if k in local_var_names] max_vars = max(max_vars, num_locals) temp_mem = sum(sys.getsizeof(v) + (v.nbytes if isinstance(v, np.ndarray) else 0) for v in local_vars) avgsize = sum(v.dtype.itemsize if isinstance(v, np.ndarray) else 0 for v in local_vars) / num_locals if config.log_level in ("functions", "processes"): print(("purging {} variables (max {}), will free {} of memory " "(avg field size: {} b)".format(num_locals, max_vars, size2str(temp_mem), avgsize))) for var in local_var_names: del temp_vars[var] def flush_index(self, period): # keep an in-memory copy of the index for the current period self.output_index[period] = self.id_to_rownum # also flush it to disk # noinspection PyProtectedMember h5file = self.output_index_node._v_file h5file.create_array(self.output_index_node, "_%d" % period, self.id_to_rownum, "Period %d index" % period) # if an old index exists (this is not the case for the first period!), # point to the one on the disk, instead of the one in memory, # effectively clearing the one in memory idxname = '_%d' % (period - 1) if idxname in self.output_index_node: prev_disk_array = getattr(self.output_index_node, idxname) # DiskBackedArray is a workaround for pytables#360 (see above) self.output_index[period - 1] = DiskBackedArray(prev_disk_array) def store_period_data(self, period): if config.debug and config.log_level in ("functions", "processes"): temp_mem = sum(v.nbytes for v in self.temp_variables.itervalues() if isinstance(v, np.ndarray)) main_mem = self.array.nbytes print("mem used: %s (main: %s / temp: %s)" % (size2str(temp_mem + main_mem), size2str(main_mem), size2str(temp_mem))) # erase all temporary variables which have been computed this period self.temp_variables = {} if period in self.output_rows: raise Exception("trying to modify already simulated rows") if self.table is not None: startrow = self.table.nrows self.array.append_to_table(self.table) self.output_rows[period] = (startrow, self.table.nrows) self.flush_index(period) self.table.flush() # def compress_period_data(self, level): # compressed = bcolz.ctable(self.array, cparams=bcolz.cparams(level)) # print "%d -> %d (%f)" % compressed._get_stats() def optimize_processes(self): """ Common subexpression elimination """ # XXX: # * we either need to do SSA first, or for each assignment process, # "forget" all expressions containing the assigned variable # doing it using SSA seems cleaner, but in the end it shouldn't # change much. If we do not do SSA, we will need to "forget" about # expressions which contain an assigned variable at *each step* of # the process, including when simply counting the number of occurrence # of expressions. In that case we also need to iterate on the # processes in the same order than the simulation! # * I don't know if it is a good idea to optimize cross-functions. # On one hand it offers much more possibilities for optimizations # but, on the other hand the optimization pass might just take too # much time... If we do not do it globally, we should move the method # to ProcessGroup instead. But let's try it cross-functions first. # * cross-functions might get tricky when we take function calls # into account. # TODO: # * it will be simpler and better to do this in two passes: first # find duplicated expr and number of occurrences of each expr, then # proceed with the factorization expr_count = collections.Counter() for p in self.processes.itervalues(): for expr in p.expressions(): for subexpr in expr.traverse(): if isinstance(subexpr, Expr) and \ not isinstance(subexpr, Variable): expr_count[subexpr] += 1 print() print("most common expressions") print("=" * 20) print(expr_count.most_common(100)) # if count(larger) <= count(smaller) <= count(larger) + 1: kill smaller # if count(smaller) > count(larger) + 1: do both (larger uses smaller) # seen = {} # for p in self.processes.itervalues(): # for expr in p.expressions(): # for subexpr in expr.traverse(): # if subexpr in seen: # original = seen[subexpr] # # 1) add an assignment process before the process of # # the "original" expression to compute a temporary # # variable # # 2) modify "original" expr to use the temp var # # 3) modify the current expr to use the temp var # else: # seen[subexpr] = subexpr def __repr__(self): return "<Entity '%s'>" % self.name def __str__(self): return self.name
class Entity(object): """ fields is a list of tuple (name, type, options) """ def __init__(self, name, fields=None, missing_fields=None, links=None, macro_strings=None, process_strings=None, array=None): self.name = name # we should have exactly one of either array or fields defined assert ((fields is None and array is not None) or (fields is not None and array is None)) if array is not None: if fields is None: fields = get_fields(array) array_period = np.min(array['period']) else: array_period = None duplicate_names = [name for name, num in count_occurrences(fname for fname, _ in fields) if num > 1] if duplicate_names: raise Exception("duplicate fields in entity '%s': %s" % (self.name, ', '.join(duplicate_names))) fnames = [name for name, _ in fields] if 'id' not in fnames: fields.insert(0, ('id', int)) if 'period' not in fnames: fields.insert(0, ('period', int)) self.fields = fields # only used in data (to check that all "required" fields are present # in the input file) # one potential solution would be to split the fields argument and # attribute in input_fields and output_fields (regardless of whether # it is split in the simulation/yaml file). # however that might be just a temporary solution as we will soon need # more arguments to fields (default values, ranges, etc...) # another solution is to use a Field class # seems like the better long term solution self.missing_fields = missing_fields self.period_individual_fnames = [name for name, _ in fields] self.links = links self.macro_strings = macro_strings self.process_strings = process_strings self.processes = None self.expectedrows = tables.parameters.EXPECTED_ROWS_TABLE self.table = None self.input_table = None self.indexed_input_table = None self.indexed_output_table = None self.input_rows = {} #TODO: it is unnecessary to keep periods which have already been # simulated, because (currently) when we go back in time, we always go # back using the output table. self.input_index = {} self.output_rows = {} self.output_index = {} self.base_period = None # we need a separate field, instead of using array['period'] to be able # to get the period even when the array is empty. self.array_period = array_period self.array = None self.lag_fields = [] self.array_lag = None self.num_tmp = 0 self.temp_variables = {} self.id_to_rownum = None self._variables = None @classmethod def from_yaml(cls, ent_name, entity_def): from links import Many2One, One2Many # YAML "ordered dict" syntax returns a list of dict and we want a list # of tuples #FIXME: if "fields" key is present but no field is defined, #entity_def.get('fields', []) returns None and this breaks fields_def = [d.items()[0] for d in entity_def.get('fields', [])] fields = [] missing_fields = [] for name, fielddef in fields_def: if isinstance(fielddef, dict): strtype = fielddef['type'] if not fielddef.get('initialdata', True): missing_fields.append(name) else: strtype = fielddef fields.append((name, field_str_to_type(strtype, "field '%s'" % name))) link_defs = entity_def.get('links', {}) str2class = {'one2many': One2Many, 'many2one': Many2One} links = dict((name, str2class[l['type']](name, l['field'], l['target'])) for name, l in link_defs.iteritems()) return Entity(ent_name, fields, missing_fields, links, entity_def.get('macros', {}), entity_def.get('processes', {})) @classmethod def from_table(cls, table): return Entity(table.name, get_fields(table), missing_fields=[], links={}, macro_strings={}, process_strings={}) @staticmethod def collect_predictors(items): predictors = [] for k, v in items: if k is None: continue # no need to test for bool since bool is a subclass of int if isinstance(v, (basestring, int, float)): predictors.append(k) elif isinstance(v, dict): predictors.append(v['predictor']) return predictors @property def variables(self): if self._variables is None: global_predictors = \ self.collect_predictors(self.process_strings.iteritems()) all_fields = set(global_predictors) stored_fields = set(self.period_individual_fnames) temporary_fields = all_fields - stored_fields variables = dict((name, Variable(name, type_)) for name, type_ in self.fields) variables.update((name, Variable(name)) for name in temporary_fields) variables.update(self.links) self._variables = variables return self._variables def check_links(self): for name, link in self.links.iteritems(): #noinspection PyProtectedMember target_name = link._target_entity_name if target_name not in entity_registry: raise Exception("Target of '%s' link in entity '%s' is an " "unknown entity (%s)" % (name, self.name, target_name)) def get_cond_context(self, entities_visited=None): """returns the conditional context: {link: variables}""" if entities_visited is None: entities_visited = set() else: entities_visited = entities_visited.copy() entities_visited.add(self) linked_entities = {} for k, link in self.links.items(): #noinspection PyProtectedMember entity = link._target_entity() if entity not in entities_visited: linked_entities[k] = entity cond_context = {} # use a set of entities to compute the conditional context only once # per target entity for entity in set(linked_entities.values()): cond_context.update(entity.get_cond_context(entities_visited)) # entities linked directly take priority over (override) farther ones cond_context.update((k, entity.variables) for k, entity in linked_entities.items()) return cond_context conditional_context = property(get_cond_context) def all_variables(self, globals_def): from links import PrefixingLink variables = global_variables(globals_def).copy() variables.update(self.variables) cond_context = self.conditional_context macros = dict((k, parse(v, variables, cond_context)) for k, v in self.macro_strings.iteritems()) variables.update(macros) variables['other'] = PrefixingLink(macros, self.links, '__other_') return variables def parse_expressions(self, items, variables, cond_context): processes = [] for k, v in items: if isinstance(v, (bool, int, float)): process = Assignment(v) elif isinstance(v, basestring): expr = parse(v, variables, cond_context) if not isinstance(expr, Process): if k is None: process = Compute(expr) else: process = Assignment(expr) else: process = expr elif isinstance(v, list): # v is a procedure # it should be a list of dict (assignment) or string (action) group_expressions = [] for element in v: if isinstance(element, dict): group_expressions.append(element.items()[0]) else: group_expressions.append((None, element)) group_predictors = \ self.collect_predictors(group_expressions) group_context = variables.copy() group_context.update((name, Variable(name)) for name in group_predictors) sub_processes = self.parse_expressions(group_expressions, group_context, cond_context) process = ProcessGroup(k, sub_processes) elif isinstance(v, dict): warnings.warn("Using the 'predictor' keyword is deprecated. " "If you need several processes to " "write to the same variable, you should " "rather use procedures.", UserDeprecationWarning) expr = parse(v['expr'], variables, cond_context) process = Assignment(expr) process.predictor = v['predictor'] else: raise Exception("unknown expression type for %s: %s" % (k, type(v))) processes.append((k, process)) return processes def parse_processes(self, globals_def): processes = self.parse_expressions(self.process_strings.iteritems(), self.all_variables(globals_def), self.conditional_context) processes = dict(processes) fnames = set(self.period_individual_fnames) def attach_processes(items): for k, v in items: if isinstance(v, ProcessGroup): v.entity = self attach_processes(v.subprocesses) elif isinstance(v, Assignment): predictor = v.predictor if v.predictor is not None else k if predictor in fnames: kind = 'period_individual' else: kind = None v.attach(k, self, kind) else: v.attach(k, self) attach_processes(processes.iteritems()) self.processes = processes def compute_lagged_fields(self): from tfunc import Lag from links import LinkValue lag_vars = set() for p in self.processes.itervalues(): for expr in p.expressions(): for node in expr.all_of(Lag): for v in node.all_of(Variable): if not isinstance(v, GlobalVariable): lag_vars.add(v.name) for lv in node.all_of(LinkValue): #noinspection PyProtectedMember lag_vars.add(lv.link._link_field) #noinspection PyProtectedMember target_entity = lv.link._target_entity() if target_entity == self: target_vars = lv.target_expression.all_of(Variable) lag_vars.update(v.name for v in target_vars) if lag_vars: # make sure we have an 'id' column, and that it comes first # (makes debugging easier). 'id' is always necessary for lag # expressions to be able to "expand" the vector of values to the # "current" individuals. lag_vars.discard('id') lag_vars = ['id'] + sorted(lag_vars) field_type = dict(self.fields) self.lag_fields = [(v, field_type[v]) for v in lag_vars] def load_period_data(self, period): if self.lag_fields: #TODO: use ColumnArray here #XXX: do we need np.empty? (but watch for alias problems) self.array_lag = np.empty(len(self.array), dtype=np.dtype(self.lag_fields)) for field, _ in self.lag_fields: self.array_lag[field] = self.array[field] rows = self.input_rows.get(period) if rows is None: # nothing needs to be done in that case return start, stop = rows # It would be nice to use ColumnArray.from_table and adapt merge_arrays # to produce a ColumnArray in all cases, but it is not a huge priority # for now input_array = self.input_table.read(start, stop) self.array, self.id_to_rownum = \ merge_arrays(self.array, input_array, result_fields='array1') # this can happen, depending on the layout of columns in input_array, # but the usual case (in retro) is that self.array is a superset of # input_array, in which case merge_arrays returns a ColumnArray if not isinstance(self.array, ColumnArray): self.array = ColumnArray(self.array) def store_period_data(self, period): if config.debug: temp_mem = sum(v.nbytes for v in self.temp_variables.itervalues() if isinstance(v, np.ndarray)) main_mem = self.array.nbytes print("mem used: %s (main: %s / temp: %s)" \ % (size2str(temp_mem + main_mem), size2str(main_mem), size2str(temp_mem))) # erase all temporary variables which have been computed this period self.temp_variables = {} if period in self.output_rows: raise Exception("trying to modify already simulated rows") else: startrow = self.table.nrows self.array.append_to_table(self.table) self.output_rows[period] = (startrow, self.table.nrows) self.output_index[period] = self.id_to_rownum self.table.flush() # def compress_period_data(self, level): # compressed = ca.ctable(self.array, cparams=ca.cparams(level)) # print "%d -> %d (%f)" % compressed._get_stats() @staticmethod def fill_missing_values(ids, values, context, filler='auto'): """ ids: ids present in past period context: current period context """ if filler is 'auto': filler = get_missing_value(values) result = np.empty(context_length(context), dtype=values.dtype) result.fill(filler) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result def value_for_period(self, expr, period, context, fill='auto'): sub_context = EntityContext(self, {'period': period, '__globals__': context['__globals__']}) result = expr_eval(expr, sub_context) if isinstance(result, np.ndarray) and result.shape: ids = expr_eval(Variable('id'), sub_context) if fill is None: return ids, result else: # expand values to the current "outer" context return self.fill_missing_values(ids, result, context, fill) else: return result def __repr__(self): return "<Entity '%s'>" % self.name