def match_one_set1_individual_pool(idx, sorted_idx, pool_size): global local_ctx set2_size = context_length(local_ctx) if not set2_size: raise StopIteration if set2_size > pool_size: pool = random.sample(xrange(context_length(local_ctx)), pool_size) else: pool = range(set2_size) sub_local_ctx = context_subset(local_ctx, pool, None) sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) set2_scores = expr_eval(score_expr, sub_local_ctx) individual2_pool_idx = np.argmax(set2_scores) individual2_idx = pool[individual2_pool_idx] id1 = sub_local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def evaluate(self, context): if config.debug: print() print("random sequence position before:", np.random.get_state()[2]) num = context_length(context) choices = self.choices if num: bins = self.bins if bins is None: # all values have the same probability choices_idx = np.random.randint(len(choices), size=num) else: if any(isinstance(b, Expr) for b in bins): weights = [expr_eval(expr, context) for expr in bins] bins = self._weights_to_bins(weights) u = np.random.uniform(size=num) #XXX: np.choice uses searchsorted(bins, u) instead of digitize choices_idx = np.digitize(u, bins) - 1 else: choices_idx = [] if config.debug: print("random sequence position after:", np.random.get_state()[2]) if any(isinstance(c, Expr) for c in choices): choices = np.array([expr_eval(expr, context) for expr in choices]) return choices[choices_idx]
def compute(self, context, set1filter, set2filter, orderby1, orderby2): set1filterexpr = self._getfilter(context, set1filter) set1filtervalue = expr_eval(set1filterexpr, context) set2filterexpr = self._getfilter(context, set2filter) set2filtervalue = expr_eval(set2filterexpr, context) set1len = set1filtervalue.sum() set2len = set2filtervalue.sum() numtomatch = min(set1len, set2len) print("matching with %d/%d individuals" % (set1len, set2len)) result = np.full(context_length(context), -1, dtype=int) if not numtomatch: return result sorted_set1_indices = orderby1[set1filtervalue].argsort()[-numtomatch:] sorted_set2_indices = orderby2[set2filtervalue].argsort()[-numtomatch:] set1ids = context['id'][set1filtervalue] set2ids = context['id'][set2filtervalue] id_to_rownum = context.id_to_rownum id1 = set1ids[sorted_set1_indices] id2 = set2ids[sorted_set2_indices] # cannot use sorted_setX_indices because those are "local" indices result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result
def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) # pk = tuple(individual1[fname] for fname in pk_names) # optimized_expr = optimized_exprs.get(pk) # if optimized_expr is None: # for name in pk_names: # fake_set1['__f_%s' % name].value = individual1[name] # optimized_expr = str(symbolic_expr.simplify()) # optimized_exprs[pk] = optimized_expr # set2_scores = evaluate(optimized_expr, mm_dict, set2) set2_scores = expr_eval(score_expr, local_ctx) individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def create_cost(idx, sorted_idx): global cost if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) cost.append(set2_scores[:].tolist())
def build_context(self, context): if config.debug: print() print("random sequence position before:", np.random.get_state()[2]) context[self.u_varname] = \ np.random.uniform(size=context_length(context)) if config.debug: print("random sequence position after:", np.random.get_state()[2]) return context
def compute(self, context, filter=None): if filter is None: return context_length(context) else: #TODO: check this at "compile" time (in __init__), though for # that we need to know the type of all temporary variables # first if not np.issubdtype(filter.dtype, bool): raise ValueError("count filter must be a boolean expression") return np.sum(filter)
def compute(self, context, filter=None): if filter is None: return context_length(context) else: # TODO: check this at "compile" time (in __init__), though for # that we need to know the type of all temporary variables # first if not np.issubdtype(filter.dtype, bool): raise ValueError("count filter must be a boolean expression") return np.sum(filter)
def evaluate(self, context): if self.filter is None: return context_length(context) else: #TODO: check this at "compile" time (in __init__), though for # that we need to know the type of all temporary variables # first if getdtype(self.filter, context) is not bool: raise Exception("count filter must be a boolean expression") return np.sum(expr_eval(self.filter, context))
def evaluate(self, context): args = [expr_eval(arg, context) for arg in self.args] kwargs = dict((k, expr_eval(v, context)) for k, v in self.kwargs.iteritems()) if 'size' in self.arg_names and 'size' not in kwargs: kwargs['size'] = context_length(context) if self.filter_expr is None: filter_value = None else: filter_value = expr_eval(self.filter_expr, context) func = self.np_func[0] return self.compute(func, args, kwargs, filter_value)
def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method)
def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) # print set2_scores individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1
def eval_rows(self, source_rows, expr_value, context): result = np.empty(context_length(context), dtype=expr_value.dtype) result.fill(get_missing_value(expr_value)) id_sort_indices = np.argsort(source_rows) sorted_rownum = source_rows[id_sort_indices] sorted_values = expr_value[id_sort_indices] groups = groupby(izip(sorted_rownum, sorted_values), key=itemgetter(0)) aggregate_func = self.aggregate_func for rownum, values in groups: if rownum == -1: continue # Note that v[n] is faster than using an itemgetter, even with map result[rownum] = aggregate_func(v[1] for v in values) return result
def align_no_link(self, context): ctx_length = context_length(context) scores = expr_eval(self.expr, context) need, expressions, possible_values = self._eval_need(context) filter_value = expr_eval(self._getfilter(context), context) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length if expressions: # retrieve the columns we need to work with columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) #noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need) need = self._add_past_error(need, context) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter)
def _eval_args(self, context): args, kwargs = NumpyCreateArray._eval_args(self, context) if 'size' in self.argspec.args: pos = self.argspec.args.index('size') size = args[pos] # The original functions return a scalar when size is None, and an # array of length one when size is 1. #TODO: users should have a way to have the "size=None" behavior. We # could differentiate whether None was explicitly passed or comes # from the default value (as we did previously: 'size' not in # kwargs), but I do not think it is a good idea. Adding a new # "sentinel" value (e.g. -1 or "scalar") is probably better. if size is None: args = args[:pos] + (context_length(context),) + args[pos + 1:] return args, kwargs
def eval_rows(self, source_rows, expr_value, context): # We can't use a negative value because that is not allowed by # bincount, and using a value too high will uselessly increase the size # of the array returned by bincount idx_for_missing = context_length(context) missing_int = missing_values[int] # filter out missing values: those where the object pointed to does not # exist anymore (the id corresponds to -1 in id_to_rownum) #XXX: use np.putmask(source_rows, source_ids == missing_int, # missing_int) source_rows[source_rows == missing_int] = idx_for_missing counts = self.count(source_rows, expr_value) counts.resize(idx_for_missing) return counts
def evaluate(self, context): ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) rank1_expr = self.rank1_expr rank2_expr = self.rank2_expr used_variables1 = rank1_expr.collect_variables(context) used_variables2 = rank2_expr.collect_variables(context) used_variables1.add('id') used_variables2.add('id') set1 = context_subset(context, set1filter, used_variables1) set2 = context_subset(context, set2filter, used_variables2) set1len = set1filter.sum() set2len = set2filter.sum() tomatch = min(set1len, set2len) order1 = expr_eval(rank1_expr, context) order2 = expr_eval(rank2_expr, context) if not self.ascending1: order1 = - order1 # reverse sorting if not self.ascending2: order2 = - order2 # reverse sorting sorted_set1_indices = order1[set1filter].argsort() sorted_set2_indices = order2[set2filter].argsort() idx1 = sorted_set1_indices[:tomatch] idx2 = sorted_set2_indices[:tomatch] print("matching with %d/%d individuals" % (set1len, set2len)) result = np.empty(context_length(context), dtype=int) result.fill(-1) id1 = set1['id'][idx1] id2 = set2['id'][idx2] result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result
def _eval_args(self, context): args, kwargs = NumpyCreateArray._eval_args(self, context) if 'size' in self.argspec.args: pos = self.argspec.args.index('size') size = args[pos] # The original functions return a scalar when size is None, and an # array of length one when size is 1. # TODO: users should have a way to have the "size=None" behavior. We # could differentiate whether None was explicitly passed or comes # from the default value (as we did previously: 'size' not in # kwargs), but I do not think it is a good idea. Adding a new # "sentinel" value (e.g. -1 or "scalar") is probably better. if size is None: args = args[:pos] + (context_length(context), ) + args[pos + 1:] return args, kwargs
def as_string(self, context): tmp_varname = get_tmp_varname() result = expr_eval(self, context) if isinstance(result, dict): indices = result['indices'] values = result['values'] else: indices = None if indices is not None: if isinstance(values, np.ndarray): res_type = values.dtype.type else: res_type = type(values) result = np.zeros(context_length(context), dtype=res_type) np.put(result, indices, values) context[tmp_varname] = result return tmp_varname
def fill_missing_values(self, ids, values, context, filler='auto'): '''ids: ids present in past period context: current period context''' if filler is 'auto': filler = get_missing_value(values) result = np.empty(context_length(context), dtype=values.dtype) result.fill(filler) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result
def eval_rows(self, source_rows, target_filter, context): target_context = self.target_context(context) value_column = expr_eval(self.target_expr, target_context) if target_filter is not None: value_column = value_column[target_filter] assert len(source_rows) == len(value_column) result = np.empty(context_length(context), dtype=value_column.dtype) result.fill(get_missing_value(value_column)) id_sort_indices = np.argsort(source_rows) sorted_rownum = source_rows[id_sort_indices] sorted_values = value_column[id_sort_indices] groups = groupby(izip(sorted_rownum, sorted_values), key=itemgetter(0)) aggregate_func = self.aggregate_func for rownum, values in groups: if rownum == -1: continue result[rownum] = aggregate_func(v[1] for v in values) return result
def evaluate(self, context): num = context_length(context) choices = self.choices if num: bins = self.bins if bins is None: # all values have the same probability choices_idx = np.random.randint(len(choices), size=num) else: if any(isinstance(b, Expr) for b in bins): weights = [expr_eval(expr, context) for expr in bins] bins = self._weights_to_bins(weights) u = np.random.uniform(size=num) choices_idx = np.digitize(u, bins) - 1 else: choices_idx = [] if any(isinstance(c, Expr) for c in choices): choices = np.array([expr_eval(expr, context) for expr in choices]) return choices[choices_idx]
def fill_missing_values(ids, values, context, filler='auto'): """ ids: ids present in past period values: values in past period context: current period context """ if filler is 'auto': filler = get_default_value(values) result = np.full(context_length(context), filler, dtype=values.dtype) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result
def evaluate(self, context): num = context_length(context) choices = self.choices if num: bins = self.bins if bins is None: # all values have the same probability choices_idx = np.random.randint(len(choices), size=num) else: if len(bins) == 2 and bins[0] == 'dynamic': weights = [expr_eval(expr, context) for expr in bins[1]] bins = self._weights_to_bins(weights) u = np.random.uniform(size=num) choices_idx = np.digitize(u, bins) - 1 else: choices_idx = [] if len(choices) == 2 and choices[0] == 'dynamic': choices = np.array([expr_eval(expr, context) for expr in choices[1]]) return choices[choices_idx]
def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(xrange(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size)
def evaluate(self, context): global local_ctx ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) score_expr = self.score_expr used_variables = score_expr.collect_variables(context) used_variables1 = [v for v in used_variables if not v.startswith('__other_')] used_variables2 = [v[8:] for v in used_variables if v.startswith('__other_')] set1 = context_subset(context, set1filter, ['id'] + used_variables1) set2 = context_subset(context, set2filter, ['id'] + used_variables2) set1len = set1filter.sum() set2len = set2filter.sum() tomatch = min(set1len, set2len) orderby = self.orderby if not isinstance(orderby, str): order = expr_eval(orderby, context) else: order = np.zeros(context_length(context), dtype=int) if orderby == 'EDtM': for var in used_variables1: order[set1filter] += (set1[var] - set1[var].mean())**2/set1[var].var() if orderby == 'SDtOM': order_ctx = dict((k if k in used_variables1 else k, v) for k, v in set1.iteritems()) order_ctx.update(('__other_' + k, set2[k].mean()) for k in used_variables2) order[set1filter] = expr_eval(score_expr, order_ctx) sorted_set1_indices = order[set1filter].argsort()[::-1] set1tomatch = sorted_set1_indices[:tomatch] print("matching with %d/%d individuals" % (set1len, set2len)) #TODO: compute pk_names automatically: variables which are either # boolean, or have very few possible values and which are used more # than once in the expression and/or which are used in boolean # expressions # pk_names = ('eduach', 'work') # optimized_exprs = {} result = np.empty(context_length(context), dtype=int) result.fill(-1) local_ctx = dict(('__other_' + k if k in ['id'] + used_variables2 else k, v) for k, v in set2.iteritems()) if self.pool_size is None: #noinspection PyUnusedLocal def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) # pk = tuple(individual1[fname] for fname in pk_names) # optimized_expr = optimized_exprs.get(pk) # if optimized_expr is None: # for name in pk_names: # fake_set1['__f_%s' % name].value = individual1[name] # optimized_expr = str(symbolic_expr.simplify()) # optimized_exprs[pk] = optimized_expr # set2_scores = evaluate(optimized_expr, mm_dict, set2) set2_scores = expr_eval(score_expr, local_ctx) individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual, set1tomatch) else: pool_size = self.pool_size #noinspection PyUnusedLocal def match_one_set1_individual_pool(idx, sorted_idx, pool_size): global local_ctx set2_size = context_length(local_ctx) if not set2_size: raise StopIteration if set2_size > pool_size: pool = random.sample(xrange(context_length(local_ctx)), pool_size) else: pool = range(set2_size) sub_local_ctx = context_subset(local_ctx, pool, None) sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1) set2_scores = expr_eval(score_expr, sub_local_ctx) individual2_pool_idx = np.argmax(set2_scores) individual2_idx = pool[individual2_pool_idx] id1 = sub_local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual_pool, set1tomatch, pool_size=10) return result
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) totals = kwargs.pop('totals', True) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] if percent: totals = True if totals: width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] else: row_totals = None col_totals = None if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)
def evaluate(self, context): source_entity = context['__entity__'] if self.entity_name is None: target_entity = source_entity else: target_entity = entity_registry[self.entity_name] if target_entity is source_entity: target_context = context else: target_context = EntityContext(target_entity, {'period': context['period']}) ctx_filter = context.get('__filter__') if self.filter is not None and ctx_filter is not None: filter_expr = ctx_filter & self.filter elif self.filter is not None: filter_expr = self.filter elif ctx_filter is not None: filter_expr = ctx_filter else: filter_expr = None if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif self.number is not None: to_give_birth = None num_birth = self.number else: raise Exception('no filter nor number in "new"') array = target_entity.array id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth) # select real duplication case if self.num_duplicate is not None: number_rep = array[self.num_duplicate].compress( array[self.num_duplicate]>0 ) children = children.repeat(number_rep,axis=0) num_birth = number_rep.sum() if self.expand==True: from numpy.lib.stride_tricks import as_strided id_add = np.arange(number_rep.max()) id_add = as_strided(id_add , shape=number_rep.shape + id_add.shape, strides=(0,) + id_add.strides) id_add = id_add[id_add < number_rep[:, None]] one_by_house = array['res'].compress( array[self.num_duplicate]>0 ) # indices = np.unique(one_by_house) # size_by_id = np.bincount(one_by_house) # size_by_id = size_by_id.compress(size_by_id>0) # size_by_id = size_by_id.repeat(size_by_id) id_ini = one_by_house.repeat(number_rep,axis=0) decalage = np.zeros(len(one_by_house),dtype=int) indices = np.unique(one_by_house,return_index=True)[1] decalage[indices[1:]] = number_rep[indices] decalage = decalage.cumsum().repeat(number_rep,axis=0) # decalage = decalage - decalage[0] children['res'] = id_add+decalage+ array['res'].max()+1 remember_id = children['id'].copy() if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context['period'] used_variables = self._collect_kwargs_variables(context) child_context = context_subset(context, to_give_birth, used_variables) if to_give_birth is None: child_context = new_context_like(context, length=num_birth) else: child_context = context_subset(context, to_give_birth, used_variables) for k, v in self.kwargs.iteritems(): children[k] = expr_eval(v, child_context) if self.numerotation is not None: from numpy.lib.stride_tricks import as_strided initial = np.zeros(len(array), dtype=bool) id_dup = np.arange(number_rep.max()) id_dup = as_strided(id_dup , shape=number_rep.shape + id_dup.shape, strides=(0,) + id_dup.strides) id_dup = id_dup[id_dup < number_rep[:, None]] +1 children[self.numerotation] = id_dup add_individuals(target_context, children) # result is the ids of the new individuals corresponding to the source # entity # I change here to have the "father" name instead if to_give_birth is not None: if self.return_option is None: result = np.empty(context_length(context), dtype=int) result.fill(-1) # TODO: must change something to have father size correct with # target and not with source. if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a tad faster, but is currently buggy when # working with columns of structured arrays. # See http://projects.scipy.org/numpy/ticket/1869 result[to_give_birth] = children['id'] return result elif self.return_option=='father' : father = np.empty(context_length(context), dtype=int) father.fill(-1) list_children = np.ones(num_birth, dtype=bool) initial = np.zeros(len(array), dtype=bool) birth = np.concatenate((initial, list_children)) father[birth] = remember_id return father else: return None
def compute(self, context, entity_name=None, filter=None, number=None, **kwargs): if filter is not None and number is not None: # Having neither is allowed, though, as there can be a contextual # filter. Also, there is no reason to prevent the whole # population giving birth, even though the usefulness of such # usage seem dubious. raise ValueError("new() 'filter' and 'number' arguments are " "mutually exclusive") source_entity = context.entity if entity_name is None: target_entity = source_entity else: target_entity = context.entities[entity_name] # target context is the context where the new individuals will be # created if target_entity is source_entity: target_context = context else: # we do need to copy the data (.extra) because we will insert into # the entity.array anyway => fresh_data=True target_context = context.clone(fresh_data=True, entity_name=target_entity.name) filter_expr = self._getfilter(context, filter) if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif number is not None: to_give_birth = None num_birth = number else: to_give_birth = np.ones(len(context), dtype=bool) num_birth = len(context) array = target_entity.array default_values = target_entity.fields.default_values id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth, default_values) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context.period used_variables = [ v.name for v in self._collect_kwargs_variables(kwargs) ] if to_give_birth is None: assert not used_variables child_context = context.empty(num_birth) else: child_context = context.subset(to_give_birth, used_variables, filter_expr) for k, v in kwargs.iteritems(): if k not in array.dtype.names: print("WARNING: {} is unknown, ignoring it!".format(k)) continue children[k] = expr_eval(v, child_context) add_individuals(target_context, children) expr_cache.invalidate(context.period, context.entity_name) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.full(context_length(context), -1, dtype=int) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See https://github.com/numpy/numpy/issues/2462 result[to_give_birth] = children['id'] return result else: return None
def evaluate(self, context): global local_ctx global cost ctx_filter = context.get('__filter__') id_to_rownum = context.id_to_rownum # at some point ctx_filter will be cached automatically, so we don't # need to take care of it manually here if ctx_filter is not None: set1filter = expr_eval(ctx_filter & self.set1filter, context) set2filter = expr_eval(ctx_filter & self.set2filter, context) else: set1filter = expr_eval(self.set1filter, context) set2filter = expr_eval(self.set2filter, context) score_expr = self.score_expr used_variables = score_expr.collect_variables(context) used_variables1 = ['id'] + [v for v in used_variables if not v.startswith('__other_')] used_variables2 = ['id'] + [v[8:] for v in used_variables if v.startswith('__other_')] set1 = context_subset(context, set1filter, used_variables1) set2 = context_subset(context, set2filter, used_variables2) orderby = expr_eval(self.orderby, context) sorted_set1_indices = orderby[set1filter].argsort()[::-1] print "matching with %d/%d individuals" % (set1filter.sum(), set2filter.sum()) #TODO: compute pk_names automatically: variables which are either # boolean, or have very few possible values and which are used more # than once in the expression and/or which are used in boolean # expressions # pk_names = ('eduach', 'work') # optimized_exprs = {} result = np.empty(context_length(context), dtype=int) result.fill(-1) local_ctx = dict(('__other_' + k if k in used_variables2 else k, v) for k, v in set2.iteritems()) # print local_ctx # test=local_ctx.copy() # test.update((k, set1[k]) for k in used_variables1) # # ######## Tentative de Munkres if self.option == "optimal": cost = [] def create_cost(idx, sorted_idx): global cost if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) cost.append(set2_scores[:].tolist()) loop_wh_progress(create_cost, sorted_set1_indices) resultat = MunkresX.maxWeightMatching(cost) for id1,id2 in resultat.items(): result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 return result else : def match_one_set1_individual(idx, sorted_idx): global local_ctx if not context_length(local_ctx): raise StopIteration local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1) set2_scores = expr_eval(score_expr, local_ctx) # print set2_scores individual2_idx = np.argmax(set2_scores) id1 = local_ctx['id'] id2 = local_ctx['__other_id'][individual2_idx] local_ctx = context_delete(local_ctx, individual2_idx) result[id_to_rownum[id1]] = id2 result[id_to_rownum[id2]] = id1 loop_wh_progress(match_one_set1_individual, sorted_set1_indices) return result
def build_context(self, context): context[self.u_varname] = \ np.random.uniform(size=context_length(context)) return context
def align_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): target_context = link._target_context(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values, target_context) # handle secondary axis if isinstance(secondary_axis, Expr): axis_name = str(secondary_axis) try: secondary_axis = need.dim_names.index(axis_name) except ValueError: raise ValueError("invalid value for secondary_axis: there is " "no axis named '%s' in the need array" % axis_name) else: if secondary_axis >= need.ndim: raise Exception("%d is an invalid value for secondary_axis: " "it should be smaller than the number of " "dimension of the need array (%d)" % (secondary_axis, need.ndim)) # evaluate columns target_columns = [expr_eval(e, target_context) for e in expressions] # this is a one2many, so the link column is on the target side link_column = target_context[link._link_field] filter_expr = self._getfilter(context, filter) if filter_expr is not None: reverse_link = Many2One("reverse", link._link_field, context.entity.name) target_filter = LinkGet(reverse_link, filter_expr, False) target_filter_value = expr_eval(target_filter, target_context) # It is often not a good idea to pre-filter columns like this # because we loose information about "indices", but in this case, # it is fine, because we do not need that information afterwards. filtered_columns = [ col[target_filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in target_columns ] link_column = link_column[target_filter_value] else: filtered_columns = target_columns target_filter_value = None # compute labels for filtered columns # ----------------------------------- # We can't use _group_labels_light because group_labels assigns labels # on a first come, first served basis, not using the order they are # in pvalues fcols_labels = [] filtered_length = len(filtered_columns[0]) unaligned = np.zeros(filtered_length, dtype=bool) for fcol, pvalues in zip(filtered_columns, need.pvalues): pvalues_index = dict((v, i) for i, v in enumerate(pvalues)) fcol_labels = np.empty(filtered_length, dtype=np.int32) for i in range(filtered_length): value_idx = pvalues_index.get(fcol[i], -1) if value_idx == -1: unaligned[i] = True fcol_labels[i] = value_idx fcols_labels.append(fcol_labels) num_unaligned = np.sum(unaligned) if num_unaligned: # further filter label columns and link_column validlabels = ~unaligned fcols_labels = [labels[validlabels] for labels in fcols_labels] link_column = link_column[validlabels] # display who are the evil ones ids = target_context['id'] if target_filter_value is not None: filtered_ids = ids[target_filter_value] else: filtered_ids = ids self._display_unaligned(expressions, filtered_ids, filtered_columns, unaligned) else: del unaligned id_to_rownum = context.id_to_rownum missing_int = missing_values[int] source_ids = link_column if len(id_to_rownum): source_rows = id_to_rownum[source_ids] # filter out missing values: those where the value of the link # points to nowhere (-1) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) source_rows = [] # filtered_columns are not filtered further on invalid labels # (num_unaligned) but this is not a problem since those will be # ignored by GroupBy anyway. # TODO: this is ugly because a groupby on "values", returns an LArray with those # values (ndarrays) as axes names. Ugh. groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values) # FIXME: target_context is not correct, as it is not filtered while # filtered_columns are. Since we do not use the context "columns" it # mostly works but I had to disable an assertion in utils.expand # because the length of the context is not correct. num_candidates = expr_eval(groupby_expr, target_context) # fetch the list of linked individuals for each local individual. # e.g. the list of person ids for each household hh = np.empty(context_length(context), dtype=object) # we can't use .fill([]) because it reuses the same list for all # objects for i in range(len(hh)): hh[i] = [] # Even though this is highly sub-optimal, the time taken to create # those lists of ids is very small compared to the total time taken # for align_other (0.2s vs 4.26), so I shouldn't care too much about # it for now. # target_row (row of person) is an index valid for *filtered/label* # columns ! for target_row, source_row in enumerate(source_rows): if source_row == -1: continue hh[source_row].append(target_row) class FakeContainer(object): def __init__(self, length): self.length = length def __len__(self): return self.length groups = [FakeContainer(g) for g in num_candidates] need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) # need = np.asarray(need) need = np.asarray(need) aligned, error = \ align_link_nd(score, need, num_candidates, hh, fcols_labels, secondary_axis) self.past_error = error return aligned
def compute(self, context, set1filter, set2filter, score, orderby, pool_size=None, algo='onebyone'): global matching_ctx if pool_size is not None: assert isinstance(pool_size, int) assert pool_size > 0 set1filterexpr = self._getfilter(context, set1filter) set1filtervalue = expr_eval(set1filterexpr, context) set2filterexpr = self._getfilter(context, set2filter) set2filtervalue = expr_eval(set2filterexpr, context) set1len = set1filtervalue.sum() set2len = set2filtervalue.sum() print("matching with %d/%d individuals" % (set1len, set2len), end='') varnames = {v.name for v in score.collect_variables()} used_variables1 = {n for n in varnames if not n.startswith('__other_')} used_variables2 = {n[8:] for n in varnames if n.startswith('__other_')} if isinstance(orderby, str): assert orderby == 'EDtM' orderby_vars = used_variables1 else: orderby_vars = {v.name for v in orderby.collect_variables()} if algo == 'onebyone': all_vars = {'id'} | used_variables1 | orderby_vars set1 = context.subset(set1filtervalue, all_vars, set1filterexpr) set2 = context.subset(set2filtervalue, {'id'} | used_variables2, set2filterexpr) # subset creates a dict for the current entity, so .entity_data is a # dict set1 = set1.entity_data set2 = set2.entity_data set1['__ids__'] = set1['id'].reshape(set1len, 1) set2['__ids__'] = set2['id'].reshape(set2len, 1) print() else: # optimized matching by grouping sets by values, which usually # means smaller sets and improved running time. assert algo == 'byvalue' # if orderby contains variables that are not used in the score # expression, this will effectively add variables in the # matching context AND group by those variables. This is correct # because otherwise (if we did not group by them), we could have # groups containing individuals with different values of the # ordering variables (ie the ordering would not be respected). set1 = group_context(used_variables1 | orderby_vars, set1filtervalue, context) set2 = group_context(used_variables2, set2filtervalue, context) # we cannot simply take the [:min(set1len, set2len)] indices like in # the non-optimized case and iterate over that because we don't know # how many groups we will need to match. print(" (%d/%d groups)" % (context_length(set1), context_length(set2))) if isinstance(orderby, str): orderbyvalue = np.zeros(context_length(set1)) for name in used_variables1: column = set1[name] orderbyvalue += (column - column.mean()) ** 2 / column.var() else: orderbyvalue = expr_eval(orderby, context.clone(entity_data=set1)) # Delete variables which are not in the score expression (but in the # orderby expr or possibly "id") because they are no longer needed and # would slow things down. context_keep(set1, used_variables1) context_keep(set2, used_variables2) sorted_set1_indices = orderbyvalue.argsort()[::-1] result = np.full(context_length(context), -1, dtype=int) id_to_rownum = context.id_to_rownum # prefix all keys except __len__ matching_ctx = {'__other_' + k if k != '__len__' else k: v for k, v in set2.iteritems()} def match_cell(idx, sorted_idx, pool_size): global matching_ctx set2_size = context_length(matching_ctx) if not set2_size: raise StopIteration if pool_size is not None and set2_size > pool_size: pool = random.sample(xrange(set2_size), pool_size) local_ctx = context_subset(matching_ctx, pool) else: local_ctx = matching_ctx.copy() local_ctx.update((k, set1[k][sorted_idx]) for k in {'__ids__'} | used_variables1) eval_ctx = context.clone(entity_data=local_ctx) set2_scores = expr_eval(score, eval_ctx) cell2_idx = set2_scores.argmax() cell1ids = local_ctx['__ids__'] cell2ids = local_ctx['__other___ids__'][cell2_idx] if pool_size is not None and set2_size > pool_size: # transform pool-local index to set/matching_ctx index cell2_idx = pool[cell2_idx] cell1size = len(cell1ids) cell2size = len(cell2ids) nb_match = min(cell1size, cell2size) # we could introduce a random choice here but it is not # much necessary. In that case, it should be done in group_context ids1 = cell1ids[:nb_match] ids2 = cell2ids[:nb_match] result[id_to_rownum[ids1]] = ids2 result[id_to_rownum[ids2]] = ids1 if nb_match == cell2size: matching_ctx = context_delete(matching_ctx, cell2_idx) else: # other variables do not need to be modified since the cell # only got smaller and was not deleted matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:] # FIXME: the expr gets cached for the full matching_ctx at the # beginning and then when another women with the same values is # found, it thinks it can reuse the expr but it breaks because it # has not the correct length. # the current workaround is to invalidate the whole cache for the # current entity but this is not the right way to go. # * disable the cache for matching? # * use a local cache so that methods after matching() can use # what was in the cache before matching(). Shouldn't the cache be # stored inside the context anyway? expr_cache.invalidate(context.period, context.entity_name) if nb_match < cell1size: set1['__ids__'][sorted_idx] = cell1ids[nb_match:] match_cell(idx, sorted_idx, pool_size) loop_wh_progress(match_cell, sorted_set1_indices, pool_size) return result
def align_link(self, context): scores = expr_eval(self.expr, context) need, expressions, possible_values = self._eval_need(context) need = self._handle_frac_need(need) need = self._add_past_error(need, context) # handle secondary axis secondary_axis = self.secondary_axis if isinstance(secondary_axis, Expr): axis_name = str(secondary_axis) try: secondary_axis = need.dim_names.index(axis_name) except ValueError: raise ValueError("invalid value for secondary_axis: there is " "no axis named '%s' in the need array" % axis_name) else: if secondary_axis >= need.ndim: raise Exception("%d is an invalid value for secondary_axis: " "it should be smaller than the number of " "dimension of the need array (%d)" % (secondary_axis, need.ndim)) # evaluate columns target_context = self.link._target_context(context) target_columns = [expr_eval(e, target_context) for e in expressions] # this is a one2many, so the link column is on the target side link_column = expr_eval(Variable(self.link._link_field), target_context) filter_expr = self._getfilter(context) if filter_expr is not None: reverse_link = Many2One("reverse", self.link._link_field, context['__entity__'].name) target_filter = LinkValue(reverse_link, filter_expr, False) target_filter_value = expr_eval(target_filter, target_context) # It is often not a good idea to pre-filter columns like this # because we loose information about "indices", but in this case, # it is fine, because we do not need that information afterwards. filtered_columns = [col[target_filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in target_columns] link_column = link_column[target_filter_value] else: filtered_columns = target_columns target_filter_value = None # compute labels for filtered columns # ----------------------------------- # We can't use _group_labels_light because group_labels assigns labels # on a first come, first served basis, not using the order they are # in pvalues fcols_labels = [] filtered_length = len(filtered_columns[0]) unaligned = np.zeros(filtered_length, dtype=bool) for fcol, pvalues in zip(filtered_columns, need.pvalues): pvalues_index = dict((v, i) for i, v in enumerate(pvalues)) fcol_labels = np.empty(filtered_length, dtype=np.int32) for i in range(filtered_length): value_idx = pvalues_index.get(fcol[i], -1) if value_idx == -1: unaligned[i] = True fcol_labels[i] = value_idx fcols_labels.append(fcol_labels) num_unaligned = np.sum(unaligned) if num_unaligned: # further filter label columns and link_column validlabels = ~unaligned fcols_labels = [labels[validlabels] for labels in fcols_labels] link_column = link_column[validlabels] # display who are the evil ones ids = target_context['id'] if target_filter_value is not None: filtered_ids = ids[target_filter_value] else: filtered_ids = ids self._display_unaligned(expressions, filtered_ids, filtered_columns, unaligned) else: del unaligned id_to_rownum = context.id_to_rownum missing_int = missing_values[int] source_ids = link_column if len(id_to_rownum): source_rows = id_to_rownum[source_ids] # filter out missing values: those where the value of the link # points to nowhere (-1) source_rows[source_ids == missing_int] = missing_int else: assert np.all(source_ids == missing_int) source_rows = [] # filtered_columns are not filtered further on invalid labels # (num_unaligned) but this is not a problem since those will be # ignored by GroupBy anyway. groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values) # target_context is not technically correct, as it is not "filtered" # while filtered_columns are, but since we don't use the context # "columns", it does not matter. num_candidates = expr_eval(groupby_expr, target_context) # fetch the list of linked individuals for each local individual. # e.g. the list of person ids for each household hh = np.empty(context_length(context), dtype=object) # we can't use .fill([]) because it reuses the same list for all # objects for i in range(len(hh)): hh[i] = [] # Even though this is highly sub-optimal, the time taken to create # those lists of ids is very small compared to the total time taken # for align_other (0.2s vs 4.26), so I shouldn't care too much about # it for now. # target_row (row of person) is an index valid for *filtered/label* # columns ! for target_row, source_row in enumerate(source_rows): if source_row == -1: continue hh[source_row].append(target_row) aligned, error = \ align_link_nd(scores, need, num_candidates, hh, fcols_labels, secondary_axis) self.past_error = error return aligned
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)
def compute(self, context, entity_name=None, filter=None, number=None, **kwargs): if filter is not None and number is not None: # Having neither is allowed, though, as there can be a contextual # filter. Also, there is no reason to prevent the whole # population giving birth, even though the usefulness of such # usage seem dubious. raise ValueError("new() 'filter' and 'number' arguments are " "mutually exclusive") source_entity = context.entity if entity_name is None: target_entity = source_entity else: target_entity = context.entities[entity_name] # target context is the context where the new individuals will be # created if target_entity is source_entity: target_context = context else: # we do need to copy the data (.extra) because we will insert into # the entity.array anyway => fresh_data=True target_context = context.clone(fresh_data=True, entity_name=target_entity.name) filter_expr = self._getfilter(context, filter) if filter_expr is not None: to_give_birth = expr_eval(filter_expr, context) num_birth = to_give_birth.sum() elif number is not None: to_give_birth = None num_birth = number else: to_give_birth = np.ones(len(context), dtype=bool) num_birth = len(context) array = target_entity.array id_to_rownum = target_entity.id_to_rownum num_individuals = len(id_to_rownum) children = self._initial_values(array, to_give_birth, num_birth) if num_birth: children['id'] = np.arange(num_individuals, num_individuals + num_birth) children['period'] = context.period used_variables = [v.name for v in self._collect_kwargs_variables(kwargs)] if to_give_birth is None: assert not used_variables child_context = context.empty(num_birth) else: child_context = context.subset(to_give_birth, used_variables, filter_expr) for k, v in kwargs.iteritems(): children[k] = expr_eval(v, child_context) add_individuals(target_context, children) expr_cache.invalidate(context.period, context.entity_name) # result is the ids of the new individuals corresponding to the source # entity if to_give_birth is not None: result = np.empty(context_length(context), dtype=int) result.fill(-1) if source_entity is target_entity: extra_bools = np.zeros(num_birth, dtype=bool) to_give_birth = np.concatenate((to_give_birth, extra_bools)) # Note that np.place is a bit faster, but is currently buggy when # working with columns of structured arrays. # See https://github.com/numpy/numpy/issues/2462 result[to_give_birth] = children['id'] return result else: return None