def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method)
def align_no_link(self, context): ctx_length = context_length(context) scores = expr_eval(self.expr, context) need, expressions, possible_values = self._eval_need(context) filter_value = expr_eval(self._getfilter(context), context) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length if expressions: # retrieve the columns we need to work with columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) #noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need) need = self._add_past_error(need, context) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter)
def evaluate(self, context): expr = self.expr expr_vars = collect_variables(expr, context) expressions = self.expressions labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] if self.filter is not None: filter_value = expr_eval(self.filter, context) #TODO: make a function out of this, I think we have this pattern # in several places filtered_columns = [col[filter_value] if isinstance(col, np.ndarray) and col.shape else [col] for col in columns] filtered_context = context_subset(context, filter_value, expr_vars) else: filtered_columns = columns filtered_context = context possible_values = self.pvalues if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group data = [expr_eval(expr, context_subset(filtered_context, indices, expr_vars)) for indices in groups] #TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_totals = [expr_eval(expr, context_subset(filtered_context, inds, expr_vars)) for inds in rows_indices] col_totals = [expr_eval(expr, context_subset(filtered_context, inds, expr_vars)) for inds in cols_indices] if self.percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) totals = kwargs.pop('totals', True) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] if percent: totals = True if totals: width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] else: row_totals = None col_totals = None if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)
def compute(self, context, *expressions, **kwargs): if not expressions: raise TypeError("groupby() takes at least 1 argument") # TODO: allow lists/tuples of arguments to group by the combinations # of keys for expr in expressions: if isinstance(expr, (bool, int, float)): raise TypeError("groupby() does not work with constant " "arguments") if isinstance(expr, (tuple, list)): raise TypeError("groupby() takes expressions as arguments, " "not a list of expressions") # On python 3, we could clean up this code (keyword only arguments). expr = kwargs.pop('expr', None) if expr is None: expr = Count() # by = kwargs.pop('by', None) filter_value = kwargs.pop('filter', None) percent = kwargs.pop('percent', False) possible_values = kwargs.pop('pvalues', None) expr_vars = [v.name for v in collect_variables(expr)] labels = [str(e) for e in expressions] columns = [expr_eval(e, context) for e in expressions] columns = [expand(c, context_length(context)) for c in columns] if filter_value is not None: filtered_columns = [col[filter_value] for col in columns] # FIXME: use the actual filter_expr instead of not_hashable filtered_context = context.subset(filter_value, expr_vars, not_hashable) else: filtered_columns = columns filtered_context = context if possible_values is None: possible_values = [np.unique(col) for col in filtered_columns] # We pre-filtered columns instead of passing the filter to partition_nd # because it is a bit faster this way. The indices are still correct, # because we use them on a filtered_context. groups = partition_nd(filtered_columns, True, possible_values) if not groups: return LabeledArray([], labels, possible_values) # evaluate the expression on each group # we use not_hashable to avoid storing the subset in the cache contexts = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in groups] data = [expr_eval(expr, c) for c in contexts] # TODO: use group_indices_nd directly to avoid using np.unique # this is twice as fast (unique is very slow) but breaks because # the rest of the code assumes all combinations are present # if self.filter is not None: # filter_value = expr_eval(self.filter, context) # else: # filter_value = True # # d = group_indices_nd(columns, filter_value) # pvalues = sorted(d.keys()) # ndim = len(columns) # possible_values = [[pv[i] for pv in pvalues] # for i in range(ndim)] # groups = [d[k] for k in pvalues] # groups is a (flat) list of list. # the first variable is the outer-most "loop", # the last one the inner most. # add total for each row len_pvalues = [len(vals) for vals in possible_values] width = len_pvalues[-1] height = prod(len_pvalues[:-1]) rows_indices = [np.concatenate([groups[y * width + x] for x in range(width)]) for y in range(height)] cols_indices = [np.concatenate([groups[y * width + x] for y in range(height)]) for x in range(width)] cols_indices.append(np.concatenate(cols_indices)) # evaluate the expression on each "combined" group (ie compute totals) row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in rows_indices] row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs] col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable) for indices in cols_indices] col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs] if percent: # convert to np.float64 to get +-inf if total_value is int(0) # instead of Python's built-in behaviour of raising an exception. # This can happen at least when using the default expr (count()) # and the filter yields empty groups total_value = np.float64(col_totals[-1]) data = [100.0 * value / total_value for value in data] row_totals = [100.0 * value / total_value for value in row_totals] col_totals = [100.0 * value / total_value for value in col_totals] # if self.by or self.percent: # if self.percent: # total_value = data[-1] # divisors = [total_value for _ in data] # else: # num_by = len(self.by) # inc = prod(len_pvalues[-num_by:]) # num_groups = len(groups) # num_categories = prod(len_pvalues[:-num_by]) # # categories_groups_idx = [range(cat_idx, num_groups, inc) # for cat_idx in range(num_categories)] # # divisors = ... # # data = [100.0 * value / divisor # for value, divisor in izip(data, divisors)] # convert to a 1d array. We don't simply use data = np.array(data), # because if data is a list of ndarray (for example if we use # groupby(a, expr=id), *and* all the ndarrays have the same length, # the result is a 2d array instead of an array of ndarrays like we # need (at this point). arr = np.empty(len(data), dtype=type(data[0])) arr[:] = data data = arr # and reshape it data = data.reshape(len_pvalues) return LabeledArray(data, labels, possible_values, row_totals, col_totals)
def compute(self, context, score, need, filter=None, take=None, leave=None, expressions=None, possible_values=None, errors='default', frac_need='uniform', link=None, secondary_axis=None): # need is a single scalar # if not isinstance(need, (tuple, list, np.ndarray)): if np.isscalar(need): need = [need] # need is a non-ndarray sequence if isinstance(need, (tuple, list)): need = np.array(need) assert isinstance(need, np.ndarray) if expressions is None: expressions = [] if possible_values is None: possible_values = [] else: possible_values = [np.array(pv) for pv in possible_values] if frac_need not in ('uniform', 'cutoff', 'round'): cls = ValueError if isinstance(frac_need, basestring) else TypeError raise cls("frac_need should be one of: 'uniform', 'cutoff' or " "'round'") scores = expr_eval(self.expr, context) filter_value = expr_eval(self._getfilter(context), context) need, expressions, possible_values = self._eval_need(context, scores, filter_value) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) periodicity = context['periodicity'] if context['format_date'] == 'year0': periodicity = periodicity*12 #give right periodicity/self.periodicity_given whereas self.periodicity_given/12 doesn't #sign(self.periodicity_given) = sign(periodicity) self.periodicity_given = \ self.periodicity_given * (self.periodicity_given*periodicity)/abs(self.periodicity_given*periodicity) if gcd(periodicity,self.periodicity_given) not in [periodicity,self.periodicity_given] : raise( "mix of quarter and triannual impossible") need = need*periodicity/self.periodicity_given if scores is not None: scores = scores*periodicity/self.periodicity_given #noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, method=frac_need) need = self._add_past_error(context, need, method=errors) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter, method=self.method)
def align_no_link(self, context): ctx_length = context_length(context) scores = expr_eval(self.expr, context) filter_value = expr_eval(self._getfilter(context), context) need, expressions, possible_values = self._eval_need(context, scores, filter_value) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length if expressions: # retrieve the columns we need to work with zzzz columns = [expr_eval(expr, context) for expr in expressions] # #bidouille pour age si on passe a un format yyyymm # str_expressions = [str(e) for e in expressions] # if 'age' in str_expressions: # age_axis_num = str_expressions.index('age') # columns[age_axis_num] = columns[age_axis_num]/100 if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) periodicity = context['periodicity'] if context['format_date'] == 'year0': periodicity = periodicity*12 #give right periodicity/self.periodicity_given whereas self.periodicity_given/12 doesn't #sign(self.periodicity_given) = sign(periodicity) self.periodicity_given = \ self.periodicity_given * (self.periodicity_given*periodicity)/abs(self.periodicity_given*periodicity) if gcd(periodicity,self.periodicity_given) not in [periodicity,self.periodicity_given] : raise( "mix of quarter and triannual impossible") need = need*periodicity/self.periodicity_given if scores is not None: scores = scores*periodicity/self.periodicity_given need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need) need = self._add_past_error(need, context) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter, method=self.method)