def compute(self, context, filter=None): filter_value = filter if filter_value is None: # this is pretty inefficient, but remove without filter is not # common enough to bother filter_value = np.ones(len(context), dtype=bool) if not np.any(filter_value): return not_removed = ~filter_value entity = context.entity len_before = len(entity.array) # Shrink array & temporaries. 99% of the function time is spent here. entity.array.keep(not_removed) temp_variables = entity.temp_variables for name, temp_value in temp_variables.items(): if isinstance(temp_value, np.ndarray) and temp_value.shape: temp_variables[name] = temp_value[not_removed] # update id_to_rownum already_removed = entity.id_to_rownum == -1 already_removed_indices = filter_to_indices(already_removed) already_removed_indices_shifted = \ already_removed_indices - np.arange(len(already_removed_indices)) id_to_rownum = np.arange(len_before) id_to_rownum -= filter_value.cumsum() #XXX: use np.putmask(id_to_rownum, filter_value, -1) id_to_rownum[filter_value] = -1 entity.id_to_rownum = np.insert(id_to_rownum, already_removed_indices_shifted, -1) # this version is cleaner and slightly faster but the result is also # slightly wrong: it eliminates ids for dead/removed individuals at # the end of the array and this cause bugs in time-related functions # ids = entity.array['id'] # id_to_rownum = np.empty(np.max(ids) + 1, dtype=int) # id_to_rownum.fill(-1) # id_to_rownum[ids] = np.arange(len(ids), dtype=int) # entity.id_to_rownum = id_to_rownum if config.log_level == "processes": print("%d %s(s) removed (%d -> %d)" % (filter_value.sum(), entity.name, len_before, len(entity.array)), end=' ') #TODO: in the case of remove(), we should update (take a subset of) all # the cache keys matching the entity, but with the current code, # it is most likely not worth it because the cache probably contains # mostly stuff we will never use. expr_cache.invalidate(context.period, context.entity_name)
def compute(self, context, filter=None): filter_value = filter if filter_value is None: # this is pretty inefficient, but remove without filter is not # common enough to bother filter_value = np.ones(len(context), dtype=bool) if not np.any(filter_value): return not_removed = ~filter_value entity = context.entity len_before = len(entity.array) # Shrink array & temporaries. 99% of the function time is spent here. entity.array.keep(not_removed) temp_variables = entity.temp_variables for name, temp_value in temp_variables.items(): if isinstance(temp_value, np.ndarray) and temp_value.shape: temp_variables[name] = temp_value[not_removed] # update id_to_rownum already_removed = entity.id_to_rownum == -1 already_removed_indices = filter_to_indices(already_removed) already_removed_indices_shifted = \ already_removed_indices - np.arange(len(already_removed_indices)) id_to_rownum = np.arange(len_before) id_to_rownum -= filter_value.cumsum() # XXX: use np.putmask(id_to_rownum, filter_value, -1) id_to_rownum[filter_value] = -1 entity.id_to_rownum = np.insert(id_to_rownum, already_removed_indices_shifted, -1) # this version is cleaner and slightly faster but the result is also # slightly wrong: it eliminates ids for dead/removed individuals at # the end of the array and this cause bugs in time-related functions # ids = entity.array['id'] # id_to_rownum = np.full(np.max(ids) + 1, -1, dtype=int) # id_to_rownum[ids] = np.arange(len(ids), dtype=int) # entity.id_to_rownum = id_to_rownum if config.log_level == "processes": print("%d %s(s) removed (%d -> %d)" % (filter_value.sum(), entity.name, len_before, len(entity.array)), end=' ') # TODO: in the case of remove(), we should update (take a subset of) all # the cache keys matching the entity, but with the current code, # it is most likely not worth it because the cache probably contains # mostly stuff we will never use. expr_cache.invalidate(context.period, context.entity_name)
def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method)
def align_no_link(self, context, score, need, filter, take, leave, expressions, possible_values, errors, frac_need, link, secondary_axis, method): ctx_length = context_length(context) need, expressions, possible_values = \ self._eval_need(context, need, expressions, possible_values) filter_value = expr_eval(self._getfilter(context, filter), context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) # noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, frac_need) need = self._add_past_error(context, need, errors) need = np.asarray(need) # FIXME: either handle past_error in no link (currently, the past # error is added... but never computed, so always 0 !) or raise # an error in case errors='carry" is used with no link. return align_get_indices_nd(ctx_length, groups, need, filter_value, score, take, leave, method)
def align_no_link(self, context): ctx_length = context_length(context) scores = expr_eval(self.expr, context) need, expressions, possible_values = self._eval_need(context) filter_value = expr_eval(self._getfilter(context), context) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length if expressions: # retrieve the columns we need to work with columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) #noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need) need = self._add_past_error(need, context) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter)
def run(self, context): filter_value = expr_eval(self.filter, context) if not np.any(filter_value): return not_removed = ~filter_value entity = context['__entity__'] len_before = len(entity.array) #FIXME: this allocates a new (slightly smaller) array. The old # array is only discarded when the gc does its job, effectively # doubling the peak memory usage for the main array for a while. # Seems like another good reason to store columns separately. # Shrink array & temporaries. 99% of the function time is spent here. entity.array = entity.array[not_removed] temp_variables = entity.temp_variables for name, temp_value in temp_variables.iteritems(): if isinstance(temp_value, np.ndarray) and temp_value.shape: temp_variables[name] = temp_value[not_removed] # update id_to_rownum already_removed = entity.id_to_rownum == -1 already_removed_indices = filter_to_indices(already_removed) already_removed_indices_shifted = already_removed_indices - \ np.arange(len(already_removed_indices)) id_to_rownum = np.arange(len_before) id_to_rownum -= filter_value.cumsum() #XXX: use np.putmask(id_to_rownum, filter_value, -1) id_to_rownum[filter_value] = -1 entity.id_to_rownum = np.insert(id_to_rownum, already_removed_indices_shifted, -1) # this version is cleaner and slightly faster but the result is also # slightly different: it eliminates ids for dead/removed individuals # and this cause bugs in time-related functions # ids = entity.array['id'] # id_to_rownum = np.empty(np.max(ids) + 1, dtype=int) # id_to_rownum.fill(-1) # id_to_rownum[ids] = np.arange(len(ids), dtype=int) # entity.id_to_rownum = id_to_rownum print "%d %s(s) removed (%d -> %d)" % (filter_value.sum(), entity.name, len_before, len(entity.array)),
def run(self, context): filter_value = expr_eval(self.filter, context) if not np.any(filter_value): return not_removed = ~filter_value entity = context['__entity__'] len_before = len(entity.array) # Shrink array & temporaries. 99% of the function time is spent here. entity.array.keep(not_removed) temp_variables = entity.temp_variables for name, temp_value in temp_variables.items(): if isinstance(temp_value, np.ndarray) and temp_value.shape: temp_variables[name] = temp_value[not_removed] # update id_to_rownum already_removed = entity.id_to_rownum == -1 already_removed_indices = filter_to_indices(already_removed) already_removed_indices_shifted = \ already_removed_indices - np.arange(len(already_removed_indices)) id_to_rownum = np.arange(len_before) id_to_rownum -= filter_value.cumsum() #XXX: use np.putmask(id_to_rownum, filter_value, -1) id_to_rownum[filter_value] = -1 entity.id_to_rownum = np.insert(id_to_rownum, already_removed_indices_shifted, -1) # this version is cleaner and slightly faster but the result is also # slightly wrong: it eliminates ids for dead/removed individuals at # the end of the array and this cause bugs in time-related functions # ids = entity.array['id'] # id_to_rownum = np.empty(np.max(ids) + 1, dtype=int) # id_to_rownum.fill(-1) # id_to_rownum[ids] = np.arange(len(ids), dtype=int) # entity.id_to_rownum = id_to_rownum if config.log_level == "processes": print("%d %s(s) removed (%d -> %d)" % (filter_value.sum(), entity.name, len_before, len(entity.array)), end=' ')
def align_get_indices_nd(ctx_length, groups, need, filter_value, score, take_filter=None, leave_filter=None): assert isinstance(need, np.ndarray) and \ issubclass(need.dtype.type, np.integer) assert score is None or isinstance(score, (bool, int, float, np.ndarray)) if filter_value is not None: bool_filter_value = filter_value.copy() else: bool_filter_value = True maybe_filter = bool_filter_value if take_filter is not None: #XXX: I wonder if users would prefer if filter_value was taken into # account or not. This only impacts what it displayed on the console, # but still... take = np.sum(take_filter) #XXX: it would probably be faster to leave the filters as boolean # vector and do # take_members = take_filter[member_indices] # group_always = member_indices[take_members] # instead of # group_always = np.intersect1d(members_indices, take_indices, # assume_unique=True) take_indices = filter_to_indices(take_filter & bool_filter_value) maybe_filter &= ~take_filter else: take = 0 take_indices = None if leave_filter is not None: leave = np.sum(leave_filter) maybe_filter &= ~leave_filter else: leave = 0 if take_filter is not None or leave_filter is not None: maybe_indices = filter_to_indices(maybe_filter) else: maybe_indices = None total_underflow = 0 total_overflow = 0 total_affected = 0 aligned = np.zeros(ctx_length, dtype=bool) for members_indices, group_need in izip(groups, need.flat): if len(members_indices): affected = group_need total_affected += affected if take_indices is not None: group_always = np.intersect1d(members_indices, take_indices, assume_unique=True) num_always = len(group_always) aligned[group_always] = True else: num_always = 0 if affected > num_always: if maybe_indices is not None: group_maybe_indices = np.intersect1d(members_indices, maybe_indices, assume_unique=True) else: group_maybe_indices = members_indices if isinstance(score, np.ndarray): maybe_members_rank_value = score[group_maybe_indices] #TODO: use np.partition (np1.8+) sorted_local_indices = np.argsort(maybe_members_rank_value) sorted_global_indices = \ group_maybe_indices[sorted_local_indices] else: # if the score expression is a constant, we don't need to # sort indices. In that case, the alignment will first take # the individuals created last (highest id). sorted_global_indices = group_maybe_indices # maybe_to_take is always > 0 maybe_to_take = affected - num_always # take the last X individuals (ie those with the highest score) indices_to_take = sorted_global_indices[-maybe_to_take:] underflow = maybe_to_take - len(indices_to_take) if underflow > 0: total_underflow += underflow aligned[indices_to_take] = True elif affected < num_always: total_overflow += num_always - affected num_aligned = int(np.sum(aligned)) # this assertion is only valid in the non weighted case assert num_aligned == total_affected + total_overflow - total_underflow num_partitioned = sum(len(g) for g in groups) if config.debug and config.log_level == "processes": print(" %d/%d" % (num_aligned, num_partitioned), end=" ") if (take_filter is not None) or (leave_filter is not None): print("[take %d, leave %d]" % (take, leave), end=" ") if total_underflow: print("UNDERFLOW: %d" % total_underflow, end=" ") if total_overflow: print("OVERFLOW: %d" % total_overflow, end=" ") return aligned
def align_get_indices_nd(ctx_length, groups, need, filter_value, score, take_filter=None, leave_filter=None, method="bysorting"): assert isinstance(need, np.ndarray) and \ np.issubdtype(need.dtype, np.integer) assert score is None or isinstance(score, (bool, int, float, np.ndarray)) if filter_value is not None: bool_filter_value = filter_value.copy() else: bool_filter_value = True maybe_filter = bool_filter_value if take_filter is not None: take_intersect = take_filter & bool_filter_value take = np.sum(take_intersect) # XXX: it would probably be faster to leave the filters as boolean # vector and do # take_members = take_filter[member_indices] # group_always = member_indices[take_members] # instead of # group_always = np.intersect1d(members_indices, take_indices, # assume_unique=True) take_indices = filter_to_indices(take_intersect) maybe_filter &= ~take_filter else: take = 0 take_indices = None if leave_filter is not None: leave = np.sum(leave_filter & bool_filter_value) maybe_filter &= ~leave_filter else: leave = 0 if take_filter is not None or leave_filter is not None: maybe_indices = filter_to_indices(maybe_filter) else: maybe_indices = None total_underflow = 0 total_overflow = 0 total_affected = 0 aligned = np.zeros(ctx_length, dtype=bool) if method == 'sidewalk': score_max = max(score) score_min = min(score) if score_max > 1 or score_min < 0: raise Exception("""Score values are in the interval {} - {}. Sidewalk alignment can only be used with a score between 0 and 1. You may want to use a logistic function. """.format(score_min, score_max)) for members_indices, group_need in izip(groups, need.flat): if len(members_indices): affected = group_need total_affected += affected if take_indices is not None: group_always = np.intersect1d(members_indices, take_indices, assume_unique=True) num_always = len(group_always) aligned[group_always] = True else: num_always = 0 if affected > num_always: if maybe_indices is not None: group_maybe_indices = np.intersect1d(members_indices, maybe_indices, assume_unique=True) else: group_maybe_indices = members_indices if isinstance(score, np.ndarray): if method == 'bysorting': maybe_members_rank_value = score[group_maybe_indices] # TODO: use np.partition (np1.8+) sorted_local_indices = np.argsort( maybe_members_rank_value) sorted_global_indices = \ group_maybe_indices[sorted_local_indices] elif method == 'sidewalk': sorted_global_indices = \ np.random.permutation(group_maybe_indices) else: # if the score expression is a constant, we don't need to # sort indices. In that case, the alignment will first take # the individuals created last (highest id). sorted_global_indices = group_maybe_indices # maybe_to_take is always > 0 maybe_to_take = affected - num_always if method == 'bysorting': # take the last X individuals (ie those with the highest # score) indices_to_take = sorted_global_indices[-maybe_to_take:] elif method == 'sidewalk': proba_sum = sum(score[sorted_global_indices]) if maybe_to_take > round(proba_sum): raise ValueError( "Cannot use 'sidewalk' with need = {} > sum of probabilities = {}" .format(maybe_to_take, proba_sum)) u = np.random.uniform() + np.arange(maybe_to_take) # on the random sample, score are cumulated and then, we # extract indices of each value before each value of u cum_score = np.cumsum(score[sorted_global_indices]) indices_to_take = \ sorted_global_indices[np.searchsorted(cum_score, u)] underflow = maybe_to_take - len(indices_to_take) if underflow > 0: total_underflow += underflow aligned[indices_to_take] = True elif affected < num_always: total_overflow += num_always - affected num_aligned = int(np.sum(aligned)) # this assertion is only valid in the non weighted case assert num_aligned == total_affected + total_overflow - total_underflow num_partitioned = sum(len(g) for g in groups) if config.log_level == "processes": print(" %d/%d" % (num_aligned, num_partitioned), end=" ") if (take_filter is not None) or (leave_filter is not None): print("[take %d, leave %d]" % (take, leave), end=" ") if total_underflow: print("UNDERFLOW: %d" % total_underflow, end=" ") if total_overflow: print("OVERFLOW: %d" % total_overflow, end=" ") return aligned
def align_get_indices_nd(ctx_length, groups, need, filter_value, score, take_filter=None, leave_filter=None, method="bysorting"): assert isinstance(need, np.ndarray) and \ np.issubdtype(need.dtype, np.integer) assert score is None or isinstance(score, (bool, int, float, np.ndarray)) if filter_value is not None: bool_filter_value = filter_value.copy() else: bool_filter_value = True maybe_filter = bool_filter_value if take_filter is not None: take_intersect = take_filter & bool_filter_value take = np.sum(take_intersect) # XXX: it would probably be faster to leave the filters as boolean # vector and do # take_members = take_filter[member_indices] # group_always = member_indices[take_members] # instead of # group_always = np.intersect1d(members_indices, take_indices, # assume_unique=True) take_indices = filter_to_indices(take_intersect) maybe_filter &= ~take_filter else: take = 0 take_indices = None if leave_filter is not None: leave = np.sum(leave_filter & bool_filter_value) maybe_filter &= ~leave_filter else: leave = 0 if take_filter is not None or leave_filter is not None: maybe_indices = filter_to_indices(maybe_filter) else: maybe_indices = None total_underflow = 0 total_overflow = 0 total_affected = 0 aligned = np.zeros(ctx_length, dtype=bool) if method == 'sidewalk': score_max = max(score) score_min = min(score) if score_max > 1 or score_min < 0: raise Exception("""Score values are in the interval {} - {}. Sidewalk alignment can only be used with a score between 0 and 1. You may want to use a logistic function. """.format(score_min, score_max)) for members_indices, group_need in izip(groups, need.flat): if len(members_indices): affected = group_need total_affected += affected if take_indices is not None: group_always = np.intersect1d(members_indices, take_indices, assume_unique=True) num_always = len(group_always) aligned[group_always] = True else: num_always = 0 if affected > num_always: if maybe_indices is not None: group_maybe_indices = np.intersect1d(members_indices, maybe_indices, assume_unique=True) else: group_maybe_indices = members_indices if isinstance(score, np.ndarray): if method == 'bysorting': maybe_members_rank_value = score[group_maybe_indices] # TODO: use np.partition (np1.8+) sorted_local_indices = np.argsort(maybe_members_rank_value) sorted_global_indices = \ group_maybe_indices[sorted_local_indices] elif method == 'sidewalk': sorted_global_indices = \ np.random.permutation(group_maybe_indices) else: # if the score expression is a constant, we don't need to # sort indices. In that case, the alignment will first take # the individuals created last (highest id). sorted_global_indices = group_maybe_indices # maybe_to_take is always > 0 maybe_to_take = affected - num_always if method == 'bysorting': # take the last X individuals (ie those with the highest # score) indices_to_take = sorted_global_indices[-maybe_to_take:] elif method == 'sidewalk': proba_sum = sum(score[sorted_global_indices]) if maybe_to_take > round(proba_sum): raise ValueError( "Cannot use 'sidewalk' with need = {} > sum of probabilities = {}".format( maybe_to_take, proba_sum ) ) u = np.random.uniform() + np.arange(maybe_to_take) # on the random sample, score are cumulated and then, we # extract indices of each value before each value of u cum_score = np.cumsum(score[sorted_global_indices]) indices_to_take = \ sorted_global_indices[np.searchsorted(cum_score, u)] underflow = maybe_to_take - len(indices_to_take) if underflow > 0: total_underflow += underflow aligned[indices_to_take] = True elif affected < num_always: total_overflow += num_always - affected num_aligned = int(np.sum(aligned)) # this assertion is only valid in the non weighted case assert num_aligned == total_affected + total_overflow - total_underflow num_partitioned = sum(len(g) for g in groups) if config.log_level == "processes": print(" %d/%d" % (num_aligned, num_partitioned), end=" ") if (take_filter is not None) or (leave_filter is not None): print("[take %d, leave %d]" % (take, leave), end=" ") if total_underflow: print("UNDERFLOW: %d" % total_underflow, end=" ") if total_overflow: print("OVERFLOW: %d" % total_overflow, end=" ") return aligned
def align_get_indices_nd(ctx_length, groups, need, filter_value, score, take_filter=None, leave_filter=None, method="default"): assert isinstance(need, np.ndarray) and \ issubclass(need.dtype.type, np.integer) assert score is None or isinstance(score, (bool, int, float, np.ndarray)) assert method in ("default","sidewalk") # to delete as the check is done earlier ? if filter_value is not None: bool_filter_value = filter_value.copy() else: bool_filter_value = True maybe_filter = bool_filter_value if take_filter is not None: #XXX: I wonder if users would prefer if filter_value was taken into # account or not. This only impacts what it displayed on the console, # but still... take = np.sum(take_filter) #XXX: it would probably be faster to leave the filters as boolean # vector and do # take_members = take_filter[member_indices] # group_always = member_indices[take_members] # instead of # group_always = np.intersect1d(members_indices, take_indices, # assume_unique=True) take_indices = filter_to_indices(take_filter & bool_filter_value) maybe_filter &= ~take_filter else: take = 0 take_indices = None if leave_filter is not None: leave = np.sum(leave_filter) maybe_filter &= ~leave_filter else: leave = 0 if take_filter is not None or leave_filter is not None: maybe_indices = filter_to_indices(maybe_filter) else: maybe_indices = None total_underflow = 0 total_overflow = 0 total_affected = 0 aligned = np.zeros(ctx_length, dtype=bool) for members_indices, group_need in izip(groups, need.flat): if len(members_indices): affected = group_need total_affected += affected if take_indices is not None: group_always = np.intersect1d(members_indices, take_indices, assume_unique=True) num_always = len(group_always) aligned[group_always] = True else: num_always = 0 if affected > num_always: if maybe_indices is not None: group_maybe_indices = np.intersect1d(members_indices, maybe_indices, assume_unique=True) else: group_maybe_indices = members_indices if isinstance(score, np.ndarray): if method=='default': maybe_members_rank_value = score[group_maybe_indices] sorted_local_indices = np.argsort(maybe_members_rank_value) sorted_global_indices = \ group_maybe_indices[sorted_local_indices] elif method=='sidewalk': if max(score[group_maybe_indices]) > 1 or min(score[group_maybe_indices]) < 0: raise Exception("Sidewalk method can be used only with a" " score between 0 and 1. You may want to use" " a logistic function ") local_indices = range(len(group_maybe_indices)) sorted_local_indices = np.random.permutation(local_indices) sorted_global_indices = \ group_maybe_indices[sorted_local_indices] else: # if the score expression is a constant, we don't need to # sort indices. In that case, the alignment will first take # the individuals created last (highest id). sorted_global_indices = group_maybe_indices # maybe_to_take is always > 0 maybe_to_take = affected - num_always if method=='default': # take the last X individuals (ie those with the highest score) indices_to_take = sorted_global_indices[-maybe_to_take:] elif method=='sidewalk': U=random()+np.arange(maybe_to_take) #on the random sample, score are cumulated and then, we extract indices #of each value before each value of U print (method) indices_to_take = np.searchsorted(np.cumsum(score[sorted_local_indices]), U) indices_to_take = sorted_global_indices[indices_to_take] underflow = maybe_to_take - len(indices_to_take) if underflow > 0: total_underflow += underflow aligned[indices_to_take] = True elif affected < num_always: total_overflow += num_always - affected num_aligned = np.sum(aligned) # this assertion is only valid in the non weighted case assert num_aligned == total_affected + total_overflow - total_underflow num_partitioned = sum(len(g) for g in groups) print(" %d/%d" % (num_aligned, num_partitioned), end=" ") if (take_filter is not None) or (leave_filter is not None): print("[take %d, leave %d]" % (take, leave), end=" ") if total_underflow: print("UNDERFLOW: %d" % total_underflow, end=" ") if total_overflow: print("OVERFLOW: %d" % total_overflow, end=" ") return aligned
def compute(self, context, score, need, filter=None, take=None, leave=None, expressions=None, possible_values=None, errors='default', frac_need='uniform', link=None, secondary_axis=None): # need is a single scalar # if not isinstance(need, (tuple, list, np.ndarray)): if np.isscalar(need): need = [need] # need is a non-ndarray sequence if isinstance(need, (tuple, list)): need = np.array(need) assert isinstance(need, np.ndarray) if expressions is None: expressions = [] if possible_values is None: possible_values = [] else: possible_values = [np.array(pv) for pv in possible_values] if frac_need not in ('uniform', 'cutoff', 'round'): cls = ValueError if isinstance(frac_need, basestring) else TypeError raise cls("frac_need should be one of: 'uniform', 'cutoff' or " "'round'") scores = expr_eval(self.expr, context) filter_value = expr_eval(self._getfilter(context), context) need, expressions, possible_values = self._eval_need(context, scores, filter_value) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length # retrieve the columns we need to work with if expressions: columns = [expr_eval(expr, context) for expr in expressions] if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: columns = [] if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) periodicity = context['periodicity'] if context['format_date'] == 'year0': periodicity = periodicity*12 #give right periodicity/self.periodicity_given whereas self.periodicity_given/12 doesn't #sign(self.periodicity_given) = sign(periodicity) self.periodicity_given = \ self.periodicity_given * (self.periodicity_given*periodicity)/abs(self.periodicity_given*periodicity) if gcd(periodicity,self.periodicity_given) not in [periodicity,self.periodicity_given] : raise( "mix of quarter and triannual impossible") need = need*periodicity/self.periodicity_given if scores is not None: scores = scores*periodicity/self.periodicity_given #noinspection PyAugmentAssignment need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need, method=frac_need) need = self._add_past_error(context, need, method=errors) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter, method=self.method)
def align_no_link(self, context): ctx_length = context_length(context) scores = expr_eval(self.expr, context) filter_value = expr_eval(self._getfilter(context), context) need, expressions, possible_values = self._eval_need(context, scores, filter_value) take_filter = expr_eval(self.take_filter, context) leave_filter = expr_eval(self.leave_filter, context) if filter_value is not None: num_to_align = np.sum(filter_value) else: num_to_align = ctx_length if expressions: # retrieve the columns we need to work with zzzz columns = [expr_eval(expr, context) for expr in expressions] # #bidouille pour age si on passe a un format yyyymm # str_expressions = [str(e) for e in expressions] # if 'age' in str_expressions: # age_axis_num = str_expressions.index('age') # columns[age_axis_num] = columns[age_axis_num]/100 if filter_value is not None: groups = partition_nd(columns, filter_value, possible_values) else: groups = partition_nd(columns, True, possible_values) else: if filter_value is not None: groups = [filter_to_indices(filter_value)] else: groups = [np.arange(num_to_align)] # the sum is not necessarily equal to len(a), because some individuals # might not fit in any group (eg if some alignment data is missing) if sum(len(g) for g in groups) < num_to_align: unaligned = np.ones(ctx_length, dtype=bool) if filter_value is not None: unaligned[~filter_value] = False for member_indices in groups: unaligned[member_indices] = False self._display_unaligned(expressions, context['id'], columns, unaligned) periodicity = context['periodicity'] if context['format_date'] == 'year0': periodicity = periodicity*12 #give right periodicity/self.periodicity_given whereas self.periodicity_given/12 doesn't #sign(self.periodicity_given) = sign(periodicity) self.periodicity_given = \ self.periodicity_given * (self.periodicity_given*periodicity)/abs(self.periodicity_given*periodicity) if gcd(periodicity,self.periodicity_given) not in [periodicity,self.periodicity_given] : raise( "mix of quarter and triannual impossible") need = need*periodicity/self.periodicity_given if scores is not None: scores = scores*periodicity/self.periodicity_given need = need * self._get_need_correction(groups, possible_values) need = self._handle_frac_need(need) need = self._add_past_error(need, context) return align_get_indices_nd(ctx_length, groups, need, filter_value, scores, take_filter, leave_filter, method=self.method)