Exemplo n.º 1
0
            def match_one_set1_individual_pool(idx, sorted_idx, pool_size):
                global local_ctx
                
                set2_size = context_length(local_ctx)
                if not set2_size:
                    raise StopIteration
                
                if set2_size > pool_size:
                    pool = random.sample(xrange(context_length(local_ctx)), pool_size)
                else:
                    pool = range(set2_size)

                sub_local_ctx = context_subset(local_ctx, pool, None)
                sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
                
                set2_scores = expr_eval(score_expr, sub_local_ctx)
    
                individual2_pool_idx = np.argmax(set2_scores)
                individual2_idx = pool[individual2_pool_idx]
                
                id1 = sub_local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
Exemplo n.º 2
0
    def evaluate(self, context):
        if config.debug:
            print()
            print("random sequence position before:", np.random.get_state()[2])
        num = context_length(context)
        choices = self.choices
        if num:
            bins = self.bins
            if bins is None:
                # all values have the same probability
                choices_idx = np.random.randint(len(choices), size=num)
            else:
                if any(isinstance(b, Expr) for b in bins):
                    weights = [expr_eval(expr, context) for expr in bins]
                    bins = self._weights_to_bins(weights)
                u = np.random.uniform(size=num)
                #XXX: np.choice uses searchsorted(bins, u) instead of digitize
                choices_idx = np.digitize(u, bins) - 1
        else:
            choices_idx = []

        if config.debug:
            print("random sequence position after:", np.random.get_state()[2])

        if any(isinstance(c, Expr) for c in choices):
            choices = np.array([expr_eval(expr, context) for expr in choices])
        return choices[choices_idx]
Exemplo n.º 3
0
    def compute(self, context, set1filter, set2filter, orderby1, orderby2):
        set1filterexpr = self._getfilter(context, set1filter)
        set1filtervalue = expr_eval(set1filterexpr, context)
        set2filterexpr = self._getfilter(context, set2filter)
        set2filtervalue = expr_eval(set2filterexpr, context)
        set1len = set1filtervalue.sum()
        set2len = set2filtervalue.sum()
        numtomatch = min(set1len, set2len)
        print("matching with %d/%d individuals" % (set1len, set2len))
        result = np.full(context_length(context), -1, dtype=int)
        if not numtomatch:
            return result

        sorted_set1_indices = orderby1[set1filtervalue].argsort()[-numtomatch:]
        sorted_set2_indices = orderby2[set2filtervalue].argsort()[-numtomatch:]

        set1ids = context['id'][set1filtervalue]
        set2ids = context['id'][set2filtervalue]

        id_to_rownum = context.id_to_rownum
        id1 = set1ids[sorted_set1_indices]
        id2 = set2ids[sorted_set2_indices]
        # cannot use sorted_setX_indices because those are "local" indices
        result[id_to_rownum[id1]] = id2
        result[id_to_rownum[id2]] = id1
        return result
Exemplo n.º 4
0
         def match_one_set1_individual(idx, sorted_idx):
             global local_ctx
 
             if not context_length(local_ctx):
                 raise StopIteration
 
             local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
 
 #            pk = tuple(individual1[fname] for fname in pk_names)
 #            optimized_expr = optimized_exprs.get(pk)
 #            if optimized_expr is None:
 #                for name in pk_names:
 #                    fake_set1['__f_%s' % name].value = individual1[name]
 #                optimized_expr = str(symbolic_expr.simplify())
 #                optimized_exprs[pk] = optimized_expr
 #            set2_scores = evaluate(optimized_expr, mm_dict, set2)
 
             set2_scores = expr_eval(score_expr, local_ctx)
 
             individual2_idx = np.argmax(set2_scores)
 
             id1 = local_ctx['id']
             id2 = local_ctx['__other_id'][individual2_idx]
 
             local_ctx = context_delete(local_ctx, individual2_idx)
 
             result[id_to_rownum[id1]] = id2
             result[id_to_rownum[id2]] = id1            
Exemplo n.º 5
0
         def create_cost(idx, sorted_idx):
 
             global cost
             if not context_length(local_ctx):
                 raise StopIteration
             local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
 
             set2_scores = expr_eval(score_expr, local_ctx)
             cost.append(set2_scores[:].tolist())
Exemplo n.º 6
0
 def build_context(self, context):
     if config.debug:
         print()
         print("random sequence position before:", np.random.get_state()[2])
     context[self.u_varname] = \
         np.random.uniform(size=context_length(context))
     if config.debug:
         print("random sequence position after:", np.random.get_state()[2])
     return context
Exemplo n.º 7
0
 def compute(self, context, filter=None):
     if filter is None:
         return context_length(context)
     else:
         #TODO: check this at "compile" time (in __init__), though for
         # that we need to know the type of all temporary variables
         # first
         if not np.issubdtype(filter.dtype, bool):
             raise ValueError("count filter must be a boolean expression")
         return np.sum(filter)
Exemplo n.º 8
0
 def compute(self, context, filter=None):
     if filter is None:
         return context_length(context)
     else:
         # TODO: check this at "compile" time (in __init__), though for
         # that we need to know the type of all temporary variables
         # first
         if not np.issubdtype(filter.dtype, bool):
             raise ValueError("count filter must be a boolean expression")
         return np.sum(filter)
Exemplo n.º 9
0
 def evaluate(self, context):
     if self.filter is None:
         return context_length(context)
     else:
         #TODO: check this at "compile" time (in __init__), though for
         # that we need to know the type of all temporary variables
         # first
         if getdtype(self.filter, context) is not bool:
             raise Exception("count filter must be a boolean expression")
         return np.sum(expr_eval(self.filter, context))
Exemplo n.º 10
0
 def evaluate(self, context):
     args = [expr_eval(arg, context) for arg in self.args]
     kwargs = dict((k, expr_eval(v, context))
                   for k, v in self.kwargs.iteritems())
     if 'size' in self.arg_names and 'size' not in kwargs:
         kwargs['size'] = context_length(context)
     if self.filter_expr is None:
         filter_value = None
     else:
         filter_value = expr_eval(self.filter_expr, context)
     func = self.np_func[0]
     return self.compute(func, args, kwargs, filter_value)
Exemplo n.º 11
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Exemplo n.º 12
0
    def align_no_link(self, context, score, need, filter, take, leave,
                      expressions, possible_values, errors, frac_need, link,
                      secondary_axis, method):

        ctx_length = context_length(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values)

        filter_value = expr_eval(self._getfilter(context, filter), context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        # retrieve the columns we need to work with
        if expressions:
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        # noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        need = np.asarray(need)
        # FIXME: either handle past_error in no link (currently, the past
        #        error is added... but never computed, so always 0 !) or raise
        #        an error in case errors='carry" is used with no link.
        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    score, take, leave, method)
Exemplo n.º 13
0
         def match_one_set1_individual(idx, sorted_idx):
             global local_ctx   
             if not context_length(local_ctx):
                 raise StopIteration    
             local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
             set2_scores = expr_eval(score_expr, local_ctx)
 #            print set2_scores
             individual2_idx = np.argmax(set2_scores)   
             id1 = local_ctx['id']
             id2 = local_ctx['__other_id'][individual2_idx]    
             local_ctx = context_delete(local_ctx, individual2_idx)
 
             result[id_to_rownum[id1]] = id2
             result[id_to_rownum[id2]] = id1
Exemplo n.º 14
0
    def eval_rows(self, source_rows, expr_value, context):
        result = np.empty(context_length(context), dtype=expr_value.dtype)
        result.fill(get_missing_value(expr_value))

        id_sort_indices = np.argsort(source_rows)
        sorted_rownum = source_rows[id_sort_indices]
        sorted_values = expr_value[id_sort_indices]
        groups = groupby(izip(sorted_rownum, sorted_values), key=itemgetter(0))
        aggregate_func = self.aggregate_func
        for rownum, values in groups:
            if rownum == -1:
                continue
            # Note that v[n] is faster than using an itemgetter, even with map
            result[rownum] = aggregate_func(v[1] for v in values)
        return result
Exemplo n.º 15
0
    def align_no_link(self, context):
        ctx_length = context_length(context)

        scores = expr_eval(self.expr, context)

        need, expressions, possible_values = self._eval_need(context)

        filter_value = expr_eval(self._getfilter(context), context)
        take_filter = expr_eval(self.take_filter, context)
        leave_filter = expr_eval(self.leave_filter, context)

        if filter_value is not None:
            num_to_align = np.sum(filter_value)
        else:
            num_to_align = ctx_length

        if expressions:
            # retrieve the columns we need to work with
            columns = [expr_eval(expr, context) for expr in expressions]
            if filter_value is not None:
                groups = partition_nd(columns, filter_value, possible_values)
            else:
                groups = partition_nd(columns, True, possible_values)
        else:
            columns = []
            if filter_value is not None:
                groups = [filter_to_indices(filter_value)]
            else:
                groups = [np.arange(num_to_align)]

        # the sum is not necessarily equal to len(a), because some individuals
        # might not fit in any group (eg if some alignment data is missing)
        if sum(len(g) for g in groups) < num_to_align:
            unaligned = np.ones(ctx_length, dtype=bool)
            if filter_value is not None:
                unaligned[~filter_value] = False
            for member_indices in groups:
                unaligned[member_indices] = False
            self._display_unaligned(expressions, context['id'], columns,
                                    unaligned)

        #noinspection PyAugmentAssignment
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need)
        need = self._add_past_error(need, context)

        return align_get_indices_nd(ctx_length, groups, need, filter_value,
                                    scores, take_filter, leave_filter)
Exemplo n.º 16
0
    def _eval_args(self, context):
        args, kwargs = NumpyCreateArray._eval_args(self, context)
        if 'size' in self.argspec.args:
            pos = self.argspec.args.index('size')
            size = args[pos]

            # The original functions return a scalar when size is None, and an
            # array of length one when size is 1.
            #TODO: users should have a way to have the "size=None" behavior. We
            # could differentiate whether None was explicitly passed or comes
            # from the default value (as we did previously: 'size' not in
            # kwargs), but I do not think it is a good idea. Adding a new
            # "sentinel" value (e.g. -1 or "scalar") is probably better.
            if size is None:
                args = args[:pos] + (context_length(context),) + args[pos + 1:]
        return args, kwargs
Exemplo n.º 17
0
    def eval_rows(self, source_rows, expr_value, context):
        # We can't use a negative value because that is not allowed by
        # bincount, and using a value too high will uselessly increase the size
        # of the array returned by bincount
        idx_for_missing = context_length(context)

        missing_int = missing_values[int]

        # filter out missing values: those where the object pointed to does not
        # exist anymore (the id corresponds to -1 in id_to_rownum)
        #XXX: use np.putmask(source_rows, source_ids == missing_int,
        #                    missing_int)
        source_rows[source_rows == missing_int] = idx_for_missing

        counts = self.count(source_rows, expr_value)
        counts.resize(idx_for_missing)
        return counts
Exemplo n.º 18
0
    def evaluate(self, context):
        ctx_filter = context.get('__filter__')
        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        rank1_expr = self.rank1_expr
        rank2_expr = self.rank2_expr
        used_variables1 = rank1_expr.collect_variables(context)
        used_variables2 = rank2_expr.collect_variables(context)
        used_variables1.add('id')
        used_variables2.add('id')
        set1 = context_subset(context, set1filter, used_variables1)
        set2 = context_subset(context, set2filter, used_variables2)
        set1len = set1filter.sum()
        set2len = set2filter.sum()
        tomatch = min(set1len, set2len)
        order1 = expr_eval(rank1_expr, context)
        order2 = expr_eval(rank2_expr, context)
        if not self.ascending1: 
            order1 = - order1       # reverse sorting
        if not self.ascending2:
            order2 = - order2       # reverse sorting

        sorted_set1_indices = order1[set1filter].argsort()
        sorted_set2_indices = order2[set2filter].argsort()
        idx1 = sorted_set1_indices[:tomatch]
        idx2 = sorted_set2_indices[:tomatch]
        print("matching with %d/%d individuals" % (set1len, set2len))
        
        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)
        
        id1 = set1['id'][idx1]
        id2 = set2['id'][idx2]
        result[id_to_rownum[id1]] = id2
        result[id_to_rownum[id2]] = id1

        return result
Exemplo n.º 19
0
    def _eval_args(self, context):
        args, kwargs = NumpyCreateArray._eval_args(self, context)
        if 'size' in self.argspec.args:
            pos = self.argspec.args.index('size')
            size = args[pos]

            # The original functions return a scalar when size is None, and an
            # array of length one when size is 1.
            # TODO: users should have a way to have the "size=None" behavior. We
            # could differentiate whether None was explicitly passed or comes
            # from the default value (as we did previously: 'size' not in
            # kwargs), but I do not think it is a good idea. Adding a new
            # "sentinel" value (e.g. -1 or "scalar") is probably better.
            if size is None:
                args = args[:pos] + (context_length(context), ) + args[pos +
                                                                       1:]
        return args, kwargs
Exemplo n.º 20
0
    def as_string(self, context):
        tmp_varname = get_tmp_varname()
        result = expr_eval(self, context)
        if isinstance(result, dict):
            indices = result['indices']
            values = result['values']
        else:
            indices = None

        if indices is not None:
            if isinstance(values, np.ndarray):
                res_type = values.dtype.type
            else:
                res_type = type(values)
            result = np.zeros(context_length(context), dtype=res_type)
            np.put(result, indices, values)
        context[tmp_varname] = result
        return tmp_varname
Exemplo n.º 21
0
    def fill_missing_values(self, ids, values, context, filler='auto'):
        '''ids: ids present in past period
           context: current period context'''
        if filler is 'auto':
            filler = get_missing_value(values)
        result = np.empty(context_length(context), dtype=values.dtype)
        result.fill(filler)
        if len(ids):
            id_to_rownum = context.id_to_rownum
            # if there was more objects in the past than in the current
            # period. Currently, remove() keeps old ids, so this never
            # happens, but if we ever change remove(), we'll need to add
            # such a check everywhere we use id_to_rownum
#            invalid_ids = ids > len(id_to_rownum)
#            if np.any(invalid_ids):
#                fix ids
            rows = id_to_rownum[ids]
            safe_put(result, rows, values)
        return result
Exemplo n.º 22
0
    def eval_rows(self, source_rows, target_filter, context):
        target_context = self.target_context(context)
        value_column = expr_eval(self.target_expr, target_context)
        if target_filter is not None:
            value_column = value_column[target_filter]
        assert len(source_rows) == len(value_column)

        result = np.empty(context_length(context), dtype=value_column.dtype)
        result.fill(get_missing_value(value_column))

        id_sort_indices = np.argsort(source_rows)
        sorted_rownum = source_rows[id_sort_indices]
        sorted_values = value_column[id_sort_indices]
        groups = groupby(izip(sorted_rownum, sorted_values), key=itemgetter(0))
        aggregate_func = self.aggregate_func
        for rownum, values in groups:
            if rownum == -1:
                continue
            result[rownum] = aggregate_func(v[1] for v in values)
        return result
Exemplo n.º 23
0
    def evaluate(self, context):
        num = context_length(context)
        choices = self.choices
        if num:
            bins = self.bins
            if bins is None:
                # all values have the same probability
                choices_idx = np.random.randint(len(choices), size=num)
            else:
                if any(isinstance(b, Expr) for b in bins):
                    weights = [expr_eval(expr, context) for expr in bins]
                    bins = self._weights_to_bins(weights)
                u = np.random.uniform(size=num)
                choices_idx = np.digitize(u, bins) - 1
        else:
            choices_idx = []

        if any(isinstance(c, Expr) for c in choices):
            choices = np.array([expr_eval(expr, context) for expr in choices])

        return choices[choices_idx]
Exemplo n.º 24
0
    def fill_missing_values(ids, values, context, filler='auto'):
        """
        ids: ids present in past period
        values: values in past period
        context: current period context
        """

        if filler is 'auto':
            filler = get_default_value(values)
        result = np.full(context_length(context), filler, dtype=values.dtype)
        if len(ids):
            id_to_rownum = context.id_to_rownum
            # if there was more objects in the past than in the current
            # period. Currently, remove() keeps old ids, so this never
            # happens, but if we ever change remove(), we'll need to add
            # such a check everywhere we use id_to_rownum
            # invalid_ids = ids > len(id_to_rownum)
            # if np.any(invalid_ids):
            #     fix ids
            rows = id_to_rownum[ids]
            safe_put(result, rows, values)
        return result
Exemplo n.º 25
0
    def evaluate(self, context):
        num = context_length(context)
        choices = self.choices
        if num:
            bins = self.bins
            if bins is None:
                # all values have the same probability
                choices_idx = np.random.randint(len(choices), size=num)
            else:
                if len(bins) == 2 and bins[0] == 'dynamic':
                    weights = [expr_eval(expr, context) for expr in bins[1]]
                    bins = self._weights_to_bins(weights)
                u = np.random.uniform(size=num)
                choices_idx = np.digitize(u, bins) - 1
        else:
            choices_idx = []

        if len(choices) == 2 and choices[0] == 'dynamic':
            choices = np.array([expr_eval(expr, context)
                                for expr in choices[1]])

        return choices[choices_idx]
Exemplo n.º 26
0
        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(xrange(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1
            
            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)
Exemplo n.º 27
0
    def evaluate(self, context):
        global local_ctx

        ctx_filter = context.get('__filter__')

        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        score_expr = self.score_expr

        used_variables = score_expr.collect_variables(context)
        used_variables1 = [v for v in used_variables
                                    if not v.startswith('__other_')]
        used_variables2 = [v[8:] for v in used_variables
                                    if v.startswith('__other_')]

        set1 = context_subset(context, set1filter, ['id'] + used_variables1)
        set2 = context_subset(context, set2filter, ['id'] + used_variables2)
        set1len = set1filter.sum()
        set2len = set2filter.sum()
        tomatch = min(set1len, set2len)
        
        orderby = self.orderby
        if not isinstance(orderby, str):
            order = expr_eval(orderby, context)
        else: 
            order = np.zeros(context_length(context), dtype=int)
            if orderby == 'EDtM':
                for var in used_variables1:
                    order[set1filter] += (set1[var] -  set1[var].mean())**2/set1[var].var()
            if orderby == 'SDtOM':
                order_ctx = dict((k if k in used_variables1 else k, v)
                             for k, v in set1.iteritems())
                order_ctx.update(('__other_' + k, set2[k].mean()) for k in used_variables2)
                order[set1filter] = expr_eval(score_expr, order_ctx)               
        
        sorted_set1_indices = order[set1filter].argsort()[::-1]
        set1tomatch = sorted_set1_indices[:tomatch]
        print("matching with %d/%d individuals" % (set1len, set2len))

        #TODO: compute pk_names automatically: variables which are either
        # boolean, or have very few possible values and which are used more
        # than once in the expression and/or which are used in boolean
        # expressions
#        pk_names = ('eduach', 'work')
#        optimized_exprs = {}

        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)

        local_ctx = dict(('__other_' + k if k in ['id'] + used_variables2 else k, v)
                         for k, v in set2.iteritems())

        if self.pool_size is None:
            #noinspection PyUnusedLocal
            def match_one_set1_individual(idx, sorted_idx):
                global local_ctx
    
                if not context_length(local_ctx):
                    raise StopIteration
    
                local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
    
    #            pk = tuple(individual1[fname] for fname in pk_names)
    #            optimized_expr = optimized_exprs.get(pk)
    #            if optimized_expr is None:
    #                for name in pk_names:
    #                    fake_set1['__f_%s' % name].value = individual1[name]
    #                optimized_expr = str(symbolic_expr.simplify())
    #                optimized_exprs[pk] = optimized_expr
    #            set2_scores = evaluate(optimized_expr, mm_dict, set2)
    
                set2_scores = expr_eval(score_expr, local_ctx)
    
                individual2_idx = np.argmax(set2_scores)
    
                id1 = local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1            
            
            loop_wh_progress(match_one_set1_individual, set1tomatch)
        else:
            pool_size = self.pool_size
            #noinspection PyUnusedLocal
            def match_one_set1_individual_pool(idx, sorted_idx, pool_size):
                global local_ctx
                
                set2_size = context_length(local_ctx)
                if not set2_size:
                    raise StopIteration
                
                if set2_size > pool_size:
                    pool = random.sample(xrange(context_length(local_ctx)), pool_size)
                else:
                    pool = range(set2_size)

                sub_local_ctx = context_subset(local_ctx, pool, None)
                sub_local_ctx.update((k, set1[k][sorted_idx]) for k in ['id'] + used_variables1)
                
                set2_scores = expr_eval(score_expr, sub_local_ctx)
    
                individual2_pool_idx = np.argmax(set2_scores)
                individual2_idx = pool[individual2_pool_idx]
                
                id1 = sub_local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]
    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
                
            loop_wh_progress(match_one_set1_individual_pool, set1tomatch, pool_size=10)
            
        return result
Exemplo n.º 28
0
    def compute(self, context, *expressions, **kwargs):
        if not expressions:
            raise TypeError("groupby() takes at least 1 argument")

        # TODO: allow lists/tuples of arguments to group by the combinations
        # of keys
        for expr in expressions:
            if isinstance(expr, (bool, int, float)):
                raise TypeError("groupby() does not work with constant "
                                "arguments")
            if isinstance(expr, (tuple, list)):
                raise TypeError("groupby() takes expressions as arguments, "
                                "not a list of expressions")

        # On python 3, we could clean up this code (keyword only arguments).
        expr = kwargs.pop('expr', None)
        if expr is None:
            expr = Count()

#        by = kwargs.pop('by', None)
        filter_value = kwargs.pop('filter', None)
        percent = kwargs.pop('percent', False)
        possible_values = kwargs.pop('pvalues', None)
        totals = kwargs.pop('totals', True)

        expr_vars = [v.name for v in collect_variables(expr)]
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        columns = [expand(c, context_length(context)) for c in columns]

        if filter_value is not None:
            filtered_columns = [col[filter_value] for col in columns]
            # FIXME: use the actual filter_expr instead of not_hashable
            filtered_context = context.subset(filter_value, expr_vars,
                                              not_hashable)
        else:
            filtered_columns = columns
            filtered_context = context

        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            return LabeledArray([], labels, possible_values)

        # evaluate the expression on each group
        # we use not_hashable to avoid storing the subset in the cache
        contexts = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in groups]
        data = [expr_eval(expr, c) for c in contexts]

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
#        if self.filter is not None:
#            filter_value = expr_eval(self.filter, context)
#        else:
#            filter_value = True
#
#        d = group_indices_nd(columns, filter_value)
#        pvalues = sorted(d.keys())
#        ndim = len(columns)
#        possible_values = [[pv[i] for pv in pvalues]
#                           for i in range(ndim)]
#        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]

        if percent:
            totals = True

        if totals:
            width = len_pvalues[-1]
            height = prod(len_pvalues[:-1])
            rows_indices = [np.concatenate([groups[y * width + x]
                                            for x in range(width)])
                            for y in range(height)]
            cols_indices = [np.concatenate([groups[y * width + x]
                                            for y in range(height)])
                            for x in range(width)]
            cols_indices.append(np.concatenate(cols_indices))

            # evaluate the expression on each "combined" group (ie compute totals)
            row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                        for indices in rows_indices]
            row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
            col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                        for indices in cols_indices]
            col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]
        else:
            row_totals = None
            col_totals = None

        if percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]

#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in izip(data, divisors)]

        # convert to a 1d array. We don't simply use data = np.array(data),
        # because if data is a list of ndarray (for example if we use
        # groupby(a, expr=id), *and* all the ndarrays have the same length,
        # the result is a 2d array instead of an array of ndarrays like we
        # need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        return LabeledArray(data, labels, possible_values,
                            row_totals, col_totals)
Exemplo n.º 29
0
    def evaluate(self, context):
        source_entity = context['__entity__']
        if self.entity_name is None:
            target_entity = source_entity
        else:
            target_entity = entity_registry[self.entity_name]

        if target_entity is source_entity:
            target_context = context
        else:
            target_context = EntityContext(target_entity,
                                           {'period': context['period']})

        ctx_filter = context.get('__filter__')

        if self.filter is not None and ctx_filter is not None:
            filter_expr = ctx_filter & self.filter
        elif self.filter is not None:
            filter_expr = self.filter
        elif ctx_filter is not None:
            filter_expr = ctx_filter
        else:
            filter_expr = None

        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif self.number is not None:
            to_give_birth = None
            num_birth = self.number
        else:
            raise Exception('no filter nor number in "new"')

        array = target_entity.array

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth)
        # select real duplication case
        if self.num_duplicate is not None:
            number_rep = array[self.num_duplicate].compress( array[self.num_duplicate]>0 )
            children = children.repeat(number_rep,axis=0)
            num_birth = number_rep.sum()
            
        if self.expand==True:    
            from numpy.lib.stride_tricks import as_strided
        
            id_add = np.arange(number_rep.max())
            id_add = as_strided(id_add ,
                             shape=number_rep.shape + id_add.shape,
                             strides=(0,) + id_add.strides)
            id_add =  id_add[id_add < number_rep[:, None]]
            one_by_house = array['res'].compress( array[self.num_duplicate]>0 )  
#            indices = np.unique(one_by_house)
#            size_by_id = np.bincount(one_by_house) 
#            size_by_id = size_by_id.compress(size_by_id>0)
#            size_by_id = size_by_id.repeat(size_by_id)  
            id_ini = one_by_house.repeat(number_rep,axis=0)
            decalage = np.zeros(len(one_by_house),dtype=int)  
            indices = np.unique(one_by_house,return_index=True)[1]      
            decalage[indices[1:]] = number_rep[indices]
            decalage = decalage.cumsum().repeat(number_rep,axis=0)
#            decalage = decalage - decalage[0] 
            children['res'] = id_add+decalage+ array['res'].max()+1
            
        remember_id = children['id'].copy()
        
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context['period']

            used_variables = self._collect_kwargs_variables(context)
            child_context = context_subset(context, to_give_birth,
                                           used_variables)
            if to_give_birth is None:
                child_context = new_context_like(context, length=num_birth)
            else:
                child_context = context_subset(context, to_give_birth,
                                               used_variables)
            for k, v in self.kwargs.iteritems():
                children[k] = expr_eval(v, child_context) 
                       
        if self.numerotation is not None:
            from numpy.lib.stride_tricks import as_strided
            initial = np.zeros(len(array), dtype=bool)  
            id_dup = np.arange(number_rep.max())
            id_dup = as_strided(id_dup ,
                             shape=number_rep.shape + id_dup.shape,
                             strides=(0,) + id_dup.strides)
            id_dup =  id_dup[id_dup < number_rep[:, None]]  +1    
            children[self.numerotation] = id_dup

        add_individuals(target_context, children)

        # result is the ids of the new individuals corresponding to the source
        # entity
        # I change here to have the "father" name instead
        if to_give_birth is not None:
            if self.return_option is None:
                result = np.empty(context_length(context), dtype=int)
                result.fill(-1)
                # TODO: must change something to have father size correct with
                # target and not with source.
                if source_entity is target_entity:               
                    extra_bools = np.zeros(num_birth, dtype=bool)
                    to_give_birth = np.concatenate((to_give_birth, extra_bools))
                    
                # Note that np.place is a tad faster, but is currently buggy when
                # working with columns of structured arrays.
                # See http://projects.scipy.org/numpy/ticket/1869
                result[to_give_birth] = children['id']

                return result
            elif self.return_option=='father' :
                father = np.empty(context_length(context), dtype=int)
                father.fill(-1)  
                list_children = np.ones(num_birth, dtype=bool)
                initial = np.zeros(len(array), dtype=bool)
                birth = np.concatenate((initial, list_children))                              
                father[birth] = remember_id
                return father
        else:
            return None
Exemplo n.º 30
0
    def compute(self,
                context,
                entity_name=None,
                filter=None,
                number=None,
                **kwargs):
        if filter is not None and number is not None:
            # Having neither is allowed, though, as there can be a contextual
            # filter. Also, there is no reason to prevent the whole
            # population giving birth, even though the usefulness of such
            # usage seem dubious.
            raise ValueError("new() 'filter' and 'number' arguments are "
                             "mutually exclusive")
        source_entity = context.entity
        if entity_name is None:
            target_entity = source_entity
        else:
            target_entity = context.entities[entity_name]

        # target context is the context where the new individuals will be
        # created
        if target_entity is source_entity:
            target_context = context
        else:
            # we do need to copy the data (.extra) because we will insert into
            # the entity.array anyway => fresh_data=True
            target_context = context.clone(fresh_data=True,
                                           entity_name=target_entity.name)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif number is not None:
            to_give_birth = None
            num_birth = number
        else:
            to_give_birth = np.ones(len(context), dtype=bool)
            num_birth = len(context)

        array = target_entity.array
        default_values = target_entity.fields.default_values

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth,
                                        default_values)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context.period

            used_variables = [
                v.name for v in self._collect_kwargs_variables(kwargs)
            ]
            if to_give_birth is None:
                assert not used_variables
                child_context = context.empty(num_birth)
            else:
                child_context = context.subset(to_give_birth, used_variables,
                                               filter_expr)
            for k, v in kwargs.iteritems():
                if k not in array.dtype.names:
                    print("WARNING: {} is unknown, ignoring it!".format(k))
                    continue
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        expr_cache.invalidate(context.period, context.entity_name)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.full(context_length(context), -1, dtype=int)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See https://github.com/numpy/numpy/issues/2462
            result[to_give_birth] = children['id']
            return result
        else:
            return None
Exemplo n.º 31
0
    def evaluate(self, context):
        global local_ctx
        global cost

        ctx_filter = context.get('__filter__')

        id_to_rownum = context.id_to_rownum

        # at some point ctx_filter will be cached automatically, so we don't
        # need to take care of it manually here
        if ctx_filter is not None:
            set1filter = expr_eval(ctx_filter & self.set1filter, context)
            set2filter = expr_eval(ctx_filter & self.set2filter, context)
        else:
            set1filter = expr_eval(self.set1filter, context)
            set2filter = expr_eval(self.set2filter, context)

        score_expr = self.score_expr

        used_variables = score_expr.collect_variables(context)
        used_variables1 = ['id'] + [v for v in used_variables
                                    if not v.startswith('__other_')]
        used_variables2 = ['id'] + [v[8:] for v in used_variables
                                    if v.startswith('__other_')]

        set1 = context_subset(context, set1filter, used_variables1)
        set2 = context_subset(context, set2filter, used_variables2)
        orderby = expr_eval(self.orderby, context)
        sorted_set1_indices = orderby[set1filter].argsort()[::-1]
        print "matching with %d/%d individuals" % (set1filter.sum(),
                                                   set2filter.sum())

        #TODO: compute pk_names automatically: variables which are either
        # boolean, or have very few possible values and which are used more
        # than once in the expression and/or which are used in boolean
        # expressions
#        pk_names = ('eduach', 'work')
#        optimized_exprs = {}

        result = np.empty(context_length(context), dtype=int)
        result.fill(-1)

        local_ctx = dict(('__other_' + k if k in used_variables2 else k, v)
                         for k, v in set2.iteritems())
#        print local_ctx
#        test=local_ctx.copy()
#        test.update((k, set1[k]) for k in used_variables1)
#
#

        
######## Tentative de Munkres
        
        if self.option == "optimal": 
            cost = []
            def create_cost(idx, sorted_idx):
    
                global cost
                if not context_length(local_ctx):
                    raise StopIteration
                local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
    
                set2_scores = expr_eval(score_expr, local_ctx)
                cost.append(set2_scores[:].tolist())
                
            loop_wh_progress(create_cost, sorted_set1_indices)       
            resultat = MunkresX.maxWeightMatching(cost)
            for id1,id2 in resultat.items(): 
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1    
            return result
        
        else : 
            def match_one_set1_individual(idx, sorted_idx):
                global local_ctx   
                if not context_length(local_ctx):
                    raise StopIteration    
                local_ctx.update((k, set1[k][sorted_idx]) for k in used_variables1)
                set2_scores = expr_eval(score_expr, local_ctx)
    #            print set2_scores
                individual2_idx = np.argmax(set2_scores)   
                id1 = local_ctx['id']
                id2 = local_ctx['__other_id'][individual2_idx]    
                local_ctx = context_delete(local_ctx, individual2_idx)
    
                result[id_to_rownum[id1]] = id2
                result[id_to_rownum[id2]] = id1
    
            loop_wh_progress(match_one_set1_individual, sorted_set1_indices)       
            return result
Exemplo n.º 32
0
 def build_context(self, context):
     context[self.u_varname] = \
         np.random.uniform(size=context_length(context))
     return context
Exemplo n.º 33
0
    def align_link(self, context, score, need, filter, take, leave,
                   expressions, possible_values, errors, frac_need, link,
                   secondary_axis, method):
        target_context = link._target_context(context)

        need, expressions, possible_values = \
            self._eval_need(context, need, expressions, possible_values,
                            target_context)

        # handle secondary axis
        if isinstance(secondary_axis, Expr):
            axis_name = str(secondary_axis)
            try:
                secondary_axis = need.dim_names.index(axis_name)
            except ValueError:
                raise ValueError("invalid value for secondary_axis: there is "
                                 "no axis named '%s' in the need array" %
                                 axis_name)
        else:
            if secondary_axis >= need.ndim:
                raise Exception("%d is an invalid value for secondary_axis: "
                                "it should be smaller than the number of "
                                "dimension of the need array (%d)" %
                                (secondary_axis, need.ndim))

        # evaluate columns
        target_columns = [expr_eval(e, target_context) for e in expressions]
        # this is a one2many, so the link column is on the target side
        link_column = target_context[link._link_field]

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            reverse_link = Many2One("reverse", link._link_field,
                                    context.entity.name)
            target_filter = LinkGet(reverse_link, filter_expr, False)
            target_filter_value = expr_eval(target_filter, target_context)

            # It is often not a good idea to pre-filter columns like this
            # because we loose information about "indices", but in this case,
            # it is fine, because we do not need that information afterwards.
            filtered_columns = [
                col[target_filter_value]
                if isinstance(col, np.ndarray) and col.shape else [col]
                for col in target_columns
            ]

            link_column = link_column[target_filter_value]
        else:
            filtered_columns = target_columns
            target_filter_value = None

        # compute labels for filtered columns
        # -----------------------------------
        # We can't use _group_labels_light because group_labels assigns labels
        # on a first come, first served basis, not using the order they are
        # in pvalues
        fcols_labels = []
        filtered_length = len(filtered_columns[0])
        unaligned = np.zeros(filtered_length, dtype=bool)
        for fcol, pvalues in zip(filtered_columns, need.pvalues):
            pvalues_index = dict((v, i) for i, v in enumerate(pvalues))
            fcol_labels = np.empty(filtered_length, dtype=np.int32)
            for i in range(filtered_length):
                value_idx = pvalues_index.get(fcol[i], -1)
                if value_idx == -1:
                    unaligned[i] = True
                fcol_labels[i] = value_idx
            fcols_labels.append(fcol_labels)

        num_unaligned = np.sum(unaligned)
        if num_unaligned:
            # further filter label columns and link_column
            validlabels = ~unaligned
            fcols_labels = [labels[validlabels] for labels in fcols_labels]
            link_column = link_column[validlabels]

            # display who are the evil ones
            ids = target_context['id']
            if target_filter_value is not None:
                filtered_ids = ids[target_filter_value]
            else:
                filtered_ids = ids
            self._display_unaligned(expressions, filtered_ids,
                                    filtered_columns, unaligned)
        else:
            del unaligned

        id_to_rownum = context.id_to_rownum
        missing_int = missing_values[int]
        source_ids = link_column

        if len(id_to_rownum):
            source_rows = id_to_rownum[source_ids]
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            source_rows = []

        # filtered_columns are not filtered further on invalid labels
        # (num_unaligned) but this is not a problem since those will be
        # ignored by GroupBy anyway.
        # TODO: this is ugly because a groupby on "values", returns an LArray with those
        # values (ndarrays) as axes names. Ugh.
        groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values)

        # FIXME: target_context is not correct, as it is not filtered while
        # filtered_columns are. Since we do not use the context "columns" it
        # mostly works but I had to disable an assertion in utils.expand
        # because the length of the context is not correct.
        num_candidates = expr_eval(groupby_expr, target_context)

        # fetch the list of linked individuals for each local individual.
        # e.g. the list of person ids for each household
        hh = np.empty(context_length(context), dtype=object)
        # we can't use .fill([]) because it reuses the same list for all
        # objects
        for i in range(len(hh)):
            hh[i] = []

        # Even though this is highly sub-optimal, the time taken to create
        # those lists of ids is very small compared to the total time taken
        # for align_other (0.2s vs 4.26), so I shouldn't care too much about
        # it for now.

        # target_row (row of person) is an index valid for *filtered/label*
        # columns !
        for target_row, source_row in enumerate(source_rows):
            if source_row == -1:
                continue
            hh[source_row].append(target_row)

        class FakeContainer(object):
            def __init__(self, length):
                self.length = length

            def __len__(self):
                return self.length

        groups = [FakeContainer(g) for g in num_candidates]
        need = need * self._get_need_correction(groups, possible_values)
        need = self._handle_frac_need(need, frac_need)
        need = self._add_past_error(context, need, errors)
        # need = np.asarray(need)
        need = np.asarray(need)
        aligned, error = \
            align_link_nd(score, need, num_candidates, hh, fcols_labels,
                          secondary_axis)
        self.past_error = error
        return aligned
Exemplo n.º 34
0
    def compute(self, context, set1filter, set2filter, score, orderby,
                pool_size=None, algo='onebyone'):
        global matching_ctx

        if pool_size is not None:
            assert isinstance(pool_size, int)
            assert pool_size > 0

        set1filterexpr = self._getfilter(context, set1filter)
        set1filtervalue = expr_eval(set1filterexpr, context)
        set2filterexpr = self._getfilter(context, set2filter)
        set2filtervalue = expr_eval(set2filterexpr, context)
        set1len = set1filtervalue.sum()
        set2len = set2filtervalue.sum()
        print("matching with %d/%d individuals" % (set1len, set2len), end='')

        varnames = {v.name for v in score.collect_variables()}
        used_variables1 = {n for n in varnames if not n.startswith('__other_')}
        used_variables2 = {n[8:] for n in varnames if n.startswith('__other_')}

        if isinstance(orderby, str):
            assert orderby == 'EDtM'
            orderby_vars = used_variables1
        else:
            orderby_vars = {v.name for v in orderby.collect_variables()}

        if algo == 'onebyone':
            all_vars = {'id'} | used_variables1 | orderby_vars
            set1 = context.subset(set1filtervalue, all_vars, set1filterexpr)
            set2 = context.subset(set2filtervalue, {'id'} | used_variables2,
                                  set2filterexpr)

            # subset creates a dict for the current entity, so .entity_data is a
            # dict
            set1 = set1.entity_data
            set2 = set2.entity_data

            set1['__ids__'] = set1['id'].reshape(set1len, 1)
            set2['__ids__'] = set2['id'].reshape(set2len, 1)

            print()
        else:
            # optimized matching by grouping sets by values, which usually
            # means smaller sets and improved running time.
            assert algo == 'byvalue'

            # if orderby contains variables that are not used in the score
            # expression, this will effectively add variables in the
            # matching context AND group by those variables. This is correct
            # because otherwise (if we did not group by them), we could have
            # groups containing individuals with different values of the
            # ordering variables (ie the ordering would not be respected).
            set1 = group_context(used_variables1 | orderby_vars,
                                 set1filtervalue, context)
            set2 = group_context(used_variables2, set2filtervalue, context)

            # we cannot simply take the [:min(set1len, set2len)] indices like in
            # the non-optimized case and iterate over that because we don't know
            # how many groups we will need to match.
            print(" (%d/%d groups)"
                  % (context_length(set1), context_length(set2)))

        if isinstance(orderby, str):
            orderbyvalue = np.zeros(context_length(set1))
            for name in used_variables1:
                column = set1[name]
                orderbyvalue += (column - column.mean()) ** 2 / column.var()
        else:
            orderbyvalue = expr_eval(orderby, context.clone(entity_data=set1))

        # Delete variables which are not in the score expression (but in the
        # orderby expr or possibly "id") because they are no longer needed and
        # would slow things down.
        context_keep(set1, used_variables1)
        context_keep(set2, used_variables2)

        sorted_set1_indices = orderbyvalue.argsort()[::-1]

        result = np.full(context_length(context), -1, dtype=int)
        id_to_rownum = context.id_to_rownum

        # prefix all keys except __len__
        matching_ctx = {'__other_' + k if k != '__len__' else k: v
                        for k, v in set2.iteritems()}

        def match_cell(idx, sorted_idx, pool_size):
            global matching_ctx

            set2_size = context_length(matching_ctx)
            if not set2_size:
                raise StopIteration

            if pool_size is not None and set2_size > pool_size:
                pool = random.sample(xrange(set2_size), pool_size)
                local_ctx = context_subset(matching_ctx, pool)
            else:
                local_ctx = matching_ctx.copy()

            local_ctx.update((k, set1[k][sorted_idx])
                             for k in {'__ids__'} | used_variables1)

            eval_ctx = context.clone(entity_data=local_ctx)
            set2_scores = expr_eval(score, eval_ctx)
            cell2_idx = set2_scores.argmax()

            cell1ids = local_ctx['__ids__']
            cell2ids = local_ctx['__other___ids__'][cell2_idx]

            if pool_size is not None and set2_size > pool_size:
                # transform pool-local index to set/matching_ctx index
                cell2_idx = pool[cell2_idx]

            cell1size = len(cell1ids)
            cell2size = len(cell2ids)
            nb_match = min(cell1size, cell2size)

            # we could introduce a random choice here but it is not
            # much necessary. In that case, it should be done in group_context
            ids1 = cell1ids[:nb_match]
            ids2 = cell2ids[:nb_match]

            result[id_to_rownum[ids1]] = ids2
            result[id_to_rownum[ids2]] = ids1
            
            if nb_match == cell2size:
                matching_ctx = context_delete(matching_ctx, cell2_idx)
            else:
                # other variables do not need to be modified since the cell
                # only got smaller and was not deleted
                matching_ctx['__other___ids__'][cell2_idx] = cell2ids[nb_match:]

            # FIXME: the expr gets cached for the full matching_ctx at the
            # beginning and then when another women with the same values is
            # found, it thinks it can reuse the expr but it breaks because it
            # has not the correct length.

            # the current workaround is to invalidate the whole cache for the
            # current entity but this is not the right way to go.
            # * disable the cache for matching?
            # * use a local cache so that methods after matching() can use
            # what was in the cache before matching(). Shouldn't the cache be
            # stored inside the context anyway?
            expr_cache.invalidate(context.period, context.entity_name)

            if nb_match < cell1size:
                set1['__ids__'][sorted_idx] = cell1ids[nb_match:]
                match_cell(idx, sorted_idx, pool_size)
        loop_wh_progress(match_cell, sorted_set1_indices, pool_size)
        return result
Exemplo n.º 35
0
    def align_link(self, context):
        scores = expr_eval(self.expr, context)

        need, expressions, possible_values = self._eval_need(context)
        need = self._handle_frac_need(need)
        need = self._add_past_error(need, context)

        # handle secondary axis
        secondary_axis = self.secondary_axis
        if isinstance(secondary_axis, Expr):
            axis_name = str(secondary_axis)
            try:
                secondary_axis = need.dim_names.index(axis_name)
            except ValueError:
                raise ValueError("invalid value for secondary_axis: there is "
                                 "no axis named '%s' in the need array"
                                 % axis_name)
        else:
            if secondary_axis >= need.ndim:
                raise Exception("%d is an invalid value for secondary_axis: "
                                "it should be smaller than the number of "
                                "dimension of the need array (%d)"
                                % (secondary_axis, need.ndim))

        # evaluate columns
        target_context = self.link._target_context(context)
        target_columns = [expr_eval(e, target_context) for e in expressions]
        # this is a one2many, so the link column is on the target side
        link_column = expr_eval(Variable(self.link._link_field),
                                target_context)

        filter_expr = self._getfilter(context)
        if filter_expr is not None:
            reverse_link = Many2One("reverse", self.link._link_field,
                                    context['__entity__'].name)
            target_filter = LinkValue(reverse_link, filter_expr, False)
            target_filter_value = expr_eval(target_filter, target_context)

            # It is often not a good idea to pre-filter columns like this
            # because we loose information about "indices", but in this case,
            # it is fine, because we do not need that information afterwards.
            filtered_columns = [col[target_filter_value]
                                  if isinstance(col, np.ndarray) and col.shape
                                  else [col]
                                for col in target_columns]

            link_column = link_column[target_filter_value]
        else:
            filtered_columns = target_columns
            target_filter_value = None

        # compute labels for filtered columns
        # -----------------------------------
        # We can't use _group_labels_light because group_labels assigns labels
        # on a first come, first served basis, not using the order they are
        # in pvalues
        fcols_labels = []
        filtered_length = len(filtered_columns[0])
        unaligned = np.zeros(filtered_length, dtype=bool)
        for fcol, pvalues in zip(filtered_columns, need.pvalues):
            pvalues_index = dict((v, i) for i, v in enumerate(pvalues))
            fcol_labels = np.empty(filtered_length, dtype=np.int32)
            for i in range(filtered_length):
                value_idx = pvalues_index.get(fcol[i], -1)
                if value_idx == -1:
                    unaligned[i] = True
                fcol_labels[i] = value_idx
            fcols_labels.append(fcol_labels)

        num_unaligned = np.sum(unaligned)
        if num_unaligned:
            # further filter label columns and link_column
            validlabels = ~unaligned
            fcols_labels = [labels[validlabels] for labels in fcols_labels]
            link_column = link_column[validlabels]

            # display who are the evil ones
            ids = target_context['id']
            if target_filter_value is not None:
                filtered_ids = ids[target_filter_value]
            else:
                filtered_ids = ids
            self._display_unaligned(expressions, filtered_ids,
                                    filtered_columns, unaligned)
        else:
            del unaligned

        id_to_rownum = context.id_to_rownum
        missing_int = missing_values[int]
        source_ids = link_column

        if len(id_to_rownum):
            source_rows = id_to_rownum[source_ids]
            # filter out missing values: those where the value of the link
            # points to nowhere (-1)
            source_rows[source_ids == missing_int] = missing_int
        else:
            assert np.all(source_ids == missing_int)
            source_rows = []

        # filtered_columns are not filtered further on invalid labels
        # (num_unaligned) but this is not a problem since those will be
        # ignored by GroupBy anyway.
        groupby_expr = GroupBy(*filtered_columns, pvalues=possible_values)

        # target_context is not technically correct, as it is not "filtered"
        # while filtered_columns are, but since we don't use the context
        # "columns", it does not matter.
        num_candidates = expr_eval(groupby_expr, target_context)

        # fetch the list of linked individuals for each local individual.
        # e.g. the list of person ids for each household
        hh = np.empty(context_length(context), dtype=object)
        # we can't use .fill([]) because it reuses the same list for all
        # objects
        for i in range(len(hh)):
            hh[i] = []

        # Even though this is highly sub-optimal, the time taken to create
        # those lists of ids is very small compared to the total time taken
        # for align_other (0.2s vs 4.26), so I shouldn't care too much about
        # it for now.

        # target_row (row of person) is an index valid for *filtered/label*
        # columns !
        for target_row, source_row in enumerate(source_rows):
            if source_row == -1:
                continue
            hh[source_row].append(target_row)

        aligned, error = \
            align_link_nd(scores, need, num_candidates, hh, fcols_labels,
                          secondary_axis)
        self.past_error = error
        return aligned
Exemplo n.º 36
0
    def compute(self, context, *expressions, **kwargs):
        if not expressions:
            raise TypeError("groupby() takes at least 1 argument")

        # TODO: allow lists/tuples of arguments to group by the combinations
        # of keys
        for expr in expressions:
            if isinstance(expr, (bool, int, float)):
                raise TypeError("groupby() does not work with constant "
                                "arguments")
            if isinstance(expr, (tuple, list)):
                raise TypeError("groupby() takes expressions as arguments, "
                                "not a list of expressions")

        # On python 3, we could clean up this code (keyword only arguments).
        expr = kwargs.pop('expr', None)
        if expr is None:
            expr = Count()

#        by = kwargs.pop('by', None)
        filter_value = kwargs.pop('filter', None)
        percent = kwargs.pop('percent', False)
        possible_values = kwargs.pop('pvalues', None)

        expr_vars = [v.name for v in collect_variables(expr)]
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        columns = [expand(c, context_length(context)) for c in columns]

        if filter_value is not None:
            filtered_columns = [col[filter_value] for col in columns]
            # FIXME: use the actual filter_expr instead of not_hashable
            filtered_context = context.subset(filter_value, expr_vars,
                                              not_hashable)
        else:
            filtered_columns = columns
            filtered_context = context

        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            return LabeledArray([], labels, possible_values)

        # evaluate the expression on each group
        # we use not_hashable to avoid storing the subset in the cache
        contexts = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in groups]
        data = [expr_eval(expr, c) for c in contexts]

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
#        if self.filter is not None:
#            filter_value = expr_eval(self.filter, context)
#        else:
#            filter_value = True
#
#        d = group_indices_nd(columns, filter_value)
#        pvalues = sorted(d.keys())
#        ndim = len(columns)
#        possible_values = [[pv[i] for pv in pvalues]
#                           for i in range(ndim)]
#        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]
        width = len_pvalues[-1]
        height = prod(len_pvalues[:-1])

        rows_indices = [np.concatenate([groups[y * width + x]
                                        for x in range(width)])
                        for y in range(height)]
        cols_indices = [np.concatenate([groups[y * width + x]
                                        for y in range(height)])
                        for x in range(width)]
        cols_indices.append(np.concatenate(cols_indices))

        # evaluate the expression on each "combined" group (ie compute totals)
        row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in rows_indices]
        row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
        col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
                    for indices in cols_indices]
        col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]

        if percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]

#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in izip(data, divisors)]

        # convert to a 1d array. We don't simply use data = np.array(data),
        # because if data is a list of ndarray (for example if we use
        # groupby(a, expr=id), *and* all the ndarrays have the same length,
        # the result is a 2d array instead of an array of ndarrays like we
        # need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        return LabeledArray(data, labels, possible_values,
                            row_totals, col_totals)
Exemplo n.º 37
0
    def compute(self, context, entity_name=None, filter=None, number=None,
                **kwargs):
        if filter is not None and number is not None:
            # Having neither is allowed, though, as there can be a contextual
            # filter. Also, there is no reason to prevent the whole
            # population giving birth, even though the usefulness of such
            # usage seem dubious.
            raise ValueError("new() 'filter' and 'number' arguments are "
                             "mutually exclusive")
        source_entity = context.entity
        if entity_name is None:
            target_entity = source_entity
        else:
            target_entity = context.entities[entity_name]

        # target context is the context where the new individuals will be
        # created
        if target_entity is source_entity:
            target_context = context
        else:
            # we do need to copy the data (.extra) because we will insert into
            # the entity.array anyway => fresh_data=True
            target_context = context.clone(fresh_data=True,
                                           entity_name=target_entity.name)

        filter_expr = self._getfilter(context, filter)
        if filter_expr is not None:
            to_give_birth = expr_eval(filter_expr, context)
            num_birth = to_give_birth.sum()
        elif number is not None:
            to_give_birth = None
            num_birth = number
        else:
            to_give_birth = np.ones(len(context), dtype=bool)
            num_birth = len(context)

        array = target_entity.array

        id_to_rownum = target_entity.id_to_rownum
        num_individuals = len(id_to_rownum)

        children = self._initial_values(array, to_give_birth, num_birth)
        if num_birth:
            children['id'] = np.arange(num_individuals,
                                       num_individuals + num_birth)
            children['period'] = context.period

            used_variables = [v.name for v in
                              self._collect_kwargs_variables(kwargs)]
            if to_give_birth is None:
                assert not used_variables
                child_context = context.empty(num_birth)
            else:
                child_context = context.subset(to_give_birth, used_variables,
                                               filter_expr)
            for k, v in kwargs.iteritems():
                children[k] = expr_eval(v, child_context)

        add_individuals(target_context, children)

        expr_cache.invalidate(context.period, context.entity_name)

        # result is the ids of the new individuals corresponding to the source
        # entity
        if to_give_birth is not None:
            result = np.empty(context_length(context), dtype=int)
            result.fill(-1)
            if source_entity is target_entity:
                extra_bools = np.zeros(num_birth, dtype=bool)
                to_give_birth = np.concatenate((to_give_birth, extra_bools))
            # Note that np.place is a bit faster, but is currently buggy when
            # working with columns of structured arrays.
            # See https://github.com/numpy/numpy/issues/2462
            result[to_give_birth] = children['id']
            return result
        else:
            return None