示例#1
0
 def copyParentSyn(self, parent, keep_debug_info, zerosyn=False):
     """
     Set node SYN (optimized) histograms to those of the parent. Used when it's the only child.
     Also, when parent is empty, set the unrounded histogram
     :param parent:
     :param keep_debug_info:
     :param zerosyn:
     :return:
     """
     if not zerosyn:
         self.syn = parent.syn
         self.unit_syn = parent.unit_syn
     else:
         # Set to zeros if parent is zero
         self.syn = multiSparse(np.zeros(parent.syn.shape, dtype=int))
         self.unit_syn = multiSparse(
             np.zeros(parent.unit_syn.shape, dtype=int))
     if not keep_debug_info:
         # dp_queries (noisy measurements) take a lot of space / memory
         if self.dp_queries:
             self.dp_queries.clear()
         if self.unit_dp_queries:
             self.unit_dp_queries.clear()
     else:
         # If we want to keep dp_queries (noisy measurements) and unrounded results
         self.syn_unrounded = parent.syn_unrounded
     return self
示例#2
0
def getToyGeounitData_GeounitNode(schema,
                                  geocodes=[
                                      '000', '001', '002', '003', '010', '011',
                                      '012', '020', '022'
                                  ],
                                  geocode_dict={
                                      3: 'block',
                                      2: 'county'
                                  },
                                  raw_params={
                                      'low': 0,
                                      'high': 2
                                  },
                                  syn_params={
                                      'low': 0,
                                      'high': 5
                                  }):
    geounits = []
    for geocode in du.aslist(geocodes):
        if raw_params is not None:
            raw = np.random.randint(low=raw_params['low'],
                                    high=raw_params['high'],
                                    size=schema.size).reshape(schema.shape)
        if syn_params is not None:
            syn = np.random.randint(low=syn_params['low'],
                                    high=syn_params['high'],
                                    size=schema.size).reshape(schema.shape)
        geounits.append(
            GeounitNode(geocode=geocode,
                        geocode_dict=geocode_dict,
                        raw=multiSparse(raw),
                        syn=multiSparse(syn)))
    return geounits
示例#3
0
def testdata_random_geounit_generator(geocode, schema, density=0.01, scale=10):
    raw_mat = np.round(
        ss.random(1, schema.size, format='csr', density=density) * scale)
    syn_mat = np.round(
        ss.random(1, schema.size, format='csr', density=density) * scale)
    raw_sparse = sp.multiSparse(raw_mat, schema.shape)
    syn_sparse = sp.multiSparse(syn_mat, schema.shape)
    return {'geocode': geocode, 'raw': raw_sparse, 'syn': syn_sparse}
def test_equal_float(datafloat):
    assert sparse.multiSparse(datafloat) == sparse.multiSparse(datafloat + 1e-6)
    shape = (6, 4, 5)
    size = np.prod(shape)
    a = np.arange(1, size + 1).reshape(shape).astype(np.float)
    a[0,0,0] = np.nan
    b = a + 1e-6
    assert sparse.multiSparse(a) == sparse.multiSparse(b)
示例#5
0
def n():
    geocode_dict = {16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County'}
    histogram = sparse.multiSparse(
        np.array([[[[5, 0], [0, 4]], [[5, 0], [0, 4]]],
                  [[[5, 0], [0, 4]], [[5, 0], [0, 4]]]]))
    housing_hist = sparse.multiSparse((np.array([0, 1, 1, 0, 0, 0, 7, 2])))

    return nodes.GeounitNode(geocode='123456789abcdefg',
                             geocode_dict=geocode_dict,
                             raw=histogram,
                             raw_housing=housing_hist)
示例#6
0
def test_init():
    good_array = np.array([[0, 1, 2], [2, 0, 4]])
    bad_obj = {"a":"bad", "dict":"obj"}
    spar = multiSparse(good_array)
    assert spar.shape == (2, 3)
    assert isinstance(spar.sparse_array,  ss.csr_matrix)
    assert spar.sparse_array.count_nonzero() == 4

    try:
        bad_spar = multiSparse(bad_obj)
        assert False
    except TypeError:
        assert True
    def makeBlockNode(self, person_unit_arrays):
        """
            This function makes block nodes from person unit arrays for a given geocode.

            Inputs:
                config: a configuration object
                person_unit_arrays: a RDD of (geocode, arrays), where arrays are the tables defined in the config

            Output:
                block_node: a nodes.GeounitNode object for the given geocode
        """

        geocode, arrays = person_unit_arrays

        # Assign arrays to table names in a dictionary {name:array} and fill in with zeros if array is non-existent
        assert len(arrays) == len(self.data_names)
        data_dict = {
            n: a if a is not None else np.zeros(self.shape_dict[n]).astype(
                int
            )  # TODO: Wonder if this creation of zeros takes too much time, maybe directly in multisparse?
            for n, a in zip(self.data_names, arrays)
        }

        # geocode is a tuple where the [1] entry is empty. We only want the [0] entry.
        geocode = geocode[0]
        logging.info(f"creating geocode: {geocode}")

        raw = sparse.multiSparse(
            data_dict[self.privacy_table_name],
            shape=self.shape_dict[self.privacy_table_name])
        raw_housing = sparse.multiSparse(
            data_dict[self.constraint_table_name],
            shape=self.shape_dict[self.constraint_table_name])

        # Make Invariants
        invariants_dict = self.setup.makeInvariants(
            raw=raw, raw_housing=raw_housing, invariant_names=self.invar_names)

        # Make Constraints
        constraints_dict = self.setup.makeConstraints(
            hist_shape=(self.setup.hist_shape, self.setup.unit_hist_shape),
            invariants=invariants_dict,
            constraint_names=self.cons_names)

        block_node = nodes.GeounitNode(geocode=geocode,
                                       geocode_dict=self.modified_geocode_dict,
                                       raw=raw,
                                       raw_housing=raw_housing,
                                       cons=constraints_dict,
                                       invar=invariants_dict)
        return block_node
示例#8
0
def test_toDense():
    good_array = np.array([[[0, 1], [2, 0], [0, 3]],
                           [[4, 0], [0, 5], [6, 0]]])
    spar = multiSparse(good_array)
    assert spar.shape == (2, 3, 2)
    undo = spar.toDense()
    assert (undo == good_array).all()
def data(spark):

    geocodeDict = {16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County'}
    bn1 = geounitNode(geocode='4400700010111000',
                      raw=multiSparse(np.array([[1, 2], [3, 4]])),
                      syn=multiSparse(np.array([[1, 1], [0, 7]])),
                      geocodeDict=geocodeDict)

    bn2 = geounitNode(geocode='4400700010111001',
                      raw=multiSparse(np.array([[3, 4], [2, 1]])),
                      syn=multiSparse(np.array([[2, 2], [1, 0]])),
                      geocodeDict=geocodeDict)

    sc = spark.sparkContext

    return sc.parallelize([bn1, bn2])
示例#10
0
def minSchematize(node, array_dims, add_over_margins):
    minSchemaQuery = cenquery.Query(array_dims=array_dims,
                                    subset=None,
                                    add_over_margins=add_over_margins)

    node.raw = sparse.multiSparse(
        minSchemaQuery.answer_original(node.raw.toDense()))
    minSchema_shape = node.raw.shape
    dims_keep = [
        x
        for x in set(range(len(array_dims))).difference(set(add_over_margins))
    ]

    constraint_keys = node.cons.keys()
    for key in constraint_keys:
        node.cons[key].query.array_dims = minSchema_shape
        node.cons[key].query.add_over_margins = tuple([
            x for x in set(dims_keep).intersection(
                set(node.cons[key].query.add_over_margins))
        ])
        node.cons[key].query.subset_input = [
            node.cons[key].query.subset_input[x] for x in dims_keep
        ]
        node.cons[key].query.subset = np.ix_(
            *tuple(node.cons[key].query.subset_input))
        #axis_groupings = () ?? currently no axis groupings in constraints
        print(node.cons[key].query)
        node.cons[key].check_after_update()

    return node
示例#11
0
def test_add2():
    good_array_A = np.array([[0, 1], [2, 3]])
    good_array_B = np.array([[3, 2], [1, 0]])
    bad_array_C = np.array([[2, 4, 4]])

    spar_A = multiSparse(good_array_A)
    spar_B = multiSparse(good_array_B)
    spar_C = multiSparse(bad_array_C)

    spar = spar_A + spar_B
    assert spar.shape == (2, 2)
    assert spar.sparse_array.count_nonzero() == 4
    assert (spar.toDense() != spar_A.toDense()).any()
    assert (spar.toDense() == np.array([[3, 3], [3, 3]])).all()

    try:
        bad_spar = spar_A + spar_B
        assert False
    except AssertionError:
        assert True
示例#12
0
 def Data(self):
     """
         Data in the shape of histograms for 1 Block. Hist shape (2,) (for, e.g., Male, Female).
     """
     b1 = multiSparse(np.array([1, 2]))
     block_nodes = [
         GeounitNode('b1',
                     raw=b1,
                     raw_housing=b1,
                     invar={},
                     cons={},
                     geocode_dict={2: "Block"}),
     ]
     return block_nodes
示例#13
0
def agg_func(config, parent_child_node):
    """
    This function takes a set of parent and child nodes, aggregates the children syn histograms and replaces the parent.syn with the aggregation.
    
    Inputs: 
        config: the config object
        parent_child_node: a list of a parent and it's children nodes
    
    Outputs:
        parent: the parent node
    """
    parent_child_node = list(parent_child_node)
    parent_geocode = parent_child_node[0]
    # a list of the node objects
    nodes = list(list(parent_child_node)[1])

    #calculate the length of each of the geocodes (to determine which is the parent)
    geocode_lens = [len(node.geocode) for node in nodes]
    #the parent is the shortest geocode
    parent = nodes[np.argmin(geocode_lens)]

    #subset the children nodes
    children = nodes[:np.argmin(geocode_lens
                                )] + nodes[np.argmin(geocode_lens) + 1:]
    children = sorted(children,
                      key=lambda geocode_data: int(geocode_data.geocode))
    child_geos = [child.geocode for child in children]

    parent.backup_solve = children[0].parent_backup_solve
    syn_agg = sparse.multiSparse(np.zeros(parent.syn.shape))

    for child in children:
        syn_agg = syn_agg + child.syn
    parent.syn = syn_agg

    return parent
    def sample_histogram(node: GeounitNode, sample_target: int):
        """
        :param node: The input GeounitNode which will receive a new sampled histogram 
        :param sample_target: The size of the target sample population
        :return: The input node with its syn attribute set to the sampled histogram
        """
        assert all([
            node.raw is not None,
            isinstance(node.raw, multiSparse), node.raw.sparse_array
            is not None, node.raw.sparse_array.data is not None
        ])

        # Grab the sparse data array from the node to do work on directly
        # This is in the format of a 1D array
        data_shape = node.raw.shape

        # Get the shape and indices of populated values in the sparse matrix to be able
        # to recreate a new one
        csr_shape = node.raw.sparse_array.shape
        indices = node.raw.sparse_array.indices
        indptr = node.raw.sparse_array.indptr

        # Get the probability vector
        pval = BootstrapEngine.compute_pval(node)

        # Sample from a multinomial of the pval
        sampled_data = numpy.random.multinomial(sample_target, pval)

        # Produce the new CSR matrix and histogram
        new_matrix = ss.csr_matrix((sampled_data, indices, indptr),
                                   shape=csr_shape)
        new_histogram: __HistData__ = multiSparse(new_matrix, shape=data_shape)

        # Set the node's syn attribute
        node.syn = new_histogram
        return node
示例#15
0
 def mfUrData(self, setup_instance):
     """
     Data in the shape of histograms for 3 Rural Blocks in 1 Rural county and 3 Urban blocks in 1 Urban county, all in 1 states
     Histogram is shape (2,) for sex, i.e. each block provides number of male and number of female.
     This is the same test example as in JavaScript simulator.
     """
     rb1 = multiSparse(np.array([1, 2]))
     rb2 = multiSparse(np.array([3, 4]))
     rb3 = multiSparse(np.array([5, 6]))
     ub1 = multiSparse(np.array([101, 102]))
     ub2 = multiSparse(np.array([103, 104]))
     ub3 = multiSparse(np.array([105, 106]))
     block_nodes = []
     for block, geocode in zip(
         [rb1, rb2, rb3, ub1, ub2, ub3],
         ['1RB1', '1RB2', '1RB3', '1UB1', '1UB2', '1UB3']):
         invariants = setup_instance.makeInvariants(
             raw=block,
             raw_housing=block,
             invariant_names=setup_instance.inv_con_by_level['Block']
             ['invar_names'])
         constraints = setup_instance.makeConstraints(
             hist_shape=(2, ),
             invariants=invariants,
             constraint_names=setup_instance.inv_con_by_level['Block']
             ['cons_names'])
         block_nodes.append(
             GeounitNode(geocode,
                         raw=block,
                         raw_housing=block,
                         invar=invariants,
                         cons=constraints,
                         geocode_dict={
                             4: "Block",
                             3: "County",
                             1: "State"
                         }))
     return block_nodes
示例#16
0
    def test_makeAdditionalInvariantsConstraints(self, block_cons, state_cons,
                                                 county_cons):
        class TestSetup(DASDecennialSetup):
            def __init__(self):
                self.hist_shape = (2, )
                self.hist_vars = ("sex", )
                self.validate_input_data_constraints = False
                self.inv_con_by_level = {
                    'Block': {
                        'invar_names': ('tot', ) if block_cons else (),
                        'cons_names': ('total', ) if block_cons else (),
                    },
                    'County': {
                        'invar_names': ('tot', ) if county_cons else (),
                        'cons_names': ('total', ) if county_cons else (),
                    },
                    'State': {
                        'invar_names': ('tot', ) if state_cons else (),
                        'cons_names': ('total', ) if state_cons else ()
                    }
                }

            @staticmethod
            def makeInvariants(raw, raw_housing, invariant_names):
                inv_dict = {}
                if 'tot' in invariant_names:
                    inv_dict.update({'tot': np.sum(raw.toDense())})
                return inv_dict

            @staticmethod
            def makeConstraints(hist_shape, invariants, constraint_names):
                cons_dict = {}
                if 'total' in constraint_names:
                    cons_dict.update({
                        'total':
                        Constraint(
                            MultiHistQuery((QueryFactory.makeTabularGroupQuery(
                                (2, ), add_over_margins=(0, )),
                                            StubQuery(
                                                (2, 1), "stub")), (1, 0)),
                            np.array(invariants['tot']), "=", "total")
                    })
                return cons_dict

        setup_instance = TestSetup()
        rb1 = sparse.multiSparse(np.array([1, 2]))
        rb2 = sparse.multiSparse(np.array([3, 4]))
        rb3 = sparse.multiSparse(np.array([5, 6]))
        ub1 = sparse.multiSparse(np.array([101, 102]))
        ub2 = sparse.multiSparse(np.array([103, 104]))
        ub3 = sparse.multiSparse(np.array([105, 106]))

        block_nodes = []
        for block, geocode in zip(
            [rb1, rb2, rb3, ub1, ub2, ub3],
            ['1RB1', '1RB2', '1RB3', '1UB1', '1UB2', '1UB3']):
            invariants = setup_instance.makeInvariants(
                raw=block,
                raw_housing=block,
                invariant_names=setup_instance.inv_con_by_level['Block']
                ['invar_names'])
            constraints = setup_instance.makeConstraints(
                hist_shape=(2, ),
                invariants=invariants,
                constraint_names=setup_instance.inv_con_by_level['Block']
                ['cons_names'])
            block_nodes.append(
                GeounitNode(geocode,
                            raw=block,
                            raw_housing=block,
                            invar=invariants,
                            cons=constraints,
                            geocode_dict={
                                4: "Block",
                                3: "County",
                                1: "State"
                            }))

        rc = block_nodes[0].addInReduce(block_nodes[1]).addInReduce(
            block_nodes[2]).shiftGeocodesUp()
        rc.makeAdditionalInvariantsConstraints(setup_instance)
        uc = block_nodes[3].addInReduce(block_nodes[4]).addInReduce(
            block_nodes[5]).shiftGeocodesUp()
        uc.makeAdditionalInvariantsConstraints(setup_instance)
        state = rc.addInReduce(uc).shiftGeocodesUp()
        state.makeAdditionalInvariantsConstraints(setup_instance)

        assert state.checkConstraints()
        assert rc.checkConstraints()
        assert uc.checkConstraints()
def geoimp_wrapper_nat(*,
                       config,
                       parent_shape,
                       nat_node: GeounitNode,
                       min_schema=None):
    """
    This function performs the Post-Processing Step of National to National level.
    It is called from engine_utils.py:topdown in a Spark map operation

    Inputs:
        config: configuration object
        nat_node: a GeounitNode object referring to entire nation

    Output:
        nat_node: a GeounitNode object referring to entire nation
    """

    # Make sure that the logger is set up on all of the nodes
    clogging.setup(level=logging.INFO,
                   syslog=True,
                   syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP))
    # t_start = time.time()
    parent_hist = None

    noisy_child = np.expand_dims(
        asDense(nat_node.dp.DPanswer), axis=len(
            nat_node.dp.DPanswer.shape)) if nat_node.dp else None
    noisy_child_weight = 1. / nat_node.dp.Var if nat_node.dp else None
    parent_geocode = "nat_to_nat"

    # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater
    dp_queries_comb = stackNodeProperties([
        nat_node,
    ], lambda node: node.dp_queries, cons_dpq.StackedDPquery,
                                          lambda name: name != C.DETAILED)
    query_weights = map(
        lambda sdpq: 1. / sdpq.Var, dp_queries_comb
    )  # We can get actual variance for each query if we want
    constraints_comb = stackNodeProperties([
        nat_node,
    ], lambda node: node.cons, cons_dpq.StackedConstraint)

    # Create an L2PlusRounderWithBackup object
    seq_opt = sequential_optimizers.L2PlusRounderWithBackup(
        das=None,
        parent=parent_hist,
        parent_shape=parent_shape,
        NoisyChild=noisy_child,
        childGeoLen=1,
        config=config,
        DPqueries=dp_queries_comb,
        constraints=constraints_comb,
        NoisyChild_weight=noisy_child_weight,
        query_weights=query_weights,
        identifier="nat_to_nat",
        min_schema=min_schema,
        stat_node=nat_node)

    l2_answer, int_answer, backup_solve_status = seq_opt.run()

    # get rid of extra dimension
    int_answer = int_answer.squeeze()
    l2_answer = l2_answer.squeeze()

    nat_node.syn = int_answer
    constraintsCheck(nat_node, parent_geocode)

    nat_node.syn = sparse.multiSparse(int_answer)
    nat_node.syn_unrounded = sparse.multiSparse(l2_answer)
    return nat_node
示例#18
0
 def conform2PL94(node: GeounitNode):
     DP_counts = node.getDenseSyn()
     PL94_counts = node.invar['pl94counts']
     node.syn = multiSparse(
         np.where(DP_counts > PL94_counts, PL94_counts, DP_counts))
     return node
def test_square(dataint, datafloat):
    assert sparse.multiSparse(dataint).square() == sparse.multiSparse(np.square(dataint))
    assert sparse.multiSparse(datafloat).square() == sparse.multiSparse(np.square(datafloat))
def test_sum(dataint, datafloat):
    assert sparse.multiSparse(dataint).sum() == np.sum(dataint)
    assert np.isclose(sparse.multiSparse(datafloat).sum(), np.sum(datafloat))
    assert np.array_equal(sparse.multiSparse(dataint).sum(dims = (1,2)), dataint.sum((1,2)))
    assert np.isclose(sparse.multiSparse(datafloat).sum(dims=(1, 2)), datafloat.sum((1, 2))).all()
示例#21
0
def makeInputsAndRunOptimizer(children,
                              config,
                              min_schema,
                              parent_hist,
                              parent_shape,
                              parent_geocode,
                              optimizers,
                              keep_debug_info=False,
                              aian=False):
    """
    Converts the data from nodes to the inputs taken by optimizer: multiarrays, StackedConstraints, StackedDPQueries etc.,
    creates the optimizer, runs it, and puts the optimized answers back into the nodes

    This is called from:
         * geoimp_wrapper_root().
         * geoimp_wrapper()
    :param optimizers:
    :param children: iterable (list or multiarray) of children noisy histograms (i.e. detailed query measurements, aka noisy counts)
    :param config: DAS config file
    :param min_schema: backup feasibility schema (reduced schema through which constraints can be expressed)
    :param parent_hist: optimized histogram of the parent node
    :param parent_shape: shape of the parent histogram (children have the same shape too)
    :param parent_geocode: parent geocode
    :param keep_debug_info: whether to delete DPqueries after optimization (they take a lot of space) and not include unrounded optimized data into the node
    :return: list of optimized children nodes and accumulator count of backup feasibility triggers
    """

    if config.getboolean(section=CC.ENGINE,
                         option="reset_dpq_weights",
                         fallback=False):
        variances = []
        for child in children:
            variances.extend(child.getAllVariances())
        min_var = min(variances)
        children = [
            child.setDPQVar(func=lambda v: v / min_var) for child in children
        ]

    # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config
    # # Have to set up the total US population as invariant, and turn of State
    # if aian:
    #     for child in children:
    #         child.removeConstraintByName('total')

    child_groups = makeChildGroups(children) if aian else None

    # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config
    # # Have to set up the total US population as invariant, and turn of State
    # if aian:
    #     for child in children:
    #         child.removeConstraintByName('total')

    # Get the stacked detailed dp_queries (if we've taken detailed measurements), as well as their weights. If only one child, just expand.
    noisy_child = np.stack(
        [child.stackDetailedDPAnswers(parent_shape) for child in children],
        axis=-1) if children[0].dp else None
    noisy_child_weights = [child.detailedWeight() for child in children]
    constraints_comb = stackNodeProperties(children, lambda node: node.cons,
                                           cons_dpq.StackedConstraint)
    dp_queries_comb = []
    # A loop over histograms. Each iteration goes over children (stackNodeProperties does that) and gets the dp_queries dict
    # corresponding to that histogram and stacks them
    for i in range(len(parent_shape)):
        dp_queries_comb.append(
            stackNodeProperties(children,
                                lambda node: node.querySets2Stack()[i],
                                cons_dpq.StackedDPquery))
    # TODO: Note that multipass rounder queries only support the main histogram currently (hence no loop below).
    #  May be necessary for full-scale DHCH to expand this to support the full histogram
    rounder_queries_comb = [
        stackNodeProperties(children, lambda node: node.rounder_queries,
                            cons_dpq.StackedQuery)
    ]

    opt_dict = {
        "Cons":
        stackNodeProperties(children, lambda node: node.opt_dict["Cons"],
                            cons_dpq.StackedConstraint),
        "npass_info":
        children[0].opt_dict["npass_info"],
    } if children[0].opt_dict is not None else None

    sequential_optimizers_dict = {
        CC.L2_PLUS_ROUNDER_WITH_BACKUP:
        sequential_optimizers.L2PlusRounderWithBackup,
        CC.L2_PLUS_ROUNDER_WITH_BACKUP_INTERLEAVED:
        sequential_optimizers.L2PlusRounderWithBackup_interleaved,
    }

    seq_opt_name, l2_opt, rounder = optimizers
    seq_opt_cls = sequential_optimizers_dict[seq_opt_name]

    try:
        l2c2o = children[0].query_ordering[CC.L2_CONSTRAIN_TO_QUERY_ORDERING]
    except KeyError:
        l2c2o = None

    # Create an appropriate sequential optimizer object
    seq_opt = seq_opt_cls(
        identifier=parent_geocode,
        child_geolevel=children[0].geolevel,
        parent=parent_hist,
        parent_shape=parent_shape,
        childGeoLen=len(children),
        constraints=constraints_comb,
        NoisyChild=noisy_child,
        noisy_child_weights=noisy_child_weights,
        DPqueries=dp_queries_comb,
        rounder_queries=rounder_queries_comb,
        min_schema=(min_schema, False),
        child_groups=child_groups,
        opt_dict=opt_dict,
        L2_DPqueryOrdering=children[0].query_ordering[CC.L2_QUERY_ORDERING],
        L2_Constrain_to_Ordering=l2c2o,
        Rounder_DPqueryOrdering=children[0].query_ordering[
            CC.ROUNDER_QUERY_ORDERING],
        optimizers=(l2_opt, rounder),
        das=None,
        config=config)

    l2_answer, int_answer, backup_solve_status = seq_opt.run()

    # Slice off the combined child solution to make lists of ndarrays, with one element for each child
    int_answer_list = np_utils.sliceArray(int_answer[0])
    unit_int_answer_list = np_utils.sliceArray(int_answer[1])
    l2_answer_list = np_utils.sliceArray(l2_answer[0])

    for i, child in enumerate(children):
        child.syn = int_answer_list[i]
        child.unit_syn = unit_int_answer_list[i]
    constraintsCheck(children)

    # Convert to sparse arrays for efficiency
    for i, child in enumerate(children):
        child.syn = sparse.multiSparse(int_answer_list[i])
        child.unit_syn = sparse.multiSparse(unit_int_answer_list[i])
        if keep_debug_info:
            child.syn_unrounded = sparse.multiSparse(l2_answer_list[i])
        else:
            child.dp_queries.clear()
    return children, backup_solve_status
示例#22
0
def geoimp_wrapper_nat(config, nat_node):
    """
    This function performs the Post-Processing Step of National to National level.

    Inputs:
        config: configuration object
        nat_node: a geounitNode object referring to entire nation

    Output:
        nat_node: a geounitNode object referring to entire nation
    """
    import programs.engine.geoimpgbopt as geoimpgbopt
    parent_hist = None
    NoisyChild = np.expand_dims(nat_node.dp.DPanswer,
                                axis=len(nat_node.dp.DPanswer.shape))
    NoisyChild_weight = 1 / nat_node.dp.Var
    child_geos = nat_node.geocode
    parent_geocode = "nat_to_nat"
    #what if DPqueries is empty {}?
    DPqueries = nat_node.dp_queries.values()

    if any(nat_node.cons) == False:
        constraints = None
    else:
        constraints = nat_node.cons.values()

    query_weights = []

    # need to add a dimension for geography to the object
    for x in DPqueries:
        x.query.array_dims = NoisyChild.shape
        x.query.subset_input = tuple(list(x.query.subset_input) + [[0]])
        x.query.subset = np.ix_(*x.query.subset_input)
        x.DPanswer = np.expand_dims(x.DPanswer, axis=len(x.DPanswer.shape))
        x.check_after_update()
        weight = 1 / x.Var
        query_weights.append(weight)

    # if no DPqueries, change this to an empty list
    if any(DPqueries) == False:
        DPqueries = []
        query_weights = None

    # need to add a dimension for geography to the object
    if constraints is not None:
        for x in constraints:
            x.query.array_dims = NoisyChild.shape
            x.query.subset_input = tuple(list(x.query.subset_input) + [[0]])
            x.query.subset = np.ix_(*x.query.subset_input)
            x.rhs = np.expand_dims(x.rhs, axis=len(x.rhs.shape))
            x.check_after_update()

    #this is the actual post-processing optimization step
    l2_answer, int_answer, backup_solve_status = geoimpgbopt.L2geoimp_wrapper(
        config=config,
        parent=parent_hist,
        NoisyChild=NoisyChild,
        NoisyChild_weight=NoisyChild_weight,
        DPqueries=DPqueries,
        query_weights=query_weights,
        constraints=constraints,
        identifier="nat_to_nat")

    if constraints is not None:
        check = True
        for x in constraints:
            check = bool(np.prod(x.check(int_answer)) * check)
        print("constraints are ", check, "for parent geocode ", parent_geocode)

    #get rid of extra dimension
    nat_node.syn = sparse.multiSparse(int_answer.squeeze())
    nat_node.syn_unrounded = sparse.multiSparse(l2_answer.squeeze())

    return (nat_node)
示例#23
0
    def make_block_node(self, person_unit_arrays):
        """
            This function makes block nodes from person unit arrays for a given geocode.

            args:
                person_unit_arrays - a key, value pair of (geocode, arrays),
                                    where arrays are the histograms defined in the config

            returns: block_node - a nodes.geounitNode object for the given geocode
        """
        geocode, arrays = person_unit_arrays
        arrays = list(arrays)
        gqhhvacs = arrays[1].astype(int)
        arrays[1] = arrays[1][:-1]

        # Assign arrays to table names in a dictionary and fill in with zeros if array is non-existent
        assert len(arrays) == len(self.data_names)
        data_dict = {n: a.astype(int) if a is not None else np.zeros(self.person_hist_dimensions).astype(int) for n,a in zip(self.data_names, arrays)}
        #for name in data_dict:
        #if data_dict[self.privacy_table_name] is None:
        #    data_dict[self.privacy_table_name] = np.zeros(self.person_hist_dimensions).astype(int)

        # data_dict = {}
        # for i in range(len(arrays)):
        #     data_dict[self.data_names[i]] = arrays[i].astype(int) if arrays[i] is not None else np.zeros(self.person_hist_dimensions).astype(int)

        # geocode is a tuple where the [1] entry is empty. We only want the [0] entry.
        geocode = geocode[0]
        logging.info("creating geocode: %s" % geocode)

        housing_table_name = self.housing_table_name
        privacy_table_name = self.privacy_table_name

        raw = sparse.multiSparse(data_dict[privacy_table_name].astype(int))
        raw_housing = sparse.multiSparse(data_dict[housing_table_name].astype(int))
        levels = tuple(self.config["geodict"]["geolevel_names"].split(","))

        invar_names = tuple(self.config[CONSTRAINTS][THE_INVARIANTS+"."+levels[0]].split(","))
        if invar_names == ("",):
            invariants_dict = {}
        else:
            invariants_dict = self.InvariantsCreator(raw=raw, raw_housing=raw_housing, invariant_names=invar_names).calculateInvariants().invariants_dict
        invariants_dict["gqhhvacs_vect"] = gqhhvacs #not used for constraints, but must be passed through. don't need to add hhvacs to node signature anymore this way.

        cons_names = tuple(self.config[CONSTRAINTS][THE_CONSTRAINTS+"."+levels[0]].split(","))
        

        # Make Constraints
        if cons_names == ("",):
            constraints_dict = {}
        else:
            constraints_dict = self.ConstraintsCreator(hist_shape=data_dict[self.privacy_table_name].shape,
                                                   invariants=invariants_dict,
                                                       constraint_names=cons_names)\
            .calculateConstraints().constraints_dict

        #raw = data_dict[self.privacy_table_name].astype(int)
        
        block_node = nodes.geounitNode(geocode=geocode, geocodeDict=self.geocodeDict, raw=raw, raw_housing=raw_housing,
                                       cons=constraints_dict, invar=invariants_dict)
        return block_node
def make_block_node(config, person_unit_arrays, dim):
    """
        This function makes block nodes from person unit arrays for a given geocode.

        Inputs:
            config: a configuration object
            person_unit_arrays: a RDD of (geocode, arrays), where arrays are the tables defined in the config

        Output:
            block_node: a nodes.geounitNode object for the given geocode
    """

    #import invariants_module
    (file,
     invariants_class_name) = config[CONSTRAINTS][INVARIANTS].rsplit(".", 1)
    invariants_module = __import__(file, fromlist=[invariants_class_name])

    #import constraints_module
    (file, class_name) = config[CONSTRAINTS][CONSTRAINTS].rsplit(".", 1)
    constraints_module = __import__(file, fromlist=[class_name])

    # Get the names of tables in person_unit_arrays.
    data_names = [config[READER][PTABLE]] + config[READER][CTABLES].split(",")
    geocode, arrays = person_unit_arrays

    data_dict = {}
    for i in range(len(arrays)):
        data_dict[data_names[i]] = arrays[i].astype(
            int) if arrays[i] is not None else np.zeros(dim).astype(int)

    # geocode is a tuple where the [1] entry is empty. We only want the [0] entry.
    geocode = geocode[0]
    logging.info("creating geocode: %s" % geocode)

    # Make Invariants
    invar_names = tuple(config[CONSTRAINTS][THEINVARIANTS].split(","))

    invariants_dict = getattr(invariants_module, invariants_class_name)(
        data_dict=data_dict,
        invariant_names=invar_names).calculateInvariants().invariants_dict

    #invariants = {}
    #for name in invar_names:
    #dataset_name = config[CONSTRAINTS]["{}.{}".format(name, DATA)]
    #subset = eval(config[CONSTRAINTS]["{}.{}".format(name, SUBSET)])
    #if config[CONSTRAINTS]["{}.{}".format(name, MARGINS)] == "None":
    #add_over_margins = None
    #else:
    #add_over_margins = tuple(
    #int(x) for x in config[CONSTRAINTS]["{}.{}".format(name, MARGINS)].split(","))
    #query = cenquery.Query(array_dims=data_dict[dataset_name].shape,
    #subset=subset, add_over_margins=add_over_margins)
    #invariants[name] = np.array(query.answer(data_dict[dataset_name])).astype(int)

    # Make Constraints
    privacy_table_name = config[READER][PTABLE]
    cons_names = tuple(config[CONSTRAINTS]["theConstraints"].split(","))
    hist_shape = data_dict[privacy_table_name].shape
    constraints_dict = getattr(constraints_module, class_name)(
        hist_shape=hist_shape,
        invariants=invariants_dict,
        constraint_names=cons_names).calculateConstraints().constraints_dict

    raw = data_dict[privacy_table_name].astype(int)

    block_node = nodes.geounitNode(geocode=geocode,
                                   config=config,
                                   raw=sparse.multiSparse(raw),
                                   cons=constraints_dict,
                                   invar=invariants_dict)

    return block_node
def test_abs(dataint, datafloat):
    assert sparse.multiSparse(dataint).abs() == sparse.multiSparse(np.abs(dataint))
    assert sparse.multiSparse(datafloat).abs() == sparse.multiSparse(np.abs(datafloat))
def geoimp_wrapper(*, config, parent_child_node, accum, min_schema=None):
    """
    This function performs the Post-Processing Step for a generic parent to the Child geography.
    It is called from topdown_engine.py:topdown in a Spark map operation. 
    It runs on the CORE and TASK nodes, not on the MASTER.
    So there is no das object!
    
    Inputs:
        config: configuration object
        parent_child_node: a (k,v) RDD with key being a geocode and
            value being a tuple of GeounitNode objects containing one parent and multiple children
        accum: spark accumulator object which tracks the number of solves that use the backup solve

    Output:
        children: a list of Node objects for each of the children, after post-processing
    """

    # Make sure that the logger is set up on all the nodes
    clogging.setup(level=logging.INFO,
                   syslog='True',
                   syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP))
    parent: GeounitNode
    children: List[GeounitNode]
    parent, children = findParentChildNodes(parent_child_node)

    n_children = len(children)

    #######
    # under cenrtain circumstances we can skip the gurobi optimization
    #######
    #
    # Only 1 child

    if n_children == 1:
        children[0].syn = parent.syn
        return children

    if parent.syn.sum() == 0:
        for child in children:
            child.syn = sparse.multiSparse(np.zeros(parent.syn.shape))
        return children

    #########
    # resume code for gurobi optimization
    ########
    # stack the dp arrays on top of one another, if only 1 child just expand the axis

    if parent.dp:
        if n_children > 1:
            noisy_child = np.stack(
                [asDense(child.dp.DPanswer) for child in children], axis=-1)
        else:
            noisy_child = np.expand_dims(asDense(children[0].dp.DPanswer),
                                         axis=len(
                                             children[0].dp.DPanswer.shape))
    else:
        noisy_child = None

    noisy_child_weight = 1. / children[0].dp.Var if parent.dp else None

    # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater
    dp_queries_comb = stackNodeProperties(children,
                                          lambda node: node.dp_queries,
                                          cons_dpq.StackedDPquery,
                                          lambda name: name != C.DETAILED)
    query_weights = map(
        lambda sdpq: 1. / sdpq.Var, dp_queries_comb
    )  # We can get actual variance for each query if we want
    constraints_comb = stackNodeProperties(children, lambda node: node.cons,
                                           cons_dpq.StackedConstraint)
    parent_hist = parent.getDenseSyn()
    parent_geocode = parent.geocode

    seq_opt = sequential_optimizers.L2PlusRounderWithBackup(
        das=None,
        config=config,
        parent=parent_hist,
        parent_shape=parent_hist.shape,
        NoisyChild=noisy_child,
        childGeoLen=n_children,
        DPqueries=dp_queries_comb,
        constraints=constraints_comb,
        NoisyChild_weight=noisy_child_weight,
        query_weights=query_weights,
        identifier=parent_geocode,
        min_schema=min_schema,
        stat_node=children[0])

    l2_answer, int_answer, backup_solve_status = seq_opt.run()

    # slice off the combined child solution to make separate arrays for each child
    int_answer_list = np_utils.sliceArray(int_answer)
    l2_answer_list = np_utils.sliceArray(l2_answer)

    # check constraints
    for i, child in enumerate(children):
        child.syn = int_answer_list[i]
        constraintsCheck(child)

    # make sparse arrays
    for i, child in enumerate(children):
        child.syn = sparse.multiSparse(int_answer_list[i])
        child.syn_unrounded = sparse.multiSparse(l2_answer_list[i])

    if backup_solve_status is True:
        accum += 1

    return children
示例#27
0
def test_to_list_from_sparse():
    spar_obj = multiSparse(np.array([[2, 3, 0], [0, 0, 1]]))
    assert to_list_from_sparse(spar_obj) == [((0, 0), 2), ((0, 1), 3),
                                             ((1, 2), 1)]
示例#28
0
def geoimp_wrapper(config, parent_child_node, accum):
    """
    This function performs the Post-Processing Step for a generic parent to the Child geography.
    
    Inputs:
        config: configuration object
        parent_child_node: a collection of geounitNode objects containing one parent and multiple child
        accum: spark accumulator object

    Output:
        children: a collection of geounitNode objects for each of the children, after post-processing
    """

    import programs.engine.geoimpgbopt as geoimpgbopt
    from itertools import compress

    parent_child_node = list(parent_child_node)
    parent_geocode = parent_child_node[0]
    print("parent geocode is", parent_geocode)
    # a list of the node objects
    nodes = list(list(parent_child_node)[1])

    #calculate the length of each of the geocodes (to determine which is the parent)
    geocode_lens = [len(node.geocode) for node in nodes]
    #the parent is the shortest geocode
    parent = nodes[np.argmin(geocode_lens)]

    #subset the children nodes
    children = nodes[:np.argmin(geocode_lens
                                )] + nodes[np.argmin(geocode_lens) + 1:]
    children = sorted(children,
                      key=lambda geocode_data: int(geocode_data.geocode))
    child_geos = [child.geocode for child in children]
    n_children = len(child_geos)

    #stack the dp arrays on top of one another, if only 1 child just expand the axis
    if n_children > 1:
        NoisyChild = np.stack([child.dp.DPanswer for child in children],
                              axis=-1)
    else:
        NoisyChild = np.expand_dims(children[0].dp.DPanswer,
                                    axis=len(children[0].dp.DPanswer.shape))

    #combine DPqueries without geography to combined DPqueries with geography
    #if no DPqueries, change this to an empty list
    if any(children[0].dp_queries) == False:
        DPqueries_comb = []
    else:
        DPqueries = list(list(child.dp_queries.values()) for child in children)
        n_q = len(DPqueries[0])
        DPqueries_comb = []
        for i in range(n_q):
            subset_input = tuple(
                list(DPqueries[0][i].query.subset_input) +
                [range(NoisyChild.shape[-1])])
            query = cenquery.Query(
                array_dims=NoisyChild.shape,
                subset=subset_input,
                add_over_margins=DPqueries[0][i].query.add_over_margins)
            q_answer = np.stack([DPquery[i].DPanswer for DPquery in DPqueries],
                                axis=-1)
            DP_query = cenquery.DPquery(query=query, DPanswer=q_answer)
            DPqueries_comb.append(DP_query)

    #delete redundant union constraints
    #which gq cat are non-zero

    #combine cenquery.Constraint objects without geography to build combined cenquery.Constraint
    constraints_comb = []
    #now children may have different constraints. only combine the ones that match.
    if any(children[0].cons) == False:
        constraints_comb = None
    else:
        all_keys = []
        for child in children:
            all_keys.extend(list(child.cons.keys()))
        #subset to unique names
        constraint_keys = tuple(list(set(all_keys)))

        #children is a list of nodes
        for key in constraint_keys:
            #make a list of individual constraints for all children who have them
            #find which children have the key
            ind = [key in child.cons.keys() for child in children]
            #children_sub is subset of children with that key
            children_sub = list(compress(children, ind))
            constraints = list(child.cons[key] for child in children_sub)

            #get the list of geos that have this constraint
            subset_geos = list(compress(range(NoisyChild.shape[-1]), ind))
            subset_input = tuple(
                list(constraints[0].query.subset_input) + [
                    subset_geos,
                ])
            query = cenquery.Query(
                array_dims=NoisyChild.shape,
                subset=subset_input,
                add_over_margins=constraints[0].query.add_over_margins)
            rhs = np.stack([con.rhs for con in constraints], axis=-1)
            constraint = cenquery.Constraint(query=query,
                                             rhs=rhs,
                                             sign=constraints[0].sign,
                                             name=constraints[0].name)
            constraints_comb.append(constraint)

    parent_hist = parent.syn.toDense()
    parent_geocode = parent.geocode
    parent_constraints = parent.cons  #for checking purposes

    #this is the actual post-processing optimization step
    l2_answer, int_answer, backup_solve_status = geoimpgbopt.L2geoimp_wrapper(
        config=config,
        parent=parent_hist,
        NoisyChild=NoisyChild,
        DPqueries=DPqueries_comb,
        constraints=constraints_comb,
        identifier=parent_geocode,
        parent_constraints=parent_constraints)

    #check constraints
    if constraints_comb is not None:
        check = True
        for x in constraints_comb:
            check = bool(np.prod(x.check(int_answer)) * check)
        print("constraints are ", check, "for parent geocode ", parent_geocode)

    temps = []
    for i in range(len(child_geos)):
        temp = int_answer[tuple(
            [
                slice(0, int_answer.shape[x])
                for x in range(len(int_answer.shape) - 1)
            ] + [slice(i, i + 1)]
        )]  #this is really ugly - feel free to improve, trying to subset to each geography
        temp = temp.squeeze()  #gets rid of dimensions of size 1
        temps.append(temp)

    #do this for unrounded too
    temps2 = []
    for i in range(len(child_geos)):
        temp2 = l2_answer[tuple(
            [
                slice(0, l2_answer.shape[x])
                for x in range(len(l2_answer.shape) - 1)
            ] + [slice(i, i + 1)]
        )]  #this is really ugly - feel free to improve, trying to subset to each geography
        temp2 = temp2.squeeze()  #gets rid of dimensions of size 1
        temps2.append(temp2)

    for i, geocode in enumerate(child_geos):
        children[i].syn = sparse.multiSparse(temps[i])
        children[i].syn_unrounded = sparse.multiSparse(temps2[i])

    if backup_solve_status == True:
        accum += 1

    return (children)
示例#29
0
def sparse_data(data):
    """ set up sparse data for testing """
    return sparse.multiSparse(data)
def test_sub(dataint, datafloat):
    assert sparse.multiSparse(dataint) - sparse.multiSparse(dataint) == sparse.multiSparse(dataint*0)
    assert sparse.multiSparse(datafloat) - sparse.multiSparse(datafloat) == sparse.multiSparse(datafloat * 0)