예제 #1
0
    def optimizeWorkload(self):
        """
        Gathers the workload and optimizes to find the strategy
        """

        # Load previously calculated strategy if requested
        if self.getboolean(LOAD_HDMM_STRATEGY, section=CC.HDMM, default=False):
            saved_strategy_path = self.getconfig(LOAD_HDMM_STRATEGY_PATH,
                                                 section=CC.HDMM,
                                                 default='')
            if not saved_strategy_path:
                try:
                    saved_strategy_path = self.das.writer.output_path
                except (NoOptionError, NoSectionError) as err:
                    raise DASConfigError(
                        f"Load HDMM strategy is requested but neither [{CC.HDMM}]/{LOAD_HDMM_STRATEGY_PATH}, nor [{err.section}]/{err.option} are provided to find the saved strategy",
                        err.option, err.section)
            load_path = os.path.join(saved_strategy_path, SAVED_STRATEGY_NAME)
            self.log_and_print(f"Loading HDMM strategy from {load_path}...")
            try:
                strategy = das_utils.loadPickleFile(load_path)
                self.queries_dict["hdmm_strategy"] = self.strategy2query(
                    strategy)
                return
            except (FileNotFoundError, EOFError):
                self.log_and_print(
                    f"File with saved HDMM strategy {load_path} not found, re-calculating strategy...",
                    cui=False)

        # Calculated strategy
        W = self.dict2workload(self.workload_queries_dict)
        if self.strategy_type == CC.PIDENTITY_STRATEGY:
            strategy = self.findPIdentityStrategy()
        elif self.strategy_type == CC.MARGINAL_STRATEGY:
            strategy = self.findMarginalStrategy()
        else:
            raise DASValueError(
                f"Strategy type '{self.strategy_type}' not implemented in das_decennial HDMM engine",
                self.strategy_type)

        # run optimization
        ans = strategy.optimize(W)

        # Save strategy if requested
        if self.getboolean(SAVE_HDMM_STRATEGY, section=CC.HDMM):
            try:
                output_path = self.das.writer.output_path
                save_path = os.path.join(output_path, SAVED_STRATEGY_NAME)
            except (NoOptionError, NoSectionError) as err:
                raise DASConfigError(
                    f"Save HDMM strategy is requested but saving path [{err.section}]/{err.option} is not provided",
                    err.option, err.section)
            das_utils.savePickleFile(save_path, strategy)

        self.queries_dict["hdmm_strategy"] = self.strategy2query(strategy)
예제 #2
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.mechanism = primitives.basic_dp_answer
        self.dp_queries = False
        try:
            self.strategy_type: str = self.getconfig(CC.HDMM_STRATEGY_TYPE,
                                                     section=CC.HDMM)
        except NoSectionError as err:
            raise DASConfigError(
                f"HDMM engine requires '{CC.HDMM_STRATEGY_TYPE}' to be set up in [{err.section}] section",
                None, err.section)
        except NoOptionError as err:
            raise DASConfigError(
                f"MHDMM engine requires '{err.option}' to be set up in [{err.section}] section",
                err.option, err.section)
        print("HDMM Strategy type: ", self.strategy_type)
예제 #3
0
    def setWorkload(self):
        try:
            self.workload = list(
                self.gettuple(CC.WORKLOAD,
                              section=CC.WORKLOAD,
                              sep=CC.REGEX_CONFIG_DELIM))
        except NoSectionError as err:
            raise DASConfigError(
                f"HDMM engine requires '{CC.WORKLOAD}' to be set up in [{err.section}] section",
                None, err.section)
        except NoOptionError as err:
            raise DASConfigError(
                f"HDMM engine requires '{err.option}' to be set up in [{err.section}] section",
                err.option, err.section)
        print("workload: ", self.workload)

        self.workload_queries_dict = make_workloads.WorkloadQueriesCreator(
            self.setup.schema_obj, self.workload).workload_queries_dict

        self.queries_dict = {}
예제 #4
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.data_shape = None
     tv = set(map(lambda v: v.name, self.variables))
     vv = set(
         map(lambda v: v.name,
             self.recode_variables)) if self.recode_variables else set()
     tv_and_vv = tv.union(vv)
     for varname in self.histogram_variables:
         if varname not in tv_and_vv:
             msg = f"Histogram variable {varname} for {self.name} is neither in table variables nor in recode variables, check your config file"
             raise DASConfigError(
                 msg,
                 f"{self.name}.{CC.HISTOGRAM}/{self.name}.{CC.GEOGRAPHY}",
                 CC.READER)
예제 #5
0
    def make_from_config(self, config):
        """
            This will set the attributes reading from the config file.
        """
        try:
            vtype = config.get(CC.READER, f"{self.name}.{CC.VAR_TYPE}")
            legal_values = config.get(CC.READER, f"{self.name}.{CC.LEGAL}")
        except NoOptionError as e:
            raise DASConfigError(
                f"Missing variable {self.name} specifications", *e.args)

        self.set_vtype(vtype)
        self.set_legal_values(legal_values)

        return self
예제 #6
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.writer_names = self.gettuple(CC.MULTIWRITER_WRITERS)
        self.writers = []
        for i, wname in enumerate(self.writer_names):
            try:
                writer_class = self.SUPPORTED_WRITERS[wname]
            except KeyError:
                raise DASConfigError(
                    f"Writer {wname} is not supported in MultiWriter",
                    CC.MULTIWRITER_WRITERS, CC.WRITER)
            postfix = self.getconfig(CC.OUTPUT_DATAFILE_NAME, default='data')
            w = writer_class(name=CC.WRITER,
                             config=self.config,
                             setup=self.setup,
                             das=self.das)
            assert isinstance(w, DASDecennialWriter)

            w.setOutputFileDataName(f"{postfix}-{wname}")
            if i > 0:
                w.unsetOverwriteFlag()
            self.writers.append(w)
예제 #7
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        if self.getboolean(CC.PRE_RELEASE, default=False):
            import programs.pre_release_datadict as datadict
        else:
            import programs.datadict as datadict

        self.datadict = datadict

        # Whether to run in spark (there is a local/serial mode, for testing and debugging)
        self.use_spark = self.getboolean(CC.SPARK,
                                         section=CC.ENGINE,
                                         default=True)

        # Geographical level names
        self.levels = self.gettuple(CC.GEODICT_GEOLEVELS, section=CC.GEODICT)

        # Bottom level geolevel we are interested in. Defaults to the first level in self.levels, which is the lowest
        self.geo_bottomlevel = self.getconfig(CC.GEO_BOTTOMLEVEL,
                                              section=CC.GEODICT,
                                              default=self.levels[0])

        self.only_dyadic_rationals = self.getboolean(CC.ONLY_DYADIC_RATIONALS,
                                                     section=CC.BUDGET,
                                                     default=False)
        # self.geolevel_prop_budgets = self.gettuple_of_fraction2floats(CC.GEOLEVEL_BUDGET_PROP, section=CC.BUDGET, sep=CC.REGEX_CONFIG_DELIM)
        self.geolevel_prop_budgets = self.gettuple_of_fractions(
            CC.GEOLEVEL_BUDGET_PROP,
            section=CC.BUDGET,
            sep=CC.REGEX_CONFIG_DELIM)
        if self.only_dyadic_rationals:
            checkDyadic(self.geolevel_prop_budgets, msg="across-geolevel")

        # Create geocode dict
        geolevel_leng = self.gettuple(CC.GEODICT_LENGTHS, section=CC.GEODICT)
        assert len(geolevel_leng) == len(
            self.levels), "Geolevel names and geolevel lengths differ in size"
        self.geocode_dict = {
            int(gl_length): gl_name
            for gl_name, gl_length in zip(self.levels, geolevel_leng)
        }

        self.spine_type = self.getconfig(CC.SPINE,
                                         section=CC.GEODICT,
                                         default="non_aian_spine")
        if self.spine_type not in CC.SPINE_TYPE_ALLOWED:
            raise DASConfigError(
                msg=
                f"spine type must be {'/'.join(CC.SPINE_TYPE_ALLOWED)} rather than {self.spine_type}.",
                option=CC.SPINE,
                section=CC.BUDGET)

        self.plb_allocation = None  # To be filled in the reader module if "opt_spine"

        self.privacy_framework = self.getconfig(key=CC.PRIVACY_FRAMEWORK,
                                                section=CC.BUDGET,
                                                default=CC.PURE_DP)
        self.dp_mechanism_name = self.getconfig(key=CC.DP_MECHANISM,
                                                section=CC.BUDGET,
                                                default=CC.GEOMETRIC_MECHANISM)
        mechanism_not_implemented_msg = f"{self.dp_mechanism_name} not implemented for {self.privacy_framework}."
        if self.privacy_framework in (CC.ZCDP, ):
            assert self.dp_mechanism_name in (
                CC.DISCRETE_GAUSSIAN_MECHANISM,
                CC.ROUNDED_CONTINUOUS_GAUSSIAN_MECHANISM,
                CC.FLOAT_DISCRETE_GAUSSIAN_MECHANISM
            ), mechanism_not_implemented_msg
        elif self.privacy_framework in (CC.PURE_DP, ):
            assert self.dp_mechanism_name in (
                CC.GEOMETRIC_MECHANISM, ), mechanism_not_implemented_msg
        else:
            raise NotImplementedError(
                f"DP primitives/composition rules for {self.privacy_framework} not implemented."
            )
        self.log_and_print(f"Privacy mechanism: {self.dp_mechanism_name}")

        self.log_and_print(f"geolevels: {self.levels}")
        # schema keyword
        self.schema = self.getconfig(CC.SCHEMA, section=CC.SCHEMA)
        self.log_and_print(f"schema keyword: {self.schema}")
        self.schema_obj = SchemaMaker.fromName(self.schema)
        self.unit_schema_obj = SchemaMaker.fromName(
            _unit_schema_dict[self.schema])
        self.postprocess_only = self.getboolean(CC.POSTPROCESS_ONLY,
                                                section=CC.ENGINE,
                                                default=False)
        self.validate_input_data_constraints = self.getboolean(
            CC.VALIDATE_INPUT_DATA_CONSTRAINTS,
            section=CC.READER,
            default=True)

        self.inv_con_by_level = {}
        for level in self.levels:
            self.inv_con_by_level[level] = {
                "invar_names":
                self.gettuple(f"{CC.THEINVARIANTS}.{level}",
                              section=CC.CONSTRAINTS,
                              default=()),
                "cons_names":
                self.gettuple(f"{CC.THECONSTRAINTS}.{level}",
                              section=CC.CONSTRAINTS,
                              default=())
            }

        try:
            # Person table histogram shape (set here and then checked/set in the reader module init)
            self.hist_shape = self.schema_obj.shape
            self.unit_hist_shape = self.unit_schema_obj.shape
            # Person table histogram variables (set here and then checked/set in the reader module init)
            self.hist_vars = self.schema_obj.dimnames
        except AssertionError:
            self.log_warning_and_print(
                f"Schema {self.schema} is not supported")

        # Temporary directory with code and files shipped to spark, to delete later
        self.dir4sparkzip = None

        noisy_partitions_by_level = self.gettuple_of_ints(
            CC.NOISY_PARTITIONS_BY_LEVEL,
            section=CC.WRITER_SECTION,
            default=",".join(("0", ) * len(self.levels)))
        self.annotate(
            f'noisy_partitions_by_level: {noisy_partitions_by_level}')
        assert len(noisy_partitions_by_level) == len(
            self.levels
        ), f'Config Error: noisy_partitions_by_level should be the same length as the geolevels. Found instead: self.levels: {self.levels}, noisy_partitions_by_level: {noisy_partitions_by_level }'

        self.noisy_partitions_dict = {
            self.levels[index]: noisy_partitions_by_level[index]
            for index in range(len(self.levels))
        }
        self.annotate(f'noisy_partitions_dict: {self.noisy_partitions_dict}')

        self.dvs_enabled = self.getboolean(CC.DVS_ENABLED,
                                           section=CC.DVS_SECTION,
                                           default=False)
예제 #8
0
        def __init__(self, budget, **kwargs):
            super().__init__(**kwargs)

            try:
                strategy = StrategySelector.strategies[budget.getconfig(
                    CC.STRATEGY)].make(budget.levels)
            except (NoOptionError, NoSectionError):
                raise DASConfigError("DPQuery strategy has to be set",
                                     section=CC.BUDGET,
                                     option="strategy")

            self.dp_query_names = strategy[CC.DPQUERIES]
            self.dp_query_prop = strategy[CC.QUERIESPROP]
            self.unit_dp_query_names = strategy[CC.UNITDPQUERIES]
            self.unit_dp_query_prop = strategy[CC.UNITQUERIESPROP]

            # FILL QUERY DICT
            self.queries_dict = {}
            for geolevel in budget.geolevel_prop_budgets_dict:
                self.queries_dict.update(
                    budget.schema_obj.getQueries(
                        self.dp_query_names[geolevel]))
                self.queries_dict.update(
                    budget.unit_schema_obj.getQueries(
                        self.unit_dp_query_names[geolevel]))

            ## CHECKING

            assert len(self.dp_query_names) == len(budget.levels)
            assert len(self.dp_query_prop) == len(budget.levels)
            assert len(self.unit_dp_query_names) in (0, len(budget.levels))
            assert len(self.unit_dp_query_prop) in (0, len(budget.levels))

            max_qname_len = max(map(len, self.queries_dict))

            qallocstr_gprop = ""
            for geolevel, gprop in budget.geolevel_prop_budgets_dict.items():

                # Make a list to check later if it sums up to 1.
                budget_per_each_query: list = []

                budget_per_each_query.extend(list(
                    self.dp_query_prop[geolevel]))

                self.checkUnique(self.dp_query_names[geolevel], CC.DPQUERIES)
                self.checkUnique(self.unit_dp_query_names[geolevel],
                                 CC.UNITDPQUERIES)

                budget.checkDyadic(self.dp_query_prop[geolevel], msg="queries")

                qallocstr = f"{geolevel}:\n\t" + "\n\t".join([
                    f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))}  {qprop}"
                    for query, qprop in self.queryPropPairs(geolevel)
                ])
                qallocstr_gprop += f"{geolevel}:\n\t" + "\n\t".join([
                    f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))}  {qprop * gprop}"
                    for query, qprop in self.queryPropPairs(geolevel)
                ])
                if self.unit_dp_query_names[geolevel]:
                    # Add the fractions of per-geolevel budgets dedicated to each query to the list that should sum up to 1.
                    budget_per_each_query.extend(
                        list(self.unit_dp_query_prop[geolevel]))
                    budget.checkDyadic(self.unit_dp_query_prop[geolevel],
                                       msg="unit queries")
                    qallocstr += "\n\t".join([
                        f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))}  {qprop}"
                        for query, qprop in self.unitQueryPropPairs(geolevel)
                    ])
                    qallocstr_gprop += "\n\t".join([
                        f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))}  {qprop * gprop}"
                        for query, qprop in self.unitQueryPropPairs(geolevel)
                    ])

                qallocstr_gprop += "\n"
                assertSumTo(budget_per_each_query,
                            msg="Within-geolevel Budget Proportion")
                assertEachPositive(budget_per_each_query, "queries")

                budget.log_and_print("Within-geolevel query allocations:")
                budget.log_and_print(qallocstr)

            logfilename = os.getenv('LOGFILE_NAME')
            df = print_alloc.makeDataFrame(budget.getconfig(CC.STRATEGY),
                                           budget.levels)
            self.allocation_df = df  # Save it for printing out of the budget object
            self.printAllocTables(df, budget)
            self.saveQueryAllocations(df, "_wglev_query_allocations",
                                      logfilename)

            dftot = print_alloc.multiplyByGLBudgets(
                df.copy(deep=True), budget.geolevel_prop_budgets_dict.items())
            budget.log_and_print(
                "All query allocations (i.e. multiplied by geolevel proportion):"
            )
            budget.log_and_print(qallocstr_gprop)
            self.printAllocTables(dftot, budget)
            self.saveQueryAllocations(dftot, "_overall_query_allocations",
                                      logfilename)

            # Print all levels, on which the measurements are taken:
            self.printLevelsOfMarginals(
                budget, set(reduce(add, self.dp_query_names.values())),
                budget.schema_obj, 'main histogram')
            unique_unit_dp_query_names = [
                udpqn for udpqn in self.unit_dp_query_names.values() if udpqn
            ]
            if unique_unit_dp_query_names:
                self.printLevelsOfMarginals(
                    budget, set(reduce(add,
                                       self.unit_dp_query_names.values())),
                    budget.unit_schema_obj, 'unit histogram')

            self.checkQueryImpactGaps(budget, self.queries_dict)
예제 #9
0
    def setOptimizersAndQueryOrderings(self, levels):
        """
        For engines with queries set in config (e.g. topdown, bottomup)
        Read the queries from config, and set their budget allocations. Check that allocation proportions sum to one/
        :return:
        """

        # If a multipass approach was specified for L2 or Rounder, get the order in which it specifies to optimize queries
        l2_optimization_approach = self.getconfig(
            CC.L2_OPTIMIZATION_APPROACH,
            section=CC.GUROBI_SECTION,
            default=CC.SINGLE_PASS_REGULAR)
        rounder_optimization_approach = self.getconfig(
            CC.ROUNDER_OPTIMIZATION_APPROACH,
            section=CC.GUROBI_SECTION,
            default=CC.CELLWISE_ROUNDER)

        seq_opt_name = self.getconfig(CC.SEQ_OPT_OPTIMIZATION_APPROACH,
                                      section=CC.GUROBI_SECTION,
                                      default=CC.L2_PLUS_ROUNDER_WITH_BACKUP)
        outer_pass = seq_opt_name == CC.L2_PLUS_ROUNDER_WITH_BACKUP_INTERLEAVED

        optimizers = (seq_opt_name, l2_optimization_approach,
                      rounder_optimization_approach)

        try:
            query_ordering_name = self.getconfig("query_ordering",
                                                 section=CC.BUDGET)
        except (NoOptionError, NoSectionError):
            #  Or just return empty query_ordering?
            raise DASConfigError("", "query_ordering", CC.BUDGET)

        query_ordering = QueryOrderingSelector.query_orderings[
            query_ordering_name].make(levels)

        # Fill rounder_queries
        rounder_query_names = {}
        for geolevel, qo_dict_geolevel in query_ordering.items():
            rounder_query_ordering = qo_dict_geolevel[
                CC.ROUNDER_QUERY_ORDERING]
            if rounder_query_ordering is None:
                continue
            if not outer_pass:
                rounder_query_names[geolevel] = reduce(
                    add, rounder_query_ordering.values())
            else:
                rounder_query_names[geolevel] = reduce(
                    add,
                    map(lambda opd: reduce(add, opd.values()),
                        rounder_query_ordering.values()))

        # Fill constrain_to ordering if empty
        for geolevel, qo_glev in query_ordering.items():
            if CC.L2_CONSTRAIN_TO_QUERY_ORDERING not in qo_glev or not qo_glev[
                    CC.L2_CONSTRAIN_TO_QUERY_ORDERING]:
                query_ordering[geolevel][
                    CC.L2_CONSTRAIN_TO_QUERY_ORDERING] = query_ordering[
                        geolevel][CC.L2_QUERY_ORDERING]

        ### CHECKING
        assert len(query_ordering) == len(
            levels
        ), "Query ordering geolevels lengths is different from engine/budget geolevels, check the strategy"
        for geolevel, qo_dict_geolevel in query_ordering.items():
            l2_dp_query_ordering = qo_dict_geolevel[CC.L2_QUERY_ORDERING]
            if l2_dp_query_ordering is None:
                continue
            l2_target_queries = []
            options_list = []

            if not outer_pass:
                for pn, qnames in l2_dp_query_ordering.items():
                    l2_target_queries.extend(qnames)
                    options_list.append(f"L2_DPqueryPart{pn}")
            else:
                for opn in l2_dp_query_ordering.keys():
                    for ipn, qnames in l2_dp_query_ordering[opn].items():
                        l2_target_queries.extend(qnames)
                        options_list.append(f"L2_DPqueryPart{opn}_{ipn}")
            l2_target_queries = sortMarginalNames(l2_target_queries)

            # if len(l2_target_queries) > len(set(l2_target_queries)):
            #    raise DASConfigValdationError(f"Some queries {l2_target_queries} are targeted in L2 optimization more than once",
            #                                  section=CC.BUDGET, options=options_list)
            # # NOTE: this is no longer a requirement with constrain-to config specification.
            # l2_target_queries = sortMarginalNames(l2_target_queries)

            measured_dp_queries = sortMarginalNames(
                self.budget.query_budget.dp_query_names[geolevel])

            if len(set(measured_dp_queries) - set(l2_target_queries)) > 0:
                raise ValueError(
                    f"In query ordering {query_ordering_name}, geolevel {geolevel}, some of the measured DP queries ({measured_dp_queries}) are not targeted in L2 optimization {l2_target_queries}"
                )

            print(
                f"Detected {geolevel} l2_dp_query_ordering: {query_ordering[geolevel][CC.L2_QUERY_ORDERING]}"
            )
            print(
                f"Detected {geolevel} l2_ConstrainTo_dp_query_ordering: {query_ordering[geolevel][CC.L2_CONSTRAIN_TO_QUERY_ORDERING]}"
            )
            print(
                f"Detected {geolevel} rounder_dp_query_ordering: {query_ordering[geolevel][CC.ROUNDER_QUERY_ORDERING]}"
            )

        return optimizers, query_ordering, rounder_query_names