def optimizeWorkload(self): """ Gathers the workload and optimizes to find the strategy """ # Load previously calculated strategy if requested if self.getboolean(LOAD_HDMM_STRATEGY, section=CC.HDMM, default=False): saved_strategy_path = self.getconfig(LOAD_HDMM_STRATEGY_PATH, section=CC.HDMM, default='') if not saved_strategy_path: try: saved_strategy_path = self.das.writer.output_path except (NoOptionError, NoSectionError) as err: raise DASConfigError( f"Load HDMM strategy is requested but neither [{CC.HDMM}]/{LOAD_HDMM_STRATEGY_PATH}, nor [{err.section}]/{err.option} are provided to find the saved strategy", err.option, err.section) load_path = os.path.join(saved_strategy_path, SAVED_STRATEGY_NAME) self.log_and_print(f"Loading HDMM strategy from {load_path}...") try: strategy = das_utils.loadPickleFile(load_path) self.queries_dict["hdmm_strategy"] = self.strategy2query( strategy) return except (FileNotFoundError, EOFError): self.log_and_print( f"File with saved HDMM strategy {load_path} not found, re-calculating strategy...", cui=False) # Calculated strategy W = self.dict2workload(self.workload_queries_dict) if self.strategy_type == CC.PIDENTITY_STRATEGY: strategy = self.findPIdentityStrategy() elif self.strategy_type == CC.MARGINAL_STRATEGY: strategy = self.findMarginalStrategy() else: raise DASValueError( f"Strategy type '{self.strategy_type}' not implemented in das_decennial HDMM engine", self.strategy_type) # run optimization ans = strategy.optimize(W) # Save strategy if requested if self.getboolean(SAVE_HDMM_STRATEGY, section=CC.HDMM): try: output_path = self.das.writer.output_path save_path = os.path.join(output_path, SAVED_STRATEGY_NAME) except (NoOptionError, NoSectionError) as err: raise DASConfigError( f"Save HDMM strategy is requested but saving path [{err.section}]/{err.option} is not provided", err.option, err.section) das_utils.savePickleFile(save_path, strategy) self.queries_dict["hdmm_strategy"] = self.strategy2query(strategy)
def __init__(self, **kwargs): super().__init__(**kwargs) self.mechanism = primitives.basic_dp_answer self.dp_queries = False try: self.strategy_type: str = self.getconfig(CC.HDMM_STRATEGY_TYPE, section=CC.HDMM) except NoSectionError as err: raise DASConfigError( f"HDMM engine requires '{CC.HDMM_STRATEGY_TYPE}' to be set up in [{err.section}] section", None, err.section) except NoOptionError as err: raise DASConfigError( f"MHDMM engine requires '{err.option}' to be set up in [{err.section}] section", err.option, err.section) print("HDMM Strategy type: ", self.strategy_type)
def setWorkload(self): try: self.workload = list( self.gettuple(CC.WORKLOAD, section=CC.WORKLOAD, sep=CC.REGEX_CONFIG_DELIM)) except NoSectionError as err: raise DASConfigError( f"HDMM engine requires '{CC.WORKLOAD}' to be set up in [{err.section}] section", None, err.section) except NoOptionError as err: raise DASConfigError( f"HDMM engine requires '{err.option}' to be set up in [{err.section}] section", err.option, err.section) print("workload: ", self.workload) self.workload_queries_dict = make_workloads.WorkloadQueriesCreator( self.setup.schema_obj, self.workload).workload_queries_dict self.queries_dict = {}
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.data_shape = None tv = set(map(lambda v: v.name, self.variables)) vv = set( map(lambda v: v.name, self.recode_variables)) if self.recode_variables else set() tv_and_vv = tv.union(vv) for varname in self.histogram_variables: if varname not in tv_and_vv: msg = f"Histogram variable {varname} for {self.name} is neither in table variables nor in recode variables, check your config file" raise DASConfigError( msg, f"{self.name}.{CC.HISTOGRAM}/{self.name}.{CC.GEOGRAPHY}", CC.READER)
def make_from_config(self, config): """ This will set the attributes reading from the config file. """ try: vtype = config.get(CC.READER, f"{self.name}.{CC.VAR_TYPE}") legal_values = config.get(CC.READER, f"{self.name}.{CC.LEGAL}") except NoOptionError as e: raise DASConfigError( f"Missing variable {self.name} specifications", *e.args) self.set_vtype(vtype) self.set_legal_values(legal_values) return self
def __init__(self, **kwargs): super().__init__(**kwargs) self.writer_names = self.gettuple(CC.MULTIWRITER_WRITERS) self.writers = [] for i, wname in enumerate(self.writer_names): try: writer_class = self.SUPPORTED_WRITERS[wname] except KeyError: raise DASConfigError( f"Writer {wname} is not supported in MultiWriter", CC.MULTIWRITER_WRITERS, CC.WRITER) postfix = self.getconfig(CC.OUTPUT_DATAFILE_NAME, default='data') w = writer_class(name=CC.WRITER, config=self.config, setup=self.setup, das=self.das) assert isinstance(w, DASDecennialWriter) w.setOutputFileDataName(f"{postfix}-{wname}") if i > 0: w.unsetOverwriteFlag() self.writers.append(w)
def __init__(self, **kwargs): super().__init__(**kwargs) if self.getboolean(CC.PRE_RELEASE, default=False): import programs.pre_release_datadict as datadict else: import programs.datadict as datadict self.datadict = datadict # Whether to run in spark (there is a local/serial mode, for testing and debugging) self.use_spark = self.getboolean(CC.SPARK, section=CC.ENGINE, default=True) # Geographical level names self.levels = self.gettuple(CC.GEODICT_GEOLEVELS, section=CC.GEODICT) # Bottom level geolevel we are interested in. Defaults to the first level in self.levels, which is the lowest self.geo_bottomlevel = self.getconfig(CC.GEO_BOTTOMLEVEL, section=CC.GEODICT, default=self.levels[0]) self.only_dyadic_rationals = self.getboolean(CC.ONLY_DYADIC_RATIONALS, section=CC.BUDGET, default=False) # self.geolevel_prop_budgets = self.gettuple_of_fraction2floats(CC.GEOLEVEL_BUDGET_PROP, section=CC.BUDGET, sep=CC.REGEX_CONFIG_DELIM) self.geolevel_prop_budgets = self.gettuple_of_fractions( CC.GEOLEVEL_BUDGET_PROP, section=CC.BUDGET, sep=CC.REGEX_CONFIG_DELIM) if self.only_dyadic_rationals: checkDyadic(self.geolevel_prop_budgets, msg="across-geolevel") # Create geocode dict geolevel_leng = self.gettuple(CC.GEODICT_LENGTHS, section=CC.GEODICT) assert len(geolevel_leng) == len( self.levels), "Geolevel names and geolevel lengths differ in size" self.geocode_dict = { int(gl_length): gl_name for gl_name, gl_length in zip(self.levels, geolevel_leng) } self.spine_type = self.getconfig(CC.SPINE, section=CC.GEODICT, default="non_aian_spine") if self.spine_type not in CC.SPINE_TYPE_ALLOWED: raise DASConfigError( msg= f"spine type must be {'/'.join(CC.SPINE_TYPE_ALLOWED)} rather than {self.spine_type}.", option=CC.SPINE, section=CC.BUDGET) self.plb_allocation = None # To be filled in the reader module if "opt_spine" self.privacy_framework = self.getconfig(key=CC.PRIVACY_FRAMEWORK, section=CC.BUDGET, default=CC.PURE_DP) self.dp_mechanism_name = self.getconfig(key=CC.DP_MECHANISM, section=CC.BUDGET, default=CC.GEOMETRIC_MECHANISM) mechanism_not_implemented_msg = f"{self.dp_mechanism_name} not implemented for {self.privacy_framework}." if self.privacy_framework in (CC.ZCDP, ): assert self.dp_mechanism_name in ( CC.DISCRETE_GAUSSIAN_MECHANISM, CC.ROUNDED_CONTINUOUS_GAUSSIAN_MECHANISM, CC.FLOAT_DISCRETE_GAUSSIAN_MECHANISM ), mechanism_not_implemented_msg elif self.privacy_framework in (CC.PURE_DP, ): assert self.dp_mechanism_name in ( CC.GEOMETRIC_MECHANISM, ), mechanism_not_implemented_msg else: raise NotImplementedError( f"DP primitives/composition rules for {self.privacy_framework} not implemented." ) self.log_and_print(f"Privacy mechanism: {self.dp_mechanism_name}") self.log_and_print(f"geolevels: {self.levels}") # schema keyword self.schema = self.getconfig(CC.SCHEMA, section=CC.SCHEMA) self.log_and_print(f"schema keyword: {self.schema}") self.schema_obj = SchemaMaker.fromName(self.schema) self.unit_schema_obj = SchemaMaker.fromName( _unit_schema_dict[self.schema]) self.postprocess_only = self.getboolean(CC.POSTPROCESS_ONLY, section=CC.ENGINE, default=False) self.validate_input_data_constraints = self.getboolean( CC.VALIDATE_INPUT_DATA_CONSTRAINTS, section=CC.READER, default=True) self.inv_con_by_level = {} for level in self.levels: self.inv_con_by_level[level] = { "invar_names": self.gettuple(f"{CC.THEINVARIANTS}.{level}", section=CC.CONSTRAINTS, default=()), "cons_names": self.gettuple(f"{CC.THECONSTRAINTS}.{level}", section=CC.CONSTRAINTS, default=()) } try: # Person table histogram shape (set here and then checked/set in the reader module init) self.hist_shape = self.schema_obj.shape self.unit_hist_shape = self.unit_schema_obj.shape # Person table histogram variables (set here and then checked/set in the reader module init) self.hist_vars = self.schema_obj.dimnames except AssertionError: self.log_warning_and_print( f"Schema {self.schema} is not supported") # Temporary directory with code and files shipped to spark, to delete later self.dir4sparkzip = None noisy_partitions_by_level = self.gettuple_of_ints( CC.NOISY_PARTITIONS_BY_LEVEL, section=CC.WRITER_SECTION, default=",".join(("0", ) * len(self.levels))) self.annotate( f'noisy_partitions_by_level: {noisy_partitions_by_level}') assert len(noisy_partitions_by_level) == len( self.levels ), f'Config Error: noisy_partitions_by_level should be the same length as the geolevels. Found instead: self.levels: {self.levels}, noisy_partitions_by_level: {noisy_partitions_by_level }' self.noisy_partitions_dict = { self.levels[index]: noisy_partitions_by_level[index] for index in range(len(self.levels)) } self.annotate(f'noisy_partitions_dict: {self.noisy_partitions_dict}') self.dvs_enabled = self.getboolean(CC.DVS_ENABLED, section=CC.DVS_SECTION, default=False)
def __init__(self, budget, **kwargs): super().__init__(**kwargs) try: strategy = StrategySelector.strategies[budget.getconfig( CC.STRATEGY)].make(budget.levels) except (NoOptionError, NoSectionError): raise DASConfigError("DPQuery strategy has to be set", section=CC.BUDGET, option="strategy") self.dp_query_names = strategy[CC.DPQUERIES] self.dp_query_prop = strategy[CC.QUERIESPROP] self.unit_dp_query_names = strategy[CC.UNITDPQUERIES] self.unit_dp_query_prop = strategy[CC.UNITQUERIESPROP] # FILL QUERY DICT self.queries_dict = {} for geolevel in budget.geolevel_prop_budgets_dict: self.queries_dict.update( budget.schema_obj.getQueries( self.dp_query_names[geolevel])) self.queries_dict.update( budget.unit_schema_obj.getQueries( self.unit_dp_query_names[geolevel])) ## CHECKING assert len(self.dp_query_names) == len(budget.levels) assert len(self.dp_query_prop) == len(budget.levels) assert len(self.unit_dp_query_names) in (0, len(budget.levels)) assert len(self.unit_dp_query_prop) in (0, len(budget.levels)) max_qname_len = max(map(len, self.queries_dict)) qallocstr_gprop = "" for geolevel, gprop in budget.geolevel_prop_budgets_dict.items(): # Make a list to check later if it sums up to 1. budget_per_each_query: list = [] budget_per_each_query.extend(list( self.dp_query_prop[geolevel])) self.checkUnique(self.dp_query_names[geolevel], CC.DPQUERIES) self.checkUnique(self.unit_dp_query_names[geolevel], CC.UNITDPQUERIES) budget.checkDyadic(self.dp_query_prop[geolevel], msg="queries") qallocstr = f"{geolevel}:\n\t" + "\n\t".join([ f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))} {qprop}" for query, qprop in self.queryPropPairs(geolevel) ]) qallocstr_gprop += f"{geolevel}:\n\t" + "\n\t".join([ f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))} {qprop * gprop}" for query, qprop in self.queryPropPairs(geolevel) ]) if self.unit_dp_query_names[geolevel]: # Add the fractions of per-geolevel budgets dedicated to each query to the list that should sum up to 1. budget_per_each_query.extend( list(self.unit_dp_query_prop[geolevel])) budget.checkDyadic(self.unit_dp_query_prop[geolevel], msg="unit queries") qallocstr += "\n\t".join([ f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))} {qprop}" for query, qprop in self.unitQueryPropPairs(geolevel) ]) qallocstr_gprop += "\n\t".join([ f"{query.name + ':' + ' ' * (max_qname_len - len(query.name))} {qprop * gprop}" for query, qprop in self.unitQueryPropPairs(geolevel) ]) qallocstr_gprop += "\n" assertSumTo(budget_per_each_query, msg="Within-geolevel Budget Proportion") assertEachPositive(budget_per_each_query, "queries") budget.log_and_print("Within-geolevel query allocations:") budget.log_and_print(qallocstr) logfilename = os.getenv('LOGFILE_NAME') df = print_alloc.makeDataFrame(budget.getconfig(CC.STRATEGY), budget.levels) self.allocation_df = df # Save it for printing out of the budget object self.printAllocTables(df, budget) self.saveQueryAllocations(df, "_wglev_query_allocations", logfilename) dftot = print_alloc.multiplyByGLBudgets( df.copy(deep=True), budget.geolevel_prop_budgets_dict.items()) budget.log_and_print( "All query allocations (i.e. multiplied by geolevel proportion):" ) budget.log_and_print(qallocstr_gprop) self.printAllocTables(dftot, budget) self.saveQueryAllocations(dftot, "_overall_query_allocations", logfilename) # Print all levels, on which the measurements are taken: self.printLevelsOfMarginals( budget, set(reduce(add, self.dp_query_names.values())), budget.schema_obj, 'main histogram') unique_unit_dp_query_names = [ udpqn for udpqn in self.unit_dp_query_names.values() if udpqn ] if unique_unit_dp_query_names: self.printLevelsOfMarginals( budget, set(reduce(add, self.unit_dp_query_names.values())), budget.unit_schema_obj, 'unit histogram') self.checkQueryImpactGaps(budget, self.queries_dict)
def setOptimizersAndQueryOrderings(self, levels): """ For engines with queries set in config (e.g. topdown, bottomup) Read the queries from config, and set their budget allocations. Check that allocation proportions sum to one/ :return: """ # If a multipass approach was specified for L2 or Rounder, get the order in which it specifies to optimize queries l2_optimization_approach = self.getconfig( CC.L2_OPTIMIZATION_APPROACH, section=CC.GUROBI_SECTION, default=CC.SINGLE_PASS_REGULAR) rounder_optimization_approach = self.getconfig( CC.ROUNDER_OPTIMIZATION_APPROACH, section=CC.GUROBI_SECTION, default=CC.CELLWISE_ROUNDER) seq_opt_name = self.getconfig(CC.SEQ_OPT_OPTIMIZATION_APPROACH, section=CC.GUROBI_SECTION, default=CC.L2_PLUS_ROUNDER_WITH_BACKUP) outer_pass = seq_opt_name == CC.L2_PLUS_ROUNDER_WITH_BACKUP_INTERLEAVED optimizers = (seq_opt_name, l2_optimization_approach, rounder_optimization_approach) try: query_ordering_name = self.getconfig("query_ordering", section=CC.BUDGET) except (NoOptionError, NoSectionError): # Or just return empty query_ordering? raise DASConfigError("", "query_ordering", CC.BUDGET) query_ordering = QueryOrderingSelector.query_orderings[ query_ordering_name].make(levels) # Fill rounder_queries rounder_query_names = {} for geolevel, qo_dict_geolevel in query_ordering.items(): rounder_query_ordering = qo_dict_geolevel[ CC.ROUNDER_QUERY_ORDERING] if rounder_query_ordering is None: continue if not outer_pass: rounder_query_names[geolevel] = reduce( add, rounder_query_ordering.values()) else: rounder_query_names[geolevel] = reduce( add, map(lambda opd: reduce(add, opd.values()), rounder_query_ordering.values())) # Fill constrain_to ordering if empty for geolevel, qo_glev in query_ordering.items(): if CC.L2_CONSTRAIN_TO_QUERY_ORDERING not in qo_glev or not qo_glev[ CC.L2_CONSTRAIN_TO_QUERY_ORDERING]: query_ordering[geolevel][ CC.L2_CONSTRAIN_TO_QUERY_ORDERING] = query_ordering[ geolevel][CC.L2_QUERY_ORDERING] ### CHECKING assert len(query_ordering) == len( levels ), "Query ordering geolevels lengths is different from engine/budget geolevels, check the strategy" for geolevel, qo_dict_geolevel in query_ordering.items(): l2_dp_query_ordering = qo_dict_geolevel[CC.L2_QUERY_ORDERING] if l2_dp_query_ordering is None: continue l2_target_queries = [] options_list = [] if not outer_pass: for pn, qnames in l2_dp_query_ordering.items(): l2_target_queries.extend(qnames) options_list.append(f"L2_DPqueryPart{pn}") else: for opn in l2_dp_query_ordering.keys(): for ipn, qnames in l2_dp_query_ordering[opn].items(): l2_target_queries.extend(qnames) options_list.append(f"L2_DPqueryPart{opn}_{ipn}") l2_target_queries = sortMarginalNames(l2_target_queries) # if len(l2_target_queries) > len(set(l2_target_queries)): # raise DASConfigValdationError(f"Some queries {l2_target_queries} are targeted in L2 optimization more than once", # section=CC.BUDGET, options=options_list) # # NOTE: this is no longer a requirement with constrain-to config specification. # l2_target_queries = sortMarginalNames(l2_target_queries) measured_dp_queries = sortMarginalNames( self.budget.query_budget.dp_query_names[geolevel]) if len(set(measured_dp_queries) - set(l2_target_queries)) > 0: raise ValueError( f"In query ordering {query_ordering_name}, geolevel {geolevel}, some of the measured DP queries ({measured_dp_queries}) are not targeted in L2 optimization {l2_target_queries}" ) print( f"Detected {geolevel} l2_dp_query_ordering: {query_ordering[geolevel][CC.L2_QUERY_ORDERING]}" ) print( f"Detected {geolevel} l2_ConstrainTo_dp_query_ordering: {query_ordering[geolevel][CC.L2_CONSTRAIN_TO_QUERY_ORDERING]}" ) print( f"Detected {geolevel} rounder_dp_query_ordering: {query_ordering[geolevel][CC.ROUNDER_QUERY_ORDERING]}" ) return optimizers, query_ordering, rounder_query_names