async def list_deliveries( self, step_context: WaterfallStepContext) -> DialogTurnResult: LOGGER.debug(msg=f"{ListDeliveriesDialog.__name__}: list deliveries") recipient: ChannelAccount = step_context.context.activity.recipient data = await self.storage.read([recipient.id]) # get this member's state member_state = data.get(recipient.id, {}) delivery_list: DeliveryList = member_state.get( Keys.DELIVERY_LIST_STATE.value) if delivery_list: deliveries: [Delivery] = delivery_list.deliveries for delivery in deliveries: DeliveryCard["body"][0]["text"] = delivery.item DeliveryCard["body"][1]["text"] = delivery.destination DeliveryCard["body"][2]["text"] = delivery.time message = Activity( type=ActivityTypes.message, attachments=[CardFactory.adaptive_card(DeliveryCard)], ) await step_context.context.send_activity(message) else: await step_context.context.send_activity(messages.NO_DELIVERIES) return await step_context.end_dialog()
async def confirm_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: LOGGER.debug(msg=f"{CreateDeliveryDialog.__name__}: confirmation step.") # Set the delivery destination to what they entered in response to the destination prompt. delivery: Delivery = step_context.values[Keys.DELIVERY_DIALOG_STATE.value] # capture the response from the previous step delivery.time = step_context.result[0].value message_text = f"""{ messages.DELIVERY_SCHEDULED % (delivery.item, delivery.destination, delivery.time)} {messages.IS_THAT_ALL}""" prompt_options = PromptOptions( prompt=MessageFactory.text(message_text) ) DeliveryCard["body"][0]["text"] = f"Item: {delivery.item}" DeliveryCard["body"][1]["text"] = f"Destination: {delivery.destination}" DeliveryCard["body"][2]["text"] = f"Time: {delivery.time}" await step_context.context.send_activity( Activity( type=ActivityTypes.message, text=MessageFactory.text(message_text), attachments=[ CardFactory.adaptive_card(DeliveryCard) ], ) ) return await step_context.prompt(ConfirmPrompt.__name__, prompt_options)
async def destination_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: """ If a delivery destination has not been provided, prompt for one. :param step_context: :return DialogTurnResult: """ LOGGER.debug(msg=f"{CreateDeliveryDialog.__name__}: destination step.") # Set the delivery item to what they entered in response to the create delivery prompt. delivery: Delivery = step_context.values[Keys.DELIVERY_DIALOG_STATE.value] # capture the response from the previous step delivery.item = step_context.result if delivery.destination is None: message_text = messages.DELIVERY_DESTINATION_PROMPT % delivery.item prompt_options = PromptOptions( prompt=MessageFactory.text( message_text, message_text, InputHints.expecting_input ) ) return await step_context.prompt(TextPrompt.__name__, prompt_options) return await step_context.next(delivery.destination)
async def _create_delivery(self, step_context): recipient: ChannelAccount = step_context.context.activity.recipient delivery: Delivery = step_context.values[Keys.DELIVERY_DIALOG_STATE.value] data = await self.storage.read([recipient.id]) # get or initialize this member's state member_state = data.get(recipient.id, {}) if not member_state: member_state = { recipient.id: {} } delivery_list: DeliveryList = member_state.get(Keys.DELIVERY_LIST_STATE.value) if delivery_list: delivery_list.deliveries.append(delivery) delivery_list.turn_number = delivery_list.turn_number + 1 else: delivery_list = DeliveryList() delivery_list.deliveries.append(delivery) delivery_list.turn_number = 1 member_state[recipient.id][Keys.DELIVERY_LIST_STATE.value] = delivery_list try: await self.storage.write(member_state) LOGGER.debug(msg=f"Delivery persisted.") except Exception as e: LOGGER.error(msg=f"An error='{e}' has occurred while trying to schedule a delivery") await step_context.context.send_activity(messages.SOMETHING_WENT_WRONG)
def train(self, data): """ Train a MLE attack to reconstruct an unknown sensitive value from a vector of known attributes :param data: type(DataFrame) A dataset of shape (n, k) """ features = self._encode_data(data.drop(self.sensitiveAttribute, axis=1)) labels = data[self.sensitiveAttribute].values n, k = features.shape # Center independent variables for better regression performance self.scaleFactor = mean(features, axis=0) featuresScaled = features - self.scaleFactor featuresScaled = concatenate( [ones((n, 1)), featuresScaled], axis=1) # append all ones for inclu intercept in beta vector # Get MLE for linear coefficients self.PredictionModel.fit(featuresScaled, labels) self.coefficients = self.PredictionModel.coef_ self.sigma = sum( (labels - featuresScaled.dot(self.coefficients))**2) / (n - k) LOGGER.debug('Finished training regression model') self.trained = True
async def on_error(context: TurnContext, error: Exception): # This check writes out errors to console log # NOTE: In production environment, you should consider logging this to Azure # application insights. LOGGER.error( msg=f"An unhandled error has occurred: '{error.__class__.__name__}: {str(error)}'" ) # Send a message to the user await context.send_activity(messages.SOMETHING_WENT_WRONG) # Send a trace activity if we're talking to the Bot Framework Emulator if context.activity.channel_id == "emulator": # Create a trace activity that contains the error object trace_activity = Activity( label="TurnError", name="on_turn_error Trace", timestamp=datetime.utcnow(), type=ActivityTypes.trace, value=f"{error}", value_type="https://www.botframework.com/schemas/error", ) # Send a trace activity, which will be displayed in Bot Framework Emulator await context.send_activity(trace_activity) # Clear out state nonlocal self await self._conversation_state.delete(context)
def fit(self, data): assert isinstance( data, self.datatype ), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' assert len( list(data) ) >= 2, "BayesianNet requires at least 2 attributes(i.e., columns) in dataset." LOGGER.debug( f'Start training BayesianNet on data of shape {data.shape}...') if self.trained: self.trained = False self.DataDescriber = None self.bayesian_network = None self.conditional_probabilities = None self.DataDescriber = DataDescriber(self.metadata, self.histogram_bins, self.infer_ranges) self.DataDescriber.describe(data) encoded_df = DataFrame(columns=self.DataDescriber.attr_names) for attr_name, column in self.DataDescriber.attr_dict.items(): encoded_df[attr_name] = column.encode_values_into_bin_idx() self.bayesian_network = self._greedy_bayes_linear( encoded_df, self.degree) self.conditional_probabilities = self._construct_conditional_probabilities( self.bayesian_network, encoded_df) LOGGER.debug(f'Finished training Bayesian net') self.trained = True
async def action_step( self, step_context: WaterfallStepContext) -> DialogTurnResult: LOGGER.debug(msg="Main dialog action step") if not self.luis_recognizer.is_configured: # LUIS is not configured, we just use the choice step return await self._handle_action(step_context=step_context, action=step_context.result.value) # Call LUIS and gather any potential delivery details. # (Note the TurnContext has the response to the prompt.) intent, luis_result = await self.luis_recognizer.recognize( step_context.context) action: str = Action.UNKNOWN.value if intent == Intent.SALUTATION.value: action = Action.SALUTATION_ACKNOWLEDGEMENT.value elif intent == Intent.SALUTATION_ACKNOWLEDGEMENT.value: action = Action.ACTION_PROMPT.value elif intent == Intent.SCHEDULE_DELIVERY.value: action = Action.SCHEDULE_DELIVERY.value elif intent == Intent.LIST_DELIVERIES.value: action = Action.LIST_DELIVERIES.value elif intent == Intent.CANCEL.value: action = Action.EXIT.value return await self._handle_action(step_context=step_context, action=action)
async def time_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: """ If a delivery time has not been provided, prompt for one. :param step_context: :return DialogTurnResult: """ LOGGER.debug(msg=f"{CreateDeliveryDialog.__name__}: time step.") # Set the delivery destination to what they entered in response to the destination prompt. delivery: Delivery = step_context.values[Keys.DELIVERY_DIALOG_STATE.value] # capture the response from the previous step delivery.destination = step_context.result if delivery.time is None: message_text = messages.DELIVERY_TIME_PROMPT % (delivery.item, delivery.destination) prompt_options = PromptOptions( prompt=MessageFactory.text( message_text, message_text, InputHints.expecting_input ), retry_prompt=MessageFactory.text(messages.VALID_DELIVERY_TIME_PROMPT), ) return await step_context.prompt(DateTimePrompt.__name__, prompt_options) return await step_context.next(delivery.time)
def generate_samples(self, nsamples): """Generate random samples from the fitted Gaussian distribution""" assert self.trained, "Model must first be fitted to some data." LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') synthetic_data = self.synthesiser.sample(nsamples) return synthetic_data
async def on_continue_dialog(self, inner_dc: DialogContext) -> DialogTurnResult: LOGGER.debug(msg=f"{CancelAndHelpDialog.__name__}: on_continue_dialog") result = await self.interrupt(inner_dc) if result is not None: return result return await super(CancelAndHelpDialog, self).on_continue_dialog(inner_dc)
def fit(self, data): """Train a generative adversarial network on tabular data. Input data is assumed to be of shape (n_samples, n_features) See https://github.com/DAI-Lab/SDGym for details""" assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...') self.synthesiser.fit(data, self.metadata) LOGGER.debug(f'Finished fitting') self.trained = True
async def on_members_added_activity(self, members_added: List[ChannelAccount], turn_context: TurnContext): for member in members_added: if member.id != turn_context.activity.recipient.id: await turn_context.send_activity( f"{messages.HELLO} {member.name}! {messages.BOT_INTRO_TEXT}." ) LOGGER.debug(f"Welcome message sent to member='{member.id}'") return await DialogHelper.run_dialog( self.dialog, turn_context, self.conversation_state.create_property(DIALOG_STATE), )
async def acknowledgement_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: LOGGER.debug(msg=f"{CreateDeliveryDialog.__name__}: acknowledgement step.") await self._create_delivery(step_context) if step_context.result: await step_context.context.send_activity( MessageFactory.text(messages.GOODBYE) ) return await step_context.end_dialog() else: await step_context.context.send_activity( MessageFactory.text(messages.HAPPY_TO_HELP) ) return await step_context.begin_dialog(self.id)
def generate_samples(self, nsamples): assert self.trained, "Model must be fitted to some data first" LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') synthetic_dataset = DataFrame(columns=self.DataDescriber.attr_names) for attr_name, Attr in self.DataDescriber.attr_dict.items(): binning_indices = Attr.sample_binning_indices_in_independent_attribute_mode( nsamples) synthetic_dataset[ attr_name] = Attr.sample_values_from_binning_indices( binning_indices) LOGGER.debug(f'Generated synthetic dataset of size {nsamples}') return synthetic_dataset
def fit(self, data): assert isinstance( data, self.datatype ), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' LOGGER.debug( f'Start fitting IndependentHistogram model to data of shape {data.shape}...' ) if self.trained: self.trained = False self.DataDescriber = None self.DataDescriber = DataDescriber(self.metadata, self.histogram_bins, self.infer_ranges) self.DataDescriber.describe(data) LOGGER.debug(f'Finished fitting IndependentHistogram') self.trained = True
async def intro_step( self, step_context: WaterfallStepContext) -> DialogTurnResult: LOGGER.debug(msg=f"Main dialog intro step") prompt_options = PromptOptions( prompt=MessageFactory.text(""), choices=[ Choice(Action.SCHEDULE_DELIVERY.value), Choice(Action.LIST_DELIVERIES.value), Choice(Action.EXIT.value) ]) if not self.luis_recognizer.is_configured or self.luis_recognizer.luis_is_disabled: return await self._handle_luis_not_configured( step_context, prompt_options) return await step_context.prompt(TextPrompt.__name__, prompt_options)
async def messages(req: Request) -> Response: # Main bot message handler. if JSON_CONTENT_TYPE in req.headers[CONTENT_TYPE_HEADER]: body = await req.json() else: return Response(status=415) activity = Activity().deserialize(body) auth_header = req.headers[ AUTHORIZATION_HEADER] if AUTHORIZATION_HEADER in req.headers else "" try: await ERROR_ADAPTER.process_activity(activity, auth_header, BOT.on_turn) return Response(status=201) except Exception as exception: LOGGER.error(msg=f"An unexpected exception={exception} has occurred") raise exception
def __init__(self, configuration: DefaultConfig): self._recognizer = None self.luis_is_disabled = configuration.LUIS_IS_DISABLED self.luis_is_configured = (configuration.LUIS_APP_ID and configuration.LUIS_API_KEY and configuration.LUIS_API_HOST_NAME) if self.luis_is_configured: # Set the recognizer options depending on which endpoint version you want to use e.g # v2 or v3. luis_application = LuisApplication( configuration.LUIS_APP_ID, configuration.LUIS_API_KEY, "https://" + configuration.LUIS_API_HOST_NAME, ) self._recognizer = LuisRecognizer(luis_application) self._recognizer.luis_trace_label = DeliverySchedulingRecognizer.__name__ LOGGER.debug(msg="LUIS application configured and initialized")
def train(self, data): """ Train a Classifier to reconstruct an unknown sensitive label from a vector of known attributes :param data: type(DataFrame) A dataset of shape (n, k) """ features = self._encode_data(data.drop(self.sensitiveAttribute, axis=1)) labels = data[self.sensitiveAttribute].apply( lambda x: self.labels[x]).values # Feature normalisation self.scaleFactor = mean(features, axis=0) featuresScaled = features - self.scaleFactor # Get MLE for linear coefficients self.PredictionModel.fit(featuresScaled, labels) LOGGER.debug('Finished training regression model') self.trained = True
def __init__(self, datatype, metadata, nbins=10, quids=None): assert datatype in [DataFrame], 'Unknown data type {}'.format(datatype) self.datatype = datatype self.nfeatures = 0 self.cat_attributes = [] self.num_attributes = [] self.histogram_bins = {} self.category_codes = {} if quids is None: quids = [] for cdict in metadata['columns']: attr_name = cdict['name'] dtype = cdict['type'] if dtype == FLOAT or dtype == INTEGER: if attr_name not in quids: self.num_attributes.append(attr_name) self.histogram_bins[attr_name] = linspace( cdict['min'], cdict['max'], nbins + 1) self.nfeatures += nbins else: self.cat_attributes.append(attr_name) cat_bins = cdict['bins'] cat_labels = [ f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins) - 1) ] self.category_codes[attr_name] = cat_labels self.nfeatures += len(cat_labels) elif dtype == CATEGORICAL or dtype == ORDINAL: self.cat_attributes.append(attr_name) self.category_codes[attr_name] = cdict['i2s'] self.nfeatures += len(cdict['i2s']) LOGGER.debug(f'Feature set will have length {self.nfeatures}') self.__name__ = 'Histogram'
def __init__( self, conversation_state: ConversationState, dialog: Dialog, user_state: UserState, ): if conversation_state is None: error = "Missing parameter. conversation_state is required" LOGGER.error(msg=error) raise Exception(f"[DeliveryBot]: {error}") if user_state is None: error = "Missing parameter. user_state is required" LOGGER.error(msg=error) raise Exception(f"[DeliveryBot]: {error}") if dialog is None: error = "Missing parameter. dialog is required" LOGGER.error(msg=error) raise Exception(f"[DeliveryBot]: {error}") self.conversation_state = conversation_state self.dialog = dialog self.user_state = user_state self.user_state_accessor = self.user_state.create_property( DELIVERIES_HISTORY)
async def interrupt(self, inner_dc: DialogContext) -> DialogTurnResult: LOGGER.debug(msg=f"{CancelAndHelpDialog.__name__}: interrupt") if inner_dc.context.activity.type == ActivityTypes.message: text = inner_dc.context.activity.text.lower() message = Activity( type=ActivityTypes.message, attachments=[CardFactory.adaptive_card(HelpCard)]) if text in (Prompts.HELP.value, Prompts.QUESTION_MARK.value): await inner_dc.context.send_activity(message) return DialogTurnResult(DialogTurnStatus.Waiting) if text in (Prompts.CANCEL.value, Prompts.END.value, Prompts.QUIT.value): cancel_message = MessageFactory.text(messages.CANCELLED, messages.CANCELLED, InputHints.ignoring_input) await inner_dc.context.send_activity(cancel_message) await inner_dc.cancel_all_dialogs() return await inner_dc.replace_dialog(self.initial_dialog_id) return None
def generate_samples(self, nsamples): LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') assert self.trained, "Model must be fitted to some real data first" synthetic_data = DataFrame(columns=self.DataDescriber.attr_names) # Get samples for attributes modelled in Bayesian net encoded_dataset = self._generate_encoded_dataset(nsamples) for attr in self.DataDescriber.attr_names: column = self.DataDescriber.attr_dict[attr] if attr in encoded_dataset: synthetic_data[ attr] = column.sample_values_from_binning_indices( encoded_dataset[attr]) else: # For attributes not in BN use independent attribute mode binning_indices = column.sample_binning_indices_in_independent_attribute_mode( nsamples) synthetic_data[ attr] = column.sample_values_from_binning_indices( binning_indices) return synthetic_data
async def item_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: """ If a delivery item has not been provided, prompt for one. :param step_context: :return DialogTurnResult: """ LOGGER.debug(msg=f"{CreateDeliveryDialog.__name__}: item step.") # Create an object in which to collect the delivery information within the dialog. step_context.values[Keys.DELIVERY_DIALOG_STATE.value] = Delivery() delivery: Delivery = step_context.values[Keys.DELIVERY_DIALOG_STATE.value] if delivery.item is None: prompt_options = PromptOptions( prompt=MessageFactory.text( messages.DELIVERY_ITEM_PROMPT, messages.DELIVERY_ITEM_PROMPT, InputHints.expecting_input ) ) return await step_context.prompt(TextPrompt.__name__, prompt_options) return await step_context.next(delivery.item)
async def salute(self, step_context: WaterfallStepContext) -> DialogTurnResult: LOGGER.debug(msg=f"{SalutationDialog.__name__}: salute") dialog_options: {} = step_context.options if step_context.options is not None else {} salutation_phase: SalutationPhase = dialog_options.get( Keys.SALUTATION_PHASE.value, SalutationPhase.INITIATE) message_text = f"" if salutation_phase == SalutationPhase.INITIATE: message_text = f"{messages.HELLO}! {messages.HOW_ARE_YOU_DOING}" elif salutation_phase == SalutationPhase.ACKNOWLEDGE: message_text = f"{messages.SALUTATION_ACKNOWLEDGEMENT}. {messages.HOW_CAN_I_HELP}" elif salutation_phase == SalutationPhase.PROMPT: message_text = f"{messages.HOW_CAN_I_HELP}" await step_context.context.send_activity( MessageFactory.text(message_text, message_text, InputHints.ignoring_input)) return await step_context.end_dialog(self.id)
async def execute_luis_query( luis_recognizer: Recognizer, turn_context: TurnContext) -> (Intent, object): """ Returns an object with pre-formatted LUIS results for the bot's dialogs to consume. """ result = None intent = None try: LOGGER.debug(msg="Executing LUIS query") recognizer_result = await luis_recognizer.recognize(turn_context) intent = get_intent(recognizer_result=recognizer_result) LOGGER.debug(msg="LUIS query execution succeeded") except Exception as exception: LOGGER.error( msg=f"Executing LUIS query failed with an error={exception}") return intent, result
def main(): argparser = ArgumentParser() datasource = argparser.add_mutually_exclusive_group() datasource.add_argument( '--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on') datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file') argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file') argparser.add_argument( '--outdir', '-O', default='outputs/test', type=str, help='Path relative to cwd for storing output files') args = argparser.parse_args() seed(SEED) # Load runconfig with open(path.join(cwd, args.runconfig)) as f: runconfig = json.load(f) print('Runconfig:') print(runconfig) # Load data if args.s3name is not None: rawPop, metadata = load_s3_data_as_df(args.s3name) dname = args.s3name else: rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath)) dname = args.datapath.split('/')[-1] print(f'Loaded data {dname}:') print(rawPop.info()) # Make sure outdir exists if not path.isdir(args.outdir): mkdir(args.outdir) ######################## #### GAME INPUTS ####### ######################## # Train test split rawTrain = rawPop.query(runconfig['dataFilter']['train']) rawTest = rawPop.query(runconfig['dataFilter']['test']) # Pick targets targetIDs = choice(list(rawTrain.index), size=runconfig['nTargets'], replace=False).tolist() # If specified: Add specific target records if runconfig['Targets'] is not None: targetIDs.extend(runconfig['Targets']) targets = rawTrain.loc[targetIDs, :] # Drop targets from population rawTrainWoTargets = rawTrain.drop(targetIDs) # Get test target records testRecordIDs = choice(list(rawTest.index), size=runconfig['nTargets'], replace=False).tolist() # If specified: Add specific target records if runconfig['TestRecords'] is not None: testRecordIDs.extend(runconfig['TestRecords']) testRecords = rawTest.loc[testRecordIDs, :] # List of candidate generative models to evaluate gmList = [] if 'generativeModels' in runconfig.keys(): for gm, paramsList in runconfig['generativeModels'].items(): if gm == 'IndependentHistogram': for params in paramsList: gmList.append(IndependentHistogram(metadata, *params)) elif gm == 'BayesianNet': for params in paramsList: gmList.append(BayesianNet(metadata, *params)) elif gm == 'PrivBayes': for params in paramsList: gmList.append(PrivBayes(metadata, *params)) elif gm == 'CTGAN': for params in paramsList: gmList.append(CTGAN(metadata, *params)) elif gm == 'PATEGAN': for params in paramsList: gmList.append(PATEGAN(metadata, *params)) else: raise ValueError(f'Unknown GM {gm}') # List of candidate sanitisation techniques to evaluate sanList = [] if 'sanitisationTechniques' in runconfig.keys(): for name, paramsList in runconfig['sanitisationTechniques'].items(): if name == 'SanitiserNHS': for params in paramsList: sanList.append(SanitiserNHS(metadata, *params)) else: raise ValueError(f'Unknown sanitisation technique {name}') utilityTasks = [] for taskName, paramsList in runconfig['utilityTasks'].items(): if taskName == 'RandForestClass': for params in paramsList: utilityTasks.append(RandForestClassTask(metadata, *params)) elif taskName == 'LogRegClass': for params in paramsList: utilityTasks.append(LogRegClassTask(metadata, *params)) elif taskName == 'LinReg': for params in paramsList: utilityTasks.append(LinRegTask(metadata, *params)) ################################## ######### EVALUATION ############# ################################## resultsTargetUtility = { ut.__name__: {gm.__name__: {} for gm in gmList + sanList} for ut in utilityTasks } resultsAggUtility = { ut.__name__: { gm.__name__: { 'TargetID': [], 'Accuracy': [] } for gm in gmList + sanList } for ut in utilityTasks } # Add entry for raw for ut in utilityTasks: resultsTargetUtility[ut.__name__]['Raw'] = {} resultsAggUtility[ut.__name__]['Raw'] = { 'TargetID': [], 'Accuracy': [] } print('\n---- Start the game ----') for nr in range(runconfig['nIter']): print(f'\n--- Game iteration {nr + 1} ---') # Draw a raw dataset rIdx = choice(list(rawTrainWoTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() rawTout = rawTrain.loc[rIdx] LOGGER.info('Start: Utility evaluation on Raw...') # Get utility from raw without targets for ut in utilityTasks: resultsTargetUtility[ut.__name__]['Raw'][nr] = {} predErrorTargets = [] predErrorAggr = [] for _ in range(runconfig['nSynT']): ut.train(rawTout) predErrorTargets.append(ut.evaluate(testRecords)) predErrorAggr.append(ut.evaluate(rawTest)) resultsTargetUtility[ut.__name__]['Raw'][nr]['OUT'] = { 'TestRecordID': testRecordIDs, 'Accuracy': list(mean(predErrorTargets, axis=0)) } resultsAggUtility[ut.__name__]['Raw']['TargetID'].append('OUT') resultsAggUtility[ut.__name__]['Raw']['Accuracy'].append( mean(predErrorAggr)) # Get utility from raw with each target for tid in targetIDs: target = targets.loc[[tid]] rawIn = rawTout.append(target) for ut in utilityTasks: predErrorTargets = [] predErrorAggr = [] for _ in range(runconfig['nSynT']): ut.train(rawIn) predErrorTargets.append(ut.evaluate(testRecords)) predErrorAggr.append(ut.evaluate(rawTest)) resultsTargetUtility[ut.__name__]['Raw'][nr][tid] = { 'TestRecordID': testRecordIDs, 'Accuracy': list(mean(predErrorTargets, axis=0)) } resultsAggUtility[ut.__name__]['Raw']['TargetID'].append(tid) resultsAggUtility[ut.__name__]['Raw']['Accuracy'].append( mean(predErrorAggr)) LOGGER.info('Finished: Utility evaluation on Raw.') for GenModel in gmList: LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...') GenModel.fit(rawTout) synTwithoutTarget = [ GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT']) ] # Util evaluation for synthetic without all targets for ut in utilityTasks: resultsTargetUtility[ut.__name__][GenModel.__name__][nr] = {} predErrorTargets = [] predErrorAggr = [] for syn in synTwithoutTarget: ut.train(syn) predErrorTargets.append(ut.evaluate(testRecords)) predErrorAggr.append(ut.evaluate(rawTest)) resultsTargetUtility[ut.__name__][ GenModel.__name__][nr]['OUT'] = { 'TestRecordID': testRecordIDs, 'Accuracy': list(mean(predErrorTargets, axis=0)) } resultsAggUtility[ut.__name__][ GenModel.__name__]['TargetID'].append('OUT') resultsAggUtility[ut.__name__][ GenModel.__name__]['Accuracy'].append(mean(predErrorAggr)) for tid in targetIDs: LOGGER.info(f'Target: {tid}') target = targets.loc[[tid]] rawTin = rawTout.append(target) GenModel.fit(rawTin) synTwithTarget = [ GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT']) ] # Util evaluation for synthetic with this target for ut in utilityTasks: predErrorTargets = [] predErrorAggr = [] for syn in synTwithTarget: ut.train(syn) predErrorTargets.append(ut.evaluate(testRecords)) predErrorAggr.append(ut.evaluate(rawTest)) resultsTargetUtility[ut.__name__][ GenModel.__name__][nr][tid] = { 'TestRecordID': testRecordIDs, 'Accuracy': list(mean(predErrorTargets, axis=0)) } resultsAggUtility[ut.__name__][ GenModel.__name__]['TargetID'].append(tid) resultsAggUtility[ut.__name__][ GenModel.__name__]['Accuracy'].append( mean(predErrorAggr)) del synTwithoutTarget, synTwithTarget LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.') for San in sanList: LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...') sanOut = San.sanitise(rawTout) for ut in utilityTasks: resultsTargetUtility[ut.__name__][San.__name__][nr] = {} predErrorTargets = [] predErrorAggr = [] for _ in range(runconfig['nSynT']): ut.train(sanOut) predErrorTargets.append(ut.evaluate(testRecords)) predErrorAggr.append(ut.evaluate(rawTest)) resultsTargetUtility[ut.__name__][San.__name__][nr]['OUT'] = { 'TestRecordID': testRecordIDs, 'Accuracy': list(mean(predErrorTargets, axis=0)) } resultsAggUtility[ut.__name__][ San.__name__]['TargetID'].append('OUT') resultsAggUtility[ut.__name__][ San.__name__]['Accuracy'].append(mean(predErrorAggr)) for tid in targetIDs: LOGGER.info(f'Target: {tid}') target = targets.loc[[tid]] rawTin = rawTout.append(target) sanIn = San.sanitise(rawTin) for ut in utilityTasks: predErrorTargets = [] predErrorAggr = [] for _ in range(runconfig['nSynT']): ut.train(sanIn) predErrorTargets.append(ut.evaluate(testRecords)) predErrorAggr.append(ut.evaluate(rawTest)) resultsTargetUtility[ut.__name__][ San.__name__][nr][tid] = { 'TestRecordID': testRecordIDs, 'Accuracy': list(mean(predErrorTargets, axis=0)) } resultsAggUtility[ut.__name__][ San.__name__]['TargetID'].append(tid) resultsAggUtility[ut.__name__][ San.__name__]['Accuracy'].append(mean(predErrorAggr)) del sanOut, sanIn LOGGER.info(f'Finished: Evaluation for model {San.__name__}.') outfile = f"ResultsUtilTargets_{dname}" LOGGER.info( f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: json.dump(resultsTargetUtility, f, indent=2, default=json_numpy_serialzer) outfile = f"ResultsUtilAgg_{dname}" LOGGER.info( f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: json.dump(resultsAggUtility, f, indent=2, default=json_numpy_serialzer)
def main(): argparser = ArgumentParser() datasource = argparser.add_mutually_exclusive_group() datasource.add_argument( '--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on') datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file') argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file') argparser.add_argument( '--outdir', '-O', default='tests', type=str, help='Path relative to cwd for storing output files') args = argparser.parse_args() # Load runconfig with open(path.join(cwd, args.runconfig)) as f: runconfig = json.load(f) print('Runconfig:') print(runconfig) # Load data if args.s3name is not None: rawPop, metadata = load_s3_data_as_df(args.s3name) dname = args.s3name else: rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath)) dname = args.datapath.split('/')[-1] print(f'Loaded data {dname}:') print(rawPop.info()) # Make sure outdir exists if not path.isdir(args.outdir): mkdir(args.outdir) seed(SEED) ######################## #### GAME INPUTS ####### ######################## # Pick targets targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist() # If specified: Add specific target records if runconfig['Targets'] is not None: targetIDs.extend(runconfig['Targets']) targets = rawPop.loc[targetIDs, :] # Drop targets from population rawPopDropTargets = rawPop.drop(targetIDs) # List of candidate generative models to evaluate gmList = [] if 'generativeModels' in runconfig.keys(): for gm, paramsList in runconfig['generativeModels'].items(): if gm == 'IndependentHistogram': for params in paramsList: gmList.append(IndependentHistogram(metadata, *params)) elif gm == 'BayesianNet': for params in paramsList: gmList.append(BayesianNet(metadata, *params)) elif gm == 'PrivBayes': for params in paramsList: gmList.append(PrivBayes(metadata, *params)) elif gm == 'CTGAN': for params in paramsList: gmList.append(CTGAN(metadata, *params)) elif gm == 'PATEGAN': for params in paramsList: gmList.append(PATEGAN(metadata, *params)) else: raise ValueError(f'Unknown GM {gm}') # List of candidate sanitisation techniques to evaluate sanList = [] if 'sanitisationTechniques' in runconfig.keys(): for name, paramsList in runconfig['sanitisationTechniques'].items(): if name == 'SanitiserNHS': for params in paramsList: sanList.append(SanitiserNHS(metadata, *params)) else: raise ValueError(f'Unknown sanitisation technique {name}') ################################## ######### EVALUATION ############# ################################## resultsTargetPrivacy = { tid: { sa: {gm.__name__: {} for gm in gmList + sanList} for sa in runconfig['sensitiveAttributes'] } for tid in targetIDs } # Add entry for raw for tid in targetIDs: for sa in runconfig['sensitiveAttributes']: resultsTargetPrivacy[tid][sa]['Raw'] = {} print('\n---- Start the game ----') for nr in range(runconfig['nIter']): print(f'\n--- Game iteration {nr + 1} ---') # Draw a raw dataset rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() rawTout = rawPopDropTargets.loc[rIdx] ############### ## ATTACKS #### ############### attacks = {} for sa, atype in runconfig['sensitiveAttributes'].items(): if atype == 'LinReg': attacks[sa] = LinRegAttack(sensitiveAttribute=sa, metadata=metadata) elif atype == 'Classification': attacks[sa] = RandForestAttack(sensitiveAttribute=sa, metadata=metadata) #### Assess advantage raw for sa, Attack in attacks.items(): Attack.train(rawTout) for tid in targetIDs: target = targets.loc[[tid]] targetAux = target.loc[[tid], Attack.knownAttributes] targetSecret = target.loc[tid, Attack.sensitiveAttribute] guess = Attack.attack(targetAux, attemptLinkage=True, data=rawTout) pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=rawTout) resultsTargetPrivacy[tid][sa]['Raw'][nr] = { 'AttackerGuess': [guess], 'ProbCorrect': [pCorrect], 'TargetPresence': [LABEL_OUT] } for tid in targetIDs: target = targets.loc[[tid]] rawTin = rawTout.append(target) for sa, Attack in attacks.items(): targetAux = target.loc[[tid], Attack.knownAttributes] targetSecret = target.loc[tid, Attack.sensitiveAttribute] guess = Attack.attack(targetAux, attemptLinkage=True, data=rawTin) pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=rawTin) resultsTargetPrivacy[tid][sa]['Raw'][nr][ 'AttackerGuess'].append(guess) resultsTargetPrivacy[tid][sa]['Raw'][nr]['ProbCorrect'].append( pCorrect) resultsTargetPrivacy[tid][sa]['Raw'][nr][ 'TargetPresence'].append(LABEL_IN) ##### Assess advantage Syn for GenModel in gmList: LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...') GenModel.fit(rawTout) synTwithoutTarget = [ GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT']) ] for sa, Attack in attacks.items(): for tid in targetIDs: resultsTargetPrivacy[tid][sa][GenModel.__name__][nr] = { 'AttackerGuess': [], 'ProbCorrect': [], 'TargetPresence': [LABEL_OUT for _ in range(runconfig['nSynT'])] } for syn in synTwithoutTarget: Attack.train(syn) for tid in targetIDs: target = targets.loc[[tid]] targetAux = target.loc[[tid], Attack.knownAttributes] targetSecret = target.loc[tid, Attack.sensitiveAttribute] guess = Attack.attack(targetAux) pCorrect = Attack.get_likelihood( targetAux, targetSecret) resultsTargetPrivacy[tid][sa][GenModel.__name__][nr][ 'AttackerGuess'].append(guess) resultsTargetPrivacy[tid][sa][GenModel.__name__][nr][ 'ProbCorrect'].append(pCorrect) del synTwithoutTarget for tid in targetIDs: LOGGER.info(f'Target: {tid}') target = targets.loc[[tid]] rawTin = rawTout.append(target) GenModel.fit(rawTin) synTwithTarget = [ GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT']) ] for sa, Attack in attacks.items(): targetAux = target.loc[[tid], Attack.knownAttributes] targetSecret = target.loc[tid, Attack.sensitiveAttribute] for syn in synTwithTarget: Attack.train(syn) guess = Attack.attack(targetAux) pCorrect = Attack.get_likelihood( targetAux, targetSecret) resultsTargetPrivacy[tid][sa][GenModel.__name__][nr][ 'AttackerGuess'].append(guess) resultsTargetPrivacy[tid][sa][GenModel.__name__][nr][ 'ProbCorrect'].append(pCorrect) resultsTargetPrivacy[tid][sa][GenModel.__name__][nr][ 'TargetPresence'].append(LABEL_IN) del synTwithTarget for San in sanList: LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...') attacks = {} for sa, atype in runconfig['sensitiveAttributes'].items(): if atype == 'LinReg': attacks[sa] = LinRegAttack(sensitiveAttribute=sa, metadata=metadata, quids=San.quids) elif atype == 'Classification': attacks[sa] = RandForestAttack(sensitiveAttribute=sa, metadata=metadata, quids=San.quids) sanOut = San.sanitise(rawTout) for sa, Attack in attacks.items(): Attack.train(sanOut) for tid in targetIDs: target = targets.loc[[tid]] targetAux = target.loc[[tid], Attack.knownAttributes] targetSecret = target.loc[tid, Attack.sensitiveAttribute] guess = Attack.attack(targetAux, attemptLinkage=True, data=sanOut) pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=sanOut) resultsTargetPrivacy[tid][sa][San.__name__][nr] = { 'AttackerGuess': [guess], 'ProbCorrect': [pCorrect], 'TargetPresence': [LABEL_OUT] } for tid in targetIDs: LOGGER.info(f'Target: {tid}') target = targets.loc[[tid]] rawTin = rawTout.append(target) sanIn = San.sanitise(rawTin) for sa, Attack in attacks.items(): targetAux = target.loc[[tid], Attack.knownAttributes] targetSecret = target.loc[tid, Attack.sensitiveAttribute] Attack.train(sanIn) guess = Attack.attack(targetAux, attemptLinkage=True, data=sanIn) pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=sanIn) resultsTargetPrivacy[tid][sa][ San.__name__][nr]['AttackerGuess'].append(guess) resultsTargetPrivacy[tid][sa][ San.__name__][nr]['ProbCorrect'].append(pCorrect) resultsTargetPrivacy[tid][sa][ San.__name__][nr]['TargetPresence'].append(LABEL_IN) outfile = f"ResultsMLEAI_{dname}" LOGGER.info( f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer)
def main(): argparser = ArgumentParser() datasource = argparser.add_mutually_exclusive_group() datasource.add_argument( '--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on') datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file') argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file') argparser.add_argument( '--outdir', '-O', default='tests', type=str, help='Path relative to cwd for storing output files') args = argparser.parse_args() # Load runconfig with open(path.join(cwd, args.runconfig)) as f: runconfig = json.load(f) print('Runconfig:') print(runconfig) # Load data if args.s3name is not None: rawPop, metadata = load_s3_data_as_df(args.s3name) dname = args.s3name else: rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath)) dname = args.datapath.split('/')[-1] print(f'Loaded data {dname}:') print(rawPop.info()) # Make sure outdir exists if not path.isdir(args.outdir): mkdir(args.outdir) seed(SEED) ######################## #### GAME INPUTS ####### ######################## # Pick targets targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist() # If specified: Add specific target records if runconfig['Targets'] is not None: targetIDs.extend(runconfig['Targets']) targets = rawPop.loc[targetIDs, :] # Drop targets from population rawPopDropTargets = rawPop.drop(targetIDs) # Init adversary's prior knowledge rawAidx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawA'], replace=False).tolist() rawA = rawPop.loc[rawAidx, :] # List of candidate generative models to evaluate gmList = [] if 'generativeModels' in runconfig.keys(): for gm, paramsList in runconfig['generativeModels'].items(): if gm == 'IndependentHistogram': for params in paramsList: gmList.append(IndependentHistogram(metadata, *params)) elif gm == 'BayesianNet': for params in paramsList: gmList.append(BayesianNet(metadata, *params)) elif gm == 'PrivBayes': for params in paramsList: gmList.append(PrivBayes(metadata, *params)) elif gm == 'CTGAN': for params in paramsList: gmList.append(CTGAN(metadata, *params)) elif gm == 'PATEGAN': for params in paramsList: gmList.append(PATEGAN(metadata, *params)) else: raise ValueError(f'Unknown GM {gm}') # List of candidate sanitisation techniques to evaluate sanList = [] if 'sanitisationTechniques' in runconfig.keys(): for name, paramsList in runconfig['sanitisationTechniques'].items(): if name == 'SanitiserNHS': for params in paramsList: sanList.append(SanitiserNHS(metadata, *params)) else: raise ValueError(f'Unknown sanitisation technique {name}') ################################### #### ATTACK TRAINING ############# ################################## print('\n---- Attack training ----') attacks = {} for tid in targetIDs: print(f'\n--- Adversary picks target {tid} ---') target = targets.loc[[tid]] attacks[tid] = {} for San in sanList: LOGGER.info(f'Start: Attack training for {San.__name__}...') attacks[tid][San.__name__] = {} # Generate example datasets for training attack classifier sanA, labelsA = generate_mia_anon_data( San, target, rawA, runconfig['sizeRawT'], runconfig['nShadows'] * runconfig['nSynA']) # Train attack on shadow data for Feature in [ NaiveFeatureSet(DataFrame), HistogramFeatureSet(DataFrame, metadata, nbins=San.histogram_size, quids=San.quids), CorrelationsFeatureSet(DataFrame, metadata, quids=San.quids), EnsembleFeatureSet(DataFrame, metadata, nbins=San.histogram_size, quasi_id_cols=San.quids) ]: Attack = MIAttackClassifierRandomForest(metadata=metadata, FeatureSet=Feature, quids=San.quids) Attack.train(sanA, labelsA) attacks[tid][San.__name__][f'{Feature.__name__}'] = Attack # Clean up del sanA, labelsA LOGGER.info(f'Finished: Attack training.') for GenModel in gmList: LOGGER.info(f'Start: Attack training for {GenModel.__name__}...') attacks[tid][GenModel.__name__] = {} # Generate shadow model data for training attacks on this target synA, labelsSA = generate_mia_shadow_data(GenModel, target, rawA, runconfig['sizeRawT'], runconfig['sizeSynT'], runconfig['nShadows'], runconfig['nSynA']) # Train attack on shadow data for Feature in [ NaiveFeatureSet(GenModel.datatype), HistogramFeatureSet(GenModel.datatype, metadata), CorrelationsFeatureSet(GenModel.datatype, metadata) ]: Attack = MIAttackClassifierRandomForest(metadata, Feature) Attack.train(synA, labelsSA) attacks[tid][GenModel.__name__][f'{Feature.__name__}'] = Attack # Clean up del synA, labelsSA LOGGER.info(f'Finished: Attack training.') ################################## ######### EVALUATION ############# ################################## resultsTargetPrivacy = { tid: {gm.__name__: {} for gm in gmList + sanList} for tid in targetIDs } print('\n---- Start the game ----') for nr in range(runconfig['nIter']): print(f'\n--- Game iteration {nr + 1} ---') # Draw a raw dataset rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() rawTout = rawPopDropTargets.loc[rIdx] for GenModel in gmList: LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...') # Train a generative model GenModel.fit(rawTout) synTwithoutTarget = [ GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT']) ] synLabelsOut = [LABEL_OUT for _ in range(runconfig['nSynT'])] for tid in targetIDs: LOGGER.info(f'Target: {tid}') target = targets.loc[[tid]] resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr] = {} rawTin = rawTout.append(target) GenModel.fit(rawTin) synTwithTarget = [ GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT']) ] synLabelsIn = [LABEL_IN for _ in range(runconfig['nSynT'])] synT = synTwithoutTarget + synTwithTarget synTlabels = synLabelsOut + synLabelsIn # Run attacks for feature, Attack in attacks[tid][ f'{GenModel.__name__}'].items(): # Produce a guess for each synthetic dataset attackerGuesses = Attack.attack(synT) resDict = { 'Secret': synTlabels, 'AttackerGuess': attackerGuesses } resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr][ feature] = resDict del synT, synTwithoutTarget, synTwithTarget LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.') for San in sanList: LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...') sanOut = San.sanitise(rawTout) for tid in targetIDs: LOGGER.info(f'Target: {tid}') target = targets.loc[[tid]] resultsTargetPrivacy[tid][San.__name__][nr] = {} rawTin = rawTout.append(target) sanIn = San.sanitise(rawTin) sanT = [sanOut, sanIn] sanTLabels = [LABEL_OUT, LABEL_IN] # Run attacks for feature, Attack in attacks[tid][San.__name__].items(): # Produce a guess for each synthetic dataset attackerGuesses = Attack.attack(sanT, attemptLinkage=True, target=target) resDict = { 'Secret': sanTLabels, 'AttackerGuess': attackerGuesses } resultsTargetPrivacy[tid][ San.__name__][nr][feature] = resDict del sanT, sanOut, sanIn LOGGER.info(f'Finished: Evaluation for model {San.__name__}.') outfile = f"ResultsMIA_{dname}" LOGGER.info( f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer)