def _attack_instance(self, inputs: JsonDict, instance: Instance, input_field_to_attack: str, grad_input_field: str, ignore_tokens: List[str]): # Save fields that must be checked for equality fields_to_compare = utils.get_fields_to_compare( inputs, instance, input_field_to_attack) # Set num_ignore_tokens, which tells input reduction when to stop # We keep at least one token for input reduction on classification/entailment/etc. if "tags" not in instance: num_ignore_tokens = 1 tag_mask = None # Set num_ignore_tokens for NER and build token mask else: num_ignore_tokens, tag_mask, original_tags = _get_ner_tags_and_mask( instance, input_field_to_attack, ignore_tokens) text_field: TextField = instance[input_field_to_attack] # type: ignore current_tokens = deepcopy(text_field.tokens) candidates = [(instance, -1, tag_mask)] # keep removing tokens until prediction is about to change while len(current_tokens) > num_ignore_tokens and candidates: # sort current candidates by smallest length (we want to remove as many tokens as possible) def get_length(input_instance: Instance): input_text_field: TextField = input_instance[ input_field_to_attack] # type: ignore return len(input_text_field.tokens) candidates = heapq.nsmallest(self.beam_size, candidates, key=lambda x: get_length(x[0])) beam_candidates = deepcopy(candidates) candidates = [] for beam_instance, smallest_idx, tag_mask in beam_candidates: # get gradients and predictions beam_tag_mask = deepcopy(tag_mask) grads, outputs = self.predictor.get_gradients([beam_instance]) for output in outputs: if isinstance(outputs[output], torch.Tensor): outputs[output] = outputs[output].detach().cpu().numpy( ).squeeze().squeeze() elif isinstance(outputs[output], list): outputs[output] = outputs[output][0] # Check if any fields have changed, if so, next beam if "tags" not in instance: # relabel beam_instance since last iteration removed an input token beam_instance = self.predictor.predictions_to_labeled_instances( beam_instance, outputs)[0] if utils.instance_has_changed(beam_instance, fields_to_compare): continue # special case for sentence tagging (we have tested NER) else: # remove the mask where you remove the input token from. if smallest_idx != -1: # Don't delete on the very first iteration del beam_tag_mask[smallest_idx] cur_tags = [ outputs["tags"][x] for x in range(len(outputs["tags"])) if beam_tag_mask[x] ] if cur_tags != original_tags: continue # remove a token from the input text_field: TextField = beam_instance[ input_field_to_attack] # type: ignore current_tokens = deepcopy(text_field.tokens) reduced_instances_and_smallest = _remove_one_token( beam_instance, input_field_to_attack, grads[grad_input_field][0], ignore_tokens, self.beam_size, beam_tag_mask) candidates.extend(reduced_instances_and_smallest) return current_tokens
def _attack_instance(self, inputs: JsonDict, instance: Instance, input_field_to_attack: str, grad_input_field: str, ignore_tokens: List[str]): # Save fields that must be checked for equality fields_to_compare = utils.get_fields_to_compare( inputs, instance, input_field_to_attack) # Set num_ignore_tokens, which tells input reduction when to stop # We keep at least one token for input reduction on classification/entailment/etc. if "tags" not in instance: num_ignore_tokens = 1 tag_mask = None # Set num_ignore_tokens for NER and build token mask else: num_ignore_tokens, tag_mask, original_tags = _get_ner_tags_and_mask( instance, input_field_to_attack, ignore_tokens) text_field: TextField = instance[input_field_to_attack] # type: ignore current_tokens = deepcopy(text_field.tokens) candidates = [(instance, -1, tag_mask)] # keep removing tokens until prediction is about to change while len(current_tokens) > num_ignore_tokens and candidates: # sort current candidates by smallest length (we want to remove as many tokens as possible) def get_length(input_instance: Instance): input_text_field: TextField = input_instance[ input_field_to_attack] # type: ignore return len(input_text_field.tokens) candidates = heapq.nsmallest(self.beam_size, candidates, key=lambda x: get_length(x[0])) # predictor.get_gradients is where the most expensive computation happens, so we're # going to do it in a batch, up front, before iterating over the results. copied_candidates = deepcopy(candidates) all_grads, all_outputs = self.predictor.get_gradients( [x[0] for x in copied_candidates]) # The output in `all_grads` and `all_outputs` is batched in a dictionary (e.g., # {'grad_output_1': batched_tensor}). We need to split this into a list of non-batched # dictionaries that we can iterate over. split_grads = [] for i in range(len(copied_candidates)): split_grads.append( {key: value[i] for key, value in all_grads.items()}) split_outputs = [] for i in range(len(copied_candidates)): instance_outputs = {} for key, value in all_outputs.items(): if key == 'loss': continue instance_outputs[key] = value[i] split_outputs.append(instance_outputs) beam_candidates = [(x[0], x[1], x[2], split_grads[i], split_outputs[i]) for i, x in enumerate(copied_candidates)] candidates = [] for beam_instance, smallest_idx, tag_mask, grads, outputs in beam_candidates: beam_tag_mask = deepcopy(tag_mask) for output in outputs: if isinstance(outputs[output], torch.Tensor): outputs[output] = outputs[output].detach().cpu().numpy( ).squeeze().squeeze() elif isinstance(outputs[output], list): outputs[output] = outputs[output][0] # Check if any fields have changed, if so, next beam if "tags" not in instance: # relabel beam_instance since last iteration removed an input token beam_instance = self.predictor.predictions_to_labeled_instances( beam_instance, outputs)[0] if utils.instance_has_changed(beam_instance, fields_to_compare): continue # special case for sentence tagging (we have tested NER) else: # remove the mask where you remove the input token from. if smallest_idx != -1: # Don't delete on the very first iteration del beam_tag_mask[smallest_idx] cur_tags = [ outputs["tags"][x] for x in range(len(outputs["tags"])) if beam_tag_mask[x] ] if cur_tags != original_tags: continue # remove a token from the input text_field: TextField = beam_instance[ input_field_to_attack] # type: ignore current_tokens = deepcopy(text_field.tokens) reduced_instances_and_smallest = _remove_one_token( beam_instance, input_field_to_attack, grads[grad_input_field], ignore_tokens, self.beam_size, beam_tag_mask) candidates.extend(reduced_instances_and_smallest) return current_tokens
def attack_instance( self, instance: Instance, inputs: JsonDict, input_field_to_attack: str = "tokens", grad_input_field: str = "grad_input_1", ignore_tokens: List[str] = None, target: JsonDict = None, ) -> Tuple[List[Token], JsonDict]: if self.embedding_matrix is None: self.initialize() ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens # If `target` is `None`, we move away from the current prediction, otherwise we move # _towards_ the target. sign = -1 if target is None else 1 # Gets a list of the fields that we want to check to see if they change. fields_to_compare = utils.get_fields_to_compare( inputs, instance, input_field_to_attack) # We'll be modifying the tokens in this text field below, and grabbing the modified # list after the `while` loop. text_field: TextField = instance[input_field_to_attack] # type: ignore # Because we can save computation by getting grads and outputs at the same time, we do # them together at the end of the loop, even though we use grads at the beginning and # outputs at the end. This is our initial gradient for the beginning of the loop. The # output can be ignored here. grads, outputs = self.predictor.get_gradients([instance]) # Ignore any token that is in the ignore_tokens list by setting the token to already # flipped. flipped: List[int] = [] for index, token in enumerate(text_field.tokens): if token.text in ignore_tokens: flipped.append(index) if "clusters" in outputs: # Coref unfortunately needs a special case here. We don't want to flip words in # the same predicted coref cluster, but we can't really specify a list of tokens, # because, e.g., "he" could show up in several different clusters. # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to # return the set of tokens that shouldn't be changed for each instance? E.g., you # could imagine setting a field on the `Token` object, that we could then read # here... for cluster in outputs["clusters"]: for mention in cluster: for index in range(mention[0], mention[1] + 1): flipped.append(index) while True: # Compute L2 norm of all grads. grad = grads[grad_input_field][0] grads_magnitude = [g.dot(g) for g in grad] # only flip a token once for index in flipped: grads_magnitude[index] = -1 # We flip the token with highest gradient norm. index_of_token_to_flip = numpy.argmax(grads_magnitude) if grads_magnitude[index_of_token_to_flip] == -1: # If we've already flipped all of the tokens, we give up. break flipped.append(index_of_token_to_flip) text_field_tensors = text_field.as_tensor( text_field.get_padding_lengths()) input_tokens = util.get_token_ids_from_text_field_tensors( text_field_tensors) original_id_of_token_to_flip = input_tokens[index_of_token_to_flip] # Get new token using taylor approximation. new_id = self._first_order_taylor(grad[index_of_token_to_flip], original_id_of_token_to_flip, sign) # Flip token. We need to tell the instance to re-index itself, so the text field # will actually update. new_token = Token(self.vocab._index_to_token[self.namespace] [new_id]) # type: ignore text_field.tokens[index_of_token_to_flip] = new_token instance.indexed = False # Get model predictions on instance, and then label the instances grads, outputs = self.predictor.get_gradients([instance ]) # predictions for key, output in outputs.items(): if isinstance(output, torch.Tensor): outputs[key] = output.detach().cpu().numpy().squeeze() elif isinstance(output, list): outputs[key] = output[0] # TODO(mattg): taking the first result here seems brittle, if we're in a case where # there are multiple predictions. labeled_instance = self.predictor.predictions_to_labeled_instances( instance, outputs)[0] # If we've met our stopping criterion, we stop. has_changed = utils.instance_has_changed(labeled_instance, fields_to_compare) if target is None and has_changed: # With no target, we just want to change the prediction. break if target is not None and not has_changed: # With a given target, we want to *match* the target, which we check by # `not has_changed`. break return text_field.tokens, outputs
def attack_from_json( self, inputs: JsonDict, input_field_to_attack: str = "tokens", grad_input_field: str = "grad_input_1", ignore_tokens: List[str] = None, target: JsonDict = None, ) -> JsonDict: """ Replaces one token at a time from the input until the model's prediction changes. ``input_field_to_attack`` is for example ``tokens``, it says what the input field is called. ``grad_input_field`` is for example ``grad_input_1``, which is a key into a grads dictionary. The method computes the gradient w.r.t. the tokens, finds the token with the maximum gradient (by L2 norm), and replaces it with another token based on the first-order Taylor approximation of the loss. This process is iteratively repeated until the prediction changes. Once a token is replaced, it is not flipped again. Parameters ---------- inputs : ``JsonDict`` The model inputs, the same as what is passed to a ``Predictor``. input_field_to_attack : ``str``, optional (default='tokens') The field that has the tokens that we're going to be flipping. This must be a ``TextField``. grad_input_field : ``str``, optional (default='grad_input_1') If there is more than one field that gets embedded in your model (e.g., a question and a passage, or a premise and a hypothesis), this tells us the key to use to get the correct gradients. This selects from the output of :func:`Predictor.get_gradients`. ignore_tokens : ``List[str]``, optional (default=DEFAULT_IGNORE_TOKENS) These tokens will not be flipped. The default list includes some simple punctuation, OOV and padding tokens, and common control tokens for BERT, etc. target : ``JsonDict``, optional (default=None) If given, this will be a `targeted` hotflip attack, where instead of just trying to change a model's prediction from what it current is predicting, we try to change it to a `specific` target value. This is a ``JsonDict`` because it needs to specify the field name and target value. For example, for a masked LM, this would be something like ``{"words": ["she"]}``, because ``"words"`` is the field name, there is one mask token (hence the list of length one), and we want to change the prediction from whatever it was to ``"she"``. """ if self.embedding_matrix is None: self.initialize() ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens # If `target` is `None`, we move away from the current prediction, otherwise we move # _towards_ the target. sign = -1 if target is None else 1 instance = self.predictor._json_to_instance(inputs) if target is None: output_dict = self.predictor._model.forward_on_instance(instance) else: output_dict = target # This now holds the predictions that we want to change (either away from or towards, # depending on whether `target` was passed). We'll use this in the loop below to check for # when we've met our stopping criterion. original_instances = self.predictor.predictions_to_labeled_instances(instance, output_dict) # This is just for ease of access in the UI, so we know the original tokens. It's not used # in the logic below. original_text_field: TextField = original_instances[0][ # type: ignore input_field_to_attack ] original_tokens = deepcopy(original_text_field.tokens) final_tokens = [] # `original_instances` is a list because there might be several different predictions that # we're trying to attack (e.g., all of the NER tags for an input sentence). We attack them # one at a time. for instance in original_instances: # Gets a list of the fields that we want to check to see if they change. fields_to_compare = utils.get_fields_to_compare(inputs, instance, input_field_to_attack) # We'll be modifying the tokens in this text field below, and grabbing the modified # list after the `while` loop. text_field: TextField = instance[input_field_to_attack] # type: ignore # Because we can save computation by getting grads and outputs at the same time, we do # them together at the end of the loop, even though we use grads at the beginning and # outputs at the end. This is our initial gradient for the beginning of the loop. The # output can be ignored here. grads, outputs = self.predictor.get_gradients([instance]) # Ignore any token that is in the ignore_tokens list by setting the token to already # flipped. flipped: List[int] = [] for index, token in enumerate(text_field.tokens): if token.text in ignore_tokens: flipped.append(index) if "clusters" in outputs: # Coref unfortunately needs a special case here. We don't want to flip words in # the same predicted coref cluster, but we can't really specify a list of tokens, # because, e.g., "he" could show up in several different clusters. # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to # return the set of tokens that shouldn't be changed for each instance? E.g., you # could imagine setting a field on the `Token` object, that we could then read # here... for cluster in outputs["clusters"]: for mention in cluster: for index in range(mention[0], mention[1] + 1): flipped.append(index) while True: # Compute L2 norm of all grads. grad = grads[grad_input_field][0] grads_magnitude = [g.dot(g) for g in grad] # only flip a token once for index in flipped: grads_magnitude[index] = -1 # We flip the token with highest gradient norm. index_of_token_to_flip = numpy.argmax(grads_magnitude) if grads_magnitude[index_of_token_to_flip] == -1: # If we've already flipped all of the tokens, we give up. break flipped.append(index_of_token_to_flip) # TODO(mattg): This is quite a bit of a hack for getting the vocab id... I don't # have better ideas at the moment, though. indexer_name = self.namespace input_tokens = text_field._indexed_tokens[indexer_name] original_id_of_token_to_flip = input_tokens[index_of_token_to_flip] # Get new token using taylor approximation. new_id = self._first_order_taylor( grad[index_of_token_to_flip], original_id_of_token_to_flip, sign ) # Flip token. We need to tell the instance to re-index itself, so the text field # will actually update. new_token = Token( self.vocab._index_to_token[self.namespace][new_id] ) # type: ignore text_field.tokens[index_of_token_to_flip] = new_token instance.indexed = False # Get model predictions on instance, and then label the instances grads, outputs = self.predictor.get_gradients([instance]) # predictions for key, output in outputs.items(): if isinstance(output, torch.Tensor): outputs[key] = output.detach().cpu().numpy().squeeze() elif isinstance(output, list): outputs[key] = output[0] # TODO(mattg): taking the first result here seems brittle, if we're in a case where # there are multiple predictions. labeled_instance = self.predictor.predictions_to_labeled_instances( instance, outputs )[0] # If we've met our stopping criterion, we stop. has_changed = utils.instance_has_changed(labeled_instance, fields_to_compare) if target is None and has_changed: # With no target, we just want to change the prediction. break if target is not None and not has_changed: # With a given target, we want to *match* the target, which we check by # `not has_changed`. break final_tokens.append(text_field.tokens) return sanitize({"final": final_tokens, "original": original_tokens, "outputs": outputs})
# Get model predictions on instance, and then label the instances grads, outputs = self.predictor.get_gradients([instance]) # predictions for key, output in outputs.items(): if isinstance(output, torch.Tensor): outputs[key] = output.detach().cpu().numpy().squeeze() elif isinstance(output, list): outputs[key] = output[0] # TODO(mattg): taking the first result here seems brittle, if we're in a case where # there are multiple predictions. labeled_instance = self.predictor.predictions_to_labeled_instances( instance, outputs )[0] # If we've met our stopping criterion, we stop. has_changed = utils.instance_has_changed(labeled_instance, fields_to_compare) if target is None and has_changed: # With no target, we just want to change the prediction. break if target is not None and not has_changed: # With a given target, we want to *match* the target, which we check by # `not has_changed`. break final_tokens.append(text_field.tokens) return sanitize({"final": final_tokens, "original": original_tokens, "outputs": outputs}) def _first_order_taylor(self, grad: numpy.ndarray, token_idx: int, sign: int) -> int: >>>>>>> 1a6a857d... Remove unwanted blank spaces """