示例#1
0
    def _attack_instance(self, inputs: JsonDict, instance: Instance,
                         input_field_to_attack: str, grad_input_field: str,
                         ignore_tokens: List[str]):
        # Save fields that must be checked for equality
        fields_to_compare = utils.get_fields_to_compare(
            inputs, instance, input_field_to_attack)

        # Set num_ignore_tokens, which tells input reduction when to stop
        # We keep at least one token for input reduction on classification/entailment/etc.
        if "tags" not in instance:
            num_ignore_tokens = 1
            tag_mask = None

        # Set num_ignore_tokens for NER and build token mask
        else:
            num_ignore_tokens, tag_mask, original_tags = _get_ner_tags_and_mask(
                instance, input_field_to_attack, ignore_tokens)

        text_field: TextField = instance[input_field_to_attack]  # type: ignore
        current_tokens = deepcopy(text_field.tokens)
        candidates = [(instance, -1, tag_mask)]
        # keep removing tokens until prediction is about to change
        while len(current_tokens) > num_ignore_tokens and candidates:
            # sort current candidates by smallest length (we want to remove as many tokens as possible)
            def get_length(input_instance: Instance):
                input_text_field: TextField = input_instance[
                    input_field_to_attack]  # type: ignore
                return len(input_text_field.tokens)

            candidates = heapq.nsmallest(self.beam_size,
                                         candidates,
                                         key=lambda x: get_length(x[0]))

            beam_candidates = deepcopy(candidates)
            candidates = []
            for beam_instance, smallest_idx, tag_mask in beam_candidates:
                # get gradients and predictions
                beam_tag_mask = deepcopy(tag_mask)
                grads, outputs = self.predictor.get_gradients([beam_instance])

                for output in outputs:
                    if isinstance(outputs[output], torch.Tensor):
                        outputs[output] = outputs[output].detach().cpu().numpy(
                        ).squeeze().squeeze()
                    elif isinstance(outputs[output], list):
                        outputs[output] = outputs[output][0]

                # Check if any fields have changed, if so, next beam
                if "tags" not in instance:
                    # relabel beam_instance since last iteration removed an input token
                    beam_instance = self.predictor.predictions_to_labeled_instances(
                        beam_instance, outputs)[0]
                    if utils.instance_has_changed(beam_instance,
                                                  fields_to_compare):
                        continue

                # special case for sentence tagging (we have tested NER)
                else:
                    # remove the mask where you remove the input token from.
                    if smallest_idx != -1:  # Don't delete on the very first iteration
                        del beam_tag_mask[smallest_idx]
                    cur_tags = [
                        outputs["tags"][x] for x in range(len(outputs["tags"]))
                        if beam_tag_mask[x]
                    ]
                    if cur_tags != original_tags:
                        continue

                # remove a token from the input
                text_field: TextField = beam_instance[
                    input_field_to_attack]  # type: ignore
                current_tokens = deepcopy(text_field.tokens)
                reduced_instances_and_smallest = _remove_one_token(
                    beam_instance, input_field_to_attack,
                    grads[grad_input_field][0], ignore_tokens, self.beam_size,
                    beam_tag_mask)
                candidates.extend(reduced_instances_and_smallest)
        return current_tokens
示例#2
0
    def _attack_instance(self, inputs: JsonDict, instance: Instance,
                         input_field_to_attack: str, grad_input_field: str,
                         ignore_tokens: List[str]):
        # Save fields that must be checked for equality
        fields_to_compare = utils.get_fields_to_compare(
            inputs, instance, input_field_to_attack)

        # Set num_ignore_tokens, which tells input reduction when to stop
        # We keep at least one token for input reduction on classification/entailment/etc.
        if "tags" not in instance:
            num_ignore_tokens = 1
            tag_mask = None

        # Set num_ignore_tokens for NER and build token mask
        else:
            num_ignore_tokens, tag_mask, original_tags = _get_ner_tags_and_mask(
                instance, input_field_to_attack, ignore_tokens)

        text_field: TextField = instance[input_field_to_attack]  # type: ignore
        current_tokens = deepcopy(text_field.tokens)
        candidates = [(instance, -1, tag_mask)]
        # keep removing tokens until prediction is about to change
        while len(current_tokens) > num_ignore_tokens and candidates:
            # sort current candidates by smallest length (we want to remove as many tokens as possible)
            def get_length(input_instance: Instance):
                input_text_field: TextField = input_instance[
                    input_field_to_attack]  # type: ignore
                return len(input_text_field.tokens)

            candidates = heapq.nsmallest(self.beam_size,
                                         candidates,
                                         key=lambda x: get_length(x[0]))

            # predictor.get_gradients is where the most expensive computation happens, so we're
            # going to do it in a batch, up front, before iterating over the results.
            copied_candidates = deepcopy(candidates)
            all_grads, all_outputs = self.predictor.get_gradients(
                [x[0] for x in copied_candidates])

            # The output in `all_grads` and `all_outputs` is batched in a dictionary (e.g.,
            # {'grad_output_1': batched_tensor}).  We need to split this into a list of non-batched
            # dictionaries that we can iterate over.
            split_grads = []
            for i in range(len(copied_candidates)):
                split_grads.append(
                    {key: value[i]
                     for key, value in all_grads.items()})
            split_outputs = []
            for i in range(len(copied_candidates)):
                instance_outputs = {}
                for key, value in all_outputs.items():
                    if key == 'loss':
                        continue
                    instance_outputs[key] = value[i]
                split_outputs.append(instance_outputs)
            beam_candidates = [(x[0], x[1], x[2], split_grads[i],
                                split_outputs[i])
                               for i, x in enumerate(copied_candidates)]

            candidates = []
            for beam_instance, smallest_idx, tag_mask, grads, outputs in beam_candidates:
                beam_tag_mask = deepcopy(tag_mask)

                for output in outputs:
                    if isinstance(outputs[output], torch.Tensor):
                        outputs[output] = outputs[output].detach().cpu().numpy(
                        ).squeeze().squeeze()
                    elif isinstance(outputs[output], list):
                        outputs[output] = outputs[output][0]

                # Check if any fields have changed, if so, next beam
                if "tags" not in instance:
                    # relabel beam_instance since last iteration removed an input token
                    beam_instance = self.predictor.predictions_to_labeled_instances(
                        beam_instance, outputs)[0]
                    if utils.instance_has_changed(beam_instance,
                                                  fields_to_compare):
                        continue

                # special case for sentence tagging (we have tested NER)
                else:
                    # remove the mask where you remove the input token from.
                    if smallest_idx != -1:  # Don't delete on the very first iteration
                        del beam_tag_mask[smallest_idx]
                    cur_tags = [
                        outputs["tags"][x] for x in range(len(outputs["tags"]))
                        if beam_tag_mask[x]
                    ]
                    if cur_tags != original_tags:
                        continue

                # remove a token from the input
                text_field: TextField = beam_instance[
                    input_field_to_attack]  # type: ignore
                current_tokens = deepcopy(text_field.tokens)
                reduced_instances_and_smallest = _remove_one_token(
                    beam_instance, input_field_to_attack,
                    grads[grad_input_field], ignore_tokens, self.beam_size,
                    beam_tag_mask)
                candidates.extend(reduced_instances_and_smallest)
        return current_tokens
示例#3
0
    def attack_instance(
        self,
        instance: Instance,
        inputs: JsonDict,
        input_field_to_attack: str = "tokens",
        grad_input_field: str = "grad_input_1",
        ignore_tokens: List[str] = None,
        target: JsonDict = None,
    ) -> Tuple[List[Token], JsonDict]:
        if self.embedding_matrix is None:
            self.initialize()

        ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens

        # If `target` is `None`, we move away from the current prediction, otherwise we move
        # _towards_ the target.
        sign = -1 if target is None else 1

        # Gets a list of the fields that we want to check to see if they change.
        fields_to_compare = utils.get_fields_to_compare(
            inputs, instance, input_field_to_attack)

        # We'll be modifying the tokens in this text field below, and grabbing the modified
        # list after the `while` loop.
        text_field: TextField = instance[input_field_to_attack]  # type: ignore

        # Because we can save computation by getting grads and outputs at the same time, we do
        # them together at the end of the loop, even though we use grads at the beginning and
        # outputs at the end.  This is our initial gradient for the beginning of the loop.  The
        # output can be ignored here.
        grads, outputs = self.predictor.get_gradients([instance])

        # Ignore any token that is in the ignore_tokens list by setting the token to already
        # flipped.
        flipped: List[int] = []
        for index, token in enumerate(text_field.tokens):
            if token.text in ignore_tokens:
                flipped.append(index)
        if "clusters" in outputs:
            # Coref unfortunately needs a special case here.  We don't want to flip words in
            # the same predicted coref cluster, but we can't really specify a list of tokens,
            # because, e.g., "he" could show up in several different clusters.
            # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to
            # return the set of tokens that shouldn't be changed for each instance?  E.g., you
            # could imagine setting a field on the `Token` object, that we could then read
            # here...
            for cluster in outputs["clusters"]:
                for mention in cluster:
                    for index in range(mention[0], mention[1] + 1):
                        flipped.append(index)

        while True:
            # Compute L2 norm of all grads.
            grad = grads[grad_input_field][0]
            grads_magnitude = [g.dot(g) for g in grad]

            # only flip a token once
            for index in flipped:
                grads_magnitude[index] = -1

            # We flip the token with highest gradient norm.
            index_of_token_to_flip = numpy.argmax(grads_magnitude)
            if grads_magnitude[index_of_token_to_flip] == -1:
                # If we've already flipped all of the tokens, we give up.
                break
            flipped.append(index_of_token_to_flip)

            text_field_tensors = text_field.as_tensor(
                text_field.get_padding_lengths())
            input_tokens = util.get_token_ids_from_text_field_tensors(
                text_field_tensors)
            original_id_of_token_to_flip = input_tokens[index_of_token_to_flip]

            # Get new token using taylor approximation.
            new_id = self._first_order_taylor(grad[index_of_token_to_flip],
                                              original_id_of_token_to_flip,
                                              sign)

            # Flip token.  We need to tell the instance to re-index itself, so the text field
            # will actually update.
            new_token = Token(self.vocab._index_to_token[self.namespace]
                              [new_id])  # type: ignore
            text_field.tokens[index_of_token_to_flip] = new_token
            instance.indexed = False

            # Get model predictions on instance, and then label the instances
            grads, outputs = self.predictor.get_gradients([instance
                                                           ])  # predictions
            for key, output in outputs.items():
                if isinstance(output, torch.Tensor):
                    outputs[key] = output.detach().cpu().numpy().squeeze()
                elif isinstance(output, list):
                    outputs[key] = output[0]

            # TODO(mattg): taking the first result here seems brittle, if we're in a case where
            # there are multiple predictions.
            labeled_instance = self.predictor.predictions_to_labeled_instances(
                instance, outputs)[0]

            # If we've met our stopping criterion, we stop.
            has_changed = utils.instance_has_changed(labeled_instance,
                                                     fields_to_compare)
            if target is None and has_changed:
                # With no target, we just want to change the prediction.
                break
            if target is not None and not has_changed:
                # With a given target, we want to *match* the target, which we check by
                # `not has_changed`.
                break
        return text_field.tokens, outputs
示例#4
0
    def attack_from_json(
        self,
        inputs: JsonDict,
        input_field_to_attack: str = "tokens",
        grad_input_field: str = "grad_input_1",
        ignore_tokens: List[str] = None,
        target: JsonDict = None,
    ) -> JsonDict:
        """
        Replaces one token at a time from the input until the model's prediction changes.
        ``input_field_to_attack`` is for example ``tokens``, it says what the input field is
        called.  ``grad_input_field`` is for example ``grad_input_1``, which is a key into a grads
        dictionary.

        The method computes the gradient w.r.t. the tokens, finds the token with the maximum
        gradient (by L2 norm), and replaces it with another token based on the first-order Taylor
        approximation of the loss.  This process is iteratively repeated until the prediction
        changes.  Once a token is replaced, it is not flipped again.

        Parameters
        ----------
        inputs : ``JsonDict``
            The model inputs, the same as what is passed to a ``Predictor``.
        input_field_to_attack : ``str``, optional (default='tokens')
            The field that has the tokens that we're going to be flipping.  This must be a
            ``TextField``.
        grad_input_field : ``str``, optional (default='grad_input_1')
            If there is more than one field that gets embedded in your model (e.g., a question and
            a passage, or a premise and a hypothesis), this tells us the key to use to get the
            correct gradients.  This selects from the output of :func:`Predictor.get_gradients`.
        ignore_tokens : ``List[str]``, optional (default=DEFAULT_IGNORE_TOKENS)
            These tokens will not be flipped.  The default list includes some simple punctuation,
            OOV and padding tokens, and common control tokens for BERT, etc.
        target : ``JsonDict``, optional (default=None)
            If given, this will be a `targeted` hotflip attack, where instead of just trying to
            change a model's prediction from what it current is predicting, we try to change it to
            a `specific` target value.  This is a ``JsonDict`` because it needs to specify the
            field name and target value.  For example, for a masked LM, this would be something
            like ``{"words": ["she"]}``, because ``"words"`` is the field name, there is one mask
            token (hence the list of length one), and we want to change the prediction from
            whatever it was to ``"she"``.
        """
        if self.embedding_matrix is None:
            self.initialize()
        ignore_tokens = DEFAULT_IGNORE_TOKENS if ignore_tokens is None else ignore_tokens

        # If `target` is `None`, we move away from the current prediction, otherwise we move
        # _towards_ the target.
        sign = -1 if target is None else 1
        instance = self.predictor._json_to_instance(inputs)
        if target is None:
            output_dict = self.predictor._model.forward_on_instance(instance)
        else:
            output_dict = target

        # This now holds the predictions that we want to change (either away from or towards,
        # depending on whether `target` was passed).  We'll use this in the loop below to check for
        # when we've met our stopping criterion.
        original_instances = self.predictor.predictions_to_labeled_instances(instance, output_dict)

        # This is just for ease of access in the UI, so we know the original tokens.  It's not used
        # in the logic below.
        original_text_field: TextField = original_instances[0][  # type: ignore
            input_field_to_attack
        ]
        original_tokens = deepcopy(original_text_field.tokens)

        final_tokens = []
        # `original_instances` is a list because there might be several different predictions that
        # we're trying to attack (e.g., all of the NER tags for an input sentence).  We attack them
        # one at a time.
        for instance in original_instances:
            # Gets a list of the fields that we want to check to see if they change.
            fields_to_compare = utils.get_fields_to_compare(inputs, instance, input_field_to_attack)

            # We'll be modifying the tokens in this text field below, and grabbing the modified
            # list after the `while` loop.
            text_field: TextField = instance[input_field_to_attack]  # type: ignore

            # Because we can save computation by getting grads and outputs at the same time, we do
            # them together at the end of the loop, even though we use grads at the beginning and
            # outputs at the end.  This is our initial gradient for the beginning of the loop.  The
            # output can be ignored here.
            grads, outputs = self.predictor.get_gradients([instance])

            # Ignore any token that is in the ignore_tokens list by setting the token to already
            # flipped.
            flipped: List[int] = []
            for index, token in enumerate(text_field.tokens):
                if token.text in ignore_tokens:
                    flipped.append(index)
            if "clusters" in outputs:
                # Coref unfortunately needs a special case here.  We don't want to flip words in
                # the same predicted coref cluster, but we can't really specify a list of tokens,
                # because, e.g., "he" could show up in several different clusters.
                # TODO(mattg): perhaps there's a way to get `predictions_to_labeled_instances` to
                # return the set of tokens that shouldn't be changed for each instance?  E.g., you
                # could imagine setting a field on the `Token` object, that we could then read
                # here...
                for cluster in outputs["clusters"]:
                    for mention in cluster:
                        for index in range(mention[0], mention[1] + 1):
                            flipped.append(index)

            while True:
                # Compute L2 norm of all grads.
                grad = grads[grad_input_field][0]
                grads_magnitude = [g.dot(g) for g in grad]

                # only flip a token once
                for index in flipped:
                    grads_magnitude[index] = -1

                # We flip the token with highest gradient norm.
                index_of_token_to_flip = numpy.argmax(grads_magnitude)
                if grads_magnitude[index_of_token_to_flip] == -1:
                    # If we've already flipped all of the tokens, we give up.
                    break
                flipped.append(index_of_token_to_flip)

                # TODO(mattg): This is quite a bit of a hack for getting the vocab id...  I don't
                # have better ideas at the moment, though.
                indexer_name = self.namespace
                input_tokens = text_field._indexed_tokens[indexer_name]
                original_id_of_token_to_flip = input_tokens[index_of_token_to_flip]

                # Get new token using taylor approximation.
                new_id = self._first_order_taylor(
                    grad[index_of_token_to_flip], original_id_of_token_to_flip, sign
                )

                # Flip token.  We need to tell the instance to re-index itself, so the text field
                # will actually update.
                new_token = Token(
                    self.vocab._index_to_token[self.namespace][new_id]
                )  # type: ignore
                text_field.tokens[index_of_token_to_flip] = new_token
                instance.indexed = False

                # Get model predictions on instance, and then label the instances
                grads, outputs = self.predictor.get_gradients([instance])  # predictions
                for key, output in outputs.items():
                    if isinstance(output, torch.Tensor):
                        outputs[key] = output.detach().cpu().numpy().squeeze()
                    elif isinstance(output, list):
                        outputs[key] = output[0]

                # TODO(mattg): taking the first result here seems brittle, if we're in a case where
                # there are multiple predictions.
                labeled_instance = self.predictor.predictions_to_labeled_instances(
                    instance, outputs
                )[0]

                # If we've met our stopping criterion, we stop.
                has_changed = utils.instance_has_changed(labeled_instance, fields_to_compare)
                if target is None and has_changed:
                    # With no target, we just want to change the prediction.
                    break
                if target is not None and not has_changed:
                    # With a given target, we want to *match* the target, which we check by
                    # `not has_changed`.
                    break

            final_tokens.append(text_field.tokens)

        return sanitize({"final": final_tokens, "original": original_tokens, "outputs": outputs})
示例#5
0
文件: hotflip.py 项目: c4n/allennlp
                # Get model predictions on instance, and then label the instances
                grads, outputs = self.predictor.get_gradients([instance])  # predictions
                for key, output in outputs.items():
                    if isinstance(output, torch.Tensor):
                        outputs[key] = output.detach().cpu().numpy().squeeze()
                    elif isinstance(output, list):
                        outputs[key] = output[0]

                # TODO(mattg): taking the first result here seems brittle, if we're in a case where
                # there are multiple predictions.
                labeled_instance = self.predictor.predictions_to_labeled_instances(
                    instance, outputs
                )[0]

                # If we've met our stopping criterion, we stop.
                has_changed = utils.instance_has_changed(labeled_instance, fields_to_compare)
                if target is None and has_changed:
                    # With no target, we just want to change the prediction.
                    break
                if target is not None and not has_changed:
                    # With a given target, we want to *match* the target, which we check by
                    # `not has_changed`.
                    break

            final_tokens.append(text_field.tokens)

        return sanitize({"final": final_tokens, "original": original_tokens, "outputs": outputs})

    def _first_order_taylor(self, grad: numpy.ndarray, token_idx: int, sign: int) -> int:
>>>>>>> 1a6a857d... Remove unwanted blank spaces
        """