Пример #1
0
def build_baegarg2019(model_wrapper, threshold_cosine=0.936338023, query_budget=None, max_candidates=50):
    """
    Modified from https://github.com/QData/TextAttack/blob/04b7c6f79bdb5301b360555bd5458c15aa2b8695/textattack/attack_recipes/bae_garg_2019.py
    """
    transformation = WordSwapMaskedLM(
        method="bae", max_candidates=max_candidates, min_confidence=0.0
    )
    constraints = [RepeatModification(), StopwordModification()]

    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))

    use_constraint = UniversalSentenceEncoder(
        threshold=threshold_cosine,
        metric="cosine",
        compare_against_original=True,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    goal_function = UntargetedClassification(model_wrapper)
    if query_budget is not None:
        goal_function.query_budget = query_budget
    search_method = GreedyWordSwapWIR(wir_method="delete")

    return Attack(goal_function, constraints, transformation, search_method)
def Seq2SickCheng2018BlackBox(model, goal_function="non_overlapping"):
    """
        Cheng, Minhao, et al. 
        
        Seq2Sick: Evaluating the Robustness of Sequence-to-Sequence Models with 
        Adversarial Examples
        
        https://arxiv.org/abs/1803.01128    
        
        This is a greedy re-implementation of the seq2sick attack method. It does 
        not use gradient descent.
    """

    #
    # Goal is non-overlapping output.
    #
    goal_function = NonOverlappingOutput(model)
    # @TODO implement transformation / search method just like they do in
    # seq2sick.
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # In these experiments, we hold the maximum difference
    # on edit distance (ϵ) to a constant 30 for each sample.
    #
    constraints.append(LevenshteinEditDistance(30))
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Пример #3
0
 def build(model, max_num_word_swaps=1):
     # a combination of 4 different character-based transforms
     # ignore the first and last letter of each word, as in the paper
     transformation = CompositeTransformation([
         WordSwapNeighboringCharacterSwap(random_one=False,
                                          skip_first_char=True,
                                          skip_last_char=True),
         WordSwapRandomCharacterDeletion(random_one=False,
                                         skip_first_char=True,
                                         skip_last_char=True),
         WordSwapRandomCharacterInsertion(random_one=False,
                                          skip_first_char=True,
                                          skip_last_char=True),
         WordSwapQWERTY(random_one=False,
                        skip_first_char=True,
                        skip_last_char=True),
     ])
     # only edit words of length >= 4, edit max_num_word_swaps words.
     # note that we also are not editing the same word twice, so
     # max_num_word_swaps is really the max number of character
     # changes that can be made. The paper looks at 1 and 2 char attacks.
     constraints = [
         MinWordLength(min_length=4),
         StopwordModification(),
         MaxWordsPerturbed(max_num_words=max_num_word_swaps),
         RepeatModification(),
     ]
     # untargeted attack
     goal_function = UntargetedClassification(model)
     search_method = GreedySearch()
     return Attack(goal_function, constraints, transformation,
                   search_method)
    def build(model):
        #
        #  we propose five bug generation methods for TEXTBUGGER:
        #
        transformation = CompositeTransformation(
            [
                # (1) Insert: Insert a space into the word.
                # Generally, words are segmented by spaces in English. Therefore,
                # we can deceive classifiers by inserting spaces into words.
                WordSwapRandomCharacterInsertion(
                    random_one=True,
                    letters_to_insert=" ",
                    skip_first_char=True,
                    skip_last_char=True,
                ),
                # (2) Delete: Delete a random character of the word except for the first
                # and the last character.
                WordSwapRandomCharacterDeletion(
                    random_one=True, skip_first_char=True, skip_last_char=True
                ),
                # (3) Swap: Swap random two adjacent letters in the word but do not
                # alter the first or last letter. This is a common occurrence when
                # typing quickly and is easy to implement.
                WordSwapNeighboringCharacterSwap(
                    random_one=True, skip_first_char=True, skip_last_char=True
                ),
                # (4) Substitute-C (Sub-C): Replace characters with visually similar
                # characters (e.g., replacing “o” with “0”, “l” with “1”, “a” with “@”)
                # or adjacent characters in the keyboard (e.g., replacing “m” with “n”).
                WordSwapHomoglyphSwap(),
                # (5) Substitute-W
                # (Sub-W): Replace a word with its topk nearest neighbors in a
                # context-aware word vector space. Specifically, we use the pre-trained
                # GloVe model [30] provided by Stanford for word embedding and set
                # topk = 5 in the experiment.
                WordSwapEmbedding(max_candidates=5),
            ]
        )

        constraints = [RepeatModification(), StopwordModification()]
        # In our experiment, we first use the Universal Sentence
        # Encoder [7], a model trained on a number of natural language
        # prediction tasks that require modeling the meaning of word
        # sequences, to encode sentences into high dimensional vectors.
        # Then, we use the cosine similarity to measure the semantic
        # similarity between original texts and adversarial texts.
        # ... "Furthermore, the semantic similarity threshold \eps is set
        # as 0.8 to guarantee a good trade-off between quality and
        # strength of the generated adversarial text."
        constraints.append(UniversalSentenceEncoder(threshold=0.8))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return Attack(goal_function, constraints, transformation, search_method)
Пример #5
0
    def build(model_wrapper):
        #
        # Swap words with their synonyms extracted based on the HowNet.
        #
        transformation = WordSwapHowNet()
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        #
        # During entailment, we should only edit the hypothesis - keep the premise
        # the same.
        #
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        #
        # Use untargeted classification for demo, can be switched to targeted one
        #
        goal_function = UntargetedClassification(model_wrapper)
        #
        # Perform word substitution with a Particle Swarm Optimization (PSO) algorithm.
        #
        search_method = ParticleSwarmOptimization(pop_size=60, max_iters=20)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Пример #6
0
def MorpheusTan2020(model):
    """Samson Tan, Shafiq Joty, Min-Yen Kan, Richard Socher.

    It’s Morphin’ Time! Combating Linguistic Discrimination with Inflectional Perturbations

    https://www.aclweb.org/anthology/2020.acl-main.263/
    """

    #
    # Goal is to minimize BLEU score between the model output given for the
    # perturbed input sequence and the reference translation
    #
    goal_function = MinimizeBleu(model)

    # Swap words with their inflections
    transformation = WordSwapInflections()

    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]

    #
    # Greedily swap words (see pseudocode, Algorithm 1 of the paper).
    #
    search_method = GreedySearch()

    return Attack(goal_function, constraints, transformation, search_method)
Пример #7
0
    def build(model, ensemble: bool = False):
        # [from correspondence with the author]
        # Candidate size K is set to 48 for all data-sets.
        transformation = WordSwapMaskedLM(method="bert-attack",
                                          max_candidates=48)
        #
        # Don't modify the same word twice or stopwords.
        #
        constraints = [RepeatModification(), StopwordModification()]

        # "We only take ε percent of the most important words since we tend to keep
        # perturbations minimum."
        #
        # [from correspondence with the author]
        # "Word percentage allowed to change is set to 0.4 for most data-sets, this
        # parameter is trivial since most attacks only need a few changes. This
        # epsilon is only used to avoid too much queries on those very hard samples."
        constraints.append(MaxWordsPerturbed(max_percent=0.4))

        # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence
        # Encoder (Cer et al., 2018) to measure the semantic consistency between the
        # adversarial sample and the original sequence. To balance between semantic
        # preservation and attack success rate, we set up a threshold of semantic
        # similarity score to filter the less similar examples."
        #
        # [from correspondence with author]
        # "Over the full texts, after generating all the adversarial samples, we filter
        # out low USE score samples. Thus the success rate is lower but the USE score
        # can be higher. (actually USE score is not a golden metric, so we simply
        # measure the USE score over the final texts for a comparison with TextFooler).
        # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for
        # datasets like MNLI, we set threshold between 0-0.2."
        #
        # Since the threshold in the real world can't be determined from the training
        # data, the TextAttack implementation uses a fixed threshold - determined to
        # be 0.2 to be most fair.
        use_constraint = UniversalSentenceEncoder(
            threshold=0.2,
            metric="cosine",
            compare_against_original=True,
            window_size=None,
        )
        constraints.append(use_constraint)
        #
        # Goal is untargeted classification.
        #
        goal_function = UntargetedClassification(model)
        #
        # "We first select the words in the sequence which have a high significance
        # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote
        # the input sentence, and oy(S) denote the logit output by the target model
        # for correct label y, the importance score Iwi is defined as
        # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···]
        # is the sentence after replacing wi with [MASK]. Then we rank all the words
        # according to the ranking score Iwi in descending order to create word list
        # L."
        search_method = GreedyWordSwapWIR(wir_method="unk", ensemble=ensemble)

        return Attack(goal_function, constraints, transformation,
                      search_method)
def InputReductionFeng2018(model):
    """Feng, Wallace, Grissom, Iyyer, Rodriguez, Boyd-Graber. (2018).

    Pathologies of Neural Models Make Interpretations Difficult.

    ArXiv, abs/1804.07781.
    """
    # At each step, we remove the word with the lowest importance value until
    # the model changes its prediction.
    transformation = WordDeletion()

    constraints = [RepeatModification(), StopwordModification()]
    #
    # Goal is untargeted classification
    #
    goal_function = InputReduction(model, maximizable=True)
    #
    # "For each word in an input sentence, we measure its importance by the
    # change in the confidence of the original prediction when we remove
    # that word from the sentence."
    #
    # "Instead of looking at the words with high importance values—what
    # interpretation methods commonly do—we take a complementary approach
    # and study how the model behaves when the supposedly unimportant words are
    # removed."
    #
    search_method = GreedyWordSwapWIR(wir_method="delete")

    return Attack(goal_function, constraints, transformation, search_method)
Пример #9
0
 def build(model):
     transformation = WordSwapWordNet()
     constraints = [RepeatModification(), StopwordModification()]
     goal_function = UntargetedClassification(model)
     # search over words based on a combination of their saliency score, and how efficient the WordSwap transform is
     search_method = GreedyWordSwapWIR("weighted-saliency")
     return Attack(goal_function, constraints, transformation,
                   search_method)
Пример #10
0
def GeneticAlgorithmAlzantot2018(model):
    """Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., &
    Chang, K. (2018).

    Generating Natural Language Adversarial Examples.

    https://arxiv.org/abs/1804.07998
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted Paragram Embeddings.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=8)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance of 0.5.
    #
    constraints.append(
        WordEmbeddingDistance(max_mse_dist=0.5,
                              compare_against_original=False))
    #
    # Language Model
    #
    constraints.append(
        Google1BillionWordsLanguageModel(top_n_per_index=4,
                                         compare_against_original=False))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a genetic algorithm.
    #
    search_method = GeneticAlgorithm(pop_size=60,
                                     max_iters=20,
                                     post_crossover_check=False)

    return Attack(goal_function, constraints, transformation, search_method)
    def build(model):
        #
        # Swap words with their embedding nearest-neighbors.
        #
        # Embedding: Counter-fitted Paragram Embeddings.
        #
        # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
        #
        transformation = WordSwapEmbedding(max_candidates=8)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # During entailment, we should only edit the hypothesis - keep the premise
        # the same.
        #
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance of 0.5.
        #
        constraints.append(
            WordEmbeddingDistance(max_mse_dist=0.5,
                                  compare_against_original=False))
        #
        # Language Model
        #
        # constraints.append(
        #     Google1BillionWordsLanguageModel(
        #         top_n_per_index=4, compare_against_original=False
        #     )
        # )
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Perform word substitution with a genetic algorithm.
        #
        search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                                 max_iters=20,
                                                 post_crossover_check=False)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Пример #12
0
def DeepWordBugGao2018(model, use_all_transformations=True):
    """
        Gao, Lanchantin, Soffa, Qi.
        
        Black-box Generation of Adversarial Text Sequences to Evade Deep Learning 
        Classifiers.
        
        https://arxiv.org/abs/1801.04354
    """
    #
    # Swap characters out from words. Choose the best of four potential transformations.
    #
    if use_all_transformations:
        # We propose four similar methods:
        transformation = CompositeTransformation([
            # (1) Swap: Swap two adjacent letters in the word.
            WordSwapNeighboringCharacterSwap(),
            # (2) Substitution: Substitute a letter in the word with a random letter.
            WordSwapRandomCharacterSubstitution(),
            # (3) Deletion: Delete a random letter from the word.
            WordSwapRandomCharacterDeletion(),
            # (4) Insertion: Insert a random letter in the word.
            WordSwapRandomCharacterInsertion(),
        ])
    else:
        # We use the Combined Score and the Substitution Transformer to generate
        # adversarial samples, with the maximum edit distance difference of 30
        # (ϵ = 30).
        transformation = WordSwapRandomCharacterSubstitution()
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # In these experiments, we hold the maximum difference
    # on edit distance (ϵ) to a constant 30 for each sample.
    #
    constraints.append(LevenshteinEditDistance(30))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Пример #13
0
def Kuleshov2017(model):
    """
        Kuleshov, V. et al. 
        
        Generating Natural Language Adversarial Examples. 
        
        https://openreview.net/pdf?id=r1QZ3zbAZ.
    """
    #
    # "Specifically, in all experiments, we used a target of τ = 0.7,
    # a neighborhood size of N = 15, and parameters λ_1 = 0.2 and δ = 0.5; we set
    # the syntactic bound to λ_2 = 2 nats for sentiment analysis"

    #
    # Word swap with top-15 counter-fitted embedding neighbors.
    #
    transformation = WordSwapEmbedding(max_candidates=15)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Maximum of 50% of words perturbed (δ in the paper).
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.5))
    #
    # Maximum thought vector Euclidean distance of λ_1 = 0.2. (eq. 4)
    #
    constraints.append(
        ThoughtVector(embedding_type='paragramcf',
                      threshold=0.2,
                      metric='max_euclidean'))
    #
    #
    # Maximum language model log-probability difference of λ_2 = 2. (eq. 5)
    #
    constraints.append(GPT2(max_log_prob_diff=2.0))
    #
    # Goal is untargeted classification: reduce original probability score
    # to below τ = 0.7 (Algorithm 1).
    #
    goal_function = UntargetedClassification(model, target_max_score=0.7)
    #
    # Perform word substitution with a genetic algorithm.
    #
    search_method = GreedySearch()

    return Attack(goal_function, constraints, transformation, search_method)
Пример #14
0
def Pruthi2019(model, max_num_word_swaps=1):
    """
        An implementation of the attack used in "Combating Adversarial
        Misspellings with Robust Word Recognition", Pruthi et al., 2019.

        This attack focuses on a small number of character-level changes
        that simulate common typos. It combines:
            - Swapping neighboring characters
            - Deleting characters
            - Inserting characters
            - Swapping characters for adjacent keys on a QWERTY keyboard.

        https://arxiv.org/abs/1905.11268

        :param model: Model to attack.
        :param max_num_word_swaps: Maximum number of modifications to allow.
    """
    # a combination of 4 different character-based transforms
    # ignore the first and last letter of each word, as in the paper
    transformation = CompositeTransformation(
        [
            WordSwapNeighboringCharacterSwap(
                random_one=False, skip_first_char=True, skip_last_char=True
            ),
            WordSwapRandomCharacterDeletion(
                random_one=False, skip_first_char=True, skip_last_char=True
            ),
            WordSwapRandomCharacterInsertion(
                random_one=False, skip_first_char=True, skip_last_char=True
            ),
            WordSwapQWERTY(random_one=False, skip_first_char=True, skip_last_char=True),
        ]
    )
    # only edit words of length >= 4, edit max_num_word_swaps words.
    # note that we also are not editing the same word twice, so
    # max_num_word_swaps is really the max number of character
    # changes that can be made. The paper looks at 1 and 2 char attacks.
    constraints = [
        MinWordLength(min_length=4),
        StopwordModification(),
        MaxWordsPerturbed(max_num_words=max_num_word_swaps),
        RepeatModification(),
    ]
    # untargeted attack
    goal_function = UntargetedClassification(model)
    search_method = GreedySearch()
    return Attack(goal_function, constraints, transformation, search_method)
def HotFlipEbrahimi2017(model):
    """
        Ebrahimi, J. et al. (2017)
        
        HotFlip: White-Box Adversarial Examples for Text Classification
        
        https://arxiv.org/abs/1712.06751
        
        This is a reproduction of the HotFlip word-level attack (section 5 of the 
        paper).
    """
    #
    # "HotFlip ... uses the gradient with respect to a one-hot input
    # representation to efficiently estimate which individual change has the
    # highest estimated loss."
    transformation = WordSwapGradientBased(model, top_n=1)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # 0. "We were able to create only 41 examples (2% of the correctly-
    # classified instances of the SST test set) with one or two flips."
    #
    constraints.append(MaxWordsPerturbed(max_num_words=2))
    #
    # 1. "The cosine similarity between the embedding of words is bigger than a
    #   threshold (0.8)."
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
    #
    # 2. "The two words have the same part-of-speech."
    #
    constraints.append(PartOfSpeech())
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # "HotFlip ... uses a beam search to find a set of manipulations that work
    # well together to confuse a classifier ... The adversary uses a beam size
    # of 10."
    #
    search_method = BeamSearch(beam_width=10)

    return Attack(goal_function, constraints, transformation, search_method)
Пример #16
0
    def build(model_wrapper, mlm=False):
        """Build attack recipe.

        Args:
            model_wrapper (:class:`~textattack.models.wrappers.ModelWrapper`):
                Model wrapper containing both the model and the tokenizer.
            mlm (:obj:`bool`, `optional`, defaults to :obj:`False`):
                If :obj:`True`, load `A2T-MLM` attack. Otherwise, load regular `A2T` attack.

        Returns:
            :class:`~textattack.Attack`: A2T attack.
        """
        constraints = [RepeatModification(), StopwordModification()]
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        constraints.append(PartOfSpeech(allow_verb_noun_swap=False))
        constraints.append(MaxModificationRate(max_rate=0.1, min_threshold=4))
        sent_encoder = BERT(model_name="stsb-distilbert-base",
                            threshold=0.9,
                            metric="cosine")
        constraints.append(sent_encoder)

        if mlm:
            transformation = transformation = WordSwapMaskedLM(
                method="bae",
                max_candidates=20,
                min_confidence=0.0,
                batch_size=16)
        else:
            transformation = WordSwapEmbedding(max_candidates=20)
            constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))

        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper,
                                                 model_batch_size=32)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="gradient")

        return Attack(goal_function, constraints, transformation,
                      search_method)
Пример #17
0
def PSOZang2020(model):
    """
        Zang, Y., Yang, C., Qi, F., Liu, Z., Zhang, M., Liu, Q., & Sun, M. (2019).
        
        Word-level Textual Adversarial Attacking as Combinatorial Optimization.
        
        https://www.aclweb.org/anthology/2020.acl-main.540.pdf

        Methodology description quoted from the paper:

        "We propose a novel word substitution-based textual attack model, which reforms
        both the aforementioned two steps. In the first step, we adopt a sememe-based word
        substitution strategy, which can generate more candidate adversarial examples with
        better semantic preservation. In the second step, we utilize particle swarm optimization
        (Eberhart and Kennedy, 1995) as the adversarial example searching algorithm."

        And "Following the settings in Alzantot et al. (2018), we set the max iteration time G to 20."
    """
    #
    # Swap words with their synonyms extracted based on the HowNet.
    #
    transformation = WordSwapHowNet()
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"}
    )
    constraints.append(input_column_modification)
    #
    # Use untargeted classification for demo, can be switched to targeted one
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a Particle Swarm Optimization (PSO) algorithm.
    #
    search_method = ParticleSwarmOptimization(pop_size=60, max_iters=20)

    return Attack(goal_function, constraints, transformation, search_method)
Пример #18
0
def build_attack(model_wrapper, target_class=-1):
    """
    Same as bert-attack except:
    - it is TargetedClassification instead of Untargeted when target_class != -1
    - using "bae" instead of "bert-attack" because of bert-attack's problem for subtokens
    Modified from https://github.com/QData/TextAttack/blob/36dfce6bdab933bdeed3a2093ae411e93018ebbf/textattack/attack_recipes/bert_attack_li_2020.py
    """

    # transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48)
    transformation = WordSwapMaskedLM(method="bae", max_candidates=100)
    constraints = [RepeatModification(), StopwordModification()]
    constraints.append(MaxWordsPerturbed(max_percent=0.4))

    use_constraint = UniversalSentenceEncoder(
        threshold=0.2,
        metric="cosine",
        compare_against_original=True,
        window_size=None,
    )
    constraints.append(use_constraint)
    if target_class == -1:
        goal_function = UntargetedClassification(model_wrapper)
    else:
        # We modify the goal
        goal_function = TargetedClassification(model_wrapper, target_class=target_class)
    search_method = GreedyWordSwapWIR(wir_method="unk")

    return Attack(goal_function, constraints, transformation, search_method)


# def build_attack_2(model_wrapper, target_class):
#     """
#     Same as HotFlipEbrahimi2017 attack except:
#     - it is TargetedClassification instead of Untargeted
#     """
#     transformation = WordSwapGradientBased(model_wrapper, top_n=1)
#     constraints = [RepeatModification(), StopwordModification()]
#     constraints.append(MaxWordsPerturbed(max_num_words=2))
#     constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
#     constraints.append(PartOfSpeech())
#     goal_function = TargetedClassification(model_wrapper)
    
#     search_method = BeamSearch(beam_width=10)

#     return Attack(goal_function, constraints, transformation, search_method)
Пример #19
0
def IGAWang2019(model):
    """
        Xiaosen Wang, Hao Jin, Kun He (2019). 
        
        Natural Language Adversarial Attack and Defense in Word Level. 
        
        http://arxiv.org/abs/1909.06723 
    """
    #
    # Swap words with their embedding nearest-neighbors.
    # Embedding: Counter-fitted Paragram Embeddings.
    # Fix the hyperparameter value to N = Unrestricted (50)."
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the stopwords
    #
    constraints = [StopwordModification()]
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance δ of 0.5.
    #
    constraints.append(
        WordEmbeddingDistance(max_mse_dist=0.5,
                              compare_against_original=False))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with an improved genetic algorithm.
    # Fix the hyperparameter values to S = 60, M = 20, λ = 5."
    #
    search_method = GeneticAlgorithm(
        pop_size=60,
        max_iters=20,
        improved_genetic_algorithm=True,
        max_replace_times_per_index=5,
        post_crossover_check=False,
    )

    return Attack(goal_function, constraints, transformation, search_method)
Пример #20
0
def PWWSRen2019(model):
    """An implementation of Probability Weighted Word Saliency from "Generating
    Natural Langauge Adversarial Examples through Probability Weighted Word
    Saliency", Ren et al., 2019.

    Words are prioritized for a synonym-swap transformation based on
    a combination of their saliency score and maximum word-swap effectiveness.
    Note that this implementation does not include the Named
    Entity adversarial swap from the original paper, because it requires
    access to the full dataset and ground truth labels in advance.

    https://www.aclweb.org/anthology/P19-1103/
    """
    transformation = WordSwapWordNet()
    constraints = [RepeatModification(), StopwordModification()]
    goal_function = UntargetedClassification(model)
    # search over words based on a combination of their saliency score, and how efficient the WordSwap transform is
    search_method = GreedyWordSwapWIR("pwws")
    return Attack(goal_function, constraints, transformation, search_method)
Пример #21
0
def Alzantot2018(model):
    """
        Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., & Chang, K. (2018). 
        
        Generating Natural Language Adversarial Examples. 
        
        https://arxiv.org/abs/1801.00554 
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted Paragram Embeddings.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=8)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance of 0.5.
    #
    constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
    #
    # Language Model
    #
    constraints.append(Google1BillionWordsLanguageModel(top_n_per_index=4))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a genetic algorithm.
    #
    search_method = GeneticAlgorithm(pop_size=60, max_iters=20)

    return Attack(goal_function, constraints, transformation, search_method)
Пример #22
0
    def build(model, use_all_transformations=True, ensemble: bool=False):
        #
        # Swap characters out from words. Choose the best of four potential transformations.
        #
        if use_all_transformations:
            # We propose four similar methods:
            transformation = CompositeTransformation(
                [
                    # (1) Swap: Swap two adjacent letters in the word.
                    WordSwapNeighboringCharacterSwap(),
                    # (2) Substitution: Substitute a letter in the word with a random letter.
                    WordSwapRandomCharacterSubstitution(),
                    # (3) Deletion: Delete a random letter from the word.
                    WordSwapRandomCharacterDeletion(),
                    # (4) Insertion: Insert a random letter in the word.
                    WordSwapRandomCharacterInsertion(),
                ]
            )
        else:
            # We use the Combined Score and the Substitution Transformer to generate
            # adversarial samples, with the maximum edit distance difference of 30
            # (ϵ = 30).
            transformation = WordSwapRandomCharacterSubstitution()
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # In these experiments, we hold the maximum difference
        # on edit distance (ϵ) to a constant 30 for each sample.
        #
        constraints.append(LevenshteinEditDistance(30))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(ensemble=ensemble)

        return Attack(goal_function, constraints, transformation, search_method)
Пример #23
0
    def build(model_wrapper):
        #
        # Swap words with their embedding nearest-neighbors.
        # Embedding: Counter-fitted Paragram Embeddings.
        # Fix the hyperparameter value to N = Unrestricted (50)."
        #
        transformation = WordSwapEmbedding(max_candidates=50)
        #
        # Don't modify the stopwords
        #
        constraints = [StopwordModification()]
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance δ of 0.5.
        #
        constraints.append(
            WordEmbeddingDistance(max_mse_dist=0.5,
                                  compare_against_original=False))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper)
        #
        # Perform word substitution with an improved genetic algorithm.
        # Fix the hyperparameter values to S = 60, M = 20, λ = 5."
        #
        search_method = ImprovedGeneticAlgorithm(
            pop_size=60,
            max_iters=20,
            max_replace_times_per_index=5,
            post_crossover_check=False,
        )

        return Attack(goal_function, constraints, transformation,
                      search_method)
Пример #24
0
    def build(model):
        # At each step, we remove the word with the lowest importance value until
        # the model changes its prediction.
        transformation = WordDeletion()

        constraints = [RepeatModification(), StopwordModification()]
        #
        # Goal is untargeted classification
        #
        goal_function = InputReduction(model, maximizable=True)
        #
        # "For each word in an input sentence, we measure its importance by the
        # change in the confidence of the original prediction when we remove
        # that word from the sentence."
        #
        # "Instead of looking at the words with high importance values—what
        # interpretation methods commonly do—we take a complementary approach
        # and study how the model behaves when the supposedly unimportant words are
        # removed."
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return Attack(goal_function, constraints, transformation, search_method)
Пример #25
0
    def build(model):

        #
        # Goal is to minimize BLEU score between the model output given for the
        # perturbed input sequence and the reference translation
        #
        goal_function = MinimizeBleu(model)

        # Swap words with their inflections
        transformation = WordSwapInflections()

        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]

        #
        # Greedily swap words (see pseudocode, Algorithm 1 of the paper).
        #
        search_method = GreedySearch()

        return Attack(goal_function, constraints, transformation,
                      search_method)
    def build(model, goal_function="non_overlapping"):

        #
        # Goal is non-overlapping output.
        #
        goal_function = NonOverlappingOutput(model)
        transformation = WordSwapEmbedding(max_candidates=50)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # In these experiments, we hold the maximum difference
        # on edit distance (ϵ) to a constant 30 for each sample.
        #
        constraints.append(LevenshteinEditDistance(30))
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="unk")

        return Attack(goal_function, constraints, transformation,
                      search_method)
    'thru', 'thus', 'to', 'too', 'toward', 'towards', 'under', 'unless',
    'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't", 'we', 'were',
    'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever',
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
    'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
    'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
    "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd",
    "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
])

# Lax Constraints
MAX_LENGTH = 256
USE_THRESHOLD = 0.9
ALLOW_VERB_NOUN_SWAP = False
TAGGER_TYPE = "flair"

CONSTRAINTS = [
    RepeatModification(),
    StopwordModification(stopwords=STOPWORDS),
    MaxWordIndexModification(max_length=MAX_LENGTH),
    InputColumnModification(["premise", "hypothesis"], {"premise"}),
    UniversalSentenceEncoder(
        threshold=USE_THRESHOLD,
        metric="angular",
        compare_against_original=False,
        window_size=15,
        skip_text_shorter_than_window=True,
    ),
    PartOfSpeech(tagger_type=TAGGER_TYPE,
                 allow_verb_noun_swap=ALLOW_VERB_NOUN_SWAP)
]
    def build(model):
        #
        # Section 5: Experiments
        #
        # We base our sets of allowed word substitutions S(x, i) on the
        # substitutions allowed by Alzantot et al. (2018). They demonstrated that
        # their substitutions lead to adversarial examples that are qualitatively
        # similar to the original input and retain the original label, as judged
        # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w
        # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector
        # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The
        # neighbors must also lie within some Euclidean distance threshold. They
        # also use a language model constraint to avoid nonsensical perturbations:
        # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not
        # decrease the log-likelihood of the text under a pre-trained language
        # model by more than some threshold.
        #
        # We make three modifications to this approach:
        #
        # First, in Alzantot et al. (2018), the adversary
        # applies substitutions one at a time, and the
        # neighborhoods and language model scores are computed.
        # Equation (4) must be applied before the model
        # can combine information from multiple words, but it can
        # be delayed until after processing each word independently.
        # Note that the model itself classifies using a different
        # set of pre-trained word vectors; the counter-fitted vectors
        # are only used to define the set of allowed substitution words.
        # relative to the current altered version of the input.
        # This results in a hard-to-define attack surface, as
        # changing one word can allow or disallow changes
        # to other words. It also requires recomputing
        # language model scores at each iteration of the genetic
        # attack, which is inefficient. Moreover, the same
        # word can be substituted multiple times, leading
        # to semantic drift. We define allowed substitutions
        # relative to the original sentence x, and disallow
        # repeated substitutions.
        #
        # Second, we use a faster language model that allows us to query
        # longer contexts; Alzantot et al. (2018) use a slower language
        # model and could only query it with short contexts.

        # Finally, we use the language model constraint only
        # at test time; the model is trained against all perturbations in N(w). This encourages the model to be
        # robust to a larger space of perturbations, instead of
        # specializing for the particular choice of language
        # model. See Appendix A.3 for further details. [This is a model-specific
        # adjustment, so does not affect the attack recipe.]
        #
        # Appendix A.3:
        #
        # In Alzantot et al. (2018), the adversary applies replacements one at a
        # time, and the neighborhoods and language model scores are computed
        # relative to the current altered version of the input. This results in a
        # hard-to-define attack surface, as the same word can be replaced many
        # times, leading to semantic drift. We instead pre-compute the allowed
        # substitutions S(x, i) at index i based on the original x. We define
        # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are
        # assigned by a pre-trained language model, and the window radius W and
        # threshold δ are hyperparameters. We use W = 6 and δ = 5.
        #
        #
        # Swap words with their embedding nearest-neighbors.
        #
        # Embedding: Counter-fitted Paragram Embeddings.
        #
        # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
        #
        transformation = WordSwapEmbedding(max_candidates=8)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance of 0.5.
        #
        constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
        #
        # Language Model
        #
        #
        #
        constraints.append(
            LearningToWriteLanguageModel(window_size=6,
                                         max_log_prob_diff=5.0,
                                         compare_against_original=True))
        # constraints.append(LearningToWriteLanguageModel(window_size=5))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Perform word substitution with a genetic algorithm.
        #
        search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                                 max_iters=20,
                                                 post_crossover_check=False)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Пример #29
0
def TextFoolerJin2019(model):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
    """
    #
    # Swap words with their 50 closest embedding nearest-neighbors.
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or the stopwords defined
    # in the TextFooler public implementation.
    #
    # fmt: off
    stopwords = set([
        "a", "about", "above", "across", "after", "afterwards", "again",
        "against", "ain", "all", "almost", "alone", "along", "already", "also",
        "although", "am", "among", "amongst", "an", "and", "another", "any",
        "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "aren",
        "aren't", "around", "as", "at", "back", "been", "before", "beforehand",
        "behind", "being", "below", "beside", "besides", "between", "beyond",
        "both", "but", "by", "can", "cannot", "could", "couldn", "couldn't",
        "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down",
        "due", "during", "either", "else", "elsewhere", "empty", "enough",
        "even", "ever", "everyone", "everything", "everywhere", "except",
        "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn",
        "hasn't", "haven", "haven't", "he", "hence", "her", "here",
        "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him",
        "himself", "his", "how", "however", "hundred", "i", "if", "in",
        "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself",
        "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile",
        "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly",
        "must", "mustn", "mustn't", "my", "myself", "namely", "needn",
        "needn't", "neither", "never", "nevertheless", "next", "no", "nobody",
        "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of",
        "off", "on", "once", "one", "only", "onto", "or", "other", "others",
        "otherwise", "our", "ours", "ourselves", "out", "over", "per",
        "please", "s", "same", "shan", "shan't", "she", "she's", "should've",
        "shouldn", "shouldn't", "somehow", "something", "sometime",
        "somewhere", "such", "t", "than", "that", "that'll", "the", "their",
        "theirs", "them", "themselves", "then", "thence", "there",
        "thereafter", "thereby", "therefore", "therein", "thereupon", "these",
        "they", "this", "those", "through", "throughout", "thru", "thus", "to",
        "too", "toward", "towards", "under", "unless", "until", "up", "upon",
        "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren",
        "weren't", "what", "whatever", "when", "whence", "whenever", "where",
        "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
        "whether", "which", "while", "whither", "who", "whoever", "whole",
        "whom", "whose", "why", "with", "within", "without", "won", "won't",
        "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll",
        "you're", "you've", "your", "yours", "yourself", "yourselves"
    ])
    # fmt: on
    constraints = [
        RepeatModification(),
        StopwordModification(stopwords=stopwords)
    ]
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    # Minimum word embedding cosine similarity of 0.5.
    # (The paper claims 0.7, but analysis of the released code and some empirical
    # results show that it's 0.5.)
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric="angular",
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Пример #30
0
    def build(model):
        # "This paper presents CLARE, a ContextuaLized AdversaRial Example generation model
        # that produces fluent and grammatical outputs through a mask-then-infill procedure.
        # CLARE builds on a pre-trained masked language model and modifies the inputs in a context-aware manner.
        # We propose three contex-tualized  perturbations, Replace, Insert and Merge, allowing for generating outputs of
        # varied lengths."
        #
        # "We  experiment  with  a  distilled  version  of RoBERTa (RoBERTa_{distill}; Sanh et al., 2019)
        # as the masked language model for contextualized infilling."
        # Because BAE and CLARE both use similar replacement papers, we use BAE's replacement method here.

        shared_masked_lm = transformers.AutoModelForCausalLM.from_pretrained(
            "distilroberta-base")
        shared_tokenizer = transformers.AutoTokenizer.from_pretrained(
            "distilroberta-base")
        transformation = CompositeTransformation([
            WordSwapMaskedLM(
                method="bae",
                masked_language_model=shared_masked_lm,
                tokenizer=shared_tokenizer,
                max_candidates=50,
                min_confidence=5e-4,
            ),
            WordInsertionMaskedLM(
                masked_language_model=shared_masked_lm,
                tokenizer=shared_tokenizer,
                max_candidates=50,
                min_confidence=0.0,
            ),
            WordMergeMaskedLM(
                masked_language_model=shared_masked_lm,
                tokenizer=shared_tokenizer,
                max_candidates=50,
                min_confidence=5e-3,
            ),
        ])

        #
        # Don't modify the same word twice or stopwords.
        #
        constraints = [RepeatModification(), StopwordModification()]

        # "A  common  choice  of sim(·,·) is to encode sentences using neural networks,
        # and calculate their cosine similarity in the embedding space (Jin et al., 2020)."
        # The original implementation uses similarity of 0.7.
        use_constraint = UniversalSentenceEncoder(
            threshold=0.7,
            metric="cosine",
            compare_against_original=True,
            window_size=15,
            skip_text_shorter_than_window=True,
        )
        constraints.append(use_constraint)

        # Goal is untargeted classification.
        # "The score is then the negative probability of predicting the gold label from f, using [x_{adv}] as the input"
        goal_function = UntargetedClassification(model)

        # "To achieve this,  we iteratively apply the actions,
        #  and first select those minimizing the probability of outputting the gold label y from f."
        #
        # "Only one of the three actions can be applied at each position, and we select the one with the highest score."
        #
        # "Actions are iteratively applied to the input, until an adversarial example is found or a limit of actions T
        # is reached.
        #  Each step selects the highest-scoring action from the remaining ones."
        #
        search_method = GreedySearch()

        return Attack(goal_function, constraints, transformation,
                      search_method)