def build(model): # # we propose five bug generation methods for TEXTBUGGER: # transformation = CompositeTransformation( [ # (1) Insert: Insert a space into the word. # Generally, words are segmented by spaces in English. Therefore, # we can deceive classifiers by inserting spaces into words. WordSwapRandomCharacterInsertion( random_one=True, letters_to_insert=" ", skip_first_char=True, skip_last_char=True, ), # (2) Delete: Delete a random character of the word except for the first # and the last character. WordSwapRandomCharacterDeletion( random_one=True, skip_first_char=True, skip_last_char=True ), # (3) Swap: Swap random two adjacent letters in the word but do not # alter the first or last letter. This is a common occurrence when # typing quickly and is easy to implement. WordSwapNeighboringCharacterSwap( random_one=True, skip_first_char=True, skip_last_char=True ), # (4) Substitute-C (Sub-C): Replace characters with visually similar # characters (e.g., replacing “o” with “0”, “l” with “1”, “a” with “@”) # or adjacent characters in the keyboard (e.g., replacing “m” with “n”). WordSwapHomoglyphSwap(), # (5) Substitute-W # (Sub-W): Replace a word with its topk nearest neighbors in a # context-aware word vector space. Specifically, we use the pre-trained # GloVe model [30] provided by Stanford for word embedding and set # topk = 5 in the experiment. WordSwapEmbedding(max_candidates=5), ] ) constraints = [RepeatModification(), StopwordModification()] # In our experiment, we first use the Universal Sentence # Encoder [7], a model trained on a number of natural language # prediction tasks that require modeling the meaning of word # sequences, to encode sentences into high dimensional vectors. # Then, we use the cosine similarity to measure the semantic # similarity between original texts and adversarial texts. # ... "Furthermore, the semantic similarity threshold \eps is set # as 0.8 to guarantee a good trade-off between quality and # strength of the generated adversarial text." constraints.append(UniversalSentenceEncoder(threshold=0.8)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # search_method = GreedyWordSwapWIR(wir_method="delete") return Attack(goal_function, constraints, transformation, search_method)
def build_baegarg2019(model_wrapper, threshold_cosine=0.936338023, query_budget=None, max_candidates=50): """ Modified from https://github.com/QData/TextAttack/blob/04b7c6f79bdb5301b360555bd5458c15aa2b8695/textattack/attack_recipes/bae_garg_2019.py """ transformation = WordSwapMaskedLM( method="bae", max_candidates=max_candidates, min_confidence=0.0 ) constraints = [RepeatModification(), StopwordModification()] constraints.append(PartOfSpeech(allow_verb_noun_swap=True)) use_constraint = UniversalSentenceEncoder( threshold=threshold_cosine, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) goal_function = UntargetedClassification(model_wrapper) if query_budget is not None: goal_function.query_budget = query_budget search_method = GreedyWordSwapWIR(wir_method="delete") return Attack(goal_function, constraints, transformation, search_method)
def build(model, ensemble: bool = False): # [from correspondence with the author] # Candidate size K is set to 48 for all data-sets. transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # "We only take ε percent of the most important words since we tend to keep # perturbations minimum." # # [from correspondence with the author] # "Word percentage allowed to change is set to 0.4 for most data-sets, this # parameter is trivial since most attacks only need a few changes. This # epsilon is only used to avoid too much queries on those very hard samples." constraints.append(MaxWordsPerturbed(max_percent=0.4)) # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence # Encoder (Cer et al., 2018) to measure the semantic consistency between the # adversarial sample and the original sequence. To balance between semantic # preservation and attack success rate, we set up a threshold of semantic # similarity score to filter the less similar examples." # # [from correspondence with author] # "Over the full texts, after generating all the adversarial samples, we filter # out low USE score samples. Thus the success rate is lower but the USE score # can be higher. (actually USE score is not a golden metric, so we simply # measure the USE score over the final texts for a comparison with TextFooler). # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for # datasets like MNLI, we set threshold between 0-0.2." # # Since the threshold in the real world can't be determined from the training # data, the TextAttack implementation uses a fixed threshold - determined to # be 0.2 to be most fair. use_constraint = UniversalSentenceEncoder( threshold=0.2, metric="cosine", compare_against_original=True, window_size=None, ) constraints.append(use_constraint) # # Goal is untargeted classification. # goal_function = UntargetedClassification(model) # # "We first select the words in the sequence which have a high significance # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote # the input sentence, and oy(S) denote the logit output by the target model # for correct label y, the importance score Iwi is defined as # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···] # is the sentence after replacing wi with [MASK]. Then we rank all the words # according to the ranking score Iwi in descending order to create word list # L." search_method = GreedyWordSwapWIR(wir_method="unk", ensemble=ensemble) return Attack(goal_function, constraints, transformation, search_method)
def __init__(self, model="distilroberta-base", tokenizer="distilroberta-base", **kwargs): import transformers from textattack.transformations import ( CompositeTransformation, WordInsertionMaskedLM, WordMergeMaskedLM, WordSwapMaskedLM, ) shared_masked_lm = transformers.AutoModelForCausalLM.from_pretrained( model) shared_tokenizer = transformers.AutoTokenizer.from_pretrained( tokenizer) transformation = CompositeTransformation([ WordSwapMaskedLM( method="bae", masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=5e-4, ), WordInsertionMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=0.0, ), WordMergeMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=5e-3, ), ]) use_constraint = UniversalSentenceEncoder( threshold=0.7, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints = DEFAULT_CONSTRAINTS + [use_constraint] super().__init__(transformation, constraints=constraints, **kwargs)
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'): # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # # 50 nearest-neighbors with a cosine similarity of at least 0.5. # (The paper claims 0.7, but analysis of the code and some empirical # results show that it's definitely 0.5.) # transformation = WordSwapEmbedding(max_candidates=50, textfooler_stopwords=True) # # Minimum word embedding cosine similarity of 0.9. # constraints = [] constraints.append( WordEmbeddingDistance(min_cos_sim=0.9) ) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # if sentence_encoder == 'bert': se_constraint = BERT(threshold=SE_thresh, metric='cosine', compare_with_original=False, window_size=15, skip_text_shorter_than_window=False) else: se_constraint = UniversalSentenceEncoder(threshold=SE_thresh, metric='cosine', compare_with_original=False, window_size=15, skip_text_shorter_than_window=False) constraints.append(se_constraint) # # Do grammar checking # constraints.append( LanguageTool(0) ) # # Untargeted attack # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # attack = GreedyWordSwapWIR(goal_function, transformation=transformation, constraints=constraints, max_depth=None) return attack
def TextFoolerJin2019(model): # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # # 50 nearest-neighbors with a cosine similarity of at least 0.5. # (The paper claims 0.7, but analysis of the code and some empirical # results show that it's definitely 0.5.) # transformation = WordSwapEmbedding(max_candidates=50, textfooler_stopwords=True) # # Minimum word embedding cosine similarity of 0.5. # constraints = [] constraints.append(WordEmbeddingDistance(min_cos_sim=0.5)) # # Only replace words with the same part of speech (or nouns with verbs) # constraints.append(PartOfSpeech(allow_verb_noun_swap=True)) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # # In the TextFooler code, they forget to divide the angle between the two # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the # new threshold is 1 - (0.3) / pi = 0.90445 # use_constraint = UniversalSentenceEncoder( threshold=0.904458599, metric='angular', compare_with_original=False, window_size=15, skip_text_shorter_than_window=True) constraints.append(use_constraint) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # attack = GreedyWordSwapWIR(goal_function, transformation=transformation, constraints=constraints, max_depth=None) return attack
def build_attack(model_wrapper, target_class=-1): """ Same as bert-attack except: - it is TargetedClassification instead of Untargeted when target_class != -1 - using "bae" instead of "bert-attack" because of bert-attack's problem for subtokens Modified from https://github.com/QData/TextAttack/blob/36dfce6bdab933bdeed3a2093ae411e93018ebbf/textattack/attack_recipes/bert_attack_li_2020.py """ # transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48) transformation = WordSwapMaskedLM(method="bae", max_candidates=100) constraints = [RepeatModification(), StopwordModification()] constraints.append(MaxWordsPerturbed(max_percent=0.4)) use_constraint = UniversalSentenceEncoder( threshold=0.2, metric="cosine", compare_against_original=True, window_size=None, ) constraints.append(use_constraint) if target_class == -1: goal_function = UntargetedClassification(model_wrapper) else: # We modify the goal goal_function = TargetedClassification(model_wrapper, target_class=target_class) search_method = GreedyWordSwapWIR(wir_method="unk") return Attack(goal_function, constraints, transformation, search_method) # def build_attack_2(model_wrapper, target_class): # """ # Same as HotFlipEbrahimi2017 attack except: # - it is TargetedClassification instead of Untargeted # """ # transformation = WordSwapGradientBased(model_wrapper, top_n=1) # constraints = [RepeatModification(), StopwordModification()] # constraints.append(MaxWordsPerturbed(max_num_words=2)) # constraints.append(WordEmbeddingDistance(min_cos_sim=0.8)) # constraints.append(PartOfSpeech()) # goal_function = TargetedClassification(model_wrapper) # search_method = BeamSearch(beam_width=10) # return Attack(goal_function, constraints, transformation, search_method)
def Alzantot2018Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'): # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=50, textfooler_stopwords=True) # # Minimum word embedding cosine similarity of 0.9. # constraints = [] constraints.append( WordEmbeddingDistance(min_cos_sim=0.9) ) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # if sentence_encoder == 'bert': se_constraint = BERT(threshold=SE_thresh, metric='cosine', compare_with_original=False, window_size=15, skip_text_shorter_than_window=False) else: se_constraint = UniversalSentenceEncoder(threshold=SE_thresh, metric='cosine', compare_with_original=False, window_size=15, skip_text_shorter_than_window=False) constraints.append(se_constraint) # # Do grammar checking # constraints.append( LanguageTool(0) ) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # attack = GeneticAlgorithm(goal_function, transformation=transformation, constraints=constraints, pop_size=60, max_iters=20) return attack
class USEMetric(Metric): def __init__(self, **kwargs): self.use_obj = UniversalSentenceEncoder() self.use_obj.model = UniversalSentenceEncoder() self.original_candidates = [] self.successful_candidates = [] self.all_metrics = {} def calculate(self, results): """Calculates average USE similarity on all successfull attacks Args: results (``AttackResult`` objects): Attack results for each instance in dataset """ self.results = results for i, result in enumerate(self.results): if isinstance(result, FailedAttackResult): continue elif isinstance(result, SkippedAttackResult): continue else: self.original_candidates.append( result.original_result.attacked_text) self.successful_candidates.append( result.perturbed_result.attacked_text) use_scores = [] for c in range(len(self.original_candidates)): use_scores.append( self.use_obj._sim_score(self.original_candidates[c], self.successful_candidates[c]).item()) self.all_metrics["avg_attack_use_score"] = round( sum(use_scores) / len(use_scores), 2) return self.all_metrics
def __init__(self, **kwargs): self.use_obj = UniversalSentenceEncoder() self.use_obj.model = UniversalSentenceEncoder() self.original_candidates = [] self.successful_candidates = [] self.all_metrics = {}
'thru', 'thus', 'to', 'too', 'toward', 'towards', 'under', 'unless', 'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won', "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves' ]) # Lax Constraints MAX_LENGTH = 256 USE_THRESHOLD = 0.9 ALLOW_VERB_NOUN_SWAP = False TAGGER_TYPE = "flair" CONSTRAINTS = [ RepeatModification(), StopwordModification(stopwords=STOPWORDS), MaxWordIndexModification(max_length=MAX_LENGTH), InputColumnModification(["premise", "hypothesis"], {"premise"}), UniversalSentenceEncoder( threshold=USE_THRESHOLD, metric="angular", compare_against_original=False, window_size=15, skip_text_shorter_than_window=True, ), PartOfSpeech(tagger_type=TAGGER_TYPE, allow_verb_noun_swap=ALLOW_VERB_NOUN_SWAP) ]
def TextFoolerJin2019(model): """ Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. https://arxiv.org/abs/1907.11932 """ # # Swap words with their 50 closest embedding nearest-neighbors. # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the same word twice or the stopwords defined # in the TextFooler public implementation. # # fmt: off stopwords = set([ "a", "about", "above", "across", "after", "afterwards", "again", "against", "ain", "all", "almost", "alone", "along", "already", "also", "although", "am", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "aren", "aren't", "around", "as", "at", "back", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "could", "couldn", "couldn't", "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down", "due", "during", "either", "else", "elsewhere", "empty", "enough", "even", "ever", "everyone", "everything", "everywhere", "except", "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "if", "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile", "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly", "must", "mustn", "mustn't", "my", "myself", "namely", "needn", "needn't", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "per", "please", "s", "same", "shan", "shan't", "she", "she's", "should've", "shouldn", "shouldn't", "somehow", "something", "sometime", "somewhere", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "this", "those", "through", "throughout", "thru", "thus", "to", "too", "toward", "towards", "under", "unless", "until", "up", "upon", "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "with", "within", "without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]) # fmt: on constraints = [ RepeatModification(), StopwordModification(stopwords=stopwords) ] # # During entailment, we should only edit the hypothesis - keep the premise # the same. # input_column_modification = InputColumnModification( ["premise", "hypothesis"], {"premise"}) constraints.append(input_column_modification) # Minimum word embedding cosine similarity of 0.5. # (The paper claims 0.7, but analysis of the released code and some empirical # results show that it's 0.5.) # constraints.append(WordEmbeddingDistance(min_cos_sim=0.5)) # # Only replace words with the same part of speech (or nouns with verbs) # constraints.append(PartOfSpeech(allow_verb_noun_swap=True)) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # # In the TextFooler code, they forget to divide the angle between the two # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the # new threshold is 1 - (0.3) / pi = 0.90445 # use_constraint = UniversalSentenceEncoder( threshold=0.904458599, metric="angular", compare_with_original=False, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # search_method = GreedyWordSwapWIR() return Attack(goal_function, constraints, transformation, search_method)
def build(model): # "This paper presents CLARE, a ContextuaLized AdversaRial Example generation model # that produces fluent and grammatical outputs through a mask-then-infill procedure. # CLARE builds on a pre-trained masked language model and modifies the inputs in a context-aware manner. # We propose three contex-tualized perturbations, Replace, Insert and Merge, allowing for generating outputs of # varied lengths." # # "We experiment with a distilled version of RoBERTa (RoBERTa_{distill}; Sanh et al., 2019) # as the masked language model for contextualized infilling." # Because BAE and CLARE both use similar replacement papers, we use BAE's replacement method here. shared_masked_lm = transformers.AutoModelForCausalLM.from_pretrained( "distilroberta-base") shared_tokenizer = transformers.AutoTokenizer.from_pretrained( "distilroberta-base") transformation = CompositeTransformation([ WordSwapMaskedLM( method="bae", masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=5e-4, ), WordInsertionMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=0.0, ), WordMergeMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=5e-3, ), ]) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # "A common choice of sim(·,·) is to encode sentences using neural networks, # and calculate their cosine similarity in the embedding space (Jin et al., 2020)." # The original implementation uses similarity of 0.7. use_constraint = UniversalSentenceEncoder( threshold=0.7, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) # Goal is untargeted classification. # "The score is then the negative probability of predicting the gold label from f, using [x_{adv}] as the input" goal_function = UntargetedClassification(model) # "To achieve this, we iteratively apply the actions, # and first select those minimizing the probability of outputting the gold label y from f." # # "Only one of the three actions can be applied at each position, and we select the one with the highest score." # # "Actions are iteratively applied to the input, until an adversarial example is found or a limit of actions T # is reached. # Each step selects the highest-scoring action from the remaining ones." # search_method = GreedySearch() return Attack(goal_function, constraints, transformation, search_method)
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'): """ Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. https://arxiv.org/abs/1907.11932 Constraints adjusted from paper to align with human evaluation. """ # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # # 50 nearest-neighbors with a cosine similarity of at least 0.5. # (The paper claims 0.7, but analysis of the code and some empirical # results show that it's definitely 0.5.) # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Minimum word embedding cosine similarity of 0.9. # constraints.append(WordEmbeddingDistance(min_cos_sim=0.9)) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # if sentence_encoder == 'bert': se_constraint = BERT(threshold=SE_thresh, metric='cosine', compare_with_original=False, window_size=15, skip_text_shorter_than_window=False) else: se_constraint = UniversalSentenceEncoder( threshold=SE_thresh, metric='cosine', compare_with_original=False, window_size=15, skip_text_shorter_than_window=False) constraints.append(se_constraint) # # Do grammar checking # constraints.append(LanguageTool(0)) # # Untargeted attack # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # search_method = GreedyWordSwapWIR() return Attack(goal_function, constraints, transformation, search_method)
def build(model): # "In this paper, we present a simple yet novel technique: BAE (BERT-based # Adversarial Examples), which uses a language model (LM) for token # replacement to best fit the overall context. We perturb an input sentence # by either replacing a token or inserting a new token in the sentence, by # means of masking a part of the input and using a LM to fill in the mask." # # We only consider the top K=50 synonyms from the MLM predictions. # # [from email correspondance with the author] # "When choosing the top-K candidates from the BERT masked LM, we filter out # the sub-words and only retain the whole words (by checking if they are # present in the GloVE vocabulary)" # transformation = WordSwapMaskedLM(method="bae", max_candidates=50) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # For the R operations we add an additional check for # grammatical correctness of the generated adversarial example by filtering # out predicted tokens that do not form the same part of speech (POS) as the # original token t_i in the sentence. constraints.append(PartOfSpeech(allow_verb_noun_swap=True)) # "To ensure semantic similarity on introducing perturbations in the input # text, we filter the set of top-K masked tokens (K is a pre-defined # constant) predicted by BERT-MLM using a Universal Sentence Encoder (USE) # (Cer et al., 2018)-based sentence similarity scorer." # # "[We] set a threshold of 0.8 for the cosine similarity between USE-based # embeddings of the adversarial and input text." # # [from email correspondence with the author] # "For a fair comparison of the benefits of using a BERT-MLM in our paper, # we retained the majority of TextFooler's specifications. Thus we: # 1. Use the USE for comparison within a window of size 15 around the word # being replaced/inserted. # 2. Set the similarity score threshold to 0.1 for inputs shorter than the # window size (this translates roughly to almost always accepting the new text). # 3. Perform the USE similarity thresholding of 0.8 with respect to the text # just before the replacement/insertion and not the original text (For # example: at the 3rd R/I operation, we compute the USE score on a window # of size 15 of the text obtained after the first 2 R/I operations and not # the original text). # ... # To address point (3) from above, compare the USE with the original text # at each iteration instead of the current one (While doing this change # for the R-operation is trivial, doing it for the I-operation with the # window based USE comparison might be more involved)." # # Finally, since the BAE code is based on the TextFooler code, we need to # adjust the threshold to account for the missing / pi in the cosine # similarity comparison. So the final threshold is 1 - (1 - 0.8) / pi # = 1 - (0.2 / pi) = 0.936338023. use_constraint = UniversalSentenceEncoder( threshold=0.936338023, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) # # Goal is untargeted classification. # goal_function = UntargetedClassification(model) # # "We estimate the token importance Ii of each token # t_i ∈ S = [t1, . . . , tn], by deleting ti from S and computing the # decrease in probability of predicting the correct label y, similar # to (Jin et al., 2019). # # • "If there are multiple tokens can cause C to misclassify S when they # replace the mask, we choose the token which makes Sadv most similar to # the original S based on the USE score." # • "If no token causes misclassification, we choose the perturbation that # decreases the prediction probability P(C(Sadv)=y) the most." # search_method = GreedyWordSwapWIR(wir_method="delete") return BAEGarg2019(goal_function, constraints, transformation, search_method)
def BERTAttackLi2020(model): """ Li, L.., Ma, R., Guo, Q., Xiangyang, X., Xipeng, Q. (2020). BERT-ATTACK: Adversarial Attack Against BERT Using BERT https://arxiv.org/abs/2004.09984 This is "attack mode" 1 from the paper, BAE-R, word replacement. """ from textattack.shared.utils import logger logger.warn( "WARNING: This BERT-Attack implementation is based off of a" " preliminary draft of the paper, which lacked source code and" " did not include any hyperparameters. Attack reuslts are likely to" " change." ) # [from correspondence with the author] # Candidate size K is set to 48 for all data-sets. transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # "We only take ε percent of the most important words since we tend to keep # perturbations minimum." # # [from correspondence with the author] # "Word percentage allowed to change is set to 0.4 for most data-sets, this # parameter is trivial since most attacks only need a few changes. This # epsilon is only used to avoid too much queries on those very hard samples." constraints.append(MaxWordsPerturbed(max_percent=0.4)) # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence # Encoder (Cer et al., 2018) to measure the semantic consistency between the # adversarial sample and the original sequence. To balance between semantic # preservation and attack success rate, we set up a threshold of semantic # similarity score to filter the less similar examples." # # [from correspondence with author] # "Over the full texts, after generating all the adversarial samples, we filter # out low USE score samples. Thus the success rate is lower but the USE score # can be higher. (actually USE score is not a golden metric, so we simply # measure the USE score over the final texts for a comparison with TextFooler). # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for # datasets like MNLI, we set threshold between 0-0.2." # # Since the threshold in the real world can't be determined from the training # data, the TextAttack implementation uses a fixed threshold - determined to # be 0.2 to be most fair. use_constraint = UniversalSentenceEncoder( threshold=0.2, metric="cosine", compare_with_original=True, window_size=None, ) constraints.append(use_constraint) # # Goal is untargeted classification. # goal_function = UntargetedClassification(model) # # "We first select the words in the sequence which have a high significance # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote # the input sentence, and oy(S) denote the logit output by the target model # for correct label y, the importance score Iwi is defined as # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···] # is the sentence after replacing wi with [MASK]. Then we rank all the words # according to the ranking score Iwi in descending order to create word list # L." search_method = GreedyWordSwapWIR(wir_method="unk") return Attack(goal_function, constraints, transformation, search_method)
def Alzantot2018Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'): """ Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., & Chang, K. (2018). Generating Natural Language Adversarial Examples. https://arxiv.org/abs/1801.00554 Constraints adjusted from paper to align with human evaluation. """ # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Minimum word embedding cosine similarity of 0.9. # constraints = [] constraints.append(WordEmbeddingDistance(min_cos_sim=0.9)) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # if sentence_encoder == 'bert': se_constraint = BERT(threshold=SE_thresh, metric='cosine', compare_against_original=False, window_size=15, skip_text_shorter_than_window=False) else: se_constraint = UniversalSentenceEncoder( threshold=SE_thresh, metric='cosine', compare_against_original=False, window_size=15, skip_text_shorter_than_window=False) constraints.append(se_constraint) # # Do grammar checking # constraints.append(LanguageTool(0)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Perform word substitution with a genetic algorithm. # search_method = AlzantotGeneticAlgorithm(pop_size=60, max_iters=20, post_crossover_check=False) return Attack(goal_function, constraints, transformation, search_method)
def attack_from_queue(args, in_queue, out_queue): gpu_id = torch.multiprocessing.current_process()._identity[0] - 2 set_env_variables(gpu_id) config = BertConfig.from_pretrained("hfl/chinese-macbert-base") # "hfl/chinese-macbert-base" config.output_attentions = False config.output_token_type_ids = False # config.max_length = 30 tokenizer = BertTokenizer.from_pretrained("hfl/chinese-macbert-base", config=config) config = AutoConfig.from_pretrained( './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289', num_labels=3 ) # for normal model = AutoModelForSequenceClassification.from_pretrained( './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289', config=config, ) model_wrapper = HuggingFaceModelWrapper(model, tokenizer, batch_size=24) # for normal # shared_masked_lm = BertModel.from_pretrained( # "bert-base-chinese" # ) # for mask!!! shared_masked_lm = AutoModelForMaskedLM.from_pretrained( "bert-base-chinese" ) shared_tokenizer = BertTokenizer.from_pretrained( "bert-base-chinese" ) transformation = CompositeTransformation( [ WordSwapMaskedLM( method="bae", masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=5, min_confidence=5e-4, ), WordInsertionMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=5, min_confidence=0.0, ), WordMergeMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=5, min_confidence=5e-3, ), ] ) # goal function goal_function = UntargetedClassification(model_wrapper) # constraints stopwords = set( ["个", "关于", "之上", "across", "之后", "afterwards", "再次", "against", "ain", "全部", "几乎", "单独", "along", "早已", "也", "虽然", "是", "among", "amongst", "一个", "和", "其他", "任何", "anyhow", "任何人", "anything", "anyway", "anywhere", "are", "aren", "没有", "around", "as", "at", "后", "been", "之前", "beforehand", "behind", "being", "below", "beside", "besides", "之間", "beyond", "皆是", "但", "by", "可以", "不可以", "是", "不是", "couldn't", "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down", "due", "之間", "either", "之外", "elsewhere", "空", "足夠", "甚至", "ever", "任何人", "everything", "everywhere", "except", "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "if", "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile", "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly", "must", "mustn", "mustn't", "my", "myself", "namely", "needn", "needn't", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "per", "please", "s", "same", "shan", "shan't", "she", "she's", "should've", "shouldn", "shouldn't", "somehow", "something", "sometime", "somewhere", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "this", "those", "through", "throughout", "thru", "thus", "to", "too", "toward", "towards", "under", "unless", "until", "up", "upon", "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "with", "within", "without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] ) constraints = [RepeatModification(), StopwordModification()] # input_column_modification = InputColumnModification( # ["premise", "hypothesis"], {"premise"} # ) # constraints.append(input_column_modification) # constraints.append(WordEmbeddingDistance(min_cos_sim=0.5)) use_constraint = UniversalSentenceEncoder( threshold=0.7, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) # constraints = [ # MaxWordsPerturbed(5), # ] # transformation # transformation = WordSwapMaskedLM(method="bae", max_candidates=50, min_confidence=0.0) # transformation = WordSwapEmbedding(max_candidates=10) # transformation = WordDeletion() # search methods # search_method = GreedyWordSwapWIR(wir_method="delete") search_method = GreedySearch() textattack.shared.utils.set_seed(args.random_seed) attack = Attack(goal_function, constraints, transformation, search_method) # attack = parse_attack_from_args(args) if gpu_id == 0: print(attack, "\n") while not in_queue.empty(): try: i, text, output = in_queue.get() results_gen = attack.attack_dataset([(text, output)]) result = next(results_gen) out_queue.put((i, result)) except Exception as e: out_queue.put(e) exit()