def _attack(self, initial_result): """Calls the ``SearchMethod`` to perturb the ``AttackedText`` stored in ``initial_result``. Args: initial_result: The initial ``GoalFunctionResult`` from which to perturb. Returns: A ``SuccessfulAttackResult``, ``FailedAttackResult``, or ``MaximizedAttackResult``. """ final_result = self.search_method(initial_result) self.clear_cache() if final_result.goal_status == GoalFunctionResultStatus.SUCCEEDED: return SuccessfulAttackResult( initial_result, final_result, ) elif final_result.goal_status == GoalFunctionResultStatus.SEARCHING: return FailedAttackResult( initial_result, final_result, ) elif final_result.goal_status == GoalFunctionResultStatus.MAXIMIZING: return MaximizedAttackResult( initial_result, final_result, ) else: raise ValueError( f"Unrecognized goal status {final_result.goal_status}")
def attack_one(self, original_tokenized_text, correct_output): max_words_changed = min(self.max_words_changed, len(original_tokenized_text.words)) original_result = self.goal_function.get_results( [original_tokenized_text], correct_output)[0] default_unswapped_word_indices = list( range(len(original_tokenized_text.words))) beam = [(original_tokenized_text, default_unswapped_word_indices)] num_words_changed = 0 best_result = None while num_words_changed < max_words_changed: num_words_changed += 1 potential_next_beam = [] for text, unswapped_word_indices in beam: transformations = self.get_transformations( text, indices_to_replace=unswapped_word_indices, original_text=original_tokenized_text) for next_text in transformations: new_unswapped_word_indices = unswapped_word_indices.copy() modified_word_index = next_text.attack_attrs[ 'modified_word_index'] new_unswapped_word_indices.remove(modified_word_index) potential_next_beam.append( (next_text, new_unswapped_word_indices)) if len(potential_next_beam) == 0: # If we did not find any possible perturbations, give up. return FailedAttackResult(original_result) transformed_text_candidates = [ text for (text, _) in potential_next_beam ] results = self.goal_function.get_results( transformed_text_candidates, correct_output) scores = np.array([r.score for r in results]) # If we succeeded, break best_result = results[scores.argmax()] if best_result.succeeded: break # Otherwise, refill the beam. This works by sorting the scores # in descending order and filling the beam from there. best_indices = -scores.argsort()[:self.beam_width] beam = [potential_next_beam[i] for i in best_indices] if best_result is None: return FailedAttackResult(original_result, best_result) else: return SuccessfulAttackResult(original_result, best_result)
def attack_one(self, initial_result): """ Calls the ``SearchMethod`` to perturb the ``TokenizedText`` stored in ``initial_result``. Args: initial_result: The initial ``GoalFunctionResult`` from which to perturb. Returns: Either a ``SuccessfulAttackResult`` or ``FailedAttackResult``. """ final_result = self.search_method(initial_result) if final_result.succeeded: return SuccessfulAttackResult(initial_result, final_result) else: return FailedAttackResult(initial_result, final_result)
def attack_one(self, tokenized_text, correct_output): self.original_tokenized_text = tokenized_text self.correct_output = correct_output original_result = self.goal_function.get_results([tokenized_text], correct_output)[0] neighbors_len = self._get_neighbors_len(tokenized_text) pop = self._generate_population(neighbors_len) cur_score = original_result.score for i in range(self.max_iters): pop_results = self.goal_function.get_results( [pm.tokenized_text for pm in pop], correct_output) for idx, result in enumerate(pop_results): pop[idx].result = pop_results[idx] pop = sorted(pop, key=lambda x: -x.result.score) print('\t\t', i, ' -- ', float(pop[0].result.score)) pop_scores = torch.Tensor([r.score for r in pop_results]) logits = ((-pop_scores) / self.temp).exp() select_probs = (logits / logits.sum()).cpu().numpy() if pop[0].result.succeeded: return SuccessfulAttackResult(original_result, pop[0].result) if pop[0].result.score > cur_score: cur_score = pop[0].result.score elif self.give_up_if_no_improvement: break elite = [pop[0]] parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) children = [ self._crossover(pop[parent1_idx[idx]], pop[parent2_idx[idx]]) for idx in range(self.pop_size - 1) ] for c in children: self._perturb(c) pop = elite + children return FailedAttackResult(original_result, pop[0].result)
def attack_one(self, tokenized_text, correct_output): original_tokenized_text = tokenized_text num_words_changed = 0 # Sort words by order of importance original_result = self.goal_function.get_results([tokenized_text], correct_output)[0] cur_score = original_result.score len_text = len(tokenized_text.words) leave_one_texts = \ [tokenized_text.replace_word_at_index(i,self.replacement_str) for i in range(len_text)] leave_one_scores = np.array([result.score for result in \ self.goal_function.get_results(leave_one_texts, correct_output)]) index_order = (-leave_one_scores).argsort() new_tokenized_text = None new_text_label = None i = 0 while ((self.max_depth is None) or num_words_changed <= self.max_depth) and i < len(index_order): transformed_text_candidates = self.get_transformations( tokenized_text, original_tokenized_text, indices_to_replace=[index_order[i]]) i += 1 if len(transformed_text_candidates) == 0: continue num_words_changed += 1 results = sorted(self.goal_function.get_results( transformed_text_candidates, correct_output), key=lambda x: -x.score) # Skip swaps which don't improve the score if results[0].score > cur_score: cur_score = results[0].score else: continue # If we succeeded, return the index with best similarity. if results[0].succeeded: best_result = results[0] # @TODO: Use vectorwise operations max_similarity = -float('inf') for result in results: if not result.succeeded: break candidate = result.tokenized_text try: similarity_score = candidate.attack_attrs[ 'similarity_score'] except KeyError: # If the attack was run without any similarity metrics, # candidates won't have a similarity score. In this # case, break and return the candidate that changed # the original score the most. break if similarity_score > max_similarity: max_similarity = similarity_score best_result = result return SuccessfulAttackResult(original_result, best_result) else: tokenized_text = results[0].tokenized_text if len(results): return FailedAttackResult(original_result, results[0]) else: return FailedAttackResult(original_result)