示例#1
0
    def attack(self, x_orig, target, max_change=0.4):
        x_adv = x_orig.copy()
        x_len = np.sum(np.sign(x_orig))
        # Neigbhours for every word.
        tmp = [
            glove_utils.pick_most_similar_words(x_orig[i], self.dist_mat, 50,
                                                0.5) for i in range(x_len)
        ]
        neigbhours_list = [x[0] for x in tmp]
        neighbours_dist = [x[1] for x in tmp]
        neighbours_len = [len(x) for x in neigbhours_list]
        for i in range(x_len):
            if (x_adv[i] < 27):
                # To prevent replacement of words like 'the', 'a', 'of', etc.
                neighbours_len[i] = 0
        w_select_probs = neighbours_len / np.sum(neighbours_len)
        tmp = [
            glove_utils.pick_most_similar_words(x_orig[i], self.dist_mat,
                                                self.top_n, 0.5)
            for i in range(x_len)
        ]
        neigbhours_list = [x[0] for x in tmp]
        neighbours_dist = [x[1] for x in tmp]
        pop = self.generate_population(x_orig, neigbhours_list,
                                       neighbours_dist, w_select_probs, target,
                                       self.pop_size)
        for i in range(self.max_iters):
            # print(i)
            pop_preds = self.batch_model.predict(self.sess, np.array(pop))
            pop_scores = pop_preds[:, target]
            print('\t\t', i, ' -- ', np.max(pop_scores))
            pop_ranks = np.argsort(pop_scores)[::-1]
            top_attack = pop_ranks[0]

            logits = np.exp(pop_scores / self.temp)
            select_probs = logits / np.sum(logits)

            if np.argmax(pop_preds[top_attack, :]) == target:
                print("Finished genetic attack in {} iterations".format(i))
                return pop[top_attack]
            elite = [pop[top_attack]]  # elite
            # print(select_probs.shape)
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)

            childs = [
                self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]])
                for i in range(self.pop_size - 1)
            ]
            childs = [
                self.perturb(x, x_orig, neigbhours_list, neighbours_dist,
                             w_select_probs, target) for x in childs
            ]
            pop = elite + childs

        return None
 def perturb(self, x_cur, pos, x_orig, target):
     # perturb a word that is in given position.
     x_len = np.sum(np.sign(x_cur))
     if pos % 50 == 0:
         print(' --- {} / {} '.format(pos, x_len))
     assert pos < x_len, "invalid position"
     src_word = x_cur[pos]
     replace_list, _ = glove_utils.pick_most_similar_words(
         src_word, self.dist_mat, 60)
     replace_list = [w if w != 0 else src_word for w in replace_list]
     return self.select_best_replacement(pos, x_cur, x_orig, target,
                                         replace_list)
 def attack(self, x_orig, target, max_change=0.4):
     x_adv = x_orig.copy()
     doc_len = np.sum(np.sign(x_orig))
     num_updates = 0
     while ((num_updates / doc_len) < max_change):
         # pick some word
         W = []  # Set of candiaate updates
         list_x_new = []
         for i, x in enumerate(x_adv):
             # for each word in x_adv
             if x != self.dataset.dict["UNK"]:
                 # skip the UNK
                 x_list, _ = glove_utils.pick_most_similar_words(
                     x, self.dist_mat)
                 # TODO(malzantot) Score words in x_ based on the language model
                 # Add the selected word to the W list
                 # TODO(malzantot): check selected word is not equal to the original word.
                 for j in range(len(x_list)):
                     if x_list[j] != x_orig[i]:
                         W.append((i, x_list[0]))
                         x_new = x_adv.copy()
                         x_new[i] = x_list[j]
                         # print(self.inv_dict[x_orig[i]], ' -> ', self.inv_dict[x_new[i]])
                         list_x_new.append(x_new)
                         break
         x_new_pred_probs = np.array([
             self.model.predict(self.sess, x[np.newaxis, :])[0]
             for x in list_x_new
         ])
         x_new_preds = np.argmax(x_new_pred_probs, axis=1)
         x_new_scores = x_new_pred_probs[:, target]
         top_attack = np.argsort(x_new_scores)[-1]
         x_adv = list_x_new[top_attack]
         num_updates += 1
         if x_new_preds[top_attack] == target:
             return x_adv
     return None
示例#4
0
    def attack(self, x_orig, target):
        x1_adv = x_orig[0].copy().ravel()
        x2_adv = x_orig[1].copy().ravel()
        x1_orig = x_orig[0].ravel()
        x2_orig = x_orig[1].ravel()
        x1_len = np.sum(np.sign(x1_adv))
        x2_len = np.sum(np.sign(x2_adv))
        tmp = [
            glove_utils.pick_most_similar_words(x2_adv[i], self.dist_mat, 50,
                                                0.5) if x2_adv[i] != 0 else
            ([], []) for i in range(len(x2_adv))
        ]
        neighbours_list = [x[0] for x in tmp]
        neighbours_dist = [x[1] for x in tmp]
        neigbhours_len = [len(x) for x in neighbours_list]
        w_select_probs = neigbhours_len / np.sum(neigbhours_len)
        tmp = [
            glove_utils.pick_most_similar_words(
                x2_adv[i], self.dist_mat, self.n1, 0.5) if x2_adv[i] != 0 else
            ([], []) for i in range(len(x2_adv))
        ]
        neighbours_list = [x[0] for x in tmp]
        neighbours_dist = [x[1] for x in tmp]

        pop = np.array(
            self.generate_population(x2_adv, neighbours_list, w_select_probs,
                                     target, self.pop_size))
        pop = pop.reshape(self.pop_size, -1)
        # print(pop)
        pop_x1 = np.tile(x1_adv,
                         (self.pop_size, 1, 1)).reshape(self.pop_size, -1)
        for iter_idx in range(self.max_iters):
            pop_preds = self.model.predict([pop_x1, pop])
            pop_scores = pop_preds[:, target]
            pop_ranks = np.argsort(pop_scores)[::-1]
            top_attack = pop_ranks[0]
            if np.argmax(pop_preds[top_attack, :]) == target:
                return x1_orig, pop[top_attack]
            print(iter_idx, ' : ', np.max(pop_scores))
            logits = np.exp(pop_scores / self.temp)
            pop_select_probs = logits / np.sum(logits)

            elite = [pop[top_attack]]
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=pop_select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=pop_select_probs)

            childs = [
                self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]])
                for i in range(self.pop_size - 1)
            ]
            childs = [
                self.perturb(x, x2_orig, neighbours_list, w_select_probs,
                             target) for x in childs
            ]
            pop = elite + childs
            pop = np.array(pop)
        return None
    def attack(self, seq, target, l, max_change=0.4):
        seq = seq.numpy().squeeze()
        seq_adv = seq.copy()
        seq_len = np.sum(np.sign(seq))
        l = l.cpu()
        # To calculate the sampling probability
        tmp = [
            glove_utils.pick_most_similar_words(self.compute_dist(seq[i]),
                                                ret_count=50,
                                                threshold=0.5)
            for i in range(l)
        ]
        neighbour_list = [t[0] for t in tmp]
        neighbour_dist = [t[1] for t in tmp]
        neighbour_len = [len(i) for i in neighbour_list]
        for i in range(seq_len):
            if seq[i] < 27:
                neighbour_len[i] = 0
        prob_select = neighbour_len / np.sum(neighbour_len)
        tmp = [
            glove_utils.pick_most_similar_words(self.compute_dist(seq[i]),
                                                self.top_n1, 0.5)
            for i in range(l)
        ]
        neighbour_list = [t[0] for t in tmp]
        neighbour_dist = [t[1] for t in tmp]
        pop = [
            self.perturb(seq_adv, seq, neighbour_list, neighbour_dist,
                         prob_select, seq_len, target, l)
            for _ in range(self.pop_size)
        ]

        l_tensor = l * torch.ones([len(pop)])
        pop_np = np.expand_dims(pop[0], 0)
        for p in pop[1:]:
            pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)), 0)

        for i in range(self.max_iters):
            pop_tensor = torch.tensor(pop_np).type(torch.LongTensor).to(
                self.device)
            l_tensor = l_tensor.to(self.device)
            self.batch_model.eval()
            with torch.no_grad():
                pop_preds = self.batch_model.pred(
                    pop_tensor, l_tensor)[1].cpu().detach().numpy()

            pop_scores = pop_preds[:, target]
            print('\t\t', i, ' -- ', np.max(pop_scores))
            pop_ranks = np.argsort(pop_scores)[::-1]
            top_attack = pop_ranks[0]

            logits = np.exp(pop_scores / self.temp)
            select_probs = logits / np.sum(logits)

            if np.argmax(pop_preds[top_attack, :]) == target:
                print('Success and score: {:.4f}'.format(
                    pop_scores[top_attack]))
                return pop[top_attack]

            elite = [pop[top_attack]]  # elite
            # print(select_probs.shape)
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)

            childs = [
                self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]])
                for i in range(self.pop_size - 1)
            ]
            childs = [
                self.perturb(x, seq, neighbour_list, neighbour_dist,
                             prob_select, seq_len, target, l) for x in childs
            ]
            pop = elite + childs
            pop_np = np.expand_dims(pop[0], 0)
            for p in pop[1:]:
                pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)), 0)

        return None
示例#6
0
    def attack(self, x_orig, target, max_change=0.4):
        x_orig = x_orig.numpy().squeeze()
        x_adv = x_orig.copy()
        x_len = np.sum(np.sign(x_orig))
        # Neigbhours for every word.
        tmp = [
            glove_utils.pick_most_similar_words(x_orig[i], self.dist, 50, 0.5)
            for i in range(x_len)
        ]
        neigbhours_list = [x[0] for x in tmp]
        neighbours_dist = [x[1] for x in tmp]
        neighbours_len = [len(x) for x in neigbhours_list]
        for i in range(x_len):
            if (x_adv[i] < 27):
                # To prevent replacement of words like 'the', 'a', 'of', etc.
                neighbours_len[i] = 0
        w_select_probs = neighbours_len / np.sum(neighbours_len)
        tmp = [
            glove_utils.pick_most_similar_words(x_orig[i], self.dist,
                                                self.top_n, 0.5)
            for i in range(x_len)
        ]
        neigbhours_list = [x[0] for x in tmp]
        neighbours_dist = [x[1] for x in tmp]
        pop = self.generate_population(x_orig, neigbhours_list,
                                       neighbours_dist, w_select_probs, target,
                                       self.pop_size)
        for i in range(self.max_iters):
            # print(i)
            l_tensor = x_len * torch.ones([len(pop)])
            pop_np = np.expand_dims(pop[0], 0)
            for p in pop[1:]:
                pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)), 0)

            pop_tensor = torch.tensor(pop_np).type(torch.LongTensor).to(
                self.device)
            l_tensor = l_tensor.to(self.device)
            self.batch_model.eval()
            with torch.no_grad():
                pop_preds = self.batch_model.pred(
                    pop_tensor, l_tensor, False)[1].cpu().detach().numpy()

#            pop_preds = self.batch_model.predict(self.sess, np.array(pop))
            pop_scores = pop_preds[:, target]
            print('\t\t', i, ' -- ', np.max(pop_scores))
            pop_ranks = np.argsort(pop_scores)[::-1]
            top_attack = pop_ranks[0]

            ampl = pop_scores / self.temp
            # print(ampl)
            covariance = np.cov(ampl)
            # print(covariance)
            if covariance > 10e-6:
                mean = np.mean(ampl)
                # print(mean)
                ampl_update = (ampl - mean) / np.sqrt(covariance + 0.001)
                # print(ampl_update)
                logits = np.exp(ampl_update)
            else:
                if np.max(ampl) > 100:
                    ampl = ampl / (np.max(ampl) / 5)
                logits = np.exp(ampl)
            # logits = np.exp(ampl)
            select_probs = logits / np.sum(logits)

            if np.argmax(pop_preds[top_attack, :]) == target:
                return pop[top_attack]
            elite = [pop[top_attack]]  # elite
            # print(select_probs.shape)
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)

            childs = [
                self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]])
                for i in range(self.pop_size - 1)
            ]
            childs = [
                self.perturb(x, x_orig, neigbhours_list, neighbours_dist,
                             w_select_probs, target) for x in childs
            ]
            pop = elite + childs

        return None
    Author: Moustafa Alzantot ([email protected])
    All rights reserved.
"""

import numpy as np
import tensorflow as tf
import glove_utils
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_VOCAB_SIZE = 50000
embedding_matrix = np.load(
    ('aux_files/embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE)))
missed = np.load(
    ('aux_files/missed_embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE)))
c_ = -2 * np.dot(embedding_matrix.T, embedding_matrix)
a = np.sum(np.square(embedding_matrix), axis=0).reshape((1, -1))
b = a.T
dist = a + b + c_
np.save(('aux_files/dist_counter_%d.npy' % (MAX_VOCAB_SIZE)), dist)

# Try an example
with open('aux_files/dataset_%d.pkl' % MAX_VOCAB_SIZE, 'rb') as f:
    dataset = pickle.load(f)
src_word = dataset.dict['good']
neighbours, neighbours_dist = glove_utils.pick_most_similar_words(
    src_word, dist)
print('Closest words to `good` are :')
result_words = [dataset.inv_dict[x] for x in neighbours]
print(result_words)
    def attack(self, seq, target, l, max_change=0.5):

        seq = seq.cpu().detach().numpy().squeeze(
        )  #'''label of change; convert'''
        seq_orig, seq_orig_string, l_orig = self.orig_sentence(seq)

        # print(seq_orig)
        # seq_adv = seq.copy()
        # seq_len = np.sum(np.sign(seq))
        l = l.cpu()
        # print(self.tokenizer.convert_ids_to_tokens(seq.tolist()))
        # To calculate the sampling probability
        tmp = [
            glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]),
                                                50, 0.5) for i in range(l_orig)
        ]

        # tmp = [glove_utils.pick_most_similar_words(self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), ret_count = 50, threshold = 0.5) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)]
        neighbour_list = [t[0] for t in tmp]
        neighbour_dist = [t[1] for t in tmp]
        neighbour_len = [len(i) for i in neighbour_list]
        for i in range(l_orig):
            if (seq_orig[i] < 27):
                # To prevent replacement of words like 'the', 'a', 'of', etc.
                neighbour_len[i] = 0
        prob_select = neighbour_len / np.sum(neighbour_len)
        # print(prob_select)
        # tmp = [glove_utils.pick_most_similar_words(
        #     self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), self.top_n1, 0.5
        # ) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)]
        tmp = [
            glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]),
                                                self.top_n1, 0.5)
            for i in range(l_orig)
        ]

        neighbour_list = [t[0] for t in tmp]
        neighbour_dist = [t[1] for t in tmp]
        # print('synonyms')
        # print(tmp)
        # print([[self.dataset.inv_dict[j] for j in i if j in self.dataset.inv_dict] for i in neighbour_list])
        seq_adv = seq_orig_string.copy()
        # pop = [self.perturb(seq_adv, seq, seq_orig, l_orig, neighbour_list, neighbour_dist, prob_select, seq_len, target, l) for _ in range(self.pop_size)]
        pop = [
            self.perturb(seq_adv, seq_orig_string, l_orig, neighbour_list,
                         neighbour_dist, prob_select, target, l)
            for _ in range(self.pop_size)
        ]

        l_tensor = torch.ones([len(pop)]).type(torch.LongTensor)
        pop_np = [[self.tokenizer.cls_token_id] +
                  self.tokenizer.convert_tokens_to_ids(
                      self.tokenizer.tokenize(' '.join(pop[0]).strip())) +
                  [self.tokenizer.sep_token_id]]
        l_tensor[0] = len(pop_np[0])
        # print(l_tensor)
        for p in range(1, len(pop)):
            token_ids = [
                self.tokenizer.cls_token_id
            ] + self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(' '.join(
                    pop[p]).strip())) + [self.tokenizer.sep_token_id]
            pop_np.append(token_ids)
            l_tensor[p] = len(token_ids)
        l_max = torch.max(l_tensor)

        # print(l_max, l_tensor, len(pop_np))
        pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post')
        pop_tensor = torch.tensor(pop_np)

        # print(torch.tensor(pop_np))
        sort = torch.sort(l_tensor, descending=True)[1]
        # print(len(sort), sort)
        pop_tensor = pop_tensor[sort]
        l_tensor = l_tensor[sort]
        pop = np.array(pop)[sort].tolist()
        # print(l_tensor)
        for i in range(self.max_iters):

            pop_tensor = pop_tensor.type(torch.LongTensor).to(self.device)
            l_tensor = l_tensor.to(self.device)
            # print('pop_tensor:',pop_tensor)
            # print(pop_tensor.shape)
            # print(l_tensor)
            self.batch_model.eval()
            with torch.no_grad():
                pop_preds = self.batch_model.pred(
                    pop_tensor, l_tensor, False)[1].cpu().detach().numpy()
            # print(sort)
            # print(pop_preds)
            # print(pop_tensor)
            pop_scores = pop_preds[:, target]
            print('\t\t', i, ' -- ', np.max(pop_scores))
            pop_ranks = np.argsort(pop_scores)[::-1]
            # print(l_tensor)
            # print(pop_ranks)
            top_attack = pop_ranks[0]
            # print(top_attack)
            ampl = pop_scores / self.temp
            # print(ampl)
            covariance = np.cov(ampl)
            # print('pop:', pop)
            print(covariance)
            if covariance > 10e-6:
                mean = np.mean(ampl)
                # print(mean)
                ampl_update = (ampl - mean) / np.sqrt(covariance + 0.001)
                # print(ampl_update)
                logits = np.exp(ampl_update)
            else:

                if np.max(ampl) > 100:
                    ampl = ampl / (np.max(ampl) / 5)
                logits = np.exp(ampl)
            # logits = np.exp(ampl)
            select_probs = logits / np.sum(logits)
            # print('prob:', select_probs)
            # print([self.tokenizer.convert_ids_to_tokens([i]) for i in pop_np[top_attack]])
            if np.argmax(pop_preds[top_attack, :]) == target:
                print('Success and score: {:.4f}'.format(
                    pop_scores[top_attack]))

                print(seq_orig_string)
                print(pop[top_attack])

                return pop[top_attack], seq_orig_string

            # for i in pop:
            #   print(i)
            #   print('\t')

            elite = [pop[top_attack]]  # elite
            # print(elite)

            # print(select_probs.shape)
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)

            childs = [
                self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]])
                for i in range(self.pop_size - 1)
            ]
            childs = [
                self.perturb(x, seq_orig_string, l_orig, neighbour_list,
                             neighbour_dist, prob_select, target, l)
                for x in childs
            ]
            # print(childs)
            pop = elite + childs
            # print(len(pop))
            # print('pop:', pop)
            l_tensor = torch.ones([len(pop)]).type(torch.LongTensor)
            pop_np = [[self.tokenizer.cls_token_id] +
                      self.tokenizer.convert_tokens_to_ids(
                          self.tokenizer.tokenize(' '.join(pop[0]).strip())) +
                      [self.tokenizer.sep_token_id]]
            l_tensor[0] = len(pop_np[0])
            # print(pop_np)
            # print(l_tensor)
            # print(pop_np)
            for p in range(1, len(pop)):
                token_ids = [
                    self.tokenizer.cls_token_id
                ] + self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(' '.join(
                        pop[p]).strip())) + [self.tokenizer.sep_token_id]
                pop_np.append(token_ids)
                l_tensor[p] = len(token_ids)

            # print(l_tensor)
            # print(pop_np)
            l_max = torch.max(l_tensor)
            pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post')
            pop_tensor = torch.tensor(pop_np)

            # print(torch.tensor(pop_np))
            sort = torch.sort(l_tensor, descending=True)[1]
            # print(len(sort), sort)
            pop_tensor = pop_tensor[sort]
            l_tensor = l_tensor[sort]
            pop = np.array(pop)[sort].tolist()
            # print(np.array(pop).shape)

            # pop_np = np.expand_dims(pop[0], 0)
            # for p in pop[1:]:
            #   pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)),0)

        return None, seq_orig