示例#1
0
def attack(text_ls, true_label, predictor, model, batch_size=1):
    # first check the prediction of the original text
    orig_probs = predictor([text_ls]).squeeze()
    orig_label = torch.argmax(orig_probs)
    orig_prob = orig_probs.max()
    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0
    else:
        # print(text_ls)
        cw = CarliniL2(debug=False, targeted=False)
        cw.num_classes = 2
        num_queries = 1
        adv_seq, success = model.attack([text_ls], true_label, cw, batch_size)
        if adv_seq is not None:
            text_prime = model.dataset.transform_back_text(adv_seq)
            print("adv texts:", text_prime)
        else:
            print("optimize fail")
            text_prime = text_ls
        num_changed = 0
        return ' '.join(text_prime), num_changed, orig_label.cpu().item(), \
               torch.argmax(predictor([text_prime])).cpu().item(), num_queries
示例#2
0
def cw_tree_attack(data_val, tree_data):
    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_correct = 0
    tot = 0
    orig_append_correct = 0
    adv_pickle = []

    cw = CarliniL2(debug=args.debugging)
    embed = torch.load(args.word_vector)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    vocab = Vocab(filename=args.dictionary,
                  data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD])
    generator = Generator(args.test_data,
                          vocab=vocab,
                          embed=embed,
                          data_set=data_val)
    bert_transfered_embedding = torch.load('bert_transfered_embedding.pth')
    transfer_emb = torch.nn.Embedding(
        bert_transfered_embedding.size(0),
        bert_transfered_embedding.size(1)).to(device)
    # transfer_emb = torch.nn.Embedding.from_pretrained(bert_transfered_embedding).to(device)
    transfer_emb.weight.data.copy_(bert_transfered_embedding)
    seqback = WrappedSeqback(embed,
                             device,
                             attack=True,
                             seqback_model=generator.seqback_model,
                             vocab=vocab,
                             transfer_emb=transfer_emb)
    treelstm = generator.tree_model
    generator.load_state_dict(torch.load(args.load_ae))

    class TreeModel(nn.Module):
        def __init__(self):
            super(TreeModel, self).__init__()

        def forward(self, hidden):
            self.embedding = seqback(hidden)
            return model(batch['data'],
                         batch['seq_len'],
                         perturbed=self.embedding)['pred']

        def set_temp(self, temp):
            seqback.temp = temp

        def get_embedding(self):
            return self.embedding

        def get_seqback(self):
            return seqback

    tree_model = TreeModel()
    for batch in get_tree_batch(data_val, tree_data, vocab):
        input_embedding = model.bert.embeddings.word_embeddings(batch['data'])
        batch['tree'] = [generator.get_tree(tree) for tree in batch['tree']]
        seqback.sentences = input_embedding.clone().detach()
        seqback.batch_trees = batch['tree']
        seqback.batch_add_sent = batch['ae_add_sents']
        seqback.start = batch['add_start']
        seqback.end = batch['add_end']
        seqback.adv_sent = []

        batch_tree_embedding = []
        for bi, append_sent in enumerate(batch['ae_add_sents']):
            sentences = [
                torch.tensor(append_sent, dtype=torch.long, device=device)
            ]
            trees = [batch['tree'][bi]]
            tree_embedding = treelstm(sentences, trees)[0][0].detach()
            batch_tree_embedding.append(tree_embedding)

        hidden = torch.cat(batch_tree_embedding, dim=0)
        cw.batch_info = batch

        adv_hidden = cw.run(tree_model,
                            hidden,
                            batch['attack_targets'],
                            batch_size=hidden.shape[0],
                            input_token=input_embedding)
        seqback.adv_sent = []

        adv_seq = torch.tensor(batch['data']).to(device)
        for bi, (add_start, add_end) in enumerate(
                zip(batch['add_start'], batch['add_end'])):
            if bi in cw.o_best_sent:
                ae_words = cw.o_best_sent[bi]
                bert_tokens = tokenizer.convert_tokens_to_ids(ae_words)
                adv_seq[bi, add_start:add_end] = torch.LongTensor(bert_tokens)

        out = model(adv_seq, batch['seq_len'])['pred']
        prediction = torch.max(out, 1)[1]
        orig_correct += batch['orig_correct'].item()
        orig_append_correct += batch['orig_append_correct'].item()
        adv_correct += torch.sum((prediction == batch['label']).float()).item()
        targeted_success += torch.sum(
            (prediction == batch['attack_targets']).float()).item()
        untargeted_success += untargeted_success_rate(prediction,
                                                      batch['label'])
        tot += len(batch['label'])

        for i in range(len(batch['label'])):
            adv_pickle.append({
                'raw_text': transform(adv_seq[i]),
                'label': batch['label'][i].item()
            })
            try:
                logger.info(("orig:", transform(batch['add_sents'][i])))
                logger.info(("adv:", cw.o_best_sent[i]))
            except:
                continue

        logger.info(("orig_correct:", orig_correct))
        logger.info(("orig_append_correct:", orig_append_correct))
        logger.info(("adv_correct:", adv_correct))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("tot:", tot))
        joblib.dump(adv_pickle, root_dir + '/adv_text.pkl')
    logger.info(("orig_correct:", orig_correct / tot))
    logger.info(("orig_append_correct:", orig_append_correct / tot))
    logger.info(("adv_correct:", adv_correct / tot))
    logger.info(("targeted successful rate:", targeted_success / tot))
    logger.info(("untargetd successful rate:", untargeted_success / tot))
示例#3
0
def cw_word_attack(data_val):
    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_correct = 0
    orig_append_correct = 0
    tot = 0
    adv_pickle = []

    cw = CarliniL2(debug=args.debugging)
    for batch in get_batch(data_val):
        data = batch['data']
        seq_len = batch['seq_len']
        label = batch['label']
        batch_add_start = batch['add_start']
        batch_add_end = batch['add_end']
        attack_targets = batch['attack_targets']
        add_sents = batch['add_sents']
        tot += len(label)

        input_embedding = model.bert.embeddings.word_embeddings(data)
        cw_mask = np.zeros(input_embedding.shape).astype(np.float32)
        for bi, (add_start,
                 add_end) in enumerate(zip(batch_add_start, batch_add_end)):
            cw_mask[bi, add_start:add_end] = 1
        cw_mask = torch.from_numpy(cw_mask).float().to(device)
        cw.wv = model.bert.embeddings.word_embeddings.weight
        cw.mask = cw_mask
        cw.seq = data
        cw.batch_info = batch
        cw.seq_len = seq_len
        adv_data = cw.run(model, input_embedding, attack_targets)

        adv_seq = torch.tensor(batch['data']).to(device)
        for bi, (add_start,
                 add_end) in enumerate(zip(batch_add_start, batch_add_end)):
            if bi in cw.o_best_sent:
                adv_seq.data[bi, add_start:add_end] = torch.LongTensor(
                    cw.o_best_sent[bi])
        out = model(adv_seq, seq_len)['pred']
        prediction = torch.max(out, 1)[1]
        orig_correct += batch['orig_correct'].item()
        orig_append_correct += batch['orig_append_correct'].item()
        adv_correct += torch.sum((prediction == label).float()).item()
        targeted_success += torch.sum(
            (prediction == attack_targets).float()).item()
        untargeted_success += untargeted_success_rate(prediction, label)

        for i in range(len(add_sents)):
            adv_pickle.append({
                'raw_text': transform(adv_seq[i]),
                'label': label[i].item()
            })
            try:
                logger.info(("orig:", transform(add_sents[i][1:])))
                logger.info(("adv:", transform(cw.o_best_sent[i])))
            except:
                continue

        logger.info(("orig_correct:", orig_correct))
        logger.info(("orig_append_correct:", orig_append_correct))
        logger.info(("adv_correct:", adv_correct))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("tot:", tot))
        joblib.dump(adv_pickle, root_dir + '/adv_text.pkl')
    logger.info(("orig_correct:", orig_correct / tot))
    logger.info(("orig_append_correct:", orig_append_correct / tot))
    logger.info(("adv_correct:", adv_correct / tot))
    logger.info(("targeted successful rate:", targeted_success / tot))
    logger.info(("untargetd successful rate:", untargeted_success / tot))
示例#4
0
文件: attack.py 项目: ml-lab/AdvCodec
def cw_tree_attack(data_val):
    init_attack()
    cw = CarliniL2()
    embed = torch.load(args.word_vector)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    generator = Generator(args.test_data, vocab=vocab, embed=embed, data_set=data_val)
    seqback = WrappedSeqback(embed, device, attack=True, seqback_model=generator.seqback_model, vocab=vocab,
                             transfer_emb=model.encoder.bilstm.encoder)
    treelstm = generator.tree_model
    generator.load_state_dict(torch.load(args.load_ae))

    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_append_correct = 0
    orig_correct = 0
    tot = 0
    adv_pickle = []

    class TreeModel(nn.Module):
        def __init__(self):
            super(TreeModel, self).__init__()

        def forward(self, hidden):
            self.embedding = seqback(hidden)
            return model(self.embedding)

        def set_temp(self, temp):
            seqback.temp = temp

        def get_embedding(self):
            return self.embedding

        def get_seqback(self):
            return seqback

    tree_model = TreeModel()
    for batch in get_batch(data_val, has_tree=True):
        seqback.sentences = batch['data']
        seqback.batch_trees = batch['tree']
        seqback.batch_masks = batch['mask']
        seqback.batch_splitted_sentences = batch['split_text']
        seqback.start = batch['add_start']
        seqback.end = batch['add_end']
        batch_add_start = batch['add_start']
        batch_add_end = batch['add_end']
        seqback.adv_sent = []
        batch_tree_embedding = []

        for bi, split_text in enumerate(batch['split_text']):
            # todo: default use the embedding of front???
            batch['split_text'][bi] = [torch.tensor(x, dtype=torch.long, device=device) for x in split_text]
            sentences = [batch['split_text'][bi][0]]
            trees = [batch['tree'][bi][0]]
            masks = [batch['mask'][bi][0]]
            tree_embedding = treelstm(sentences, trees, masks)[0][0].detach()
            batch_tree_embedding.append(tree_embedding)

        hidden = torch.cat(batch_tree_embedding, dim=0)
        data = batch['data']
        model.encoder.raw_inp = batch['data']
        model.init_hidden(data.size(1))
        model.encoder.bilstm.attack_mode = True
        input_embedding = model.encoder.bilstm.encoder(data)

        # np.save('tree_attack/input.npy', input_token.cpu().numpy())

        if args.baseline:
            modifier = torch.randn_like(hidden, device=device)
            modifier = F.normalize(modifier, p=2, dim=1) * 1e2
            adv_hidden = hidden + modifier
        else:
            with torch.autograd.detect_anomaly():
                adv_hidden = cw.run(tree_model, hidden, batch['attack_targets'], batch_size=hidden.shape[0],
                                    input_token=input_embedding)

        seqback.adv_sent = []

        adv_seq = torch.tensor(data).to(device)
        for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)):
            if bi in cw.o_best_sent:
                adv_seq[add_start:add_end, bi] = cw.o_best_sent[bi]

        for i in range(len(batch['label'])):
            adv_pickle.append({
                'raw_text': vocab.tensorConvertToLabels(adv_seq[:, i]),
                'label': batch['label'][i]
            })
            try:
                logger.info(("orig:", vocab.convertToLabels(batch['add_words'][i])))
                logger.info(("adv:", vocab.tensorConvertToLabels(cw.o_best_sent[i])))
            except:
                continue

        model.encoder.raw_inp = None
        model.encoder.bilstm.attack_mode = False
        output, attention = model(adv_seq)
        output_flat = output.view(data.size(1), -1)
        prediction = torch.max(output_flat, 1)[1]

        orig_correct += batch['orig_correct'].item()
        orig_append_correct += batch['orig_append_correct'].item()
        adv_correct += torch.sum((prediction == batch['targets']).float()).item()
        targeted_success += torch.sum((prediction == batch['attack_targets']).float()).item()
        untargeted_success += untargeted_success_rate(prediction, batch['label'])
        tot += len(batch['label'])

        logger.info(("orig_correct:", orig_correct))
        logger.info(("orig_append_correct:", orig_append_correct))
        logger.info(("adv_correct:", adv_correct))
        logger.info(("targeted successful rate:", targeted_success))
        logger.info(("untargetd successful rate:", untargeted_success))
        logger.info(("tot:", tot))
        joblib.dump(adv_pickle, root_dir + '/adv_text.pkl')
    logger.info(("orig_correct:", orig_correct / tot))
    logger.info(("orig_append_correct:", orig_append_correct / tot))
    logger.info(("adv_correct:", adv_correct / tot))
    logger.info(("targeted successful rate:", targeted_success / tot))
    logger.info(("untargetd successful rate:", untargeted_success / tot))
示例#5
0
文件: attack.py 项目: ml-lab/AdvCodec
def cw_seq_attack(data_val):
    init_attack()
    cw = CarliniL2()
    embed = torch.load(args.word_vector)
    device = torch.device("cuda:0" if args.cuda else "cpu")
    encoder = EncoderRNN(vocab, embed.size(1), args.hidden_dim, device)
    decoder = Decoder(embed.size(1), args.hidden_dim, vocab.size(), dropout=0.0)
    generator = Seq2SeqGenerator(encoder, decoder, embed=embed).to(device)
    seqback = WrappedSeqDecoder(decoder, vocab)
    generator.load_state_dict(torch.load(args.load_ae))

    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_correct = 0
    tot = 0

    def get_seq_hidden(batch_add_words):
        # get lstm hidden embedding
        encoder_output, hidden = encoder(batch_add_words)
        hidden = torch.stack(hidden).transpose_(0, 2).detach()
        encoder_output = encoder_output.detach()
        return encoder_output, hidden

    class SeqModel(nn.Module):
        def __init__(self):
            super(SeqModel, self).__init__()

        def forward(self, hidden):
            embedding = seqback(hidden)
            return model(embedding)

    seq_model = SeqModel()
    for batch in get_batch(data_val):
        batch_add_words = batch['add_words']
        sos = torch.tensor([vocab.getIndex(SOS_WORD)], dtype=torch.long)
        eos = torch.tensor([vocab.getIndex(EOS_WORD)], dtype=torch.long)
        for i, sentence in enumerate(batch_add_words):
            sentence = torch.tensor(sentence)
            sentence = torch.cat((sos, sentence, eos), 0)
            sentence = sentence.to(device)
            batch_add_words[i] = sentence
        from torch.nn.utils.rnn import pad_sequence
        batch_add_words = pad_sequence(batch_add_words, padding_value=vocab.getIndex(PAD_WORD))
        encoder_output, hidden = get_seq_hidden(batch_add_words)

        seqback.trg = batch_add_words
        seqback.encoder_output = encoder_output
        seqback.start = batch['add_start']
        seqback.end = batch['add_end']
        seqback.sentences = batch['data']
        seqback.adv_sent = []

        data = batch['data']
        model.encoder.raw_inp = batch['data']
        model.init_hidden(data.size(1))
        model.encoder.bilstm.attack_mode = True

        if args.baseline:
            modifier = torch.randn_like(hidden, device=device)
            modifier = F.normalize(modifier, p=2, dim=3) * 1e2
            adv_hidden = hidden + modifier
        else:
            adv_hidden = cw.run(seq_model, hidden, batch['attack_targets'], batch_size=hidden.shape[0])
            adv_hidden = torch.tensor(adv_hidden).to(device)

        seqback.adv_sent = []
        output, attention = seq_model(adv_hidden)

        output_flat = output.view(data.size(1), -1)
        prediction = torch.max(output_flat, 1)[1]

        orig_correct += batch['orig_correct'].item()
        adv_correct += torch.sum((prediction == batch['targets']).float()).item()
        targeted_success += torch.sum((prediction == batch['attack_targets']).float()).item()
        untargeted_success += untargeted_success_rate(prediction, batch['label'])
        tot += len(batch['label'])

        for adv, orig in zip(seqback.adv_sent, batch['add_words']):
            print("orig:", vocab.tensorConvertToLabels(orig[1:], vocab.getIndex(PAD_WORD))[:-1], file=adv_sent_file)
            print("adv:", adv[:-1], file=adv_sent_file)

        print("orig_correct:", orig_correct)
        print("adv_correct:", adv_correct)
        print("targeted successful rate:", targeted_success)
        print("untargetd successful rate:", untargeted_success)
        print("tot:", tot)

    print("orig_correct:", orig_correct / tot)
    print("adv_correct:", adv_correct / tot)
    print("targeted successful rate:", targeted_success / tot)
    print("untargetd successful rate:", untargeted_success / tot)
示例#6
0
文件: attack.py 项目: ml-lab/AdvCodec
def cw_word_attack(data_val):
    init_attack()
    # fname = "/home/wbx/yelp/vectors.kv"
    fname = "full-vectors.kv"
    if not os.path.isfile(fname):
        embed = model.encoder.bilstm.encoder.weight
        print(len(vocab.idxToLabel), embed.shape[1], file=open(fname, "a"))
        for k, v in vocab.idxToLabel.items():
            vectors = embed[k].cpu().numpy()
            vector = ""
            for x in vectors:
                vector += " " + str(x)
            print(v, vector[1:], file=open(fname, "a"))
    device = torch.device("cuda:0" if args.cuda else "cpu")
    adv_correct = 0
    targeted_success = 0
    untargeted_success = 0
    orig_correct = 0
    orig_append_correct = 0
    tot = 0
    adv_pickle = []

    cw = CarliniL2(debug=args.debugging)
    for batch in get_batch(data_val):
        data = batch['data']
        attack_targets = batch['attack_targets']
        batch_add_start = batch['add_start']
        batch_add_end = batch['add_end']
        text = batch['text']
        split_text = batch['split_text']
        label = batch['label']
        # convert text into embedding and attack in the embedding space
        model.encoder.raw_inp = data
        model.init_hidden(data.size(1))
        model.encoder.bilstm.attack_mode = True
        input_embedding = model.encoder.bilstm.encoder(data)

        cw_mask = np.zeros(input_embedding.shape).astype(np.float32)
        for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)):
            cw_mask[add_start:add_end, bi] = 1
        cw_mask = torch.from_numpy(cw_mask).float()
        if args.cuda:
            cw_mask = cw_mask.cuda()
            cw.batch_info = batch
            cw.wv = model.encoder.bilstm.encoder.weight

        if args.baseline:
            modifier = torch.randn_like(data, device=device)
            modifier = F.normalize(modifier, p=2, dim=2) * 10
            adv_data = input_embedding + modifier * cw_mask
            adv_data = adv_data.cpu().detach().numpy()
        else:
            cw.mask = cw_mask
            adv_data = cw.run(model, input_embedding, attack_targets)
            # adv_hidden = torch.tensor(adv_data).to(device)

            adv_seq = torch.tensor(data).to(device)
            for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)):
                if bi in cw.o_best_sent:
                    adv_seq.data[add_start:add_end, bi] = torch.LongTensor(cw.o_best_sent[bi])

            for i in range(len(split_text)):
                adv_pickle.append({
                    'raw_text': vocab.tensorConvertToLabels(adv_seq[:, i]),
                    'label': label[i]
                })
                try:
                    logger.info(("orig:", vocab.convertToLabels(split_text[i][0])))
                    logger.info(("adv:", vocab.convertToLabels(cw.o_best_sent[i])))
                except:
                    continue

            model.encoder.raw_inp = None
            model.encoder.bilstm.attack_mode = False
            output, attention = model(adv_seq)
            output_flat = output.view(data.size(1), -1)
            prediction = torch.max(output_flat, 1)[1]

            targets = batch['targets']
            orig_correct += batch['orig_correct'].item()
            orig_append_correct += batch['orig_append_correct'].item()
            adv_correct += torch.sum((prediction == targets).float()).item()
            targeted_success += torch.sum((prediction == attack_targets).float()).item()
            untargeted_success += untargeted_success_rate(prediction, label)
            tot += len(label)

            logger.info(("orig_correct:", orig_correct))
            logger.info(("orig_append_correct:", orig_append_correct))
            logger.info(("adv_correct:", adv_correct))
            logger.info(("targeted successful rate:", targeted_success))
            logger.info(("untargetd successful rate:", untargeted_success))
            logger.info(("tot:", tot))
            joblib.dump(adv_pickle, root_dir + '/adv_text.pkl')
    logger.info(("orig_correct:", orig_correct / tot))
    logger.info(("adv_correct:", adv_correct / tot))
    logger.info(("orig_append_correct:", orig_append_correct / tot))
    logger.info(("targeted successful rate:", targeted_success / tot))
    logger.info(("untargetd successful rate:", untargeted_success / tot))