def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer( lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # initialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
def main(): # Load SNLI dataset single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer tokenizer = WordTokenizer( end_tokens=["@@NULL@@"] ) # add @@NULL@@ to the end of sentences reader = SnliReader( token_indexers={"tokens": single_id_indexer}, tokenizer=tokenizer ) dev_dataset = reader.read( "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl" ) # Load model and vocab model = load_archive( "https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz" ).model model.eval().cuda() vocab = model.vocab # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model ) # save the word embedding matrix # Batches of examples to construct triggers universal_perturb_batch_size = 32 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Subsample the dataset to one class to do a universal attack on that class dataset_label_filter = "entailment" # only entailment examples # dataset_label_filter = 'contradiction' # only contradiction examples # dataset_label_filter = 'neutral' # only neutral examples subset_dev_dataset = [] for instance in dev_dataset: if instance["label"].label == dataset_label_filter: subset_dev_dataset.append(instance) # the attack is targeted towards a specific class # target_label = "0" # flip to entailment target_label = "1" # flip to contradiction # target_label = "2" # flip to neutral # A k-d tree if you want to do gradient + nearest neighbors # tree = KDTree(embedding_weight.numpy()) # Get original accuracy before adding universal triggers utils.get_accuracy( model, subset_dev_dataset, vocab, trigger_token_ids=None, snli=True ) model.train() # rnn cannot do backwards in train mode # Initialize triggers num_trigger_tokens = 1 # one token prepended trigger_token_ids = [vocab.get_token_index("a")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of( iterator(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1 ): # get model accuracy with current triggers utils.get_accuracy( model, subset_dev_dataset, vocab, trigger_token_ids, snli=True ) model.train() # rnn cannot do backwards in train mode # get grad of triggers averaged_grad = utils.get_average_grad( model, batch, trigger_token_ids, target_label, snli=True ) # find attack candidates using an attack method cand_trigger_token_ids = attacks.hotflip_attack( averaged_grad, embedding_weight, num_candidates=40 ) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # decrease_prob=True) # query the model to get the best candidates trigger_token_ids = utils.get_best_candidates( model, batch, trigger_token_ids, cand_trigger_token_ids, snli=True )
def main(): # Read the SQuAD validation dataset using a word tokenizer single_id = SingleIdTokenIndexer(lowercase_tokens=True) reader = SquadReader(token_indexers={'tokens': single_id}) dev_dataset = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v1.1.json' ) # Load the model and its associated vocabulary. model = load_archive( 'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-glove-2019.05.09.tar.gz' ).model vocab = model.vocab model.eval().cuda() # filter to just certain `wh` questions who_questions_dev, what_questions_dev, where_questions_dev, when_questions_dev, what_questions_dev, \ how_questions_dev, why_questions_dev, which_questions_dev, other_questions_dev = ([] for i in range(9)) for item in dev_dataset: for word in item['question']: if word.text.lower() == 'who': who_questions_dev.append(item) break if word.text.lower() == 'what': what_questions_dev.append(item) break if word.text.lower() == 'where': where_questions_dev.append(item) break if word.text.lower() == 'when': when_questions_dev.append(item) break if word.text.lower() == 'how': how_questions_dev.append(item) break if word.text.lower() == 'why': why_questions_dev.append(item) break if word.text.lower() == 'which': which_questions_dev.append(item) break else: other_questions_dev.append(item) # Use batches to craft the universal perturbations universal_perturb_batch_size = 32 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # We register a gradient hook on the embeddings. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model) # save the word embedding matrix # Initialize the trigger. The first one is an intialization with all "the" tokens. # You can customize it. Make sure to set the fixed target answer and the question type. # The second is a trigger found after running as reported in our paper. trigger_init = "the the the the donald trump the the the the" target_answer = "donald trump" subsampled_dev_dataset = who_questions_dev # universal attack on `who` questions # trigger_init = "why how ; known because : to kill american people ." # target_answer = "to kill american people" # subsampled_dev_dataset = why_questions_dev # universal attack on `who` questions # tokenizes the trigger, and finds the start/end span # make sure the trigger tokens are space separated trigger_token_ids = [ vocab.get_token_index(t) for t in trigger_init.split(' ') ] span_start = trigger_init.split(' ').index( target_answer.split(' ')[0]) # start of target_answer span_end = trigger_init.split(' ').index(target_answer.split(' ')[-1]) # we ignore replacement at the positions of the answer (answer is fixed) ignore_indices = [0]*(span_start) + \ [1]*(span_end - span_start + 1) + [0]*(len(trigger_token_ids) - 1 - span_end) # if these parameters are bigger = better result, but slower num_candidates = 20 beam_size = 5 for _ in range(100): # Get targeted accuracy squad_utils.get_accuracy_squad(model, subsampled_dev_dataset, vocab, trigger_token_ids, target_answer, span_start, span_end) model.train() # Get the gradient for the appended tokens averaged over the batch. averaged_grad = squad_utils.get_average_grad_squad( model, vocab, trigger_token_ids, subsampled_dev_dataset, span_start, span_end) # Use an attack method to get the top candidates cand_trigger_token_ids = attacks.hotflip_attack( averaged_grad, embedding_weight, trigger_token_ids, num_candidates=num_candidates, increase_loss=False) # Query the model with the top candidates to find the best tokens. trigger_token_ids = squad_utils.get_best_candidates_squad( model, trigger_token_ids, cand_trigger_token_ids, vocab, subsampled_dev_dataset, beam_size, ignore_indices, span_start, span_end)