def main(): print("Loading data") train, valid, test, vocab = prepare_data() # Load dataset model = make_model(len(vocab)) # Design a victim model print("Training") trained_model = train_model(model, train, valid, vocab) # Train the victim model print( "Generating adversarial samples (this step will take dozens of minutes)" ) clsf = OpenAttack.PytorchClassifier(trained_model, word2id=vocab) # Wrap the victim model adversarial_samples = attack( clsf, train) # Conduct adversarial attacks and generate adversarial examples print("Adversarially training classifier") finetune_model = train_model( trained_model, train + adversarial_samples, valid, vocab) # Retrain the classifier with additional adversarial examples print("Testing enhanced model (this step will take dozens of minutes)") attack( clsf, train ) # Re-attack the victim model to measure the effect of adversarial training
def train_model(model, data_train, data_valid, vocab, num_epoch=10): mx_acc = None mx_model = None for i in range(num_epoch): loss = train_epoch(model, data_train, vocab) clsf = OpenAttack.PytorchClassifier(model, word2id=vocab) accuracy = len(data_valid.eval(clsf).correct()) / len(data_valid) print("Epoch %d: loss: %lf, accuracy %lf" % (i, loss, accuracy)) if mx_acc is None or mx_acc < accuracy: mx_model = model.state_dict() model.load_state_dict(mx_model) return model
def main(): print("Loading data") train, valid, test, vocab = prepare_data() # Load dataset model = make_model(len(vocab)) # Design a victim model print("Training") trained_model = train_model(model, train, valid, vocab) # Train the victim model print( "Generating adversarial samples (this step will take dozens of minutes)" ) clsf = OpenAttack.PytorchClassifier(trained_model, word2id=vocab) # Wrap the victim model adversarial_samples = attack( clsf, train) # Conduct adversarial attacks and generate adversarial examples print("Adversarially training classifier") print(train.features) print(adversarial_samples.features) new_dataset = {"x": [], "y": [], "tokens": []} for it in train: new_dataset["x"].append(it["x"]) new_dataset["y"].append(it["y"]) new_dataset["tokens"].append(it["tokens"]) for it in adversarial_samples: new_dataset["x"].append(it["x"]) new_dataset["y"].append(it["y"]) new_dataset["tokens"].append(it["tokens"]) finetune_model = train_model( trained_model, datasets.Dataset.from_dict(new_dataset), valid, vocab) # Retrain the classifier with additional adversarial examples print("Testing enhanced model (this step will take dozens of minutes)") attack( clsf, train ) # Re-attack the victim model to measure the effect of adversarial training