def fool_text_classifier(): clean_samples_cap = args.clean_samples_cap # 1000 print('clean_samples_cap:', clean_samples_cap) # get tokenizer dataset = args.dataset tokenizer = get_tokenizer(opt) # Read data set x_test = y_test = None test_texts = None if dataset == 'imdb': train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels = split_imdb_files( opt) if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files( ) if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'yahoo': train_texts, train_labels, test_texts, test_labels = split_yahoo_files( ) if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) # Write clean examples into a txt file clean_texts_path = r'./fool_result/{}/clean_{}.txt'.format( dataset, str(clean_samples_cap)) if not os.path.isfile(clean_texts_path): write_origin_input_texts(clean_texts_path, test_texts) # Select the model and load the trained weights assert args.model[:4] == args.level model = None if args.model == "word_cnn": model = word_cnn(dataset) elif args.model == "word_bdlstm": model = bd_lstm(dataset) elif args.model == "char_cnn": model = char_cnn(dataset) elif args.model == "word_lstm": model = lstm(dataset) model_path = r'./runs/{}/{}.dat'.format(dataset, args.model) model.load_weights(model_path) print('model path:', model_path) # evaluate classification accuracy of model on clean samples scores_origin = model.evaluate(x_test[:clean_samples_cap], y_test[:clean_samples_cap]) print('clean samples origin test_loss: %f, accuracy: %f' % (scores_origin[0], scores_origin[1])) all_scores_origin = model.evaluate(x_test, y_test) print('all origin test_loss: %f, accuracy: %f' % (all_scores_origin[0], all_scores_origin[1])) grad_guide = ForwardGradWrapper(model) classes_prediction = grad_guide.predict_classes(x_test[:clean_samples_cap]) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 sub_rate_list = [] NE_rate_list = [] start_cpu = time.clock() adv_text_path = r'./fool_result/{}/{}/adv_{}.txt'.format( dataset, args.model, str(clean_samples_cap)) change_tuple_path = r'./fool_result/{}/{}/change_tuple_{}.txt'.format( dataset, args.model, str(clean_samples_cap)) file_1 = open(adv_text_path, "a") file_2 = open(change_tuple_path, "a") for index, text in enumerate(test_texts[:clean_samples_cap]): sub_rate = 0 NE_rate = 0 if np.argmax(y_test[index]) == classes_prediction[index]: # If the ground_true label is the same as the predicted label adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase( input_text=text, true_y=np.argmax(y_test[index]), grad_guide=grad_guide, tokenizer=tokenizer, dataset=dataset, level=args.level) if adv_y != np.argmax(y_test[index]): successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) text = adv_doc sub_rate_list.append(sub_rate) NE_rate_list.append(NE_rate) file_2.write(str(index) + str(change_tuple_list) + '\n') file_1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " + str(NE_rate) + "\n") end_cpu = time.clock() print('CPU second:', end_cpu - start_cpu) mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list) mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list) print('mean substitution rate:', mean_sub_rate) print('mean NE rate:', mean_NE_rate) file_1.close() file_2.close()
def fool_text_classifier_pytorch(model, dataset='imdb'): clean_samples_cap = 100 print('clean_samples_cap:', clean_samples_cap) # get tokenizer tokenizer = get_tokenizer(opt) # Read data set x_test = y_test = None test_texts = None if dataset == 'imdb': train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels = split_imdb_files( opt) x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files( ) x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'yahoo': train_texts, train_labels, test_texts, test_labels = split_yahoo_files( ) x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) grad_guide = ForwardGradWrapper_pytorch(model) classes_prediction = grad_guide.predict_classes(x_test[:clean_samples_cap]) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 sub_rate_list = [] NE_rate_list = [] start_cpu = time.clock() adv_text_path = r'./fool_result/{}/adv_{}.txt'.format( dataset, str(clean_samples_cap)) change_tuple_path = r'./fool_result/{}/change_tuple_{}.txt'.format( dataset, str(clean_samples_cap)) file_1 = open(adv_text_path, "a") file_2 = open(change_tuple_path, "a") for index, text in enumerate(test_texts[:clean_samples_cap]): sub_rate = 0 NE_rate = 0 if np.argmax(y_test[index]) == classes_prediction[index]: # If the ground_true label is the same as the predicted label adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase( input_text=text, true_y=np.argmax(y_test[index]), grad_guide=grad_guide, tokenizer=tokenizer, dataset=dataset, level='word') if adv_y != np.argmax(y_test[index]): successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) text = adv_doc sub_rate_list.append(sub_rate) NE_rate_list.append(NE_rate) file_2.write(str(index) + str(change_tuple_list) + '\n') file_1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " + str(NE_rate) + "\n") end_cpu = time.clock() print('CPU second:', end_cpu - start_cpu) mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list) mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list) print('mean substitution rate:', mean_sub_rate) print('mean NE rate:', mean_NE_rate) file_1.close() file_2.close()
# Choose some female tweets tweets, = np.where(y_test[:test_samples_cap] == 0) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 adversarial_text_data = [] adversarial_preds = np.array(preds) for index, doc in enumerate(docs_test[:test_samples_cap]): correct_classifed = False if y_test[index] == preds[index]: adv_doc, (y, adv_y) = adversarial_paraphrase( doc, grad_guide, target=1 - y_test[index], use_typos=use_typos, use_synonyms=use_synonyms, mode=mode) correct_classifed = True if correct_classifed: pred = np.round(adv_y) if pred != preds[index]: successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) adversarial_preds[index] = pred
print('Model accuracy on test:', accuracy) # Choose some female tweets female_indices, = np.where(y_test[:test_samples_cap] == 0) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 adversarial_text_data = [] adversarial_preds = np.array(preds) for index, doc in enumerate(docs_test[:test_samples_cap]): if y_test[index] == 0 and preds[index] == 0: # If model prediction is correct, and the true class is female, # craft adversarial text adv_doc, (y, adv_y) = adversarial_paraphrase( doc, grad_guide, target=1) pred = np.round(adv_y) if pred != preds[index]: successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) adversarial_preds[index] = pred adversarial_text_data.append({ 'index': index, 'doc': clean(doc), 'adv': clean(adv_doc), 'success': pred != preds[index],
def fool_text_classifier(): dataset = args.dataset print('dataset: {}; model: {}; level: {}.'.format(dataset, args.model, args.level)) clean_samples_cap = args.clean_samples_cap # 1000 print('clean_samples_cap:', clean_samples_cap) # get tokenizer tokenizer = get_tokenizer(dataset) # Load and process data set data_helper = DataHelper(dataset, args.level) train_texts, train_labels, test_texts, test_labels = data_helper.load_data() x_train, y_train, x_test, y_test = data_helper.processing(need_shuffle=False) # Write clean examples into a txt file clean_texts_path = r'./fool_result/{}/clean_{}.txt'.format(dataset, str(clean_samples_cap)) if not os.path.isfile(clean_texts_path): write_origin_input_texts(clean_texts_path, test_texts) # Select the model and load the trained weights assert args.model[:4] == args.level model = None if args.model == "word_cnn": model = word_cnn(dataset) elif args.model == "word_bdlstm": model = bd_lstm(dataset) elif args.model == "char_cnn": model = char_cnn(dataset) elif args.model == "word_lstm": model = lstm(dataset) model_filename = r'./runs/{}/{}.dat'.format(dataset, args.model) model.load_weights(model_filename) print('model path:', model_filename) # evaluate classification accuracy of model on clean samples scores_origin = model.evaluate(x_test[:clean_samples_cap], y_test[:clean_samples_cap]) print('clean samples origin test_loss: %f, accuracy: %f' % (scores_origin[0], scores_origin[1])) all_scores_origin = model.evaluate(x_test, y_test) print('all origin test_loss: %f, accuracy: %f' % (all_scores_origin[0], all_scores_origin[1])) grad_guide = ForwardGradWrapper(model) classes_prediction = grad_guide.predict_classes(x_test[: clean_samples_cap]) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 sub_rate_list = [] NE_rate_list = [] adv_text_filename = r'./fool_result/{}/{}/adv_{}.txt'.format(dataset, args.model, str(clean_samples_cap)) change_tuple_filename = r'./fool_result/{}/{}/change_tuple_{}.txt'.format(dataset, args.model, str(clean_samples_cap)) fool_result_path = os.path.split(adv_text_filename)[0] if not os.path.exists(fool_result_path): os.makedirs(fool_result_path) start_cpu = time.clock() with open(adv_text_filename, "a") as f1, open(change_tuple_filename, "a") as f2: for index, text in enumerate(test_texts[: clean_samples_cap]): sub_rate = 0 NE_rate = 0 if np.argmax(y_test[index]) == classes_prediction[index]: # If the ground_true label is the same as the predicted label adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase(input_text=text, true_y=np.argmax( y_test[index]), grad_guide=grad_guide, tokenizer=tokenizer, dataset=dataset, level=args.level) if adv_y != np.argmax(y_test[index]): successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) text = adv_doc sub_rate_list.append(sub_rate) NE_rate_list.append(NE_rate) f2.write(str(index) + str(change_tuple_list) + '\n') f1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " + str(NE_rate) + "\n") end_cpu = time.clock() print('CPU second:', end_cpu - start_cpu) mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list) mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list) print('mean substitution rate:', mean_sub_rate) print('mean NE rate:', mean_NE_rate)
# Choose some female tweets female_indices, = np.where(y_test[:test_samples_cap] == 0) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 adversarial_text_data = [] adversarial_preds = np.array(preds) for index, doc in enumerate(docs_test[:test_samples_cap]): if y_test[index] == 0 and preds[index] == 0: # If model prediction is correct, and the true class is female, # craft adversarial text adv_doc, (y, adv_y) = adversarial_paraphrase(doc, grad_guide, target=1, use_typos=args.use_typos) pred = np.round(adv_y) if pred != preds[index]: successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) adversarial_preds[index] = pred adversarial_text_data.append({ 'index': index, 'doc': clean(doc), 'adv': clean(adv_doc),