def get_embedding_matrix(dataset, num_words, embedding_dims): # global num_words, embedding_matrix, word_index global embedding_matrix, word_index word_index = get_tokenizer(dataset).word_index print('Preparing embedding matrix.') # num_words = min(num_words, len(word_index)) embedding_matrix = np.zeros((num_words + 1, embedding_dims)) for word, i in word_index.items(): if i > num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return embedding_matrix
def fool_text_classifier(): clean_samples_cap = args.clean_samples_cap # 1000 print('clean_samples_cap:', clean_samples_cap) # get tokenizer dataset = args.dataset tokenizer = get_tokenizer(dataset) # Read data set x_test = y_test = None test_texts = None if dataset == 'imdb': train_texts, train_labels, test_texts, test_labels = split_imdb_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'yahoo': train_texts, train_labels, test_texts, test_labels = split_yahoo_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'yahoo_csv': train_texts, train_labels, test_texts, test_labels = split_yahoo_csv_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset) # Write clean examples into a txt file clean_texts_path = r'./fool_result/{}/clean_{}.txt'.format(dataset, str(clean_samples_cap)) if not os.path.isfile(clean_texts_path): write_origin_input_texts(clean_texts_path, test_texts) # Select the model and load the trained weights assert args.model[:4] == args.level model = None if args.model == "word_cnn": model = word_cnn(dataset) elif args.model == "word_bdlstm": model = bd_lstm(dataset) elif args.model == "char_cnn": model = char_cnn(dataset) elif args.model == "word_lstm": model = lstm(dataset) model_path = r'./runs/{}/{}.dat'.format(dataset, args.model) model.load_weights(model_path) print('model path:', model_path) # evaluate classification accuracy of model on clean samples scores_origin = model.evaluate(x_test[:clean_samples_cap], y_test[:clean_samples_cap]) print('clean samples origin test_loss: %f, accuracy: %f' % (scores_origin[0], scores_origin[1])) all_scores_origin = model.evaluate(x_test, y_test) print('all origin test_loss: %f, accuracy: %f' % (all_scores_origin[0], all_scores_origin[1])) grad_guide = ForwardGradWrapper(model) classes_prediction = grad_guide.predict_classes(x_test[: clean_samples_cap]) # load HowNet candidates with open(os.path.join('hownet_candidates', dataset, 'dataset_50000.pkl'), 'rb') as f: dataset_dict = pickle.load(f) with open(os.path.join('hownet_candidates', dataset, 'word_candidates_sense_0515.pkl'), 'rb') as fp: word_candidate = pickle.load(fp) print('Crafting adversarial examples...') total_perturbated_text = 0 successful_perturbations = 0 failed_perturbations = 0 sub_word_list = [] sub_rate_list = [] NE_rate_list = [] start_cpu = time.clock() adv_text_path = r'./fool_result/{}/{}/adv_{}.txt'.format(dataset, args.model, str(clean_samples_cap)) change_tuple_path = r'./fool_result/{}/{}/change_tuple_{}.txt'.format(dataset, args.model, str(clean_samples_cap)) file_1 = open(adv_text_path, "a") file_2 = open(change_tuple_path, "a") for index, text in enumerate(test_texts[: clean_samples_cap]): if np.argmax(y_test[index]) == classes_prediction[index]: total_perturbated_text += 1 # If the ground_true label is the same as the predicted label adv_doc, adv_y, sub_word, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase(input_text=text, true_y=np.argmax(y_test[index]), grad_guide=grad_guide, tokenizer=tokenizer, dataset=dataset, dataset_dict=dataset_dict, word_candidate=word_candidate, level=args.level) if adv_y != np.argmax(y_test[index]): successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) text = adv_doc sub_word_list.append(sub_word) sub_rate_list.append(sub_rate) NE_rate_list.append(NE_rate) file_2.write(str(index) + str(change_tuple_list) + '\n') file_1.write(text + "\n") end_cpu = time.clock() print('CPU second:', end_cpu - start_cpu) mean_sub_word = sum(sub_word_list) / len(sub_word_list) mean_fool_rate = successful_perturbations / total_perturbated_text mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list) mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list) print('mean substitution word:', mean_sub_word) print('mean fooling rate:', mean_fool_rate) print('mean substitution rate:', mean_sub_rate) print('mean NE rate:', mean_NE_rate) file_1.close() file_2.close()
def fool_text_classifier_pytorch(model, dataset='imdb'): clean_samples_cap = 100 print('clean_samples_cap:', clean_samples_cap) # get tokenizer tokenizer = get_tokenizer(opt) # Read data set x_test = y_test = None test_texts = None if dataset == 'imdb': train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels = split_imdb_files( opt) x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files( ) x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'yahoo': train_texts, train_labels, test_texts, test_labels = split_yahoo_files( ) x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) grad_guide = ForwardGradWrapper_pytorch(model) classes_prediction = grad_guide.predict_classes(x_test[:clean_samples_cap]) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 sub_rate_list = [] NE_rate_list = [] start_cpu = time.clock() adv_text_path = r'./fool_result/{}/adv_{}.txt'.format( dataset, str(clean_samples_cap)) change_tuple_path = r'./fool_result/{}/change_tuple_{}.txt'.format( dataset, str(clean_samples_cap)) file_1 = open(adv_text_path, "a") file_2 = open(change_tuple_path, "a") for index, text in enumerate(test_texts[:clean_samples_cap]): sub_rate = 0 NE_rate = 0 if np.argmax(y_test[index]) == classes_prediction[index]: # If the ground_true label is the same as the predicted label adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase( input_text=text, true_y=np.argmax(y_test[index]), grad_guide=grad_guide, tokenizer=tokenizer, dataset=dataset, level='word') if adv_y != np.argmax(y_test[index]): successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) text = adv_doc sub_rate_list.append(sub_rate) NE_rate_list.append(NE_rate) file_2.write(str(index) + str(change_tuple_list) + '\n') file_1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " + str(NE_rate) + "\n") end_cpu = time.clock() print('CPU second:', end_cpu - start_cpu) mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list) mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list) print('mean substitution rate:', mean_sub_rate) print('mean NE rate:', mean_NE_rate) file_1.close() file_2.close()
words = text.split(' ') NE_rate = float(words[-1].replace('\n', '')) all_NE_rate.append(NE_rate) NE_rate_list.append((index, NE_rate)) mean_NE_rate = sum(all_NE_rate) / len(all_NE_rate) NE_rate_list.sort(key=lambda t: t[1], reverse=True) return mean_NE_rate if __name__ == '__main__': args = parser.parse_args() clean_samples_cap = args.clean_samples_cap # 1000 # get tokenizer dataset = args.dataset tokenizer = get_tokenizer(dataset) # Read data set x_train = y_train = x_test = y_test = None test_texts = None first_get_dataset = False if dataset == 'imdb': train_texts, train_labels, test_texts, test_labels = split_imdb_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files(
def fool_text_classifier(): clean_samples_cap = args.clean_samples_cap # 1000 print('clean_samples_cap:', clean_samples_cap) # get tokenizer dataset = args.dataset tokenizer = get_tokenizer(dataset) # Read data set x_test = y_test = None test_texts = None if dataset == 'imdb': train_texts, train_labels, test_texts, test_labels = split_imdb_files() if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'agnews': train_texts, train_labels, test_texts, test_labels = split_agnews_files( ) if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) elif dataset == 'yahoo': train_texts, train_labels, test_texts, test_labels = split_yahoo_files( ) if args.level == 'word': x_train, y_train, x_test, y_test = word_process( train_texts, train_labels, test_texts, test_labels, dataset) elif args.level == 'char': x_train, y_train, x_test, y_test = char_process( train_texts, train_labels, test_texts, test_labels, dataset) # Write clean examples into a txt file clean_path = '/content/drive/My Drive/PWWS_Pretrained' #r./fool_result/{}'.format(dataset, str(clean_samples_cap)) if not os.path.exists(clean_path): os.mkdir(clean_path) # if not os.path.isfile(clean_path): write_origin_input_texts(os.path.join(clean_path, 'clean_1000.txt'), test_texts) # write_origin_input_texts(os.path.join(clean_path, 'x_processed_1000'), x_test) # write_origin_input_texts(os.path.join(clean_path, 'y_1000'), y_test) np.save(os.path.join(clean_path, 'x_processed_1000.npy'), x_test) np.save(os.path.join(clean_path, 'y_1000.npy'), y_test) with open(os.path.join(clean_path, 'clean_1000.txt'), 'r') as f: test_texts = f.readlines() x_test = np.load(os.path.join(clean_path, 'x_processed_1000.npy')) y_test = np.load(os.path.join(clean_path, 'y_1000.npy')) # Select the model and load the trained weights assert args.model[:4] == args.level model = None if args.model == "word_cnn": model = word_cnn(dataset) elif args.model == "word_bdlstm": model = bd_lstm(dataset) elif args.model == "char_cnn": model = char_cnn(dataset) elif args.model == "word_lstm": model = lstm(dataset) model_path = r'runs/{}/{}.dat'.format(dataset, args.model) model.load_weights(model_path) print('model path:', model_path) # evaluate classification accuracy of model on clean samples scores_origin = model.evaluate(x_test, y_test) print('clean samples origin test_loss: %f, accuracy: %f' % (scores_origin[0], scores_origin[1])) all_scores_origin = model.evaluate(x_test, y_test) print('all origin test_loss: %f, accuracy: %f' % (all_scores_origin[0], all_scores_origin[1])) grad_guide = ForwardGradWrapper(model) classes_prediction = grad_guide.predict_classes(x_test) print('Crafting adversarial examples...') successful_perturbations = 0 failed_perturbations = 0 sub_rate_list = [] NE_rate_list = [] start_cpu = time.clock() adv_text_path = r'./fool_result/{}/{}/adv_{}.txt'.format( dataset, args.model, str(clean_samples_cap)) change_tuple_path = r'./fool_result/{}/{}/change_tuple_{}.txt'.format( dataset, args.model, str(clean_samples_cap)) file_1 = open(adv_text_path, "a") file_2 = open(change_tuple_path, "a") for index, text in enumerate(test_texts[:clean_samples_cap]): # print(text) sub_rate = 0 NE_rate = 0 if np.argmax(y_test[index]) == classes_prediction[index]: # If the ground_true label is the same as the predicted label adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase( input_text=text, true_y=np.argmax(y_test[index]), grad_guide=grad_guide, tokenizer=tokenizer, dataset=dataset, level=args.level) if adv_y != np.argmax(y_test[index]): successful_perturbations += 1 print('{}. Successful example crafted.'.format(index)) else: failed_perturbations += 1 print('{}. Failure.'.format(index)) text = adv_doc sub_rate_list.append(sub_rate) NE_rate_list.append(NE_rate) file_2.write(str(index) + str(change_tuple_list) + '\n') file_1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " + str(NE_rate) + "\n") end_cpu = time.clock() print('CPU second:', end_cpu - start_cpu) mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list) mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list) print('mean substitution rate:', mean_sub_rate) print('mean NE rate:', mean_NE_rate) print('successful perturbations:', successful_perturbations) print('failed perturbations:', failed_perturbations) print( 'success rate:', successful_perturbations / (failed_perturbations + successful_perturbations)) file_1.close() file_2.close()