Exemplo n.º 1
0
def get_embedding_matrix(dataset, num_words, embedding_dims):
    # global num_words, embedding_matrix, word_index
    global embedding_matrix, word_index
    word_index = get_tokenizer(dataset).word_index
    print('Preparing embedding matrix.')
    # num_words = min(num_words, len(word_index))
    embedding_matrix = np.zeros((num_words + 1, embedding_dims))
    for word, i in word_index.items():
        if i > num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
Exemplo n.º 2
0
def fool_text_classifier():
    clean_samples_cap = args.clean_samples_cap  # 1000
    print('clean_samples_cap:', clean_samples_cap)

    # get tokenizer
    dataset = args.dataset
    tokenizer = get_tokenizer(dataset)

    # Read data set
    x_test = y_test = None
    test_texts = None
    if dataset == 'imdb':
        train_texts, train_labels, test_texts, test_labels = split_imdb_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'agnews':
        train_texts, train_labels, test_texts, test_labels = split_agnews_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'yahoo':
        train_texts, train_labels, test_texts, test_labels = split_yahoo_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'yahoo_csv':
        train_texts, train_labels, test_texts, test_labels = split_yahoo_csv_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(train_texts, train_labels, test_texts, test_labels, dataset)

    # Write clean examples into a txt file
    clean_texts_path = r'./fool_result/{}/clean_{}.txt'.format(dataset, str(clean_samples_cap))
    if not os.path.isfile(clean_texts_path):
        write_origin_input_texts(clean_texts_path, test_texts)

    # Select the model and load the trained weights
    assert args.model[:4] == args.level
    model = None
    if args.model == "word_cnn":
        model = word_cnn(dataset)
    elif args.model == "word_bdlstm":
        model = bd_lstm(dataset)
    elif args.model == "char_cnn":
        model = char_cnn(dataset)
    elif args.model == "word_lstm":
        model = lstm(dataset)
    model_path = r'./runs/{}/{}.dat'.format(dataset, args.model)
    model.load_weights(model_path)
    print('model path:', model_path)

    # evaluate classification accuracy of model on clean samples
    scores_origin = model.evaluate(x_test[:clean_samples_cap], y_test[:clean_samples_cap])
    print('clean samples origin test_loss: %f, accuracy: %f' % (scores_origin[0], scores_origin[1]))
    all_scores_origin = model.evaluate(x_test, y_test)
    print('all origin test_loss: %f, accuracy: %f' % (all_scores_origin[0], all_scores_origin[1]))

    grad_guide = ForwardGradWrapper(model)
    classes_prediction = grad_guide.predict_classes(x_test[: clean_samples_cap])

    # load HowNet candidates
    with open(os.path.join('hownet_candidates', dataset, 'dataset_50000.pkl'), 'rb') as f:
        dataset_dict = pickle.load(f)
    with open(os.path.join('hownet_candidates', dataset, 'word_candidates_sense_0515.pkl'), 'rb') as fp:
        word_candidate = pickle.load(fp)

    print('Crafting adversarial examples...')
    total_perturbated_text = 0
    successful_perturbations = 0
    failed_perturbations = 0
    sub_word_list = []
    sub_rate_list = []
    NE_rate_list = []

    start_cpu = time.clock()
    adv_text_path = r'./fool_result/{}/{}/adv_{}.txt'.format(dataset, args.model, str(clean_samples_cap))
    change_tuple_path = r'./fool_result/{}/{}/change_tuple_{}.txt'.format(dataset, args.model, str(clean_samples_cap))
    file_1 = open(adv_text_path, "a")
    file_2 = open(change_tuple_path, "a")
    for index, text in enumerate(test_texts[: clean_samples_cap]):
        if np.argmax(y_test[index]) == classes_prediction[index]:
            total_perturbated_text += 1
            # If the ground_true label is the same as the predicted label
            adv_doc, adv_y, sub_word, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase(input_text=text,
                                                                                          true_y=np.argmax(y_test[index]),
                                                                                          grad_guide=grad_guide,
                                                                                          tokenizer=tokenizer,
                                                                                          dataset=dataset,
                                                                                          dataset_dict=dataset_dict,
                                                                                          word_candidate=word_candidate,
                                                                                          level=args.level)
            if adv_y != np.argmax(y_test[index]):
                successful_perturbations += 1
                print('{}. Successful example crafted.'.format(index))
            else:
                failed_perturbations += 1
                print('{}. Failure.'.format(index))

            text = adv_doc
            sub_word_list.append(sub_word)
            sub_rate_list.append(sub_rate)
            NE_rate_list.append(NE_rate)
            file_2.write(str(index) + str(change_tuple_list) + '\n')
        file_1.write(text + "\n")
    end_cpu = time.clock()
    print('CPU second:', end_cpu - start_cpu)
    mean_sub_word = sum(sub_word_list) / len(sub_word_list)
    mean_fool_rate = successful_perturbations / total_perturbated_text
    mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list)
    mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list)
    print('mean substitution word:', mean_sub_word)
    print('mean fooling rate:', mean_fool_rate)
    print('mean substitution rate:', mean_sub_rate)
    print('mean NE rate:', mean_NE_rate)
    file_1.close()
    file_2.close()
Exemplo n.º 3
0
def fool_text_classifier_pytorch(model, dataset='imdb'):
    clean_samples_cap = 100
    print('clean_samples_cap:', clean_samples_cap)

    # get tokenizer
    tokenizer = get_tokenizer(opt)

    # Read data set
    x_test = y_test = None
    test_texts = None
    if dataset == 'imdb':
        train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels = split_imdb_files(
            opt)
        x_train, y_train, x_test, y_test = word_process(
            train_texts, train_labels, test_texts, test_labels, dataset)

    elif dataset == 'agnews':
        train_texts, train_labels, test_texts, test_labels = split_agnews_files(
        )
        x_train, y_train, x_test, y_test = word_process(
            train_texts, train_labels, test_texts, test_labels, dataset)

    elif dataset == 'yahoo':
        train_texts, train_labels, test_texts, test_labels = split_yahoo_files(
        )
        x_train, y_train, x_test, y_test = word_process(
            train_texts, train_labels, test_texts, test_labels, dataset)

    grad_guide = ForwardGradWrapper_pytorch(model)
    classes_prediction = grad_guide.predict_classes(x_test[:clean_samples_cap])

    print('Crafting adversarial examples...')
    successful_perturbations = 0
    failed_perturbations = 0
    sub_rate_list = []
    NE_rate_list = []

    start_cpu = time.clock()
    adv_text_path = r'./fool_result/{}/adv_{}.txt'.format(
        dataset, str(clean_samples_cap))
    change_tuple_path = r'./fool_result/{}/change_tuple_{}.txt'.format(
        dataset, str(clean_samples_cap))
    file_1 = open(adv_text_path, "a")
    file_2 = open(change_tuple_path, "a")
    for index, text in enumerate(test_texts[:clean_samples_cap]):
        sub_rate = 0
        NE_rate = 0
        if np.argmax(y_test[index]) == classes_prediction[index]:
            # If the ground_true label is the same as the predicted label
            adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase(
                input_text=text,
                true_y=np.argmax(y_test[index]),
                grad_guide=grad_guide,
                tokenizer=tokenizer,
                dataset=dataset,
                level='word')
            if adv_y != np.argmax(y_test[index]):
                successful_perturbations += 1
                print('{}. Successful example crafted.'.format(index))
            else:
                failed_perturbations += 1
                print('{}. Failure.'.format(index))

            text = adv_doc
            sub_rate_list.append(sub_rate)
            NE_rate_list.append(NE_rate)
            file_2.write(str(index) + str(change_tuple_list) + '\n')
        file_1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " +
                     str(NE_rate) + "\n")
    end_cpu = time.clock()
    print('CPU second:', end_cpu - start_cpu)
    mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list)
    mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list)
    print('mean substitution rate:', mean_sub_rate)
    print('mean NE rate:', mean_NE_rate)
    file_1.close()
    file_2.close()
Exemplo n.º 4
0
        words = text.split(' ')
        NE_rate = float(words[-1].replace('\n', ''))
        all_NE_rate.append(NE_rate)
        NE_rate_list.append((index, NE_rate))
    mean_NE_rate = sum(all_NE_rate) / len(all_NE_rate)
    NE_rate_list.sort(key=lambda t: t[1], reverse=True)
    return mean_NE_rate


if __name__ == '__main__':
    args = parser.parse_args()
    clean_samples_cap = args.clean_samples_cap  # 1000

    # get tokenizer
    dataset = args.dataset
    tokenizer = get_tokenizer(dataset)

    # Read data set
    x_train = y_train = x_test = y_test = None
    test_texts = None
    first_get_dataset = False
    if dataset == 'imdb':
        train_texts, train_labels, test_texts, test_labels = split_imdb_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'agnews':
        train_texts, train_labels, test_texts, test_labels = split_agnews_files(
Exemplo n.º 5
0
def fool_text_classifier():
    clean_samples_cap = args.clean_samples_cap  # 1000
    print('clean_samples_cap:', clean_samples_cap)

    # get tokenizer
    dataset = args.dataset
    tokenizer = get_tokenizer(dataset)

    # Read data set
    x_test = y_test = None
    test_texts = None
    if dataset == 'imdb':
        train_texts, train_labels, test_texts, test_labels = split_imdb_files()
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'agnews':
        train_texts, train_labels, test_texts, test_labels = split_agnews_files(
        )
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
    elif dataset == 'yahoo':
        train_texts, train_labels, test_texts, test_labels = split_yahoo_files(
        )
        if args.level == 'word':
            x_train, y_train, x_test, y_test = word_process(
                train_texts, train_labels, test_texts, test_labels, dataset)
        elif args.level == 'char':
            x_train, y_train, x_test, y_test = char_process(
                train_texts, train_labels, test_texts, test_labels, dataset)

    # Write clean examples into a txt file
    clean_path = '/content/drive/My Drive/PWWS_Pretrained'  #r./fool_result/{}'.format(dataset, str(clean_samples_cap))
    if not os.path.exists(clean_path):
        os.mkdir(clean_path)
        # if not os.path.isfile(clean_path):
        write_origin_input_texts(os.path.join(clean_path, 'clean_1000.txt'),
                                 test_texts)
        # write_origin_input_texts(os.path.join(clean_path, 'x_processed_1000'), x_test)
        # write_origin_input_texts(os.path.join(clean_path, 'y_1000'), y_test)
        np.save(os.path.join(clean_path, 'x_processed_1000.npy'), x_test)
        np.save(os.path.join(clean_path, 'y_1000.npy'), y_test)

    with open(os.path.join(clean_path, 'clean_1000.txt'), 'r') as f:
        test_texts = f.readlines()
    x_test = np.load(os.path.join(clean_path, 'x_processed_1000.npy'))
    y_test = np.load(os.path.join(clean_path, 'y_1000.npy'))

    # Select the model and load the trained weights
    assert args.model[:4] == args.level
    model = None
    if args.model == "word_cnn":
        model = word_cnn(dataset)
    elif args.model == "word_bdlstm":
        model = bd_lstm(dataset)
    elif args.model == "char_cnn":
        model = char_cnn(dataset)
    elif args.model == "word_lstm":
        model = lstm(dataset)
    model_path = r'runs/{}/{}.dat'.format(dataset, args.model)
    model.load_weights(model_path)
    print('model path:', model_path)

    # evaluate classification accuracy of model on clean samples
    scores_origin = model.evaluate(x_test, y_test)
    print('clean samples origin test_loss: %f, accuracy: %f' %
          (scores_origin[0], scores_origin[1]))
    all_scores_origin = model.evaluate(x_test, y_test)
    print('all origin test_loss: %f, accuracy: %f' %
          (all_scores_origin[0], all_scores_origin[1]))

    grad_guide = ForwardGradWrapper(model)
    classes_prediction = grad_guide.predict_classes(x_test)

    print('Crafting adversarial examples...')
    successful_perturbations = 0
    failed_perturbations = 0
    sub_rate_list = []
    NE_rate_list = []

    start_cpu = time.clock()
    adv_text_path = r'./fool_result/{}/{}/adv_{}.txt'.format(
        dataset, args.model, str(clean_samples_cap))
    change_tuple_path = r'./fool_result/{}/{}/change_tuple_{}.txt'.format(
        dataset, args.model, str(clean_samples_cap))
    file_1 = open(adv_text_path, "a")
    file_2 = open(change_tuple_path, "a")

    for index, text in enumerate(test_texts[:clean_samples_cap]):
        # print(text)
        sub_rate = 0
        NE_rate = 0
        if np.argmax(y_test[index]) == classes_prediction[index]:
            # If the ground_true label is the same as the predicted label
            adv_doc, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase(
                input_text=text,
                true_y=np.argmax(y_test[index]),
                grad_guide=grad_guide,
                tokenizer=tokenizer,
                dataset=dataset,
                level=args.level)
            if adv_y != np.argmax(y_test[index]):
                successful_perturbations += 1
                print('{}. Successful example crafted.'.format(index))
            else:
                failed_perturbations += 1
                print('{}. Failure.'.format(index))

            text = adv_doc
            sub_rate_list.append(sub_rate)
            NE_rate_list.append(NE_rate)
            file_2.write(str(index) + str(change_tuple_list) + '\n')
        file_1.write(text + " sub_rate: " + str(sub_rate) + "; NE_rate: " +
                     str(NE_rate) + "\n")
    end_cpu = time.clock()
    print('CPU second:', end_cpu - start_cpu)
    mean_sub_rate = sum(sub_rate_list) / len(sub_rate_list)
    mean_NE_rate = sum(NE_rate_list) / len(NE_rate_list)
    print('mean substitution rate:', mean_sub_rate)
    print('mean NE rate:', mean_NE_rate)
    print('successful perturbations:', successful_perturbations)
    print('failed perturbations:', failed_perturbations)
    print(
        'success rate:', successful_perturbations /
        (failed_perturbations + successful_perturbations))
    file_1.close()
    file_2.close()