Exemplo n.º 1
0
    def heuristic_fn(text, candidate):
        '''
        Return the difference between the classification probability of the original
        word and the candidate substitute synonym, which is defined in Eq.(4) and Eq.(5).
        '''
        doc = nlp(text)
        origin_vector = None
        perturbed_vector = None
        if level == 'word':
            origin_vector = text_to_vector(text, tokenizer, dataset)
            perturbed_tokens = _compile_perturbed_tokens(doc, [candidate])
            perturbed_doc = nlp(' '.join(perturbed_tokens))
            perturbed_vector = text_to_vector(perturbed_doc.text, tokenizer,
                                              dataset)
        elif level == 'char':
            max_len = config.char_max_len[dataset]
            origin_vector = doc_process(text, get_embedding_dict(),
                                        dataset).reshape(1, max_len)
            perturbed_tokens = _compile_perturbed_tokens(
                nlp(input_text), [candidate])
            perturbed_text = ' '.join(perturbed_tokens)
            perturbed_vector = doc_process(perturbed_text,
                                           get_embedding_dict(),
                                           dataset).reshape(1, max_len)

        origin_prob = grad_guide.predict_prob(input_vector=origin_vector)
        perturbed_prob = grad_guide.predict_prob(input_vector=perturbed_vector)
        delta_p = origin_prob[true_y] - perturbed_prob[true_y]

        return delta_p
Exemplo n.º 2
0
def evaluate_word_saliency(doc, grad_guide, tokenizer, input_y, dataset,
                           level):
    word_saliency_list = []

    # zero the code of the current word and calculate the amount of change in the classification probability
    if level == 'word':
        max_len = config.word_max_len[dataset]
        text = [doc[position].text for position in range(len(doc))]
        text = ' '.join(text)
        origin_vector = text_to_vector(text, tokenizer, dataset)
        origin_prob = grad_guide.predict_prob(input_vector=origin_vector)
        for position in range(len(doc)):
            if position >= max_len:
                break
            # get x_i^(\hat)
            without_word_vector = copy.deepcopy(origin_vector)
            without_word_vector[0][position] = 0
            prob_without_word = grad_guide.predict_prob(
                input_vector=without_word_vector)

            # calculate S(x,w_i) defined in Eq.(6)
            word_saliency = origin_prob[input_y] - prob_without_word[input_y]
            word_saliency_list.append(
                (position, doc[position], word_saliency, doc[position].tag_))

    elif level == 'char':
        max_len = config.char_max_len[dataset]
        embedding_dic = get_embedding_dict()
        origin_vector = doc_process(doc.text.lower(), embedding_dic,
                                    dataset).reshape(1, max_len)
        origin_prob = grad_guide.predict_prob(input_vector=origin_vector)

        find_a_word = False
        word_position = 0
        without_word_vector = copy.deepcopy(origin_vector)
        for i, c in enumerate(doc.text):
            if i >= max_len:
                break
            if c is not ' ':
                without_word_vector[0][i] = 0
            else:
                find_a_word = True
                prob_without_word = grad_guide.predict_prob(
                    without_word_vector)
                word_saliency = origin_prob[input_y] - prob_without_word[
                    input_y]
                word_saliency_list.append(
                    (word_position, doc[word_position], word_saliency,
                     doc[word_position].tag_))
                word_position += 1
            if find_a_word:
                without_word_vector = copy.deepcopy(origin_vector)
                find_a_word = False

    position_word_list = []
    for word in word_saliency_list:
        position_word_list.append((word[0], word[1]))

    return position_word_list, word_saliency_list
Exemplo n.º 3
0
    def origin_perturbed_vector_fn(text, substitute):
        doc = nlp(text)
        origin_vector = None
        perturbed_vector = None
        if level == 'word':
            origin_vector = text_to_vector(text, tokenizer, dataset)
            perturbed_tokens = _compile_perturbed_tokens(doc, substitute)
            perturbed_doc = nlp(' '.join(perturbed_tokens))
            perturbed_vector = text_to_vector(perturbed_doc.text, tokenizer,
                                              dataset)
        elif level == 'char':
            max_len = config.char_max_len[dataset]
            origin_vector = doc_process(text, get_embedding_dict(),
                                        dataset).reshape(1, max_len)
            perturbed_tokens = _compile_perturbed_tokens(
                nlp(input_text), substitute)
            perturbed_text = ' '.join(perturbed_tokens)
            perturbed_vector = doc_process(perturbed_text,
                                           get_embedding_dict(),
                                           dataset).reshape(1, max_len)

        return origin_vector, perturbed_vector
Exemplo n.º 4
0
 def halt_condition_fn(perturbed_text):
     '''
     Halt if model output is changed.
     '''
     perturbed_vector = None
     if level == 'word':
         perturbed_vector = text_to_vector(perturbed_text, tokenizer,
                                           dataset)
     elif level == 'char':
         max_len = config.char_max_len[dataset]
         perturbed_vector = doc_process(perturbed_text,
                                        get_embedding_dict(),
                                        dataset).reshape(1, max_len)
     adv_y = grad_guide.predict_classes(input_vector=perturbed_vector)
     if adv_y != true_y:
         return True
     else:
         return False
Exemplo n.º 5
0
    print('model path:', model_path)

    # evaluate classification accuracy of model on clean samples
    scores_origin = model.evaluate(x_test[:clean_samples_cap],
                                   y_test[:clean_samples_cap])
    print('clean samples origin test_loss: %f, accuracy: %f' %
          (scores_origin[0], scores_origin[1]))
    all_scores_origin = model.evaluate(x_test, y_test)
    print('all origin test_loss: %f, accuracy: %f' %
          (all_scores_origin[0], all_scores_origin[1]))

    # evaluate classification accuracy of model on adversarial examples
    adv_text_path = r'./fool_result/{}/{}/adv_{}.txt'.format(
        dataset, args.model, str(clean_samples_cap))
    print('adversarial file:', adv_text_path)
    adv_text = read_adversarial_file(adv_text_path)

    x_adv = None
    if args.level == 'word':
        x_adv = text_to_vector_for_all(adv_text, tokenizer, dataset)
    elif args.level == 'char':
        x_adv = doc_process_for_all(adv_text, get_embedding_dict(), dataset)
    score_adv = model.evaluate(x_adv[:clean_samples_cap],
                               y_test[:clean_samples_cap])
    print('adv test_loss: %f, accuracy: %f' % (score_adv[0], score_adv[1]))

    mean_sub_rate = get_mean_sub_rate(adv_text_path)
    print('mean substitution rate:', mean_sub_rate)
    mean_NE_rate = get_mean_NE_rate(adv_text_path)
    print('mean NE rate:', mean_NE_rate)
Exemplo n.º 6
0
def adversarial_paraphrase(input_text,
                           true_y,
                           grad_guide,
                           tokenizer,
                           dataset,
                           dataset_dict,
                           word_candidate,
                           level,
                           verbose=True):
    '''
    Compute a perturbation, greedily choosing the synonym if it causes the most
    significant change in the classification probability after replacement
    :return perturbed_text: generated adversarial examples
    :return perturbed_y: predicted class of perturbed_text
    :return sub_rate: word replacement rate showed in Table 3
    :return change_tuple_list: list of substitute words
    '''
    def halt_condition_fn(perturbed_text):
        '''
        Halt if model output is changed.
        '''
        perturbed_vector = None
        if level == 'word':
            perturbed_vector = text_to_vector(perturbed_text, tokenizer,
                                              dataset)
        elif level == 'char':
            max_len = config.char_max_len[dataset]
            perturbed_vector = doc_process(perturbed_text,
                                           get_embedding_dict(),
                                           dataset).reshape(1, max_len)
        adv_y = grad_guide.predict_classes(input_vector=perturbed_vector)
        if adv_y != true_y:
            return True
        else:
            return False

    def heuristic_fn(text, candidate):
        '''
        Return the difference between the classification probability of the original
        word and the candidate substitute synonym, which is defined in Eq.(4) and Eq.(5).
        '''
        doc = nlp(text)
        origin_vector = None
        perturbed_vector = None
        if level == 'word':
            origin_vector = text_to_vector(text, tokenizer, dataset)
            perturbed_tokens = _compile_perturbed_tokens(doc, [candidate])
            perturbed_doc = nlp(' '.join(perturbed_tokens))
            perturbed_vector = text_to_vector(perturbed_doc.text, tokenizer,
                                              dataset)
        elif level == 'char':
            max_len = config.char_max_len[dataset]
            origin_vector = doc_process(text, get_embedding_dict(),
                                        dataset).reshape(1, max_len)
            perturbed_tokens = _compile_perturbed_tokens(
                nlp(input_text), [candidate])
            perturbed_text = ' '.join(perturbed_tokens)
            perturbed_vector = doc_process(perturbed_text,
                                           get_embedding_dict(),
                                           dataset).reshape(1, max_len)

        origin_prob = grad_guide.predict_prob(input_vector=origin_vector)
        perturbed_prob = grad_guide.predict_prob(input_vector=perturbed_vector)
        delta_p = origin_prob[true_y] - perturbed_prob[true_y]

        return delta_p

    def origin_perturbed_vector_fn(text, substitute):
        doc = nlp(text)
        origin_vector = None
        perturbed_vector = None
        if level == 'word':
            origin_vector = text_to_vector(text, tokenizer, dataset)
            perturbed_tokens = _compile_perturbed_tokens(doc, substitute)
            perturbed_doc = nlp(' '.join(perturbed_tokens))
            perturbed_vector = text_to_vector(perturbed_doc.text, tokenizer,
                                              dataset)
        elif level == 'char':
            max_len = config.char_max_len[dataset]
            origin_vector = doc_process(text, get_embedding_dict(),
                                        dataset).reshape(1, max_len)
            perturbed_tokens = _compile_perturbed_tokens(
                nlp(input_text), substitute)
            perturbed_text = ' '.join(perturbed_tokens)
            perturbed_vector = doc_process(perturbed_text,
                                           get_embedding_dict(),
                                           dataset).reshape(1, max_len)

        return origin_vector, perturbed_vector

    def delta_P_fn(origin_vector, perturbed_vector):
        '''Return the difference between the classification probility of
        the clean text and the perturbed text.
        '''
        # origin_vector = text_to_vector(text, tokenizer, dataset)
        origin_prob = grad_guide.predict_prob(input_vector=origin_vector)
        # perturbed_vector = text_to_vector(perturbed_text, tokenizer, dataset)
        perturbed_prob = grad_guide.predict_prob(input_vector=perturbed_vector)
        delta_P = origin_prob[true_y] - perturbed_prob[true_y]

        return delta_P

    doc = nlp(input_text)

    # BU-MHS
    position_word_list, word_saliency_list = evaluate_word_saliency(
        doc, grad_guide, tokenizer, true_y, dataset, level)
    perturbed_text, sub_word, sub_rate, NE_rate, change_tuple_list = BU_MHS(
        doc,
        true_y,
        dataset,
        dataset_dict,
        word_candidate,
        word_saliency_list=word_saliency_list,
        heuristic_fn=heuristic_fn,
        halt_condition_fn=halt_condition_fn,
        origin_perturbed_vector_fn=origin_perturbed_vector_fn,
        delta_P_fn=delta_P_fn,
        verbose=verbose)

    # print("perturbed_text after perturb_text:", perturbed_text)
    origin_vector = perturbed_vector = None
    if level == 'word':
        origin_vector = text_to_vector(input_text, tokenizer, dataset)
        perturbed_vector = text_to_vector(perturbed_text, tokenizer, dataset)
    elif level == 'char':
        max_len = config.char_max_len[dataset]
        origin_vector = doc_process(input_text, get_embedding_dict(),
                                    dataset).reshape(1, max_len)
        perturbed_vector = doc_process(perturbed_text, get_embedding_dict(),
                                       dataset).reshape(1, max_len)
    perturbed_y = grad_guide.predict_classes(input_vector=perturbed_vector)
    if verbose:
        origin_prob = grad_guide.predict_prob(input_vector=origin_vector)
        perturbed_prob = grad_guide.predict_prob(input_vector=perturbed_vector)
        raw_score = origin_prob[true_y] - perturbed_prob[true_y]
        print('Prob before: ', origin_prob[true_y], '. Prob after: ',
              perturbed_prob[true_y], '. Prob shift: ', raw_score)
    return perturbed_text, perturbed_y, sub_word, sub_rate, NE_rate, change_tuple_list