def test_attack(sess,model,data,label): 

    sifts = [] # getSIFT(data)
        
    attack = CarliniL2(sess, model, len(data[0][1]), NN.img_channels, NN.nb_classes, sifts, batch_size=len(data), max_iterations=1000, confidence=0,targeted = False)
    
    #inputs, targets = generate_data(data, samples=1, targeted=True,
    #                                start=0, inception=False)
    timestart = time.time()
    #adv = attack.attack(inputs, targets)
    adv = attack.attack(data, getTargets(label))
    print(adv.shape)
    timeend = time.time()
        
    print("Took",timeend-timestart,"seconds to run",len(data),"samples.")

    for i in range(len(adv)):
        print("Valid:")
        show(data[i])
        print("Adversarial:")
        show(adv[i])
            
        # keep information for the original image
        (newClass,newConfident) = NN.predictWithImage(model,adv[i]+0.5)
        newClassStr = dataBasics.LABELS(int(newClass))
        path0="%s/%s_converted_into_%s_with_confidence_%s.png"%(directory_pic_string,startIndexOfImage+i,newClassStr,newConfident)
        dataBasics.save(-1,np.squeeze(adv[i]), path0)
            
        print("Classification:", model.predict(adv[i:i+1]+0.5))
        print("Total distortion:", np.sum((adv[i]-data[i])**2)**.5)        
        print("L1 distance:", l1Distance(data[i],adv[i]))
 def __init__(self, sess, classifier, ord, confidence=None, **kwargs):
     if np.isinf(ord):
         self.attacker = CarliniLi(sess, classifier, **kwargs)
     else:
         self.attacker = CarliniL2(sess,
                                   classifier,
                                   confidence=confidence,
                                   **kwargs)
예제 #3
0
def attack(data, name):
    sess = K.get_session()
    model = load_model("models/"+name, custom_objects={'fn': fn})
    class Wrap:
        image_size = 28 if "mnist" in name else 32
        num_labels = 10
        num_channels = 1 if "mnist" in name else 3
        def predict(self, x):
            return model(x)
    attack = CarliniL2(sess, Wrap(), batch_size=100,
                       max_iterations=10000, binary_search_steps=5,
                       initial_const=1, targeted=True)
    adv = attack.attack(data.test_data[:100],
                        get_labs(data.test_labels[:100]))
    np.save("/tmp/"+name, adv)
    print(np.mean(np.sum((adv-data.test_data[:100])**2,axis=(1,2,3))**.5))
예제 #4
0
    def attack(self):
        attack = CarliniL2(self.model.sess,
                           self.model,
                           batch_size=1,
                           max_iterations=self.MAX_ITERATION,
                           confidence=0,
                           direction=self.args.direction)
        for i in range(self.args.start, self.args.start + self.args.instances):
            image = self.data.get_image([i])[0]

            org_prediction = self.model.predict_image(image)[0][0]

            distortions = [0]
            predictions = [org_prediction]
            for j in self.CONST_LIST:
                time_start = time.time()
                inputs = self.data.get_image([i])

                adv = attack.attack(inputs, [j])
                new_prediction = self.model.predict_image(adv[0])[0][0]
                x_diff = adv[0] - inputs[0]

                distortion = np.sum(
                    np.square(x_diff.raw_image /
                              (image.max_value - image.min_value)))
                distortions.append(distortion)
                predictions.append(new_prediction)

                print('Sample %d' % i, "Const %f" % j,
                      "Time used %.2f" % (time.time() - time_start),
                      new_prediction, distortion)

                if self.args.save_data and i >= self.args.start + self.args.instances - 1 and j == self.CONST_LIST[
                        -1]:
                    self.save_variables([
                        inputs[0], adv[0], predictions[0], new_prediction,
                        distortion
                    ])

            self.write_results(predictions)
            self.write_results(distortions)
def main(_):

    tf.set_random_seed(FLAGS.seed)
    random.seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    beam_size = FLAGS.beam_size
    record_path = FLAGS.result_directory

    # we should use os.path.join!
    if record_path[-1] != "/":
        record_path += "/"

    with open(FLAGS.caption_file) as data_file:
        caption_file = json.load(data_file)
    caption_info = caption_file['annotations']

    print("using " + FLAGS.norm + " for attack")
    print("targeted?", FLAGS.targeted)
    print("attack confidence kappa", FLAGS.confidence)
    if FLAGS.use_keywords:
        keywords_num = FLAGS.keywords_num
        header = ("target filename","attack filename",\
          "L2 distortion","L_inf distortion","loss","loss1","loss2",\
          "optimal C","attack successful?","target_sentence")
        header += tuple(["keywords"] * keywords_num) + tuple(["human caption"])
        header += tuple([
            val for pair in zip([
                "caption before attack " + str(i + 1) for i in range(beam_size)
            ], [
                "prob of caption before attack " + str(i + 1)
                for i in range(beam_size)
            ]) for val in pair
        ])
        header += tuple([
            val for pair in zip([
                "caption after attack " + str(i + 1) for i in range(beam_size)
            ], [
                "prob of caption after attack " + str(i + 1)
                for i in range(beam_size)
            ]) for val in pair
        ])
        with open('wordPOS/noun.txt') as noun_file:
            noun = noun_file.read().split()
        with open('wordPOS/verb.txt') as verb_file:
            verb = verb_file.read().split()
        with open('wordPOS/adjective.txt') as adjective_file:
            adjective = adjective_file.read().split()
        with open('wordPOS/adverb.txt') as adverb_file:
            adverb = adverb_file.read().split()
        # good words are noun, verb, adj or adv. We do not want words like "a" or "the" to be our keywords.
        # Those .txt files are generated by classifying the vocabulary list.
        good_words = set(noun + verb + adjective + adverb)
    else:
        header = ("target filename","attack filename","L2 distortion","L_inf distortion","loss","loss1","loss2",\
          "optimal C","attack successful?")
        header += tuple([
            val for pair in zip(
                ["target caption " + str(i + 1) for i in range(beam_size)], [
                    "prob of target caption " + str(i + 1)
                    for i in range(beam_size)
                ]) for val in pair
        ])
        header += tuple(["human caption"])
        header += tuple([
            val for pair in zip([
                "caption before attack " + str(i + 1) for i in range(beam_size)
            ], [
                "prob of caption before attack " + str(i + 1)
                for i in range(beam_size)
            ]) for val in pair
        ])
        header += tuple([
            val for pair in zip([
                "caption after attack " + str(i + 1) for i in range(beam_size)
            ], [
                "prob of caption after attack " + str(i + 1)
                for i in range(beam_size)
            ]) for val in pair
        ])

    os.system("mkdir -p {}".format(os.path.join(record_path, "fail_log")))
    record = open(
        os.path.join(record_path, "record_" + str(FLAGS.offset) + ".csv"),
        "a+")
    writer = csv.writer(record)
    writer.writerow(header)
    record.close()

    fail_log = open(
        os.path.join(record_path,
                     "fail_log/fail_record_" + str(FLAGS.offset) + ".csv"),
        "a+")
    fail_log_writer = csv.writer(fail_log)
    fail_log_writer.writerow(header)
    fail_log.close()

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    config = tf.ConfigProto(gpu_options=gpu_options)
    vocab = vocabulary.Vocabulary(FLAGS.vocab_file)

    inference_graph = tf.Graph()
    with inference_graph.as_default():
        inf_model = inference_wrapper.InferenceWrapper()
        inf_restore_fn = inf_model.build_graph_from_config(
            configuration.ModelConfig(), FLAGS.checkpoint_path)
        inf_image_placeholder = tf.placeholder(dtype=tf.string, shape=[])
        inf_preprocessor = inf_model.model.process_image(inf_image_placeholder)
    inference_graph.finalize()
    inf_sess = tf.Session(graph=inference_graph, config=config)
    # Load the model from checkpoint.
    inf_restore_fn(inf_sess)
    inf_generator = caption_generator.CaptionGenerator(inf_model,
                                                       vocab,
                                                       beam_size=beam_size)

    if FLAGS.targeted or FLAGS.use_keywords:
        target_g = tf.Graph()
        with target_g.as_default():
            target_model = inference_wrapper.InferenceWrapper()
            target_restore_fn = target_model.build_graph_from_config(
                configuration.ModelConfig(), FLAGS.checkpoint_path)
            target_image_placeholder = tf.placeholder(dtype=tf.string,
                                                      shape=[])
            target_preprocessor = target_model.model.process_image(
                target_image_placeholder)
        target_g.finalize()
        target_sess = tf.Session(graph=target_g, config=config)
        target_restore_fn(target_sess)
        target_generator = caption_generator.CaptionGenerator(
            target_model, vocab, beam_size=beam_size)

    attack_graph = tf.Graph()
    with attack_graph.as_default():
        model = attack_wrapper.AttackWrapper()
        sess = tf.Session(config=config)
        # build the attacker graph
        print("target:", FLAGS.targeted)
        attack = CarliniL2(sess,
                           inf_sess,
                           attack_graph,
                           inference_graph,
                           model,
                           inf_model,
                           targeted=FLAGS.targeted,
                           use_keywords=FLAGS.use_keywords,
                           use_logits=FLAGS.use_logits,
                           batch_size=1,
                           initial_const=FLAGS.C,
                           max_iterations=FLAGS.iters,
                           print_every=1,
                           confidence=FLAGS.confidence,
                           use_log=False,
                           norm=FLAGS.norm,
                           abort_early=False,
                           learning_rate=0.005)
        # compute graph for preprocessing
        image_placeholder = tf.placeholder(dtype=tf.string, shape=[])
        preprocessor = model.model.process_image(image_placeholder)

    # get all the files in the directory
    image_directory = FLAGS.image_directory
    filenames = [file for file in os.listdir(image_directory)]
    filenames.sort()
    random.shuffle(filenames)

    for j in range(FLAGS.exp_num):

        attack_filename = filenames[len(filenames) - 1 - j - FLAGS.offset]
        attack_image_id = int(
            re.match(r"^.*\_(.*)\..*$", attack_filename).group(1))
        human_cap = next((item for item in caption_info
                          if item["image_id"] == attack_image_id))
        human_cap = human_cap['caption']

        print("attack filename:", attack_filename)
        print("human's caption:", human_cap)
        with tf.gfile.GFile(image_directory + attack_filename, "rb") as f:
            image = f.read()
        raw_image = sess.run(preprocessor,
                             feed_dict={image_placeholder: image})

        show(raw_image, record_path,
             "original_" + attack_filename.replace(".jpg", ".png"))
        raw_filename = record_path + "original_" + attack_filename.replace(
            ".jpg", ".png.npy")
        # raw_image = np.squeeze(np.load(raw_filename))
        raw_captions = inf_generator.beam_search(inf_sess, raw_image)
        print("Captions for original image %s:" %
              os.path.basename(raw_filename))
        raw_sentences = []
        raw_probs = []
        for indx, raw_caption in enumerate(raw_captions):
            raw_sentence = [
                vocab.id_to_word(w) for w in raw_caption.sentence[1:-1]
            ]
            raw_sentence = " ".join(raw_sentence)
            print("  %d) %s (p=%f)" %
                  (1, raw_sentence, math.exp(raw_caption.logprob)))
            raw_sentences = raw_sentences + [raw_sentence]
            raw_probs = raw_probs + [math.exp(raw_caption.logprob)]

        if FLAGS.targeted:
            # If it's targeted attack, we pick another image as our target image to generate target caption for us.
            target_filename = filenames[j + FLAGS.offset]
            print("Captions for target image %s:" %
                  os.path.basename(target_filename))
            with tf.gfile.GFile(image_directory + target_filename, "rb") as f:
                target_image = f.read()
                target_image = target_sess.run(
                    target_preprocessor,
                    {target_image_placeholder: target_image})
            target_captions = target_generator.beam_search(
                target_sess, target_image)
            target_sentences = []
            target_probs = []
            for indx, target_caption in enumerate(target_captions):
                target_sentence = [
                    vocab.id_to_word(w) for w in target_caption.sentence[1:-1]
                ]
                target_sentence = " ".join(target_sentence)
                print("  %d) %s (p=%f)" %
                      (1, target_sentence, math.exp(target_caption.logprob)))
                target_sentences = target_sentences + [target_sentence]
                target_probs = target_probs + [
                    math.exp(target_caption.logprob)
                ]
        else:
            # If it's untargeted, our target sentence is the attack image's own original caption.
            target_sentences = raw_sentences
            target_probs = raw_probs
            target_filename = attack_filename

        if FLAGS.use_keywords:
            if FLAGS.input_feed:
                # If there is an input feed, we use input feed as our keywords.
                words = FLAGS.input_feed.split()
            else:
                # If there is no input feed, we use select keywords from the target caption.
                target_sentences_words = set(target_sentences[0].split())
                raw_sentences_words = set(raw_sentences[0].split())
                if FLAGS.targeted:
                    # If tagreted, we also need to exclude the words in the original caption.
                    word_candidates = list((target_sentences_words
                                            & good_words) -
                                           raw_sentences_words)
                    word_candidates.sort()
                else:
                    word_candidates = list(
                        (target_sentences_words & good_words))
                    word_candidates.sort()
            if len(word_candidates) < keywords_num:
                print("words not enough for this attack!")
                print(
                    "****************************************** END OF THIS ATTACK ******************************************"
                )
                continue
            # Randomly select keywords from all candidates.
            words = list(
                np.random.choice(word_candidates, keywords_num, replace=False))

        # run multiple attacks
        success = []
        C_val = [FLAGS.C]
        best_adv = None
        best_loss, best_loss1, best_loss2 = None, None, None
        l2_distortion_log = []
        linf_distortion_log = []
        best_l2_distortion = 1e10
        best_linf_distortion = 1e10
        adv_log = []
        loss1_log = []
        loss2_log = []
        loss_log = []

        for try_index in range(FLAGS.C_search_times):

            attack_const = C_val[try_index]
            max_caption_length = 20

            if FLAGS.use_keywords:
                # keywords based attack
                key_words = [vocab.word_to_id(word) for word in words]
                print("My key words are: ", words)
                key_words_mask = np.append(
                    np.ones(len(key_words)),
                    np.zeros(max_caption_length - len(key_words)))
                key_words = key_words + [vocab.end_id] * (max_caption_length -
                                                          len(key_words))
                adv, loss, loss1, loss2, _ = attack.attack(
                    np.array([raw_image]),
                    sess,
                    inf_sess,
                    model,
                    inf_model,
                    vocab,
                    key_words,
                    key_words_mask,
                    j,
                    try_index,
                    beam_size,
                    FLAGS.infer_per_iter,
                    attack_const=attack_const)
            else:
                # exact attack
                if FLAGS.targeted:
                    if FLAGS.input_feed:
                        new_sentence = FLAGS.input_feed
                    else:
                        new_sentence = target_sentences[0]
                else:
                    new_sentence = raw_sentences[0]
                # new_sentence = "a black and white photo of a train on a track ."
                new_sentence = new_sentence.split()
                print("My target sentence:", new_sentence)
                new_caption = [vocab.start_id
                               ] + [vocab.word_to_id(w)
                                    for w in new_sentence] + [vocab.end_id]
                true_cap_len = len(new_caption)
                new_caption = new_caption + [vocab.end_id] * (
                    max_caption_length - true_cap_len)
                print("My target id:", new_caption)
                new_mask = np.append(
                    np.ones(true_cap_len),
                    np.zeros(max_caption_length - true_cap_len))
                adv, loss, loss1, loss2, _ = attack.attack(
                    np.array([raw_image]),
                    sess,
                    inf_sess,
                    model,
                    inf_model,
                    vocab,
                    new_caption,
                    new_mask,
                    j,
                    try_index,
                    1,
                    attack_const=attack_const)
            # save information of this image to log array
            adv_log += [adv]
            loss_log += [loss]
            loss1_log += [loss1]
            loss2_log += [loss2]

            adv_captions = inf_generator.beam_search(inf_sess, np.squeeze(adv))
            print("Captions after this attempt:")
            adv_caption = adv_captions[0]
            adv_sentence = [
                vocab.id_to_word(w) for w in adv_caption.sentence[1:-1]
            ]
            adv_sentence = " ".join(adv_sentence)
            print("  %d) %s (p=%f)" %
                  (1, adv_sentence, math.exp(adv_caption.logprob)))

            if FLAGS.use_keywords:
                if FLAGS.targeted:
                    success += [set(words) < set(adv_sentence.split())]
                else:
                    success += [
                        not bool(set(words) & set(adv_sentence.split()))
                    ]
            else:
                if FLAGS.targeted:
                    success += [(adv_sentence == target_sentences[0])]
                else:
                    '''
          raw_split = [item.split() for item in raw_sentences]
          nltk_BLEU = nltk.translate.bleu_score.sentence_bleu(raw_split, adv_sentence.split())
          print("BLEU by nltk is:", nltk_BLEU)
          success += [nltk_BLEU<0.5]
          '''
                    # For untargeted and caption based attack, there is no simple criterion to determine an attack is successful or not. We need to calculate the scores.
                    # So here we always assumee the attack is fail, then we save fail log for score calculation.
                    success += [False]

            print("Attack with this C is successful?", success[try_index])

            l2_distortion = np.sum((adv - raw_image)**2)**.5
            linf_distortion = np.max(np.abs(adv - raw_image))
            l2_distortion_log += [l2_distortion]
            linf_distortion_log += [linf_distortion]
            print("L2 distortion is", l2_distortion)
            print("L_inf distortion is", linf_distortion)
            if success[try_index]:
                # Among the successful attacks, we select the one with minimum distortion as our final result.
                # Note this one may not correspond to minimum C.
                if FLAGS.norm == "l2":
                    if l2_distortion < best_l2_distortion:
                        best_adv = adv
                        best_loss, best_loss1, best_loss2 = loss, loss1, loss2
                        best_l2_distortion = l2_distortion
                        best_linf_distortion = linf_distortion
                        final_C = C_val[try_index]
                elif FLAGS.norm == "inf":
                    if linf_distortion < best_linf_distortion:
                        best_adv = adv
                        best_loss, best_loss1, best_loss2 = loss, loss1, loss2
                        best_l2_distortion = l2_distortion
                        best_linf_distortion = linf_distortion
                        final_C = C_val[try_index]
                else:
                    raise ValueError("unsupported distance metric:" +
                                     FLAGS.norm)
            if FLAGS.targeted or FLAGS.use_keywords:
                # We do binary search to find next C.
                if try_index + 1 < FLAGS.C_search_times:
                    if success[try_index]:
                        if any(not _ for _ in success):
                            last_false = len(success) - success[::-1].index(
                                False) - 1
                            C_val += [
                                0.5 * (C_val[try_index] + C_val[last_false])
                            ]
                        else:
                            C_val += [C_val[try_index] * 0.5]
                    else:
                        if any(_ for _ in success):
                            last_true = len(success) - success[::-1].index(
                                True) - 1
                            C_val += [
                                0.5 * (C_val[try_index] + C_val[last_true])
                            ]
                        else:
                            C_val += [C_val[try_index] * 10.0]
            else:
                C_val += [C_val[try_index] * 10.0]

        print("results of each attempt:", success)
        print("C values of each attempt:", C_val)
        print("L2 distortion log is", l2_distortion_log)
        print("L_inf distortion log is", linf_distortion_log)
        final_success = any(_ for _ in success)

        if not final_success:
            final_C = C_val[-1]
            best_adv = adv

            best_loss, best_loss1, best_loss2 = loss, loss1, loss2
            if FLAGS.use_keywords:
                target_info = {
                    "words": words,
                    "target_filename": target_filename,
                    "target_sentences": target_sentences
                }
            else:
                target_info = {
                    'target_filename': target_filename,
                    "target_sentences": target_sentences,
                    "target_probs": target_probs
                }
            save_fail_log(adv_log, loss_log, loss1_log, loss2_log, l2_distortion_log, linf_distortion_log, success, C_val, record_path, attack_filename, raw_image, human_cap,\
              raw_sentences, raw_probs, inf_sess, inf_generator, vocab, target_info)

        show(best_adv, record_path,
             "adversarial_" + attack_filename.replace(".jpg", ".png"))
        show(best_adv - raw_image, record_path,
             "diff_" + attack_filename.replace(".jpg", ".png"))

        best_l2_distortion = np.sum((best_adv - raw_image)**2)**.5
        best_linf_distortion = np.max(np.abs(best_adv - raw_image))
        print("best L2 distortion is", best_l2_distortion)
        print("best L_inf distortion is", best_linf_distortion)

        adv_filename = record_path + "adversarial_" + attack_filename.replace(
            ".jpg", ".png.npy")
        adv_image = np.squeeze(np.load(adv_filename))
        adv_captions = inf_generator.beam_search(inf_sess, adv_image)
        print("Captions for adversarial image %s:" %
              os.path.basename(adv_filename))
        adv_sentences = []
        adv_probs = []
        for indx, adv_caption in enumerate(adv_captions):
            adv_sentence = [
                vocab.id_to_word(w) for w in adv_caption.sentence[1:-1]
            ]
            adv_sentence = " ".join(adv_sentence)
            print("  %d) %s (p=%f)" %
                  (1, adv_sentence, math.exp(adv_caption.logprob)))
            adv_sentences = adv_sentences + [adv_sentence]
            adv_probs = adv_probs + [math.exp(adv_caption.logprob)]

        record = open(record_path + "record_" + str(FLAGS.offset) + ".csv",
                      "a+")
        writer = csv.writer(record)
        if FLAGS.use_keywords:

            row = (target_filename, attack_filename, best_l2_distortion,best_linf_distortion,\
              best_loss,best_loss1,best_loss2,final_C,str(final_success),target_sentences[0])
            row += tuple(words) + tuple([human_cap])
            row += tuple([
                val for pair in zip(raw_sentences, raw_probs) for val in pair
            ])
            row += tuple([
                val for pair in zip(adv_sentences, adv_probs) for val in pair
            ])
            writer.writerow(row)
        else:
            row = (target_filename, attack_filename, best_l2_distortion,
                   best_linf_distortion, best_loss, best_loss1, best_loss2,
                   final_C, str(final_success))
            row += tuple([
                val for pair in zip(target_sentences, target_probs)
                for val in pair
            ]) + tuple([human_cap])
            row += tuple([
                val for pair in zip(raw_sentences, raw_probs) for val in pair
            ])
            row += tuple([
                val for pair in zip(adv_sentences, adv_probs) for val in pair
            ])
            writer.writerow(row)
        record.close()
        print(
            "****************************************** END OF THIS ATTACK ******************************************"
        )

    sess.close()
    inf_sess.close()
    if FLAGS.use_keywords or FLAGS.targeted:
        target_sess.close()
예제 #6
0
def main(args):

    with tf.Session() as sess:
        if args['dataset'] == 'mnist':
            data, model = MNIST(), MNISTModel("models/mnist", sess)
            handpick = False
            inception = False
        if args['dataset'] == "cifar":
            data, model = CIFAR(), CIFARModel("models/cifar", sess)
            handpick = True
            inception = False
        if args['dataset'] == "imagenet":
            data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess)
            handpick = True
            inception = True

        if args['adversarial'] != "none":
            model = MNISTModel("models/mnist_cwl2_admm" + str(args['adversarial']), sess)

        if args['temp'] and args['dataset'] == 'mnist':
            model = MNISTModel("models/mnist-distilled-" + str(args['temp']), sess)
        if args['temp'] and args['dataset'] == 'cifar':
            model = CIFARModel("models/cifar-distilled-" + str(args['temp']), sess)

        inputs, targets, labels, true_ids = generate_data_ST(data, model, samples=args['numimg'],
                                                             samplesT=args['numimgT'], targeted=True,
                                        start=0, inception=inception, handpick=handpick, seed=args['seed'])
        #print(true_ids)
        if args['attack'] == 'L2C':
            attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               binary_search_steps=args['binary_steps'],
                               abort_early=args['abort_early'])

        if args['attack'] == 'L2LA2':
            attack = LADMML2re(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'],
                               layernum=args['layer_number'], use_kernel=args['use_kernel'],
                               confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'],
                               abort_early=args['abort_early'])


        timestart = time.time()
        adv = attack.attack(inputs, targets)
        timeend = time.time()

        print("Took", timeend - timestart, "seconds to run", len(inputs), "samples.\n")

        if args['conf'] != 0:
            model = MNISTModel("models/mnist-distilled-100", sess)

        if args['kernel_bias']:
            EP = evaluate_perturbation_kb(args, sess, model, inputs)
            scores, l2 = EP(inputs, targets, adv)
            EPT = evaluate_perturbation_testset(args, sess, model, data.test_data)
            test_scores = EPT(data.test_data, data.test_labels)
            EP2 = evaluate_perturbation_kb_restore(args, sess, model, inputs)
            scores2 = EP2(inputs, targets, adv)
            EPT2 = evaluate_perturbation_testset(args, sess, model, data.test_data)
            test_scores2 = EPT2(data.test_data, data.test_labels)
        else:
            EP = evaluate_perturbation(args, sess, model, inputs)
#        scores = EP(inputs, targets, adv)
#        scores2 = EP2(inputs, targets, adv)

        score_count = []
        score_count2 = []
        score_count3 = []

        score_count4 = []
        for e, (sc) in enumerate(scores):

            if np.argmax(sc) == np.argmax(targets[e]):
                score_count.append(1)
                if e < args['numimg']:
                    score_count4.append(1)
            else:
                score_count.append(0)
                if e < args['numimg']:
                    score_count4.append(0)

        for e, (sc) in enumerate(scores):
            if np.argmax(sc) == np.argmax(labels[e]):
                score_count3.append(1)
            else:
                score_count3.append(0)

        for e, (sc2) in enumerate(scores2):
            if np.argmax(sc2) == np.argmax(labels[e]):
                score_count2.append(1)
            else:
                score_count2.append(0)

        test_score_count = []
        test_score_count2 = []

        for e, (tsc) in enumerate(test_scores):

            if np.argmax(tsc) == np.argmax(data.test_labels[e]):
                test_score_count.append(1)
            else:
                test_score_count.append(0)

        for e, (tsc2) in enumerate(test_scores2):

            if np.argmax(tsc2) == np.argmax(data.test_labels[e]):
                test_score_count2.append(1)
            else:
                test_score_count2.append(0)

        l0s = np.count_nonzero(adv)
        successrate = np.mean(score_count)
        successrate2 = np.mean(score_count2)
        successrate3 = np.mean(score_count3)
        test_successrate = np.mean(test_score_count)
        test_successrate2 = np.mean(test_score_count2)

        print('original model, success rate of T images for the original labels:', successrate2)
        print('modified model, success rate of T images for the original labels:', successrate3)
        print('modified model, success rate of T images for the target labels:', successrate)
        print('modified model, success rate of S imges for the target labels:', np.mean(score_count4))

        print('modified model, success rate of test set for the original labels:', test_successrate)
        print('original model, success rate of test set for the original labels:', test_successrate2)
        print('l0 distance:', l0s)
        print('l2 distance:', l2)

# In[ ]:

if __name__ == "__main__":
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        modelPath = '%smodels/mnist' % (nn_robust_attack_root)
        data, model = MNIST(), MNISTModel(modelPath, sess)

        attack = CarliniL2(sess,
                           model,
                           batch_size=1,
                           max_iterations=2000,
                           confidence=0,
                           binary_search_steps=5,
                           initial_const=1.,
                           learning_rate=1e-1,
                           targeted=False)

        inputs, targets = generate_data(data,
                                        samples=1000,
                                        targeted=False,
                                        start=5500,
                                        inception=False)

        original_classified_wrong_number = 0  #number of benign samples that are misclassified
        disturbed_failure_number = 0  #number of samples that failed to craft corresponding adversarial samples
        test_number = 0  #number of adversarial samples that we generate
        TTP = 0
예제 #8
0
def main(args):
    with tf.Session() as sess:
        if args['dataset'] == 'mnist':
            data, model = MNIST(), MNISTModel("models/mnist", sess)
            handpick = False
            inception = False
        if args['dataset'] == "cifar":
            data, model = CIFAR(), CIFARModel("models/cifar", sess)
            handpick = True
            inception = False
        if args['dataset'] == "imagenet":
            data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess)
            handpick = True
            inception = True

        if args['adversarial'] != "none":
            model = MNISTModel(
                "models/mnist_cwl2_admm" + str(args['adversarial']), sess)

        if args['temp'] and args['dataset'] == 'mnist':
            model = MNISTModel("models/mnist-distilled-" + str(args['temp']),
                               sess)
        if args['temp'] and args['dataset'] == 'cifar':
            model = CIFARModel("models/cifar-distilled-" + str(args['temp']),
                               sess)

        inputs, targets, labels, true_ids = generate_data(
            data,
            model,
            samples=args['numimg'],
            targeted=True,
            start=0,
            inception=inception,
            handpick=handpick,
            seed=args['seed'])

        #print(true_ids)
        if args['attack'] == 'L2C':
            attack = CarliniL2(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               binary_search_steps=args['binary_steps'],
                               abort_early=args['abort_early'])

        if args['attack'] == 'L0A':
            attack = ADMML0(sess,
                            model,
                            batch_size=args['batch_size'],
                            max_iterations=args['maxiter'],
                            confidence=args['conf'],
                            binary_search_steps=args['iteration_steps'],
                            ro=args['ro'],
                            abort_early=args['abort_early'])

        timestart = time.time()
        adv = attack.attack(inputs, targets)
        timeend = time.time()

        print("Took", timeend - timestart, "seconds to run", len(inputs),
              "samples.\n")

        if args['train']:
            np.save('labels_train.npy', labels)
            np.save(str(args['attack']) + '_train.npy', adv)

        if (args['conf'] != 0):
            model = MNISTModel("models/mnist-distilled-100", sess)

        if args['attack'] != 'L0A' and args['attack'] != 'L0AE' and args[
                'attack'] != 'L0C' and args['attack'] != 'L0AE2':
            l1_l2_li_computation(args, data, model, adv, inception, inputs,
                                 targets, labels, true_ids)
        else:
            l0_computation(args, data, model, adv, inception, inputs, targets,
                           labels, true_ids)
예제 #9
0
    def __init__(self, reformer):
        self.reformer = reformer

    def predict(self, x):
        return classifier.model(self.reformer.model(x))


if attacking:
    thrs = operator.get_thrs(dict((k, v * 4) for k, v in dr.items()))

    attack = CarliniL2(sess, [Pred2(x) for x in reformer],
                       detector_dict,
                       thrs,
                       batch_size=100,
                       binary_search_steps=4,
                       learning_rate=1e-2,
                       max_iterations=10000,
                       targeted=True,
                       initial_const=1,
                       confidence=1,
                       boxmin=0,
                       boxmax=1)

    adv = attack.attack(dat, lab)
    np.save("/tmp/" + dataset + ".npy", adv)
else:
    adv = np.load("/tmp/" + dataset + ".npy")
print('mean distortion', np.mean(np.sum((adv - dat)**2, axis=(1, 2, 3))**.5))

for i, ref in enumerate(reformer):
    print('reformer', i)
    predicted = np.argmax(classifier.model.predict(ref.model.predict(adv)),
예제 #10
0
def main(args):
    temp_encoder = encoder(level=args['level'])
    with tf.Session() as sess:
        use_log = not args['use_zvalue']
        is_inception = args['dataset'] == "imagenet"
        # load network
        print('Loading model', args['dataset'])
        if args['dataset'] == "mnist":
            data, model = MNIST(), MNISTModel("models/mnist", sess, use_log)
            # data, model =  MNIST(), MNISTModel("models/mnist-distilled-100", sess, use_log)
        elif args['dataset'] == "cifar10":
            #data, model = CIFAR(), CIFARModel("models/cifar", sess, use_log)
            # data, model = CIFAR(), CIFARModel("models/cifar-distilled-100", sess, use_log)
            data, model = CIFAR(), CIFAR_WIDE("models/wide_resnet", sess,
                                              use_log)
        elif args['dataset'] == "imagenet":
            data, model = ImageNet(), InceptionModel(sess, use_log)
        print('Done...')
        if args['numimg'] == 0:
            args['numimg'] = len(data.test_labels) - args['firstimg']
        print('Using', args['numimg'], 'test images')
        # load attack module
        if args['attack'] == "white":
            # batch size 1, optimize on 1 image at a time, rather than optimizing images jointly
            attack = CarliniL2(sess,
                               model,
                               batch_size=1,
                               max_iterations=args['maxiter'],
                               print_every=args['print_every'],
                               early_stop_iters=args['early_stop_iters'],
                               confidence=0,
                               learning_rate=args['lr'],
                               initial_const=args['init_const'],
                               binary_search_steps=args['binary_steps'],
                               targeted=not args['untargeted'],
                               use_log=use_log,
                               adam_beta1=args['adam_beta1'],
                               adam_beta2=args['adam_beta2'])
        else:
            # batch size 128, optimize on 128 coordinates of a single image
            attack = BlackBoxL2(sess,
                                model,
                                batch_size=128,
                                max_iterations=args['maxiter'],
                                print_every=args['print_every'],
                                early_stop_iters=args['early_stop_iters'],
                                confidence=0,
                                learning_rate=args['lr'],
                                initial_const=args['init_const'],
                                binary_search_steps=args['binary_steps'],
                                targeted=not args['untargeted'],
                                use_log=use_log,
                                use_tanh=args['use_tanh'],
                                use_resize=args['use_resize'],
                                adam_beta1=args['adam_beta1'],
                                adam_beta2=args['adam_beta2'],
                                reset_adam_after_found=args['reset_adam'],
                                solver=args['solver'],
                                save_ckpts=args['save_ckpts'],
                                load_checkpoint=args['load_ckpt'],
                                start_iter=args['start_iter'],
                                init_size=args['init_size'],
                                use_importance=not args['uniform'])

        random.seed(args['seed'])
        np.random.seed(args['seed'])
        print('Generate data')
        all_inputs, all_targets, all_labels, all_true_ids, encoding_all = generate_data(
            data,
            samples=args['numimg'],
            targeted=not args['untargeted'],
            start=args['firstimg'],
            inception=is_inception)
        print('Done...')
        #print('all_inputs : ', all_inputs.shape)
        #print('encoding_all : ',encoding_all.shape)
        os.system("mkdir -p {}/{}".format(args['save'], args['dataset']))
        img_no = 0
        total_success = 0
        l2_total = 0.0
        origin_correct = 0
        adv_correct = 0
        for i in range(all_true_ids.size):
            print(' adversarial_image_no: ', i)
            inputs = all_inputs[i:i + 1]
            encoding_inputs = encoding_all[i:i + 1]
            #print('encoding_inputs shape: ', encoding_inputs)
            targets = all_targets[i:i + 1]
            labels = all_labels[i:i + 1]
            print("true labels:", np.argmax(labels), labels)
            print("target:", np.argmax(targets), targets)
            # test if the image is correctly classified
            original_predict = model.model.predict(encoding_inputs)
            original_predict = np.squeeze(original_predict)
            original_prob = np.sort(original_predict)
            original_class = np.argsort(original_predict)
            print("original probabilities:", original_prob[-1:-6:-1])
            print("original classification:", original_class[-1:-6:-1])
            print("original probabilities (most unlikely):", original_prob[:6])
            print("original classification (most unlikely):",
                  original_class[:6])
            if original_class[-1] != np.argmax(labels):
                print(
                    "skip wrongly classified image no. {}, original class {}, classified as {}"
                    .format(i, np.argmax(labels), original_class[-1]))
                continue
            origin_correct += np.argmax(labels, 1) == original_class[-1]

            img_no += 1
            timestart = time.time()
            adv, const = attack.attack_batch(inputs, targets)
            if type(const) is list:
                const = const[0]
            if len(adv.shape) == 3:
                adv = adv.reshape((1, ) + adv.shape)
            timeend = time.time()
            l2_distortion = np.sum((adv - inputs)**2)**.5

            ##### llj
            encode_adv = np.transpose(adv, axes=(0, 3, 1, 2))
            channel0, channel1, channel2 = encode_adv[:,
                                                      0, :, :], encode_adv[:,
                                                                           1, :, :], encode_adv[:,
                                                                                                2, :, :]
            channel0, channel1, channel2 = temp_encoder.tempencoding(
                channel0), temp_encoder.tempencoding(
                    channel1), temp_encoder.tempencoding(channel2)
            encode_adv = np.concatenate([channel0, channel1, channel2], axis=1)
            encode_adv = np.transpose(encode_adv, axes=(0, 2, 3, 1))

            #### llj
            adversarial_predict = model.model.predict(encode_adv)
            adversarial_predict = np.squeeze(adversarial_predict)
            adversarial_prob = np.sort(adversarial_predict)
            adversarial_class = np.argsort(adversarial_predict)
            print("adversarial probabilities:", adversarial_prob[-1:-6:-1])
            print("adversarial classification:", adversarial_class[-1:-6:-1])

            adv_correct += np.argmax(labels, 1) == adversarial_class[-1]

            success = False
            if args['untargeted']:
                if adversarial_class[-1] != original_class[-1]:
                    success = True
            else:
                if adversarial_class[-1] == np.argmax(targets):
                    success = True
            if l2_distortion > 20.0:
                success = False
            if success:
                total_success += 1
                l2_total += l2_distortion
            suffix = "id{}_seq{}_prev{}_adv{}_{}_dist{}".format(
                all_true_ids[i], i, original_class[-1], adversarial_class[-1],
                success, l2_distortion)
            print("Saving to", suffix)
            show(
                inputs,
                "{}/{}/{}_original_{}.png".format(args['save'],
                                                  args['dataset'], img_no,
                                                  suffix))
            show(
                adv,
                "{}/{}/{}_adversarial_{}.png".format(args['save'],
                                                     args['dataset'], img_no,
                                                     suffix))
            show(
                adv - inputs,
                "{}/{}/{}_diff_{}.png".format(args['save'], args['dataset'],
                                              img_no, suffix))
            print(
                "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, const = {:.6f}, prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, l2_avg = {:.5f}"
                .format(img_no, i, all_true_ids[i], timeend - timestart,
                        success, const, original_class[-1],
                        adversarial_class[-1], l2_distortion,
                        total_success / float(img_no),
                        0 if total_success == 0 else l2_total / total_success))
            sys.stdout.flush()

        print(' origin accuracy : ',
              100.0 * origin_correct / all_true_ids.size)
        print(' adv accuracy : ', 100.0 * adv_correct / all_true_ids.size)
예제 #11
0
            inputs.append(data.test_data[start + i])
            targets.append(data.test_labels[start + i])

    inputs = np.array(inputs)
    targets = np.array(targets)

    return inputs, targets


if __name__ == "__main__":
    with tf.Session() as sess:
        data, model = MNIST(), MNISTModel("models/mnist", sess)
        #data, model =  CIFAR(), CIFARModel("models/cifar", sess)
        attack = CarliniL2(sess,
                           model,
                           batch_size=9,
                           max_iterations=1000,
                           confidence=0)
        #attack = CarliniL0(sess, model, max_iterations=1000, initial_const=10,
        #                   largest_const=15)

        inputs, targets = generate_data(data,
                                        samples=1,
                                        targeted=True,
                                        start=0,
                                        inception=False)
        timestart = time.time()
        adv = attack.attack(inputs, targets)
        timeend = time.time()

        print("Took", timeend - timestart, "seconds to run", len(inputs),
예제 #12
0
def test_cw():
    sess = tf.Session()
    # sess.run(tf.global_variables_initializer())

    # keras maintains a tf session. It must be set by either
    # keras.backend.set_session(sess), or use inside a context manager
    # sess.as_default()
    with sess.as_default():
        data, model = MNIST(), MNISTModel("models/mnist", sess)
    with sess.as_default():
        data, model = CIFAR(), CIFARModel("models/cifar", sess)

    # testing the model
    np.argmax(model.model.predict(data.test_data[:10]), axis=1)
    print(np.argmax(data.test_labels[:10], axis=1))

    #data, model =  CIFAR(), CIFARModel("models/cifar", sess)
    attack_l2 = CarliniL2(sess,
                          model,
                          batch_size=10,
                          max_iterations=1000,
                          confidence=0)
    attack_l0 = CarliniL0(sess,
                          model,
                          max_iterations=1000,
                          initial_const=10,
                          largest_const=15)
    attack_li = CarliniLi(sess, model)

    inputs, targets = generate_data(data,
                                    samples=1,
                                    targeted=True,
                                    start=0,
                                    inception=False)
    # TODO find the first digits of each kind, try map it to the next digit
    inputs, targets = generate_data_2(data)

    adv_l2 = attack_l2.attack(inputs, targets)
    adv_l0 = attack_l0.attack(inputs, targets)
    adv_li = attack_li.attack(inputs, targets)

    plt.tight_layout()
    plt.tight_layout(pad=1, w_pad=1, h_pad=1)

    grid_show_image(inputs, 10, 1, 'images/orig-mnist.png')
    grid_show_image(adv_l2, 10, 1, 'images/l2.png')
    grid_show_image(adv_l0, 10, 1, 'images/l0.png')
    grid_show_image(adv_li, 9, 2, 'images/li.png')

    from contextlib import redirect_stdout
    redirect_stdout

    np.sum((adv_l2[0] - inputs[0])**2)

    # np.argmax(targets, axis=1)
    # import keras
    # keras.backend.set_session(sess)
    np.argmax(model.model.predict(inputs), axis=1)
    np.argmax(targets, axis=1)
    # # (((adv_l2 + 0.5)*255).round())

    np.argmax(model.model.predict(adv_l2), axis=1)
    np.argmax(model.model.predict(adv_l0), axis=1)
    np.argmax(model.model.predict(adv_li), axis=1)

    np.sum(model.model.predict(adv_l2), axis=1)

    np.sum(sess.run(tf.nn.softmax(model.model.predict(adv_l2))), axis=1)

    softmax_pred = sess.run(tf.nn.softmax(model.model.predict(adv_l2)))
    softmax_pred[0]
    np.argmax(softmax_pred, axis=1)

    keras.activations.softmax(model.model)

    model.model.predict(((adv_l2 + 0.5) * 255).round())
예제 #13
0
if __name__ == "__main__":
    with tf.Session() as sess:
        use_log = False
        print('Loading model...')
        # data, model =  MNIST(), MNISTModel("models/mnist", sess, use_log)
        # data, model =  MNIST(), MNISTModel("models/mnist-distilled-100", sess, use_log)
        # data, model = CIFAR(), CIFARModel("models/cifar", sess, use_log)
        data, model = ImageNet(), InceptionModel(sess, use_log)
        print('Done...')
        batch_size = 1
        if isinstance(model, InceptionModel):
            batch_size = 10
        attack = CarliniL2(sess,
                           model,
                           batch_size=batch_size,
                           initial_const=1.0,
                           max_iterations=1000,
                           confidence=0,
                           use_log=use_log)

        print('Generate data')
        inputs, targets = generate_data(data,
                                        samples=1,
                                        targeted=True,
                                        start=6,
                                        inception=isinstance(
                                            model, InceptionModel))
        print('Done...')
        print(inputs.shape)
        inputs = inputs[0:batch_size]
        targets = targets[0:batch_size]
예제 #14
0
def main(args):

    with tf.Session() as sess:

        random.seed(args["seed"])
        np.random.seed(args["seed"])
        tf.set_random_seed(args["seed"])

        print("seed = ", args["seed"])

        overall_timestart = time.time()

        use_log = not args['use_zvalue']

        print("use_log = ", use_log)

        data_map = {}

        model_map = {}

        if args['dataset'] == "imagenet":
            if args['attack'] == "CW":
                model_map[args['model_name']] = ImageNetModel(
                    sess, use_log, args['model_name'], create_prediction=False)
            elif args['attack'] == "EADL1":
                model_map[args['model_name']] = ImageNetModel(
                    sess, use_log, args['model_name'], create_prediction=True)

            data_map['imagenet'] = ImageNet(
                model_map[args['model_name']].image_size,
                load_total_imgs=args['numimg_loaded'])

        print('Loading model', args['dataset'])
        data = data_map[args['dataset']]
        model = model_map[args['model_name']]

        if args['numimg'] == 0:
            args['numimg'] = len(data.test_labels) - args['firstimg']
        print('Using', args['numimg'], 'test images')
        # load attack module

        print('args = ', args)

        targeted_flag = not args['untargeted']

        print("targeted_flag = ", targeted_flag)

        # load attack module
        if args['attack'] == "CW":

            attack = CarliniL2(sess, model, 100)
            attack_predictor = attack.predict
        elif args['attack'] == "EADL1":
            attack_predictor = model.model.predict

        random.seed(args['seed'])
        np.random.seed(args['seed'])
        tf.set_random_seed(args['seed'])

        print('Generate data')
        model_name = args['model_name']

        if 'vgg' in model_name or 'densenet' in model_name or 'alexnet' in model_name:
            remove_background_class_flag = True
        else:
            remove_background_class_flag = False

        sys.stdout.flush()

        all_inputs, all_targets, all_labels, all_true_ids, img_info = generate_data(
            data,
            samples=args['numimg'],
            targeted=targeted_flag,
            random_and_least_likely=True,
            predictor=attack_predictor,
            start=args['firstimg'],
            imagenet=isinstance(data, ImageNet),
            remove_background_class=remove_background_class_flag,
            target_type=args['target_type'],
            total_num_valid_samples=args['num_valid_test_imgs'])

        print('len(all_inputs) = ', len(all_inputs))
        print("all_inputs shape:", all_inputs.shape)
        print("all_targets shape:", all_targets.shape)

        attack_batch_size = args['attack_batch_size']
        if attack_batch_size == 0:
            attack_batch_size = all_true_ids.size
        print("attack_batch_size = ", attack_batch_size)

        if args['attack'] == 'CW':
            attack.init_attack(sess,
                               model,
                               targeted=targeted_flag,
                               batch_size=attack_batch_size,
                               initial_const=args['init_const'],
                               binary_search_steps=args['binary_steps'],
                               max_iterations=args['maxiter'],
                               print_every=args['print_every'],
                               confidence=args['kappa'],
                               use_log=use_log)

        elif args['attack'] == 'EADL1':
            print("EADL1 attack")
            attack = EADL1(sess,
                           model,
                           targeted=targeted_flag,
                           batch_size=attack_batch_size,
                           initial_const=args['init_const'],
                           binary_search_steps=args['binary_steps'],
                           max_iterations=args['maxiter'],
                           confidence=args['kappa'],
                           print_every=args['print_every'])

        else:
            print("Invalid attack name, exit 1")
            return

        saved_path = "{}/{}/{}/targeted_{}".format(args['save'],
                                                   args['dataset'],
                                                   args['attack'],
                                                   targeted_flag)
        if not os.path.exists(saved_path):
            os.system("mkdir -p " + saved_path)

        img_no = 0
        total_success = 0
        l0_list = []
        l1_list = []
        l2_list = []
        linf_list = []
        time_list = []

        verbose_f = open(
            args['save'] + "/" + "_".join([
                args['dataset'], args['attack'],
                str(targeted_flag), "verbose.txt"
            ]), "w")
        aggre_f = open(
            args['save'] + "/" + "_".join([
                args['dataset'], args['attack'],
                str(targeted_flag), "aggre.txt"
            ]), "w")

        if targeted_flag == True:
            verbose_head_str = '\t'.join([
                'total', 'seq', 'id', 'time', 'success', 'prev_class',
                'target', 'new_class', 'l0_distortion', 'l1_distortion',
                'l2_distortion', 'linf_distortion'
            ])
        else:
            verbose_head_str = '\t'.join([
                'total', 'seq', 'id', 'time', 'success', 'prev_class',
                'new_class', 'l0_distortion', 'l1_distortion', 'l2_distortion',
                'linf_distortion'
            ])

        aggre_head_str = '\t'.join([
            'total_count', 'success_rate', 'l0_avg', 'l0_std', 'l1_avg',
            'l1_std', 'l2_avg', 'l2_std', 'linf_avg', 'linf_std', 'time_avg',
            'time_std'
        ])

        verbose_f.write(verbose_head_str + '\n')
        aggre_f.write(aggre_head_str + '\n')

        print("all_true_ids.size = ", all_true_ids.size)
        sys.stdout.flush()

        random.seed(args['seed'])
        np.random.seed(args['seed'])
        tf.set_random_seed(args['seed'])

        for i in range(0, all_true_ids.size, attack_batch_size):

            if i + attack_batch_size > all_true_ids.size:
                actual_attack_batch_size = all_true_ids.size - i
            else:
                actual_attack_batch_size = attack_batch_size

            inputs = all_inputs[i:i + actual_attack_batch_size]
            targets = all_targets[i:i + actual_attack_batch_size]
            labels = all_labels[i:i + actual_attack_batch_size]

            timestart = time.time()
            """perform the attack"""
            print("perform the attack")
            adv = attack.attack(inputs, targets)

            timeend = time.time()

            time_used = timeend - timestart
            time_used_per_image = time_used / attack_batch_size

            for j in range(len(adv)):

                print("=" * 10, "i = ", i, "=" * 10, "j=", j, "=" * 10)

                # original_predict = np.squeeze(attack.predict(np.array([inputs[j]])))
                original_predict = np.squeeze(
                    attack_predictor(np.array([inputs[j]])))

                original_prob = np.sort(original_predict)
                original_class = np.argsort(original_predict)
                print("Original Classification:", original_prob[-1:-6:-1])
                print("Original Probabilities/Logits:",
                      original_class[-1:-6:-1])
                sys.stdout.flush()

                true_label = np.argmax(labels[j])
                target_label = np.argmax(targets[j])
                attack_label = None
                success = False

                img_no += 1

                print("Target:", target_label)
                # if the array contains NaN, the solver did not return a solution
                if (np.any(np.isnan(adv[j]))):
                    print('Attack failed. (solver returned NaN)')
                    l0_distortion = l1_distortion = l2_distortion = linf_distortion = np.nan
                    adversarial_class = np.zeros(original_class.shape)

                else:
                    l0_distortion = l0_loss(adv[j], inputs[j])
                    l1_distortion = l1_loss(adv[j], inputs[j])
                    l2_distortion = l2_loss(adv[j], inputs[j])
                    linf_distortion = linf_loss(adv[j], inputs[j])
                    #adversarial_predict = np.squeeze(model.model.predict(np.array([adv[j]])))
                    # adversarial_predict = np.squeeze(attack.predict(np.array([adv[j]])))
                    adversarial_predict = np.squeeze(
                        attack_predictor(np.array([adv[j]])))

                    adversarial_prob = np.sort(adversarial_predict)
                    adversarial_class = np.argsort(adversarial_predict)
                    attack_label = np.argmax(adversarial_predict)

                    print("adversarial probabilities:",
                          adversarial_prob[-1:-11:-1])
                    print("adversarial classification:",
                          adversarial_class[-1:-11:-1])
                    sys.stdout.flush()

                    success = False
                    if targeted_flag:
                        success = np.argsort(
                            adversarial_predict)[-1] == target_label

                        candidates = set([
                            i for i in range(len(adversarial_predict) - 1)
                            if abs(adversarial_predict[i] -
                                   adversarial_prob[-1]) < 0.001
                        ])
                        if len(candidates) > 1 and target_label in candidates:
                            success = True
                    else:
                        success = np.argsort(
                            adversarial_predict)[-1] != target_label

                    if success:
                        print("Attack succeeded.")
                    else:
                        print("Attack failed.")

                    if success:
                        total_success += 1
                        l0_list.append(l0_distortion)
                        l1_list.append(l1_distortion)
                        l2_list.append(l2_distortion)
                        linf_list.append(linf_distortion)
                        time_list.append(time_used_per_image)

                suffix = "id={0}_seq={1}_prev={2}_adv={3}_res={4}".format(
                    all_true_ids[i + j], i, original_class[-1],
                    adversarial_class[-1], success)
                print("Saving to", suffix)
                sys.stdout.flush()

                dump(
                    inputs[j],
                    "{}/imgno={}_content={}_{}".format(saved_path, img_no,
                                                       'original', suffix))
                dump(
                    adv[j],
                    "{}/imgno={}_content={}_{}".format(saved_path, img_no,
                                                       'adversarial', suffix))
                # dump(adv[j] - inputs[j], "{}/imgno={}_content={}_{}".format(saved_path, img_no, 'noise', suffix))
                np.save(
                    "{}/imgno={}_content={}_{}".format(
                        saved_path, img_no, 'targets', suffix) + ".npy",
                    targets[j])
                np.save(
                    "{}/imgno={}_content={}_{}".format(
                        saved_path, img_no, 'labels', suffix) + ".npy",
                    labels[j])

                L1_debug_str = "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, " \
                               "prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, " \
                               "l2_avg = {:.5f}".format(img_no, i+j, all_true_ids[i+j], time_used_per_image, success,
                                original_class[-1], adversarial_class[-1], l2_distortion,
                                total_success / float(img_no), 0 if total_success == 0 else np.mean(l2_list))

                print(L1_debug_str)
                sys.stdout.flush()

                if targeted_flag == True:
                    verbose_str = '\t'.join([
                        str(img_no),
                        str(i + j),
                        str(all_true_ids[i + j]),
                        str(time_used_per_image),
                        str(success),
                        str(original_class[-1]),
                        str(np.argmax(targets[j])),
                        str(adversarial_class[-1]),
                        str(l0_distortion),
                        str(l1_distortion),
                        str(l2_distortion),
                        str(linf_distortion)
                    ])
                else:
                    verbose_str = '\t'.join([
                        str(img_no),
                        str(i + j),
                        str(all_true_ids[i + j]),
                        str(time_used_per_image),
                        str(success),
                        str(original_class[-1]),
                        str(adversarial_class[-1]),
                        str(l0_distortion),
                        str(l1_distortion),
                        str(l2_distortion),
                        str(linf_distortion)
                    ])

                verbose_f.write(verbose_str + "\n")
                verbose_f.flush()
                print(verbose_head_str)
                print(verbose_str)

                sys.stdout.flush()

                overall_timeend_sofar = time.time()

                overall_time_used_sofar = overall_timeend_sofar - overall_timestart

                print("overall_time_used_sofar = ", overall_time_used_sofar)
                sys.stdout.flush()

        verbose_f.close()

        if img_no == 0:
            success_rate = 0.0
        else:
            success_rate = total_success / float(img_no)

        if total_success == 0:
            aggre_str = "\t".join([
                str(img_no),
                str(success_rate),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0),
                str(0.0)
            ])
        else:
            aggre_str = "\t".join([
                str(img_no),
                str(success_rate),
                str(np.mean(l0_list)),
                str(np.std(l0_list)),
                str(np.mean(l1_list)),
                str(np.std(l1_list)),
                str(np.mean(l2_list)),
                str(np.std(l2_list)),
                str(np.mean(linf_list)),
                str(np.std(linf_list)),
                str(np.mean(time_list)),
                str(np.std(time_list))
            ])

        aggre_f.write(aggre_str + "\n")
        print(aggre_head_str)
        print(aggre_str)
        sys.stdout.flush()
        aggre_f.close()

        overall_timeend = time.time()

        overall_time_used = overall_timeend - overall_timestart

        print("overall_time_used = ", overall_time_used)
        sys.stdout.flush()

        print("ALL DONE!!!")
        return
예제 #15
0
def main(args):
    #   gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    with tf.Session() as sess:
        if args['dataset'] == 'mnist':
            data, model = MNIST(), MadryMNISTModel("models/secret/", sess)
            handpick = False
            inception = False
        if args['dataset'] == "cifar":
            data, model = CIFAR(), CIFARModel("models/cifar", sess)
            #data, model = CIFAR(), MadryCIFARModel("models/model_0/", sess)
            handpick = True
            inception = False
        if args['dataset'] == "imagenet":
            data, model = ImageNet(args['seed_imagenet']), InceptionModel(
                sess, False)
            handpick = True
            inception = True

        if args['adversarial'] != "none":
            model = MNISTModel(
                "models/mnist_cwl2_admm" + str(args['adversarial']), sess)

        if args['temp'] and args['dataset'] == 'mnist':
            model = MNISTModel("models/mnist-distilled-" + str(args['temp']),
                               sess)
        if args['temp'] and args['dataset'] == 'cifar':
            model = MadryCIFARModel(
                "models/cifar-distilled-" + str(args['temp']), sess)

        inputs, targets, labels, true_ids = generate_data(
            data,
            model,
            samples=args['numimg'],
            targeted=True,
            target_num=args['target_number'],
            start=0,
            inception=inception,
            handpick=handpick,
            seed=args['seed'])

        #print(true_ids)
        if args['attack'] == 'L2C':
            attack = CarliniL2(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               binary_search_steps=args['binary_steps'],
                               abort_early=args['abort_early'])
        if args['attack'] == 'LiCW':
            attack = CarliniLi(sess,
                               model,
                               max_iterations=args['maxiter'],
                               abort_early=args['abort_early'])

        if args['attack'] == 'L2A':
            attack = ADMML2(sess,
                            model,
                            batch_size=args['batch_size'],
                            max_iterations=args['maxiter'],
                            confidence=args['conf'],
                            binary_search_steps=args['iteration_steps'],
                            ro=args['ro'],
                            abort_early=args['abort_early'])

        if args['attack'] == 'L2AE':
            attack = ADMML2en(sess,
                              model,
                              batch_size=args['batch_size'],
                              max_iterations=args['maxiter'],
                              confidence=args['conf'],
                              binary_search_steps=args['binary_steps'],
                              ro=args['ro'],
                              iteration_steps=args['iteration_steps'],
                              abort_early=args['abort_early'])

        if args['attack'] == 'L2LA':
            attack = LADMML2(sess,
                             model,
                             batch_size=args['batch_size'],
                             max_iterations=args['maxiter'],
                             confidence=args['conf'],
                             binary_search_steps=args['iteration_steps'],
                             ro=args['ro'],
                             abort_early=args['abort_early'])
        if args['attack'] == 'L2LAST':
            attack = LADMMSTL2(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               binary_search_steps=args['iteration_steps'],
                               ro=args['ro'],
                               abort_early=args['abort_early'],
                               retrain=args['retrain'])

        if args['attack'] == 'LiIF':
            attack = IFGM(sess,
                          model,
                          batch_size=args['batch_size'],
                          ord=np.inf,
                          inception=inception)
        if args['attack'] == 'LiF':
            attack = FGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=np.inf,
                         inception=inception)

        if args['attack'] == 'L1':
            attack = EADL1(sess,
                           model,
                           batch_size=args['batch_size'],
                           max_iterations=args['maxiter'],
                           confidence=args['conf'],
                           binary_search_steps=args['binary_steps'],
                           beta=args['beta'],
                           abort_early=args['abort_early'])

        if args['attack'] == 'L1EN':
            attack = EADEN(sess,
                           model,
                           batch_size=args['batch_size'],
                           max_iterations=args['maxiter'],
                           confidence=args['conf'],
                           binary_search_steps=args['binary_steps'],
                           beta=args['beta'],
                           abort_early=args['abort_early'])

        if args['attack'] == 'L1IFGM':
            attack = IFGM(sess,
                          model,
                          batch_size=args['batch_size'],
                          ord=1,
                          inception=inception)
        if args['attack'] == 'L2IFGM':
            attack = IFGM(sess,
                          model,
                          batch_size=args['batch_size'],
                          ord=2,
                          inception=inception)

        if args['attack'] == 'L1FGM':
            attack = FGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=1,
                         inception=inception)
        if args['attack'] == 'L2FGM':
            attack = FGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=2,
                         inception=inception)

        timestart = time.time()
        adv = attack.attack(inputs, targets)
        timeend = time.time()

        print("Took", timeend - timestart, "seconds to run", len(inputs),
              "samples.\n")

        if args['train']:
            np.save('labels_train.npy', labels)
            np.save(str(args['attack']) + '_train.npy', adv)

        #if (args['conf'] != 0):
        #    model = MNISTModel("models/mnist-distilled-100", sess)

        l1_l2_li_computation(args, data, model, adv, inception, inputs,
                             targets, labels, true_ids)
    with tf.Session() as sess:
        #data, model =  MNIST(), Classifier(sess)
        data = CIFAR10()
        
        # target model
        if sys.argv[1] == 'our':
            model = Classifier(input_shape=data.IMG_SHAPE, session=sess)
            model.restore('../Clf/models/cifar_classifier')
        elif sys.argv[1] == 'orgONLY':
            model = CIFARModel('models/cifar', sess)
        elif sys.argv[1] == 'orgDIS':
            model = CIFARModel('models/cifar-distilled-100', sess)
        else:
            print('Wrong Parameters')
            sys.exit()

        # init attack
        attack = CarliniL2(sess, model, targeted=False, max_iterations=1000, confidence=10, boxmin=0, boxmax=1)

        #inputs, targets = generate_data(data, samples=128, targeted=False, start=0, inception=False)
        inputs = data.X_test[:128]
        targets = data.y_test[:128]

        timestart = time.time()
        adv = attack.attack(inputs, targets)
        timeend = time.time()
        
        print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.")

        np.save(('results/%s.npy' % sys.argv[2]), adv)
예제 #17
0
def main(args):
    #   gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    with tf.Session() as sess:
        if args['dataset'] == 'mnist':
            data, model = MNIST(), MNISTModel("models/mnist", sess)
            handpick = False
            inception = False
        if args['dataset'] == "cifar":
            data, model = CIFAR(), CIFARModel("models/cifar", sess)
            handpick = True
            inception = False
        if args['dataset'] == "imagenet":
            data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess)
            handpick = True
            inception = True

        if args['adversarial'] != "none":
            model = MNISTModel(
                "models/mnist_cwl2_admm" + str(args['adversarial']), sess)

        if args['temp'] and args['dataset'] == 'mnist':
            model = MNISTModel("models/mnist-distilled-" + str(args['temp']),
                               sess)
        if args['temp'] and args['dataset'] == 'cifar':
            model = CIFARModel("models/cifar-distilled-" + str(args['temp']),
                               sess)

        inputs, targets, labels, true_ids = generate_data(
            data,
            model,
            samples=args['numimg'],
            targeted=args['targeted'],
            start=0,
            inception=inception,
            handpick=handpick,
            seed=args['seed'])

        #print(true_ids)
        if args['attack'] == 'L2C':
            attack = CarliniL2(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               targeted=args['targeted'],
                               binary_search_steps=args['binary_steps'],
                               abort_early=args['abort_early'])

        if args['attack'] == 'L2BB':
            # score-based ZO-ADMM attack
            attack = LADMMBB(sess,
                             model,
                             batch_size=args['batch_size'],
                             max_iterations=args['maxiter'],
                             targeted=args['targeted'],
                             confidence=args['conf'],
                             binary_search_steps=args['iteration_steps'],
                             ro=args['ro'],
                             abort_early=args['abort_early'],
                             gama=args['gama'],
                             epi=args['epi'],
                             alpha=args['alpha'])

        timestart = time.time()
        #    adv = attack.attack(inputs, targets)
        adv, querycount, queryl2 = attack.attack(inputs, targets)
        timeend = time.time()
        print("Took", timeend - timestart, "seconds to run", len(inputs),
              "samples.\n")

        if args['train']:
            np.save('labels_train.npy', labels)
            np.save(str(args['attack']) + '_train.npy', adv)

        if (args['conf'] != 0):
            model = MNISTModel("models/mnist-distilled-100", sess)

        if args['targeted']:
            l1_l2_li_computation(args, data, model, adv, inception, inputs,
                                 targets, labels, true_ids, querycount,
                                 queryl2)
        else:
            l2_computation(args, data, model, adv, inception, inputs, targets,
                           labels, true_ids, querycount, queryl2)
예제 #18
0
def main(args):

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        use_log = not args['use_zvalue']
        is_inception = args['dataset'] == "imagenet"
        # load network
        print('Loading model', args['dataset'])
        if args['dataset'] == "mnist":
            data, model = MNIST(), MNISTModel("models/mnist", sess, use_log)
            # data, model =  MNIST(), MNISTModel("models/mnist-distilled-100", sess, use_log)
        elif args['dataset'] == "cifar10":
            data, model = CIFAR(), CIFARModel("models/cifar", sess, use_log)
            # data, model = CIFAR(), CIFARModel("models/cifar-distilled-100", sess, use_log)
        elif args['dataset'] == "imagenet":
            data, model = ImageNet(), InceptionModel(sess, use_log)
        print('Done...')
        if args['numimg'] == 0:
            args['numimg'] = len(data.test_labels) - args['firstimg']
        print('Using', args['numimg'], 'test images')
        # load attack module
        if args['attack'] == "white":
            # batch size 1, optimize on 1 image at a time, rather than optimizing images jointly
            attack = CarliniL2(sess,
                               model,
                               batch_size=1,
                               max_iterations=args['maxiter'],
                               print_every=args['print_every'],
                               early_stop_iters=args['early_stop_iters'],
                               confidence=0,
                               learning_rate=args['lr'],
                               initial_const=args['init_const'],
                               binary_search_steps=args['binary_steps'],
                               targeted=not args['untargeted'],
                               use_log=use_log,
                               adam_beta1=args['adam_beta1'],
                               adam_beta2=args['adam_beta2'])
        else:
            # batch size 128, optimize on 128 coordinates of a single image
            attack = BlackBoxL2(sess,
                                model,
                                batch_size=128,
                                max_iterations=args['maxiter'],
                                print_every=args['print_every'],
                                early_stop_iters=args['early_stop_iters'],
                                confidence=0,
                                learning_rate=args['lr'],
                                initial_const=args['init_const'],
                                binary_search_steps=args['binary_steps'],
                                targeted=not args['untargeted'],
                                use_log=use_log,
                                use_tanh=args['use_tanh'],
                                use_resize=args['use_resize'],
                                adam_beta1=args['adam_beta1'],
                                adam_beta2=args['adam_beta2'],
                                reset_adam_after_found=args['reset_adam'],
                                solver=args['solver'],
                                save_ckpts=args['save_ckpts'],
                                load_checkpoint=args['load_ckpt'],
                                start_iter=args['start_iter'],
                                init_size=args['init_size'],
                                use_importance=not args['uniform'])

        random.seed(args['seed'])
        np.random.seed(args['seed'])
        print('Generate data')
        all_inputs, all_targets, all_labels, all_true_ids = generate_data(
            data,
            samples=args['numimg'],
            targeted=not args['untargeted'],
            start=args['firstimg'],
            inception=is_inception)
        print('Done...')
        os.system("mkdir -p {}/{}".format(args['save'], args['dataset']))
        img_no = 0
        total_success = 0
        l2_total = 0.0
        for i in range(all_true_ids.size):
            inputs = all_inputs[i:i + 1]
            targets = all_targets[i:i + 1]
            labels = all_labels[i:i + 1]
            print("true labels:", np.argmax(labels), labels)
            print("target:", np.argmax(targets), targets)
            # test if the image is correctly classified
            original_predict = model.model.predict(inputs)
            original_predict = np.squeeze(original_predict)
            original_prob = np.sort(original_predict)
            original_class = np.argsort(original_predict)
            print("original probabilities:", original_prob[-1:-6:-1])
            print("original classification:", original_class[-1:-6:-1])
            print("original probabilities (most unlikely):", original_prob[:6])
            print("original classification (most unlikely):",
                  original_class[:6])
            if original_class[-1] != np.argmax(labels):
                print(
                    "skip wrongly classified image no. {}, original class {}, classified as {}"
                    .format(i, np.argmax(labels), original_class[-1]))
                continue

            img_no += 1
            timestart = time.time()
            adv, const = attack.attack_batch(inputs, targets)
            if type(const) is list:
                const = const[0]
            if len(adv.shape) == 3:
                adv = adv.reshape((1, ) + adv.shape)
            timeend = time.time()
            l2_distortion = np.sum((adv - inputs)**2)**.5
            adversarial_predict = model.model.predict(adv)
            adversarial_predict = np.squeeze(adversarial_predict)
            adversarial_prob = np.sort(adversarial_predict)
            adversarial_class = np.argsort(adversarial_predict)
            print("adversarial probabilities:", adversarial_prob[-1:-6:-1])
            print("adversarial classification:", adversarial_class[-1:-6:-1])
            success = False
            if args['untargeted']:
                if adversarial_class[-1] != original_class[-1]:
                    success = True
            else:
                if adversarial_class[-1] == np.argmax(targets):
                    success = True
            if l2_distortion > 20.0:
                success = False
            if success:
                total_success += 1
                l2_total += l2_distortion
            suffix = "id{}_seq{}_prev{}_adv{}_{}_dist{}".format(
                all_true_ids[i], i, original_class[-1], adversarial_class[-1],
                success, l2_distortion)
            print("Saving to", suffix)
            show(
                inputs,
                "{}/{}/{}_original_{}.png".format(args['save'],
                                                  args['dataset'], img_no,
                                                  suffix))
            show(
                adv,
                "{}/{}/{}_adversarial_{}.png".format(args['save'],
                                                     args['dataset'], img_no,
                                                     suffix))
            show(
                adv - inputs,
                "{}/{}/{}_diff_{}.png".format(args['save'], args['dataset'],
                                              img_no, suffix))
            print(
                "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, const = {:.6f}, prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, l2_avg = {:.5f}"
                .format(img_no, i, all_true_ids[i], timeend - timestart,
                        success, const, original_class[-1],
                        adversarial_class[-1], l2_distortion,
                        total_success / float(img_no),
                        0 if total_success == 0 else l2_total / total_success))

            with open(args['save'] + "/report.txt", 'a') as f:
                f.write("*" * 20)
                to_write = "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, const = {:.6f}, prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, l2_avg = {:.5f}".format(
                    img_no, i, all_true_ids[i], timeend - timestart, success,
                    const, original_class[-1], adversarial_class[-1],
                    l2_distortion, total_success / float(img_no),
                    0 if total_success == 0 else l2_total / total_success)
                f.write(to_write)
                f.write("*" * 20)
                f.write("\n\n")

            sys.stdout.flush()
예제 #19
0
def model_setup_carlini(rd, model_dict, X_train, y_train, X_test, y_test, X_val,
                        y_val, mean, ax=None, layer=None):
    """
    Main function to set up network (create, load, test, save)
    """

    rev = model_dict['rev']
    dim_red = model_dict['dim_red']
    if rd != None:
        # Doing dimensionality reduction on dataset
        print("Doing {} with rd={} over the training data".format(dim_red, rd))
        _, _, _, dr_alg = dr_wrapper(X_train, X_test, dim_red, rd, y_train, rev,
                                     X_val)
    else:
        dr_alg = None

    # Getting data parameters after dimensionality reduction
    data_dict = get_data_shape(X_train, X_test, X_val)
    no_of_dim = data_dict['no_of_dim']

    # Prepare Theano variables for inputs and targets
    if no_of_dim == 2:
        input_var = T.tensor('inputs')
    elif no_of_dim == 3:
        input_var = T.tensor3('inputs')
    elif no_of_dim == 4:
        input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Check if model already exists
    if layer is not None:
        network, model_exist_flag, layers = model_creator(model_dict, data_dict,
                                            input_var, target_var, rd, layer)
    else:
        network, model_exist_flag = model_creator(model_dict, data_dict,
                                    input_var, target_var, rd, layer)

    # Defining symbolic variable for network output
    prediction = lasagne.layers.get_output(network)
    # Defining symbolic variable for network parameters
    params = lasagne.layers.get_all_params(network, trainable=True)
    # Defining symbolic variable for network output with dropout disabled
    test_prediction = lasagne.layers.get_output(network, deterministic=True)

    # Building or loading model depending on existence
    if model_exist_flag == 1:
        # Load the correct model:
        param_values = model_loader(model_dict, rd)
        #lasagne.layers.set_all_param_values(network, param_values)

        # Create Keras model
        from keras.models import Sequential
        from keras.layers import Dense, Dropout, Activation, Flatten
        from keras.layers import Convolution2D, MaxPooling2D

        model = Sequential()
        if rd is not None:
            model.add(Dense(rd, activation=None,
                            input_shape=(784,), use_bias=False))
            model.add(Dense(100, activation='sigmoid'))
        else:
            model.add(Dense(100, activation='sigmoid', input_shape=(784,)))
        model.add(Dense(100, activation='sigmoid'))
        model.add(Dense(10, activation=None))

        if rd is not None:
            A = gradient_transform(model_dict, dr_alg)
            param_values = [A.T] + param_values

        # model.set_weights(param_values)
        # m_path = './keras/' + get_model_name(model_dict, rd)
        # model.save(m_path)
        # model.load_weights(m_path)

        y_onehot = np.zeros((len(y_test), 10))
        y_onehot[np.arange(len(y_test)), y_test] = 1
        # X_test was mean-subtracted before, now we add the mean back
        X_test_mean = (X_test + mean - 0.5).reshape(-1, 784)
        data = (X_test_mean, y_onehot)
        mean_flat = mean.reshape(-1, 784)

        # l2-Carlini Attack
        import tensorflow as tf
        import time
        from l2_attack import CarliniL2

        with tf.Session() as sess:
            attack = CarliniL2(sess, model, mean_flat, batch_size=10,
                               max_iterations=1000, confidence=0, targeted=False)

            inputs, targets = generate_data(data, samples=10000, targeted=False,
                                            start=0, inception=False)
            timestart = time.time()
            adv = attack.attack(inputs, targets, param_values)
            timeend = time.time()

            print("Took", timeend - timestart,
                  "seconds to run", len(inputs), "samples.")

            # Resolve absolute path to output directory
            abs_path_o = resolve_path_o(model_dict)

            fname = 'carlini_l2'
            fname += '_' + get_model_name(model_dict)

            if rd is not None:
                fname += '_' + model_dict['dim_red'] + str(rd)

            plotfile = open(abs_path_o + fname + '.txt', 'a')
            plotfile.write('\\\small{' + str(rd) + '}\n')

            dists = []
            pred = model.predict(inputs + 0.5 - mean_flat)
            for i in range(len(adv)):
                dist = np.linalg.norm((adv[i] + mean_flat) - (inputs[i] + 0.5))
                if np.argmax(pred[i]) == y_test[i]:
                    dists.append(dist)
                if i < 50:
                    # Save original test and adversarial images
                    x_adv = (adv[i] + mean_flat).reshape((28, 28))
                    orig = (inputs[i] + 0.5).reshape((28, 28))
                    img.imsave('./carlini_images/{}_adv.png'.format(i),
                               x_adv * 255, vmin=0, vmax=255, cmap='gray')
                    img.imsave('./carlini_images/{}_orig.png'.format(i),
                               orig * 255, vmin=0, vmax=255, cmap='gray')

            # Test overall accuracy of the model
            pred = model.predict(inputs + 0.5 - mean_flat)
            correct = 0
            for i in range(pred.shape[0]):
                if np.argmax(pred[i]) == y_test[i]:
                    correct += 1
            print('Overall accuracy on test images: ',
                  correct / float(pred.shape[0]))

            pred = model.predict(adv)
            correct = 0
            for i in range(pred.shape[0]):
                if np.argmax(pred[i]) == y_test[i]:
                    correct += 1
            print('Overall accuracy on adversarial images: ',
                  correct / float(pred.shape[0]))

            dists_sorted = sorted(dists)

            for i in range(len(dists)):
                plotfile.write('{} {} \n'.format(i, dists_sorted[i]))

            # Plot histogram
            # import matplotlib.pyplot as plt
            # dists = np.array(dists)
            # ax.hist(dists, 50, normed=1, histtype='step', cumulative=True,label=str(rd))

    elif model_exist_flag == 0:
        # Launch the training loop.
        print("Starting training...")
        if layer is not None:
            model_trainer(input_var, target_var, prediction, test_prediction,
                          params, model_dict, X_train, y_train, X_val, y_val,
                          network, layers)
        else:
            model_trainer(input_var, target_var, prediction, test_prediction,
                          params, model_dict, X_train, y_train, X_val, y_val,
                          network)
        model_saver(network, model_dict, rd)
예제 #20
0
    inputs = np.array(inputs)
    targets = np.array(targets)

    return inputs, targets


if __name__ == "__main__":

    count = 0
    distortion = []
    testCount = 20
    for i in range(testCount):
        with tf.Session() as sess:
            data, model = RNN(), RNNModel("models\imdb_model.h5", sess)  #MNIST(), MNISTModel("models/mnist", sess)
            attack = CarliniL2(sess, model, batch_size=1, max_iterations=1000, confidence=0, targeted=False)
            inputs, targets = generate_data(data, samples=1, targeted=False,
                                                  start=0, inception=False)

            timestart = time.time()
            adv = attack.attack(inputs, targets)
            timeend = time.time()

            for i in range(len(adv)):
                # print("Valid:")
                # input = inputs[i]
                # input = np.reshape(input, (input.shape[0], -1))
                # print(inputs)
                # print("Adversarial:")
                # attack_input = adv[i]
                # attack_input = np.reshape(attack_input, (attack_input.shape[0], -1))
def main(args):
    with tf.Session() as sess:
        if (args['dataset'] == 'mnist'):
            data = MNIST()
            inception = False
            if (args['adversarial'] != "none"):
                model = MNISTModel(
                    "models/mnist_cw" + str(args['adversarial']), sess)
            elif (args['temp']):
                model = MNISTModel(
                    "models/mnist-distilled-" + str(args['temp']), sess)
            else:
                model = MNISTModel("models/mnist", sess)
        if (args['dataset'] == "cifar"):
            data = CIFAR()
            inception = False
            if (args['adversarial'] != "none"):
                model = CIFARModel(
                    "models/cifar_cw" + str(args['adversarial']), sess)
            elif (args['temp']):
                model = CIFARModel(
                    "models/cifar-distilled-" + str(args['temp']), sess)
            else:
                model = CIFARModel("models/cifar", sess)
        if (args['dataset'] == "imagenet"):
            data, model = ImageNet(args['seed_imagenet'],
                                   2 * args['numimg']), InceptionModel(sess)
            inception = True

        inputs, targets, labels, true_ids = generate_data(
            data,
            model,
            samples=args['numimg'],
            targeted=not args['untargeted'],
            target_num=args['targetnum'],
            inception=inception,
            train=args['train'],
            seed=args['seed'])
        timestart = time.time()
        if (args['restore_np']):
            if (args['train']):
                adv = np.load(
                    str(args['dataset']) + '_' + str(args['attack']) +
                    '_train.npy')
            else:
                adv = np.load(
                    str(args['dataset']) + '_' + str(args['attack']) + '.npy')
        else:
            if (args['attack'] == 'L2'):
                attack = CarliniL2(sess,
                                   model,
                                   batch_size=args['batch_size'],
                                   max_iterations=args['maxiter'],
                                   confidence=args['conf'],
                                   initial_const=args['init_const'],
                                   binary_search_steps=args['binary_steps'],
                                   targeted=not args['untargeted'],
                                   beta=args['beta'],
                                   abort_early=args['abort_early'])
                adv = attack.attack(inputs, targets)
            if (args['attack'] == 'L1'):
                attack = EADL1(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               initial_const=args['init_const'],
                               binary_search_steps=args['binary_steps'],
                               targeted=not args['untargeted'],
                               beta=args['beta'],
                               abort_early=args['abort_early'])
                adv = attack.attack(inputs, targets)
            if (args['attack'] == 'EN'):
                attack = EADEN(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               initial_const=args['init_const'],
                               binary_search_steps=args['binary_steps'],
                               targeted=not args['untargeted'],
                               beta=args['beta'],
                               abort_early=args['abort_early'])
                adv = attack.attack(inputs, targets)
            """If untargeted, pass labels instead of targets"""
            if (args['attack'] == 'FGSM'):
                attack = FGM(sess,
                             model,
                             batch_size=args['batch_size'],
                             ord=np.inf,
                             eps=args['eps'],
                             inception=inception)
                adv = attack.attack(inputs, targets)
            if (args['attack'] == 'FGML1'):
                attack = FGM(sess,
                             model,
                             batch_size=args['batch_size'],
                             ord=1,
                             eps=args['eps'],
                             inception=inception)
                adv = attack.attack(inputs, targets)
            if (args['attack'] == 'FGML2'):
                attack = FGM(sess,
                             model,
                             batch_size=args['batch_size'],
                             ord=2,
                             eps=args['eps'],
                             inception=inception)
                adv = attack.attack(inputs, targets)

            if (args['attack'] == 'IFGSM'):
                attack = IFGM(sess,
                              model,
                              batch_size=args['batch_size'],
                              ord=np.inf,
                              eps=args['eps'],
                              inception=inception)
                adv = attack.attack(inputs, targets)
            if (args['attack'] == 'IFGML1'):
                attack = IFGM(sess,
                              model,
                              batch_size=args['batch_size'],
                              ord=1,
                              eps=args['eps'],
                              inception=inception)
                adv = attack.attack(inputs, targets)
            if (args['attack'] == 'IFGML2'):
                attack = IFGM(sess,
                              model,
                              batch_size=args['batch_size'],
                              ord=2,
                              eps=args['eps'],
                              inception=inception)
                adv = attack.attack(inputs, targets)

        timeend = time.time()

        if args['untargeted']:
            num_targets = 1
        else:
            num_targets = args['targetnum']
        print("Took", timeend - timestart, "seconds to run",
              len(inputs) / num_targets, "random instances.")

        if (args['save_np']):
            if (args['train']):
                np.save(str(args['dataset']) + '_labels_train.npy', labels)
                np.save(
                    str(args['dataset']) + '_' + str(args['attack']) +
                    '_train.npy', adv)
            else:
                np.save(
                    str(args['dataset']) + '_' + str(args['attack'] + '.npy'),
                    adv)

        r_best_ = []
        d_best_l1_ = []
        d_best_l2_ = []
        d_best_linf_ = []
        r_average_ = []
        d_average_l1_ = []
        d_average_l2_ = []
        d_average_linf_ = []
        r_worst_ = []
        d_worst_l1_ = []
        d_worst_l2_ = []
        d_worst_linf_ = []

        #Transferability Tests
        model_ = []
        model_.append(model)
        if (args['targetmodel'] != "same"):
            if (args['targetmodel'] == "dd_100"):
                model_.append(MNISTModel("models/mnist-distilled-100", sess))
        num_models = len(model_)

        if (args['show']):
            if not os.path.exists(
                    str(args['save']) + "/" + str(args['dataset']) + "/" +
                    str(args['attack'])):
                os.makedirs(
                    str(args['save']) + "/" + str(args['dataset']) + "/" +
                    str(args['attack']))
        for m, model in enumerate(model_):
            r_best = []
            d_best_l1 = []
            d_best_l2 = []
            d_best_linf = []
            r_average = []
            d_average_l1 = []
            d_average_l2 = []
            d_average_linf = []
            r_worst = []
            d_worst_l1 = []
            d_worst_l2 = []
            d_worst_linf = []
            for i in range(0, len(inputs), num_targets):
                pred = []
                for j in range(i, i + num_targets):
                    if inception:
                        pred.append(
                            np.reshape(model.model.predict(adv[j:j + 1]),
                                       (data.test_labels[0:1].shape)))
                    else:
                        pred.append(model.model.predict(adv[j:j + 1]))

                dist_l1 = 1e10
                dist_l1_index = 1e10
                dist_linf = 1e10
                dist_linf_index = 1e10
                dist_l2 = 1e10
                dist_l2_index = 1e10
                for k, j in enumerate(range(i, i + num_targets)):
                    success = False
                    if (args['untargeted']):
                        if (np.argmax(pred[k], 1) != np.argmax(
                                targets[j:j + 1], 1)):
                            success = True
                    else:
                        if (np.argmax(pred[k],
                                      1) == np.argmax(targets[j:j + 1], 1)):
                            success = True
                    if (success):
                        if (np.sum(np.abs(adv[j] - inputs[j])) < dist_l1):
                            dist_l1 = np.sum(np.abs(adv[j] - inputs[j]))
                            dist_l1_index = j
                        if (np.amax(np.abs(adv[j] - inputs[j])) < dist_linf):
                            dist_linf = np.amax(np.abs(adv[j] - inputs[j]))
                            dist_linf_index = j
                        if ((np.sum((adv[j] - inputs[j])**2)**.5) < dist_l2):
                            dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5)
                            dist_l2_index = j
                if (dist_l1_index != 1e10):
                    d_best_l2.append((np.sum(
                        (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5))
                    d_best_l1.append(
                        np.sum(
                            np.abs(adv[dist_l1_index] -
                                   inputs[dist_l1_index])))
                    d_best_linf.append(
                        np.amax(
                            np.abs(adv[dist_linf_index] -
                                   inputs[dist_linf_index])))
                    r_best.append(1)
                else:
                    r_best.append(0)

                rand_int = np.random.randint(i, i + num_targets)
                if inception:
                    pred_r = np.reshape(
                        model.model.predict(adv[rand_int:rand_int + 1]),
                        (data.test_labels[0:1].shape))
                else:
                    pred_r = model.model.predict(adv[rand_int:rand_int + 1])
                success_average = False
                if (args['untargeted']):
                    if (np.argmax(pred_r, 1) != np.argmax(
                            targets[rand_int:rand_int + 1], 1)):
                        success_average = True
                else:
                    if (np.argmax(pred_r, 1) == np.argmax(
                            targets[rand_int:rand_int + 1], 1)):
                        success_average = True
                if success_average:
                    r_average.append(1)
                    d_average_l2.append(
                        np.sum((adv[rand_int] - inputs[rand_int])**2)**.5)
                    d_average_l1.append(
                        np.sum(np.abs(adv[rand_int] - inputs[rand_int])))
                    d_average_linf.append(
                        np.amax(np.abs(adv[rand_int] - inputs[rand_int])))

                else:
                    r_average.append(0)

                dist_l1 = 0
                dist_l1_index = 1e10
                dist_linf = 0
                dist_linf_index = 1e10
                dist_l2 = 0
                dist_l2_index = 1e10
                for k, j in enumerate(range(i, i + num_targets)):
                    failure = True
                    if (args['untargeted']):
                        if (np.argmax(pred[k], 1) != np.argmax(
                                targets[j:j + 1], 1)):
                            failure = False
                    else:
                        if (np.argmax(pred[k],
                                      1) == np.argmax(targets[j:j + 1], 1)):
                            failure = False
                    if failure:
                        r_worst.append(0)
                        dist_l1_index = 1e10
                        dist_l2_index = 1e10
                        dist_linf_index = 1e10
                        break
                    else:
                        if (np.sum(np.abs(adv[j] - inputs[j])) > dist_l1):
                            dist_l1 = np.sum(np.abs(adv[j] - inputs[j]))
                            dist_l1_index = j
                        if (np.amax(np.abs(adv[j] - inputs[j])) > dist_linf):
                            dist_linf = np.amax(np.abs(adv[j] - inputs[j]))
                            dist_linf_index = j
                        if ((np.sum((adv[j] - inputs[j])**2)**.5) > dist_l2):
                            dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5)
                            dist_l2_index = j
                if (dist_l1_index != 1e10):
                    d_worst_l2.append((np.sum(
                        (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5))
                    d_worst_l1.append(
                        np.sum(
                            np.abs(adv[dist_l1_index] -
                                   inputs[dist_l1_index])))
                    d_worst_linf.append(
                        np.amax(
                            np.abs(adv[dist_linf_index] -
                                   inputs[dist_linf_index])))
                    r_worst.append(1)

                if (args['show'] and m == (num_models - 1)):
                    for j in range(i, i + num_targets):
                        target_id = np.argmax(targets[j:j + 1], 1)
                        label_id = np.argmax(labels[j:j + 1], 1)
                        prev_id = np.argmax(
                            np.reshape(model.model.predict(inputs[j:j + 1]),
                                       (data.test_labels[0:1].shape)), 1)
                        adv_id = np.argmax(
                            np.reshape(model.model.predict(adv[j:j + 1]),
                                       (data.test_labels[0:1].shape)), 1)
                        suffix = "id{}_seq{}_lbl{}_prev{}_adv{}_{}_l1_{:.3f}_l2_{:.3f}_linf_{:.3f}".format(
                            true_ids[i], target_id, label_id, prev_id, adv_id,
                            adv_id == target_id,
                            np.sum(np.abs(adv[j] - inputs[j])),
                            np.sum((adv[j] - inputs[j])**2)**.5,
                            np.amax(np.abs(adv[j] - inputs[j])))

                        show(
                            inputs[j:j + 1],
                            str(args['save']) + "/" + str(args['dataset']) +
                            "/" + str(args['attack']) +
                            "/original_{}.png".format(suffix))
                        show(
                            adv[j:j + 1],
                            str(args['save']) + "/" + str(args['dataset']) +
                            "/" + str(args['attack']) +
                            "/adversarial_{}.png".format(suffix))
            if (m != (num_models - 1)):
                lbl = "Src_"
                if (num_models > 2):
                    lbl += str(m) + "_"
            else:
                lbl = "Tgt_"
            if (num_targets > 1):
                print(lbl + 'best_case_L1_mean', np.mean(d_best_l1))
                print(lbl + 'best_case_L2_mean', np.mean(d_best_l2))
                print(lbl + 'best_case_Linf_mean', np.mean(d_best_linf))
                print(lbl + 'best_case_prob', np.mean(r_best))
                print(lbl + 'average_case_L1_mean', np.mean(d_average_l1))
                print(lbl + 'average_case_L2_mean', np.mean(d_average_l2))
                print(lbl + 'average_case_Linf_mean', np.mean(d_average_linf))
                print(lbl + 'average_case_prob', np.mean(r_average))
                print(lbl + 'worst_case_L1_mean', np.mean(d_worst_l1))
                print(lbl + 'worst_case_L2_mean', np.mean(d_worst_l2))
                print(lbl + 'worst_case_Linf_mean', np.mean(d_worst_linf))
                print(lbl + 'worst_case_prob', np.mean(r_worst))
            else:
                print(lbl + 'L1_mean', np.mean(d_average_l1))
                print(lbl + 'L2_mean', np.mean(d_average_l2))
                print(lbl + 'Linf_mean', np.mean(d_average_linf))
                print(lbl + 'success_prob', np.mean(r_average))
예제 #22
0
def find_adv(sess, face, face_stack_self, face_stack_target, FRmodel, file_name=None, margin=0, hinge_loss=True, model='triplet'):
    const_high = 10.0
    const_low = 0.05
    const = 0.3
    ever_success = False
    best_l2 = 9999.0
    best_adv = None
    best_delta = None
    best_const = None
    

    batch_size = face.shape[0]
    self_size = face_stack_self.shape[0]
    target_size = face_stack_target.shape[0]

    for ii in range(5):
        print("Search #",ii,"with constant",const)
        if model == 'center':
            boxmin = -1
            boxmax = 1
        if model == 'triplet':
            boxmin = 0
            boxmax = 1
        attack = CarliniL2(sess, FRmodel, batch_size=batch_size,
                    learning_rate=0.01,hinge_loss=hinge_loss ,targeted=True,
                    self_db_size=self_size,target_batch_size=target_size,
                    initial_const=const, max_iterations=500, confidence=margin,
                    boxmin=boxmin, boxmax=boxmax)
        adv, delta = attack.attack(face, face_stack_target, face_stack_self)
        if model == 'triplet': 
            dist = face_recog(adv, face_stack_self, face_stack_target, FRmodel, sess)
        if model == 'center':
            dist = face_recog_center(adv, face_stack_self, face_stack_target, FRmodel, sess)
            print(dist)
        if(dist[0] - dist[1] >= margin):
            # Successfully found adv example
            print('Success with const',const)
            ever_success = True
            adv_l2 = np.linalg.norm(delta)
            if(adv_l2) < best_l2:
                best_l2 = adv_l2
                best_adv = adv
                best_delta = delta
                best_const = const
            # decrease const
            const_high = const
            const = (const_high + const_low) / 2
        else:
            # Faild to find adv example
            print('Failure with const',const)
            const_low = const
            const = (const_high + const_low) / 2
        if(ever_success == True and const_high-const_low < 0.02):
            break
    
    if(ever_success):
        print('Successfully found adv example')
    else:
        print('Failed to find adv example')
    
    if(file_name):
        np.savez(file_name, face=face, adv=best_adv, delta=best_delta, l2=best_l2)

    return best_adv, best_delta, best_l2, best_const
예제 #23
0
#classification+samples+batch_size+start
filename = '95s40bs9start10.pkl'
utfile = 'ut_'+filename
advname = 'adv_'+filename

if __name__ == "__main__":
    with tf.Session() as sess:

        data, model = ImageNet(), InceptionModel(sess)
        inputs, targets = generate_data(model, data, samples=samples, targeted=True,
                                        start=start, inception=True, batch_size = bs)

        tar = np.argmax(targets, axis = 1)

        attack = CarliniL2(sess, model, batch_size=bs, max_iterations = mi, confidence=confidence)
#        attack = CarliniL0(sess, model, max_iterations = mi)
#        attack = CarliniLi(sess, model)

        timestart = time.time()
        adv = attack.attack(inputs, targets)
        timeend = time.time()

        advnp = np.array(adv).astype(np.uint8)

        f = open(advname,'wb')
        pickle.dump(advnp,f)
        f.close

        print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.")
예제 #24
0
def main(_):
    '''
  # Build the inference graph.
  g = tf.Graph()
  with g.as_default():
    
    model = attack_wrapper.AttackWrapper()
    restore_fn = model.build_graph_from_config(configuration.ModelConfig(),
                                               FLAGS.checkpoint_path)
  # g.finalize()

  # Create the vocabulary.
  '''

    tf.set_random_seed(1234)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    config = tf.ConfigProto(gpu_options=gpu_options)
    vocab = vocabulary.Vocabulary(FLAGS.vocab_file)

    # TODO: build the inference graph
    inference_graph = tf.Graph()
    with inference_graph.as_default():
        inf_model = inference_wrapper.InferenceWrapper()
        inf_restore_fn = inf_model.build_graph_from_config(
            configuration.ModelConfig(), FLAGS.checkpoint_path)
        # inf_image_placeholder = tf.placeholder(dtype=tf.string, shape=[], name="inf_image_placeholder")
        # inf_preprocessor = inf_model.model.process_image(inf_image_placeholder)
    inference_graph.finalize()
    inf_sess = tf.Session(graph=inference_graph, config=config)
    # Load the model from checkpoint.
    inf_restore_fn(inf_sess)

    attack_graph = tf.Graph()
    with attack_graph.as_default():
        model = attack_wrapper.AttackWrapper()
        sess = tf.Session(config=config)
        # build the attacker graph
        attack = CarliniL2(sess,
                           inf_sess,
                           attack_graph,
                           inference_graph,
                           model,
                           inf_model,
                           targeted=FLAGS.targeted,
                           use_keywords=FLAGS.use_keywords,
                           use_logits=FLAGS.use_logits,
                           batch_size=1,
                           initial_const=1.0,
                           max_iterations=1000,
                           print_every=1,
                           confidence=2,
                           use_log=False,
                           norm=FLAGS.norm,
                           abort_early=False,
                           learning_rate=0.005)
        # compute graph for preprocessing
        image_placeholder = tf.placeholder(dtype=tf.string, shape=[])
        preprocessor = model.model.process_image(image_placeholder)

    filenames = []
    for file_pattern in FLAGS.input_files.split(","):
        filenames.extend(tf.gfile.Glob(file_pattern))
    tf.logging.info("Running caption generation on %d files matching %s",
                    len(filenames), FLAGS.input_files)

    for filename in filenames:

        with tf.gfile.GFile(filename, "rb") as f:
            image = f.read()

        raw_image = sess.run(preprocessor,
                             feed_dict={image_placeholder: image})

        print('raw image size:', raw_image.shape)
        '''
    new_sentence = "kite"
    new_sentence = "a man on a surfboard riding a wave ."
    new_sentence = "a dog riding a bike on a road ."
    new_sentence = "a group of giraffe standing next to each other ." # success, p=0.016556
    new_sentence = "a person skiing down a snow covered slope ." # success, p=0.021917
    new_sentence = "a person on a beach flying a kite ." # success, p=0.019417
    new_sentence = "a black and white photo of a train on a track ." # success, p=0.006146
    new_sentence = "a bowl of pasta with meat and vegetables ."
    new_sentence = "a man and girl carrying kites down a sidewalk in front of a metro bus ." # end up with "a group of people standing on top of a sandy beach ." same as a sentence in training set
    new_sentence = "a man and girl carrying surfboards down a sidewalk in front of a metro bus ."# same as in training set
    '''

        new_sentence = FLAGS.input_feed
        new_sentence = new_sentence.split()
        print("My new sentence:", new_sentence)
        max_caption_length = 20
        new_caption = [vocab.start_id
                       ] + [vocab.word_to_id(w)
                            for w in new_sentence] + [vocab.end_id]
        true_cap_len = len(new_caption)
        new_caption = new_caption + [vocab.end_id
                                     ] * (max_caption_length - true_cap_len)

        print("My new id:", new_caption)
        new_mask = np.append(np.ones(true_cap_len),
                             np.zeros(max_caption_length - true_cap_len))
        # print("Probability by attack_step:", model.attack_step(sess, new_caption, new_mask, raw_image))

        # adv = attack.attack(np.array([raw_image]), new_caption, [new_mask])
        # key_words = [vocab.word_to_id("surfboard"),vocab.word_to_id("riding"),vocab.word_to_id("man"),vocab.word_to_id("wave"),vocab.word_to_id("dog"),vocab.word_to_id("water"),vocab.word_to_id("woman"),vocab.word_to_id("surfer"),vocab.word_to_id("ocean"),vocab.word_to_id("frisbee")]
        # key_words = [vocab.word_to_id("surfboard"), vocab.word_to_id("man"), vocab.word_to_id("wave"), vocab.word_to_id("riding"), vocab.word_to_id("water")]
        # key_words = [vocab.word_to_id("giraffe"), vocab.word_to_id("standing"), vocab.word_to_id("photo")]
        # key_words = [vocab.word_to_id("photo"), vocab.word_to_id("train"), vocab.word_to_id("track")]
        # words = ["train", "photo", "track"]
        words = ["riding", "train", "long"]
        words = FLAGS.input_feed.split()
        key_words = [vocab.word_to_id(word) for word in words]
        print(key_words)
        # key_words = [vocab.word_to_id("bird"), vocab.word_to_id("flying")]
        key_words_mask = np.append(
            np.ones(len(key_words)),
            np.zeros(max_caption_length - len(key_words)))
        key_words = key_words + [vocab.end_id
                                 ] * (max_caption_length - len(key_words))

        if FLAGS.use_keywords:
            # keywords based attack
            adv = attack.attack(np.array([raw_image]), sess, inf_sess, model,
                                inf_model, vocab, key_words, key_words_mask, 1)
        else:
            # exact attack
            adv = attack.attack(np.array([raw_image]), sess, inf_sess, model,
                                inf_model, vocab, new_caption, new_mask, 1)

        l2_distortion = np.sum((adv - raw_image)**2)**.5
        linf_distortion = np.max(np.abs(adv - raw_image))
        print("L2 distortion is", l2_distortion)
        print("L_inf distortion is", linf_distortion)
        show(raw_image, "original.png")
        show(adv, "adversarial.png")
        show(adv - raw_image, "diff.png")
        inf_sess.close()
예제 #25
0
def main(args):
    with tf.Session() as sess:
        if (args['dataset'] == 'mnist'):
            data, model = MNIST(), MNISTModel("models/mnist", sess)
            handpick = False
            inception = False
        if (args['dataset'] == "cifar"):
            data, model = CIFAR(), CIFARModel("models/cifar", sess)
            handpick = True
            inception = False
        if (args['dataset'] == "imagenet"):
            data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess)
            handpick = True
            inception = True

        if (args['adversarial'] != "none"):
            model = MNISTModel("models/mnist_cw" + str(args['adversarial']),
                               sess)

        if (args['temp'] and args['dataset'] == 'mnist'):
            model = MNISTModel("models/mnist-distilled-" + str(args['temp']),
                               sess)
        if (args['temp'] and args['dataset'] == 'cifar'):
            model = CIFARModel("models/cifar-distilled-" + str(args['temp']),
                               sess)

        inputs, targets, labels, true_ids = generate_data(
            data,
            model,
            samples=args['numimg'],
            inception=inception,
            handpick=handpick,
            train=args['train'],
            seed=args['seed'])
        timestart = time.time()
        if (args['attack'] == 'L2'):
            attack = CarliniL2(sess,
                               model,
                               batch_size=args['batch_size'],
                               max_iterations=args['maxiter'],
                               confidence=args['conf'],
                               binary_search_steps=args['binary_steps'],
                               beta=args['beta'],
                               abort_early=args['abort_early'])
            adv = attack.attack(inputs, targets)
        if (args['attack'] == 'L1'):
            attack = EADL1(sess,
                           model,
                           batch_size=args['batch_size'],
                           max_iterations=args['maxiter'],
                           confidence=args['conf'],
                           binary_search_steps=args['binary_steps'],
                           beta=args['beta'],
                           abort_early=args['abort_early'])
            adv = attack.attack(inputs, targets)
        if (args['attack'] == 'EN'):
            attack = EADEN(sess,
                           model,
                           batch_size=args['batch_size'],
                           max_iterations=args['maxiter'],
                           confidence=args['conf'],
                           binary_search_steps=args['binary_steps'],
                           beta=args['beta'],
                           abort_early=args['abort_early'])
            adv = attack.attack(inputs, targets)
        """If untargeted, pass labels instead of targets"""
        if (args['attack'] == 'FGSM'):
            attack = FGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=np.inf,
                         inception=inception)
            adv = attack.attack(inputs, targets)
        if (args['attack'] == 'FGML1'):
            attack = FGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=1,
                         inception=inception)
            adv = attack.attack(inputs, targets)
        if (args['attack'] == 'FGML2'):
            attack = FGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=2,
                         inception=inception)
            adv = attack.attack(inputs, targets)

        if (args['attack'] == 'IFGSM'):
            attack = IGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=np.inf,
                         inception=inception)
            adv = attack.attack(inputs, targets)
        if (args['attack'] == 'IFGML1'):
            attack = IGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=1,
                         inception=inception)
            adv = attack.attack(inputs, targets)
        if (args['attack'] == 'IFGML2'):
            attack = IGM(sess,
                         model,
                         batch_size=args['batch_size'],
                         ord=2,
                         inception=inception)
            adv = attack.attack(inputs, targets)

        timeend = time.time()
        print("Took", timeend - timestart, "seconds to run",
              len(inputs) / args['batch_size'], "random instances.")

        if (args['train']):
            np.save('labels_train.npy', labels)
            np.save(str(args['attack']) + '_train.npy', adv)
            return

        r_best = []
        d_best_l1 = []
        d_best_l2 = []
        d_best_linf = []
        r_average = []
        d_average_l1 = []
        d_average_l2 = []
        d_average_linf = []
        r_worst = []
        d_worst_l1 = []
        d_worst_l2 = []
        d_worst_linf = []

        if (args['conf'] != 0):
            model = MNISTModel("models/mnist-distilled-100", sess)

        if (args['show']):
            if not os.path.exists(
                    str(args['save']) + "/" + str(args['dataset']) + "/" +
                    str(args['attack'])):
                os.makedirs(
                    str(args['save']) + "/" + str(args['dataset']) + "/" +
                    str(args['attack']))

        for i in range(0, len(inputs), args['batch_size']):

            pred = []
            for j in range(i, i + args['batch_size']):
                if inception:
                    pred.append(
                        np.reshape(model.model.predict(adv[j:j + 1]),
                                   (data.test_labels[0:1].shape)))
                else:
                    pred.append(model.model.predict(adv[j:j + 1]))

            dist_l1 = 1e10
            dist_l2 = 1e10
            dist_linf = 1e10
            dist_l1_index = 1e10
            dist_l2_index = 1e10
            dist_linf_index = 1e10
            for k, j in enumerate(range(i, i + args['batch_size'])):
                if (np.argmax(pred[k], 1) == np.argmax(targets[j:j + 1], 1)):
                    if (np.sum(np.abs(adv[j] - inputs[j])) < dist_l1):
                        dist_l1 = np.sum(np.abs(adv[j] - inputs[j]))
                        dist_l1_index = j
                    if (np.amax(np.abs(adv[j] - inputs[j])) < dist_linf):
                        dist_linf = np.amax(np.abs(adv[j] - inputs[j]))
                        dist_linf_index = j
                    if ((np.sum((adv[j] - inputs[j])**2)**.5) < dist_l2):
                        dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5)
                        dist_l2_index = j
            if (dist_l1_index != 1e10):
                d_best_l2.append((np.sum(
                    (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5))
                d_best_l1.append(
                    np.sum(np.abs(adv[dist_l1_index] - inputs[dist_l1_index])))
                d_best_linf.append(
                    np.amax(
                        np.abs(adv[dist_linf_index] -
                               inputs[dist_linf_index])))
                r_best.append(1)
            else:
                r_best.append(0)

            rand_int = np.random.randint(i, i + args['batch_size'])
            if inception:
                pred_r = np.reshape(
                    model.model.predict(adv[rand_int:rand_int + 1]),
                    (data.test_labels[0:1].shape))
            else:
                pred_r = model.model.predict(adv[rand_int:rand_int + 1])
            if (np.argmax(pred_r,
                          1) == np.argmax(targets[rand_int:rand_int + 1], 1)):
                r_average.append(1)
                d_average_l2.append(
                    np.sum((adv[rand_int] - inputs[rand_int])**2)**.5)
                d_average_l1.append(
                    np.sum(np.abs(adv[rand_int] - inputs[rand_int])))
                d_average_linf.append(
                    np.amax(np.abs(adv[rand_int] - inputs[rand_int])))

            else:
                r_average.append(0)

            dist_l1 = 0
            dist_l1_index = 1e10
            dist_linf = 0
            dist_linf_index = 1e10
            dist_l2 = 0
            dist_l2_index = 1e10
            for k, j in enumerate(range(i, i + args['batch_size'])):
                if (np.argmax(pred[k], 1) != np.argmax(targets[j:j + 1], 1)):
                    r_worst.append(0)
                    dist_l1_index = 1e10
                    dist_l2_index = 1e10
                    dist_linf_index = 1e10
                    break
                else:
                    if (np.sum(np.abs(adv[j] - inputs[j])) > dist_l1):
                        dist_l1 = np.sum(np.abs(adv[j] - inputs[j]))
                        dist_l1_index = j
                    if (np.amax(np.abs(adv[j] - inputs[j])) > dist_linf):
                        dist_linf = np.amax(np.abs(adv[j] - inputs[j]))
                        dist_linf_index = j
                    if ((np.sum((adv[j] - inputs[j])**2)**.5) > dist_l2):
                        dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5)
                        dist_l2_index = j
            if (dist_l1_index != 1e10):
                d_worst_l2.append((np.sum(
                    (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5))
                d_worst_l1.append(
                    np.sum(np.abs(adv[dist_l1_index] - inputs[dist_l1_index])))
                d_worst_linf.append(
                    np.amax(
                        np.abs(adv[dist_linf_index] -
                               inputs[dist_linf_index])))
                r_worst.append(1)

            if (args['show']):
                for j in range(i, i + args['batch_size']):
                    target_id = np.argmax(targets[j:j + 1], 1)
                    label_id = np.argmax(labels[j:j + 1], 1)
                    prev_id = np.argmax(
                        np.reshape(model.model.predict(inputs[j:j + 1]),
                                   (data.test_labels[0:1].shape)), 1)
                    adv_id = np.argmax(
                        np.reshape(model.model.predict(adv[j:j + 1]),
                                   (data.test_labels[0:1].shape)), 1)
                    suffix = "id{}_seq{}_lbl{}_prev{}_adv{}_{}_l1_{:.3f}_l2_{:.3f}_linf_{:.3f}".format(
                        true_ids[i], target_id, label_id, prev_id,
                        adv_id, adv_id == target_id,
                        np.sum(np.abs(adv[j] - inputs[j])),
                        np.sum((adv[j] - inputs[j])**2)**.5,
                        np.amax(np.abs(adv[j] - inputs[j])))

                    show(
                        inputs[j:j + 1],
                        str(args['save']) + "/" + str(args['dataset']) + "/" +
                        str(args['attack']) +
                        "/original_{}.png".format(suffix))
                    show(
                        adv[j:j + 1],
                        str(args['save']) + "/" + str(args['dataset']) + "/" +
                        str(args['attack']) +
                        "/adversarial_{}.png".format(suffix))

        print('best_case_L1_mean', np.mean(d_best_l1))
        print('best_case_L2_mean', np.mean(d_best_l2))
        print('best_case_Linf_mean', np.mean(d_best_linf))
        print('best_case_prob', np.mean(r_best))
        print('average_case_L1_mean', np.mean(d_average_l1))
        print('average_case_L2_mean', np.mean(d_average_l2))
        print('average_case_Linf_mean', np.mean(d_average_linf))
        print('average_case_prob', np.mean(r_average))
        print('worst_case_L1_mean', np.mean(d_worst_l1))
        print('worst_case_L2_mean', np.mean(d_worst_l2))
        print('worst_case_Linf_mean', np.mean(d_worst_linf))
        print('worst_case_prob', np.mean(r_worst))
예제 #26
0
    classifier = Classifier("./models/cifar_example_classifier")

if dataset == "MNIST":
    data = MNIST()
else:
    data = CIFAR()

class Pred2:
    image_size = 28 if dataset == "MNIST" else 32
    num_labels = 10
    num_channels = 1 if dataset == "MNIST" else 3
    def predict(self, x):
        return classifier.model(x)


keras.backend.set_learning_phase(False)
sess = keras.backend.get_session()
attack = CarliniL2(sess, [Pred2()], {}, {}, batch_size=100,
                   binary_search_steps=4, learning_rate=1e-2,
                   max_iterations=10000, targeted=True,
                   initial_const=1, confidence=1,
                   boxmin=0, boxmax=1)

idx = [np.where(np.argmax(data.test_labels,axis=1)==i)[0][0] for i in range(10)]
dat = np.array([data.test_data[i] for i in idx for j in range(10)])
lab = sess.run(tf.one_hot(np.array([list(range(10))]*10).flatten(), depth=10))

adv = attack.attack(dat, lab)
print('mean distortion', np.mean(np.sum((adv-dat)**2,axis=(1,2,3))**.5))