def test(self, testX=None, testy=None):
        prist_testX = None
        adv_testX = None
        if testX is None and testy is None:
            _, prist_testX, adv_testX = utils.read_joblib(
                config.get('dataset', 'dataX'))
            _, prist_testy, adv_testy = utils.read_joblib(
                config.get('dataset', 'datay'))

            testX = np.concatenate((prist_testX, adv_testX))
            testy = np.concatenate((prist_testy, adv_testy))
        if len(testX) == 0:
            print("No test data.")
            return

        self.mode = 'test'

        # rebuild the graph
        tf.reset_default_graph()
        self.model_graph()

        cur_checkpoint = tf.train.latest_checkpoint(self.save_dir)
        if cur_checkpoint is None:
            print("No saved parameters")
            return

        saver = tf.train.Saver()
        eval_dir = os.path.join(self.save_dir, 'eval')
        sess = tf.Session()
        with sess:
            saver.restore(sess, cur_checkpoint)
            accuracy, macro_f1_score = tester(sess, testX, testy, self,
                                              eval_dir)
            MSG = "The accuracy on the test dataset is {:.5f}%"
            print(MSG.format(accuracy * 100))
            logger.info(MSG.format(accuracy * 100))

            if prist_testX is not None and adv_testX is not None:
                print("Other evaluation metrics we may need:")
                prist_acc, prist_f1_socre = tester(sess, prist_testX,
                                                   prist_testy, self, eval_dir)
                adv_acc, adv_f1_score = tester(sess, adv_testX, adv_testy,
                                               self, eval_dir)
                harmonic_f1_score = utils.harmonic_mean(
                    prist_f1_socre, adv_f1_score)
                MSG = "The accuracy on pristine test datasest is {:.5f}% vs. {:.5f}% on adversarial data."
                print(MSG.format(prist_acc * 100, adv_acc * 100))
                logger.info(MSG.format(prist_acc * 100, adv_acc * 100))
                MSG = "The macro f1 score on pristine test datasest is {:.5f}% vs. {:.5f}% on adversarial data."
                print(MSG.format(prist_f1_socre * 100, adv_f1_score * 100))
                logger.info(
                    MSG.format(prist_f1_socre * 100, adv_f1_score * 100))
                MSG = "Harmonic macro F1 score is {:.5f}%"
                print(MSG.format(harmonic_f1_score * 100))
                logger.info(MSG.format(harmonic_f1_score * 100))

            sess.close()
        return accuracy
    def __init__(self,
                 hyper_params=None,
                 reuse=False,
                 is_saving=True,
                 init_graph=True,
                 mode='train',
                 name='JOINT_DEFENSE'):
        self.is_saving = is_saving
        self.init_graph = init_graph
        self.mode = mode

        if hyper_params is None:
            hyper_params = ADV_TRAIN_HP
        self.hp_params = utils.ParamWrapper(hyper_params)
        self.threshold = None  # get_median()

        # attack initilization
        if not (os.path.exists(config.get('dataset', 'dataX'))
                and os.path.exists(config.get('dataset', 'datay'))
                and os.path.exists(config.get('dataset', 'normalizer'))):
            dataX, datay = self.data_preprocess()
            utils.dump_joblib(dataX, config.get('dataset', 'dataX'))
            utils.dump_joblib(datay, config.get('dataset', 'datay'))

        self.normalizer = utils.read_joblib(config.get('dataset',
                                                       'normalizer'))
        input_dim = len(self.normalizer.data_min_)
        self.inner_maximizer = PGDAdam(self,
                                       input_dim,
                                       self.normalizer,
                                       verbose=False,
                                       **AUG_PARAM)
        super(JointDefense, self).__init__(hyper_params, reuse, self.is_saving,
                                           self.init_graph, self.mode, name)
def get_median():
    if not os.path.exists(config.get('dataset', 'threshold')):
        trainX, _, _ = utils.read_joblib(config.get('dataset', 'dataX'))
        threshold = np.median(trainX, axis=0)
        utils.dumpdata_np(threshold, config.get('dataset', 'threshold'))
    threshold = utils.readdata_np(config.get('dataset', 'threshold'))
    return threshold
Пример #4
0
    def __init__(self,
                 hyper_params = None,
                 reuse = False,
                 is_saving = True,
                 init_graph = True,
                 mode = 'train',
                 name = 'DAE_RPST_LEARN_DNN'):
        self.is_saving = is_saving
        self.init_graph = init_graph
        self.mode = mode

        if hyper_params is None:
            hyper_params = DAE_TRAIN_HP

        # initilization
        if not (os.path.exists(config.get('dataset', 'dataX')) and
                os.path.exists(config.get('dataset', 'datay')) and
                os.path.exists(config.get('dataset', 'normalizer'))):
            dataX, datay = self.data_preprocess()
            utils.dump_joblib(dataX, config.get('dataset', 'dataX'))
            utils.dump_joblib(datay, config.get('dataset', 'datay'))

        self.normalizer = utils.read_joblib(config.get('dataset', 'normalizer'))
        input_dim = len(self.normalizer.data_min_)
        self.inner_maximizer = PGDAdam(self, input_dim, self.normalizer, verbose=False, **AUG_PARAM)

        super(DAE_RPST_DNN, self).__init__(hyper_params, reuse,
                                           self.is_saving, self.init_graph, self.mode, name)
def normalize_inverse(X, normalizer=None):
    try:
        if normalizer is None:
            normalizer = utils.read_joblib(config.get('dataset', 'normalizer'))
        if np.min(X) < 0 and np.max(X) > 1.:
            warnings.warn("The data is not within the range [0, 1]")
    except IOError as e:
        raise IOError("Unable to load normalizer.")
    return normalizer.inverse_transform(X)
def normalize_data(X, is_fitting=False):
    """Normalize data using minmaxscalar"""
    if not os.path.exists(config.get('dataset', 'normalizer')) and is_fitting:
        minmax_norm = MinMaxScaler()
        normalizer = minmax_norm.fit(X)
        utils.dump_joblib(
            normalizer,
            config.get('dataset', 'normalizer'),
        )
    normalizer = utils.read_joblib(config.get('dataset', 'normalizer'))
    x_clipped = np.clip(X,
                        a_min=normalizer.data_min_,
                        a_max=normalizer.data_max_)
    X_normlized = normalizer.transform(x_clipped)
    return X_normlized
    def __init__(self,
                 hyper_params=None,
                 reuse=False,
                 is_saving=True,
                 init_graph=True,
                 mode='train',
                 name='BASIC_DNN'):
        super(BasicDNN, self).__init__()
        self.is_saving = is_saving
        self.init_graph = init_graph
        self.reuse = reuse
        self.model_name = name

        try:
            assert mode == 'train' or mode == 'test'
        except:
            raise AssertionError("Two modes: 'train' or 'test', not both.")
        self.mode = mode
        if hyper_params is not None:
            self.hp_params_dict = hyper_params
            self.hp_params = utils.ParamWrapper(hyper_params)
        else:
            self.hp_params_dict = DNN_HP
            self.hp_params = utils.ParamWrapper(DNN_HP)

        if self.is_saving:
            self.save_dir = config.get("experiments", self.model_name.lower())

        if not (os.path.exists(config.get('dataset', 'dataX'))
                and os.path.exists(config.get('dataset', 'datay'))
                and os.path.exists(config.get('dataset', 'normalizer'))):
            dataX, datay = self.data_preprocess()
            utils.dump_joblib(dataX, config.get('dataset', 'dataX'))
            utils.dump_joblib(datay, config.get('dataset', 'datay'))
        self.normalizer = utils.read_joblib(config.get('dataset',
                                                       'normalizer'))

        # DNN based model
        self.input_dim = len(self.normalizer.data_min_)

        self.hidden_layers = self.hp_params.hidden_units
        self.output_dim = self.hp_params.output_dim
        tf.set_random_seed(self.hp_params.random_seed)
        if self.init_graph:
            self.model_graph(reuse=reuse)
    def train(self, trainX=None, trainy=None, is_sampling = True):
        """train dnn based malware detector"""
        if trainX is None and trainy is None:
            trainX, _, _ = utils.read_joblib(config.get('dataset', 'dataX'))
            trainy, _, _ = utils.read_joblib(config.get('dataset', 'datay'))
        if is_sampling:
            trainX, trainy = random_over_sampling(trainX, trainy, ratio=0.3)

        # train submodel subsequently per mini-batch
        global_train_step = tf.train.get_or_create_global_step()
        saver = tf.train.Saver()

        # optimizers
        from collections import defaultdict
        optimizers_dict = defaultdict(list)
        for sub_m in range(self.hp_params.base_module_count):
            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                optimizer_clf = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(
                    self.sub_models[sub_m].cross_entropy,
                    global_step=global_train_step)
                optimizers_dict[sub_m] = [optimizer_clf]

        tf_cfg = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
        tf_cfg.gpu_options.allow_growth = True
        tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1.
        sess = tf.Session(config=tf_cfg)

        with sess.as_default():
            # summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            training_time = 0.0
            output_steps = 200
            for epoch_idx in range(self.hp_params.n_epochs):

                for sub_m in range(self.hp_params.base_module_count):
                    train_idx = range(len(trainX))
                    random.seed(self.random_seeds[sub_m])
                    sub_train_idx = random.sample(train_idx, int(len(train_idx) * self.training_sample_ratio))
                    train_input_supervised = utils.DataProducer(trainX[sub_train_idx], trainy[sub_train_idx],
                                                                self.hp_params.batch_size, n_epochs=1)
                    train_input_supervised.reset_cursor()

                    for step_idx, X_batch, y_batch in train_input_supervised.next_batch():

                        train_dict = {
                            self.x_input: X_batch,
                            self.y_input: y_batch,
                            self.is_training: True
                        }

                        start = default_timer()
                        if len(optimizers_dict[sub_m]) == 1:
                            sess.run(optimizers_dict[sub_m][0], feed_dict=train_dict)
                        else:
                            raise ValueError("Optimizer needs to be changed.")
                        end = default_timer()
                        training_time = training_time + end - start
                        iterations = epoch_idx * train_input_supervised.mini_batches + step_idx + 1

                        if iterations % output_steps == 0:
                            print("Sub model: ", sub_m)
                            print('Epoch {}/{},Step {}/{}:{}'.format(epoch_idx, self.hp_params.n_epochs,
                                                                     step_idx + 1, train_input_supervised.steps,
                                                                     datetime.now()))

                            _acc = sess.run(self.accuracy, feed_dict=train_dict)
                            print('    training accuracy {:.5}%'.format(_acc * 100))

                            if not os.path.exists(self.save_dir):
                                os.makedirs(self.save_dir)
                            saver.save(sess, os.path.join(self.save_dir, 'checkpoint'),
                                       global_step=global_train_step)

        sess.close()
Пример #9
0
    def train(self, trainX = None, trainy = None, is_sampling = False):
        """train dnn based malware detector"""
        if trainX is None and trainy is None:
            trainX, _, _ = utils.read_joblib(config.get('dataset', 'dataX'))
            trainy, _, _ = utils.read_joblib(config.get('dataset', 'datay'))

        if is_sampling:
            trainX, trainy = random_over_sampling(trainX, trainy, ratio=0.3)

        train_input_supervised = utils.DataProducer(trainX, trainy,
                                                    self.hp_params.batch_size,
                                                    n_epochs=self.hp_params.n_epochs)

        saver = tf.train.Saver(max_to_keep=10)
        tf.summary.scalar('accuracy', self.accuracy)
        tf.summary.scalar('loss', self.cross_entropy)
        merged_summaries = tf.summary.merge_all()
        global_train_step = tf.train.get_or_create_global_step()

        # optimizer
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            optimizer_clf = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(self.cross_entropy,
                                                                                          global_step=global_train_step)
            optimizer_dae = tf.train.AdamOptimizer(self.hp_params.learning_rate).minimize(self.mse_dae)

        tf_cfg = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
        tf_cfg.gpu_options.allow_growth = True
        tf_cfg.gpu_options.per_process_gpu_memory_fraction = 1.
        sess = tf.Session(config=tf_cfg)

        with sess.as_default():
            summary_writer = tf.summary.FileWriter(self.save_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            training_time = 0.0
            train_input_supervised.reset_cursor()
            output_steps = 100
            for step_idx, X_batch, y_batch in train_input_supervised.next_batch():
                train_dict = {
                    self.x_input: X_batch,
                    self.y_input: y_batch,
                    self.is_training: True
                }

                if (step_idx + 1) % output_steps == 0:
                    print('Step {}/{}:{}'.format(step_idx + 1, train_input_supervised.steps, datetime.now()))
                    _acc = sess.run(self.accuracy, feed_dict=train_dict)
                    print("The Accuracy on training batch:{:.5f}%".format(_acc * 100))
                    if step_idx != 0:
                        print('    {} samples per second'.format(
                            output_steps * self.hp_params.batch_size / training_time))
                        training_time = 0.

                    summary = sess.run(merged_summaries, feed_dict=train_dict)
                    summary_writer.add_summary(summary, global_train_step.eval(sess))
                    if not os.path.exists(self.save_dir):
                        os.makedirs(self.save_dir)
                    saver.save(sess, os.path.join(self.save_dir, 'checkpoint'),
                               global_step=global_train_step)

                start = default_timer()
                sess.run(optimizer_dae, feed_dict=train_dict)
                sess.run(optimizer_clf, feed_dict=train_dict)
                end = default_timer()
                training_time = training_time + end - start
        sess.close()