예제 #1
0
    def load_data(
        self,
        original_scale=False,
    ):

        print("[INFO: ] Loading data...")

        X = load_mnist_images('%strain-images-idx3-ubyte.gz' % self.data_path)
        y = load_mnist_labels('%strain-labels-idx1-ubyte.gz' % self.data_path)
        X_test = load_mnist_images('%st10k-images-idx3-ubyte.gz' %
                                   self.data_path)
        y_test = load_mnist_labels('%st10k-labels-idx1-ubyte.gz' %
                                   self.data_path)

        if Cfg.ad_experiment:

            # set normal and anomalous class
            normal = []
            outliers = []

            if Cfg.mnist_normal == -1:
                normal = list(range(0, 10))
                normal.remove(Cfg.mnist_outlier)
            else:
                normal.append(Cfg.mnist_normal)

            if Cfg.mnist_outlier == -1:
                outliers = list(range(0, 10))
                outliers.remove(Cfg.mnist_normal)
            else:
                outliers.append(Cfg.mnist_outlier)
                print("[INFO:] The  label  of outlier  points are ",
                      Cfg.mnist_outlier)
                print("[INFO:] The  number of outlier  points are ",
                      len(outliers))

            print("[INFO:] The  label  of normal points are ",
                  Cfg.mnist_normal)
            # extract normal and anomalous class

            X_norm, X_out, y_norm, y_out = extract_norm_and_out(
                X, y, normal=normal, outlier=outliers)

            # reduce outliers to fraction defined
            n_norm = len(y_norm)
            n_out = int(np.ceil(Cfg.out_frac * n_norm / (1 - Cfg.out_frac)))
            #
            # print("[INFO:] The number of normal data points are ", (n_norm))
            # print("[INFO:] The number of outlier data points are ", (n_out))

            # shuffle to obtain random validation splits
            print("[INFO:] Random Seed used is  ", Cfg.seed)
            np.random.seed(self.seed)
            perm_norm = np.random.permutation(len(y_norm))
            perm_out = np.random.permutation(len(y_out))

            # split into training and validation set
            n_norm_split = int(Cfg.mnist_val_frac * n_norm)
            n_out_split = int(Cfg.mnist_val_frac * n_out)

            X_norm_Training = X_norm[perm_norm[n_norm_split:]]
            X_out_Training = X_out[perm_out[:n_out][n_out_split:]]

            # print("[INFO:] The shape of Normal used in training+validation ", X_norm_Training.shape)
            # print("[INFO:] The shape of Outlier used in training+validation ", X_out_Training.shape)

            self._X_train = np.concatenate(
                (X_norm[perm_norm[n_norm_split:]],
                 X_out[perm_out[:n_out][n_out_split:]]))
            self._y_train = np.append(y_norm[perm_norm[n_norm_split:]],
                                      y_out[perm_out[:n_out][n_out_split:]])
            self._X_val = np.concatenate(
                (X_norm[perm_norm[:n_norm_split]],
                 X_out[perm_out[:n_out][:n_out_split]]))
            self._y_val = np.append(y_norm[perm_norm[:n_norm_split]],
                                    y_out[perm_out[:n_out][:n_out_split]])

            # print("[INFO:] The shape of Data used in [ Training  ] ", self._X_train.shape)
            # print("[INFO:] The shape of Data used in [ Validation ] ", self._X_val.shape)

            # shuffle data (since batches are extracted block-wise)
            self.n_train = len(self._y_train)
            self.n_val = len(self._y_val)
            perm_train = np.random.permutation(self.n_train)
            perm_val = np.random.permutation(self.n_val)
            self._X_train = self._X_train[perm_train]
            self._y_train = self._y_train[perm_train]
            self._X_val = self._X_train[perm_val]
            self._y_val = self._y_train[perm_val]

            # Subset train set such that we only get batches of the same size
            self.n_train = (self.n_train / Cfg.batch_size) * Cfg.batch_size
            subset = np.random.choice(len(self._X_train),
                                      int(self.n_train),
                                      replace=False)
            self._X_train = self._X_train[subset]
            self._y_train = self._y_train[subset]

            # Adjust number of batches
            Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

            # test set
            X_norm, X_out, y_norm, y_out = extract_norm_and_out(
                X_test, y_test, normal=normal, outlier=outliers)
            self._X_test = np.concatenate((X_norm, X_out))
            self._y_test = np.append(y_norm, y_out)
            perm_test = np.random.permutation(len(self._y_test))
            self._X_test = self._X_test[perm_test]
            self._y_test = self._y_test[perm_test]
            self.n_test = len(self._y_test)
            #
            # print("[INFO:] The shape of  Normal instances used in Testing ", X_norm.shape)
            # print("[INFO:] The shape of  Outlier instances used in Testing ", X_out.shape)
            # print("========================================================================")

        else:
            # split into training, validation, and test sets
            np.random.seed(self.seed)
            perm = np.random.permutation(len(X))

            self._X_train = X[perm[self.n_val:]]
            self._y_train = y[perm[self.n_val:]]
            self._X_val = X[perm[:self.n_val]]
            self._y_val = y[perm[:self.n_val]]
            self._X_test = X_test
            self._y_test = y_test

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        print("[INFO: ] Data loaded.")
예제 #2
0
    def load_data(self, original_scale=True):

        print("[INFO:] Loading data...")

        # get train data
        X = readTrafficSigns(rootpath=self.data_path,
                             which_set="train",
                             label=14)

        # X = readTrafficSigns_asnparray(rootpath=self.data_path, which_set="train", label=14)
        # X_test_norm = readTrafficSigns(rootpath=self.data_path, which_set="test", label=14)
        # X_test_adv = np.load(self.data_path + "/Images_150.npy")
        # print("Shape of input",X.shape)

        # X_test_adv = np.moveaxis(X_test_adv, 1, 3)

        # print("Shape of X_test_adv, ", X_test_adv.shape)

        # debug_visualise_anamolies_detected(X_test_adv,X_test_adv,X_test_adv,X_test_adv)
        #
        # plot_cifar(X_test_adv,10,10)
        # exit()

        # get (normal) test data
        # X_test_norm = readTrafficSigns(rootpath=self.data_path, which_set="test", label=14)
        # sub-sample test set data of size
        print("The random seed used in the experiment is ", self.seed)
        self.seed = Cfg.seed
        np.random.seed(self.seed)
        perm = np.random.permutation(len(X))
        X_test_norm = X[perm[:100], ...]

        self._X_train = X[perm[100:], ...]
        self.n_train = len(self._X_train)
        self._y_train = np.zeros(self.n_train, dtype=np.uint8)

        # load (adversarial) test data
        print("[INFO:] Loading adversarial data...")
        X_test_adv = np.load(self.data_path + "/Images_150.npy")
        labels_adv = np.load(self.data_path + "/Labels_150.npy")
        # print("[INFO:] The number of Loading adversarial data...",len(X_test_adv))

        # X_test_adv , labels_adv = self.generate_AdversarialSigns(X_test_norm)

        self._X_test = np.concatenate(
            (X_test_norm, X_test_adv[labels_adv == 1]),
            axis=0).astype(np.float32)
        self._y_test = np.concatenate(
            (np.zeros(len(X_test_norm), dtype=np.uint8),
             1 * np.ones(int(np.sum(labels_adv)), dtype=np.uint8)),
            axis=0)
        self.n_test = len(self._X_test)

        # since val set is referenced at some points initialize empty np arrays
        self._X_val = np.empty(shape=(0, 3, 32, 32), dtype=np.float32)
        self._y_val = np.empty(shape=(0), dtype=np.uint8)

        # Adjust number of batches
        Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

        # shuffle
        np.random.seed(self.seed)
        perm_train = np.random.permutation(self.n_train)
        perm_test = np.random.permutation(self.n_test)
        self._X_train = self._X_train[perm_train, ...]
        self._y_train = self._y_train[perm_train]
        self._X_test = self._X_test[perm_test, ...]
        self._y_test = self._y_test[perm_test]

        positiveSamples_test = self._X_test[np.where(self._y_test == 0)]
        positiveSamples_test = positiveSamples_test[0:100]
        positiveSamples_test = np.concatenate(
            (self._X_train[0:170], positiveSamples_test))

        negativeSamples_test = self._X_test[np.where(self._y_test == 1)]
        #negativeSamples_test = negativeSamples_test[0:50]
        #negativeSamples_test = negativeSamples_test[0:40]
        negativeSamples_test = negativeSamples_test[0:100]

        self._X_train = np.concatenate((self._X_train, positiveSamples_test))

        self._X_train = self._X_train[0:780]
        self._y_train = np.zeros(len(self._X_train))

        y_positiveSamples_test = np.zeros(len(positiveSamples_test))
        y_negativeSamples_test = 1 * np.ones(len(negativeSamples_test))

        self._X_test = np.concatenate(
            (positiveSamples_test, negativeSamples_test))
        self._y_test = np.concatenate(
            (y_positiveSamples_test, y_negativeSamples_test))

        # print("[INFO:] The number of train samples", len(self._X_train))
        # print("[INFO:] The number of test samples", len(self._X_test))
        print("[INFO:] Negative Y_test labels",
              len(self._y_test[np.where(self._y_test == 1)]))
        print("[INFO:] Positive Y_test labels",
              len(self._y_test[np.where(self._y_test == 0)]))

        # Adjust number of batches
        Cfg.n_batches = int(np.ceil(self.n_train * 1. / Cfg.batch_size))

        # normalize data (if original scale should not be preserved)
        if not original_scale:

            # simple rescaling to [0,1]
            normalize_data(self._X_train,
                           self._X_val,
                           self._X_test,
                           scale=np.float32(255))

            # global contrast normalization
            if Cfg.gcn:
                global_contrast_normalization(self._X_train,
                                              self._X_val,
                                              self._X_test,
                                              scale=Cfg.unit_norm_used)

            # ZCA whitening
            if Cfg.zca_whitening:
                self._X_train, self._X_val, self._X_test = zca_whitening(
                    self._X_train, self._X_val, self._X_test)

            # rescale to [0,1] (w.r.t. min and max in train data)
            rescale_to_unit_interval(self._X_train, self._X_val, self._X_test)

            # PCA
            if Cfg.pca:
                self._X_train, self._X_val, self._X_test = pca(
                    self._X_train, self._X_val, self._X_test, 0.95)

        flush_last_line()

        # self._X_train = np.concatenate((self._X_train, self._X_test))
        # self._y_train = np.concatenate((self._y_train, self._y_test))
        # Make sure the axis dimensions are aligned for training convolutional autoencoders
        self._X_train = np.moveaxis(self._X_train, 1, 3)
        self._X_test = np.moveaxis(self._X_test, 1, 3)

        self._X_train = self._X_train / 255.0
        self._X_test = self._X_test / 255.0

        X_train = self._X_train
        X_test = self._X_test
        y_train = self._y_train
        y_test = self._y_test

        print("X_train,X_test====>", X_train.shape, X_test.shape)

        ## Combine the positive data
        trainXPos = X_train[np.where(y_train == 0)]
        trainYPos = np.zeros(len(trainXPos))
        testXPos = X_test[np.where(y_test == 0)]
        testYPos = np.zeros(len(testXPos))

        # Combine the negative data
        trainXNeg = X_train[np.where(y_train == 1)]
        trainYNeg = np.ones(len(trainXNeg))
        testXNeg = X_test[np.where(y_test == 1)]
        testYNeg = np.ones(len(testXNeg))

        print("trainXPos,testXPos", trainXPos.shape, testXPos.shape)
        X_trainPOS = np.concatenate((trainXPos, testXPos))
        y_trainPOS = np.concatenate((trainYPos, testYPos))

        X_trainNEG = np.concatenate((trainXNeg, testXNeg))
        y_trainNEG = np.concatenate((trainYNeg, testYNeg))

        # Just 0.01 points are the number of anomalies.
        num_of_anomalies = int(0.1 * len(X_trainPOS))

        X_trainNEG = X_trainNEG[0:num_of_anomalies]
        y_trainNEG = y_trainNEG[0:num_of_anomalies]

        X_train = np.concatenate((X_trainPOS, X_trainNEG))
        y_train = np.concatenate((y_trainPOS, y_trainNEG))

        self._X_train = X_train
        self._y_train = y_train

        self._X_test = X_train
        self._y_test = y_train

        self._X_test_beforegcn = X_train
        self._y_test_beforegcn = y_train

        self._X_test_beforegcn = np.reshape(
            self._X_test_beforegcn, (len(self._X_test_beforegcn), 32, 32, 3))

        X_test_sample = self._X_test[-5:]
        import random
        random_list = random.sample(range(1, 700), 5)

        X_train_sample = self._X_train[random_list]

        print("[INFO:] The shape of self.data._X_train", self._X_train.shape)
        print("[INFO:] The shape of self.data._X_test", self._X_test.shape)

        X_test = np.concatenate((X_train_sample, X_test_sample))

        # X_train_sample = np.moveaxis(X_train_sample, 1, 3)
        # X_test_sample = np.moveaxis(X_test_sample, 1, 3)
        # X_train_sample = X_train_sample/255.0
        # X_test_sample = X_test_sample / 255.0

        # self.save_reconstructed_image(X_train_sample, X_train_sample)

        # global contrast normalization

        # if Cfg.gcn:
        #       [self._X_train,self._X_val,self._X_test] = global_contrast_normalization(self._X_train, self._X_val, self._X_test, scale=Cfg.unit_norm_used)
        #       self._X_test = self._X_train

        print("Data loaded.")