def fit(self, data): """ :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :return: None """ N, d = data.shape if self.standardize: self.scaler = MinMaxScaler(feature_range=(-1, 1)).fit(data) data = self.scaler.transform(data) if self.shared_nearest_neighbors: self.data_train = data if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) # The distance statistic is averaged over this neighborhood range low = self.n_neighbors - int(np.floor(0.5 * (self.n_neighbors - 1))) high = self.n_neighbors + int(np.floor(0.5 * self.n_neighbors)) self.neighborhood_range = (low, high) logger.info("Number of samples: {:d}. Number of features: {:d}".format( N, d)) logger.info( "Range of nearest neighbors used for the averaged K-LPE statistic: ({:d}, {:d})" .format(low, high)) # Build the KNN graph self.index_knn = KNNIndex( data, n_neighbors=self.neighborhood_range[1], metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng) # Compute the distance statistic for every data point self.dist_stat_nominal = self.distance_statistic(data, exclude_self=True)
def fit(self, X, y, y_unique=None): """ :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param y: numpy array of class labels of shape `(N, )`. :param y_unique: Allows the optional specification of the unique labels. Can be a tuple list, or numpy array of the unique labels. If this is not specified, then it is found using `numpy.unique`. :return: None """ self.labels_dtype = y.dtype # Labels are mapped to dtype int because `numba` does not handle generic numpy arrays if y_unique is None: y_unique = np.unique(y) self.n_classes = len(y_unique) ind = np.arange(self.n_classes) # Mapping from label values to integers and its inverse d = dict(zip(y_unique, ind)) self.label_enc = np.vectorize(d.__getitem__) d = dict(zip(ind, y_unique)) self.label_dec = np.vectorize(d.__getitem__) self.y_train = self.label_enc(y) self.index_knn = KNNIndex( X, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng )
def fit(self, data): """ Find the optimal projection matrix for the given data points. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) logger.info("Applying PCA as first-level dimension reduction step") data, self.mean_data, self.transform_pca = pca_wrapper( data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng) # If `self.neighbors > data.shape[1]` (number of neighbors larger than the data dimension), then the # Gram matrix that comes up while solving for the neighborhood weights becomes singular. To avoid this, # we can set `self.neighbors = data.shape[1]` or add a small nonzero value to the diagonal elements of the # Gram matrix d = data.shape[1] if self.n_neighbors >= d: k = max(d - 1, 1) logger.info( "Reducing the number of neighbors from {:d} to {:d} to avoid singular Gram " "matrix while solving for neighborhood weights.".format( self.n_neighbors, k)) self.n_neighbors = k if self.dim_projection == 'auto': # Estimate the intrinsic dimension of the data and use that as the projected dimension id = estimate_intrinsic_dimension( data, method='two_nn', n_neighbors=self.n_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) self.dim_projection = int(np.ceil(id)) logger.info( "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}." .format(id)) if self.dim_projection >= data.shape[1]: self.dim_projection = data.shape[1] logger.info("Dimension of the projected subspace = {:d}".format( self.dim_projection)) # Create a KNN index for all nearest neighbor tasks self.index_knn = KNNIndex( data, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) # Create the adjacency matrix `W` based on the optimal reconstruction weights of neighboring points # (as done in locally linear embedding). # Then calculate the iterated graph Laplacian matrix `M = (I - W)^T (I - W)`. self.create_iterated_laplacian(data) # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest # eigenvalues as the columns of the projection matrix logger.info( "Solving the generalized eigenvalue problem to find the optimal projection matrix." ) data_trans = data.T # X^T M X lmat = sparse.csr_matrix.dot(data_trans, self.iterated_laplacian_matrix).dot(data) if self.orthogonal: # ONPP, the paper [2] recommends skipping the eigenvector corresponding to the smallest eigenvalue eig_values, eig_vectors = eigh(lmat, eigvals=(1, self.dim_projection)) else: # Standard NPP or NPE # X^T X rmat = np.dot(data_trans, data) eig_values, eig_vectors = eigh(lmat, b=rmat, eigvals=(0, self.dim_projection - 1)) # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered # according to increasing eigenvalues. # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)` self.transform_npp = eig_vectors self.transform_comb = np.dot(self.transform_pca, self.transform_npp)
class NeighborhoodPreservingProjection: """ Neighborhood preserving projection (NPP) method for dimensionality reduction. Also known as neighborhood preserving embedding (NPE) [1]. Orthogonal neighborhood preserving projection (ONPP) method is based on [2]. 1. He, Xiaofei, et al. "Neighborhood preserving embedding." Tenth IEEE International Conference on Computer Vision (ICCV'05) Volume 1. Vol. 2. IEEE, 2005. 2. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence, 29.12 (2007): 2143-2156. """ def __init__( self, dim_projection='auto', # 'auto' or positive integer orthogonal=False, # True to enable Orthogonal NPP (ONPP) method pca_cutoff=1.0, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, # Specify one of them. If `n_neighbors` # is specified, `neighborhood_constant` will be ignored. shared_nearest_neighbors=False, metric=METRIC_DEF, metric_kwargs=None, # distance metric and its parameter dict (if any) approx_nearest_neighbors=True, n_jobs=1, reg_eps=0.001, seed_rng=SEED_DEFAULT): """ :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced dimension will be chosen by estimating the intrinsic dimension of the data. If an integer value is specified, it should be in the range `[1, dim - 1]`, where `dim` is the observed dimension of the data. :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP in [3]. :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve in the projected dimension-reduced data. PCA is applied as a first-level dimension reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in order to handle only the data matrix singularity. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to find the K nearest neighbors. This is a secondary distance metric that is found to be better suited to high dimensional data. :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity calculation. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param reg_eps: small float value that multiplies the trace to regularize the Gram matrix, if it is close to singular. :param seed_rng: int value specifying the seed for the random number generator. """ self.dim_projection = dim_projection self.orthogonal = orthogonal self.pca_cutoff = pca_cutoff self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.shared_nearest_neighbors = shared_nearest_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.reg_eps = reg_eps self.seed_rng = seed_rng self.mean_data = None self.index_knn = None self.iterated_laplacian_matrix = None self.transform_pca = None self.transform_npp = None self.transform_comb = None def fit(self, data): """ Find the optimal projection matrix for the given data points. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) logger.info("Applying PCA as first-level dimension reduction step") data, self.mean_data, self.transform_pca = pca_wrapper( data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng) # If `self.neighbors > data.shape[1]` (number of neighbors larger than the data dimension), then the # Gram matrix that comes up while solving for the neighborhood weights becomes singular. To avoid this, # we can set `self.neighbors = data.shape[1]` or add a small nonzero value to the diagonal elements of the # Gram matrix d = data.shape[1] if self.n_neighbors >= d: k = max(d - 1, 1) logger.info( "Reducing the number of neighbors from {:d} to {:d} to avoid singular Gram " "matrix while solving for neighborhood weights.".format( self.n_neighbors, k)) self.n_neighbors = k if self.dim_projection == 'auto': # Estimate the intrinsic dimension of the data and use that as the projected dimension id = estimate_intrinsic_dimension( data, method='two_nn', n_neighbors=self.n_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) self.dim_projection = int(np.ceil(id)) logger.info( "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}." .format(id)) if self.dim_projection >= data.shape[1]: self.dim_projection = data.shape[1] logger.info("Dimension of the projected subspace = {:d}".format( self.dim_projection)) # Create a KNN index for all nearest neighbor tasks self.index_knn = KNNIndex( data, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) # Create the adjacency matrix `W` based on the optimal reconstruction weights of neighboring points # (as done in locally linear embedding). # Then calculate the iterated graph Laplacian matrix `M = (I - W)^T (I - W)`. self.create_iterated_laplacian(data) # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest # eigenvalues as the columns of the projection matrix logger.info( "Solving the generalized eigenvalue problem to find the optimal projection matrix." ) data_trans = data.T # X^T M X lmat = sparse.csr_matrix.dot(data_trans, self.iterated_laplacian_matrix).dot(data) if self.orthogonal: # ONPP, the paper [2] recommends skipping the eigenvector corresponding to the smallest eigenvalue eig_values, eig_vectors = eigh(lmat, eigvals=(1, self.dim_projection)) else: # Standard NPP or NPE # X^T X rmat = np.dot(data_trans, data) eig_values, eig_vectors = eigh(lmat, b=rmat, eigvals=(0, self.dim_projection - 1)) # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered # according to increasing eigenvalues. # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)` self.transform_npp = eig_vectors self.transform_comb = np.dot(self.transform_pca, self.transform_npp) def transform(self, data, dim=None): """ Transform the given data by first subtracting the mean and then applying the linear projection. Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger than `self.dim_projection`. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`. Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only the `dim` top eigenvectors. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ if dim is None: data_trans = np.dot(data - self.mean_data, self.transform_comb) else: data_trans = np.dot(data - self.mean_data, self.transform_comb[:, 0:dim]) return data_trans def fit_transform(self, data, dim=None): """ Fit the model and transform the given data. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: same as the `transform` method. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ self.fit(data) return self.transform(data, dim=dim) def create_iterated_laplacian(self, data): """ Calculate the optimal edge weights corresponding to the nearest neighbors of each point. This is exactly the same as the first step of the locally linear embedding (LLE) method. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :return: None """ # Find the `self.n_neighbors` nearest neighbors of each point nn_indices, nn_distances = self.index_knn.query_self( k=self.n_neighbors) N, K = nn_indices.shape if self.n_jobs == 1: w = [ helper_solve_lle(data, nn_indices, self.reg_eps, i) for i in range(N) ] else: helper_partial = partial(helper_solve_lle, data, nn_indices, self.reg_eps) pool_obj = multiprocessing.Pool(processes=self.n_jobs) w = [] _ = pool_obj.map_async(helper_partial, range(N), callback=w.extend) pool_obj.close() pool_obj.join() # Create a sparse matrix of size `(N, N)` for the adjacency matrix row_ind = np.array([[i] * (K + 1) for i in range(N)], dtype=np.int).ravel() col_ind = np.insert(nn_indices, 0, np.arange(N), axis=1).ravel() w = np.negative(w) vals = np.insert(w, 0, 1.0, axis=1).ravel() # Matrix `I - W` mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N)) # Matrix `M = (I - W)^T (I - W)`, also referred to as the iterated graph Laplacian self.iterated_laplacian_matrix = sparse.csr_matrix.dot( mat_tmp.transpose(), mat_tmp)
def fit(self, data): """ Find the optimal projection matrix for the given data points. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) logger.info("Applying PCA as first-level dimension reduction step") data, self.mean_data, self.transform_pca = pca_wrapper( data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng) if self.dim_projection == 'auto': # Estimate the intrinsic dimension of the data and use that as the projected dimension id = estimate_intrinsic_dimension( data, method='two_nn', n_neighbors=self.n_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) self.dim_projection = int(np.ceil(id)) logger.info( "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}." .format(id)) if self.dim_projection >= data.shape[1]: self.dim_projection = data.shape[1] logger.info("Dimension of the projected subspace = {:d}".format( self.dim_projection)) # Create a KNN index for all nearest neighbor tasks self.index_knn = KNNIndex( data, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) # Create the symmetric adjacency matrix, diagonal incidence matrix, and the graph Laplacian matrix # for the data points self.create_laplacian_matrix(data) # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest # eigenvalues as the columns of the projection matrix logger.info( "Solving the generalized eigenvalue problem to find the optimal projection matrix." ) data_trans = data.T # X^T L X lmat = sparse.csr_matrix.dot(data_trans, self.laplacian_matrix).dot(data) if self.orthogonal: # Orthogonal LPP eig_values, eig_vectors = eigh(lmat, eigvals=(0, self.dim_projection - 1)) else: # Standard LPP # X^T D X rmat = sparse.csr_matrix.dot(data_trans, self.incidence_matrix).dot(data) eig_values, eig_vectors = eigh(lmat, b=rmat, eigvals=(0, self.dim_projection - 1)) # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered # according to increasing eigenvalues. # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)` self.transform_lpp = eig_vectors self.transform_comb = np.dot(self.transform_pca, self.transform_lpp)
class LocalityPreservingProjection: """ Locality preserving projection (LPP) method for dimensionality reduction [1, 2]. Orthogonal LPP (OLPP) method based on [3]. 1. He, Xiaofei, and Partha Niyogi. "Locality preserving projections." Advances in neural information processing systems. 2004. 2. He, Xiaofei, et al. "Face recognition using LaplacianFaces." IEEE Transactions on Pattern Analysis & Machine Intelligence 3 (2005): 328-340. 3. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence, 29.12 (2007): 2143-2156. """ def __init__( self, dim_projection='auto', # 'auto' or positive integer orthogonal=False, # True to enable Orthogonal LPP (OLPP) pca_cutoff=1.0, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, # Specify one of them. If `n_neighbors` # is specified, `neighborhood_constant` will be ignored. shared_nearest_neighbors=False, edge_weights='SNN', # Choices are {'simple', 'SNN', 'heat_kernel'} heat_kernel_param=None, # Used only if `edge_weights = 'heat_kernel'` metric=METRIC_DEF, metric_kwargs=None, # distance metric and its parameter dict (if any) approx_nearest_neighbors=True, n_jobs=1, seed_rng=SEED_DEFAULT): """ :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced dimension will be chosen by estimating the intrinsic dimension of the data. If an integer value is specified, it should be in the range `[1, dim - 1]`, where `dim` is the observed dimension of the data. :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP in [3]. :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve in the projected dimension-reduced data. PCA is applied as a first-level dimension reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in order to handle only the data matrix singularity. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to find the K nearest neighbors. This is a secondary distance metric that is found to be better suited to high dimensional data. This will be set to True if `edge_weights = 'SNN'`. :param edge_weights: Weighting method to use for the edge weights. Valid choices are {'simple', 'SNN', 'heat_kernel'}. They are described below: - 'simple': the edge weight is set to one for every sample pair in the neighborhood. - 'SNN': the shared nearest neighbors (SNN) similarity score between two samples is used as the edge weight. This will be a value in [0, 1]. - 'heat_kernel': the heat (Gaussian) kernel with a suitable scale parameter defines the edge weight. :param heat_kernel_param: Heat kernel scale parameter. If set to `None`, this parameter is set automatically based on the median of the pairwise distances between samples. Else a positive real value can be specified. :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity calculation. This is used only if `edge_weights = 'SNN'`. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. Again, this is used only if `edge_weights = 'SNN'`. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param seed_rng: int value specifying the seed for the random number generator. """ self.dim_projection = dim_projection self.orthogonal = orthogonal self.pca_cutoff = pca_cutoff self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.shared_nearest_neighbors = shared_nearest_neighbors self.edge_weights = edge_weights.lower() self.heat_kernel_param = heat_kernel_param self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.seed_rng = seed_rng if self.edge_weights not in {'simple', 'snn', 'heat_kernel'}: raise ValueError( "Invalid value '{}' for parameter 'edge_weights'".format( self.edge_weights)) if self.edge_weights == 'snn': self.shared_nearest_neighbors = True self.mean_data = None self.index_knn = None self.adjacency_matrix = None self.incidence_matrix = None self.laplacian_matrix = None self.transform_pca = None self.transform_lpp = None self.transform_comb = None def fit(self, data): """ Find the optimal projection matrix for the given data points. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) logger.info("Applying PCA as first-level dimension reduction step") data, self.mean_data, self.transform_pca = pca_wrapper( data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng) if self.dim_projection == 'auto': # Estimate the intrinsic dimension of the data and use that as the projected dimension id = estimate_intrinsic_dimension( data, method='two_nn', n_neighbors=self.n_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) self.dim_projection = int(np.ceil(id)) logger.info( "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}." .format(id)) if self.dim_projection >= data.shape[1]: self.dim_projection = data.shape[1] logger.info("Dimension of the projected subspace = {:d}".format( self.dim_projection)) # Create a KNN index for all nearest neighbor tasks self.index_knn = KNNIndex( data, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) # Create the symmetric adjacency matrix, diagonal incidence matrix, and the graph Laplacian matrix # for the data points self.create_laplacian_matrix(data) # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest # eigenvalues as the columns of the projection matrix logger.info( "Solving the generalized eigenvalue problem to find the optimal projection matrix." ) data_trans = data.T # X^T L X lmat = sparse.csr_matrix.dot(data_trans, self.laplacian_matrix).dot(data) if self.orthogonal: # Orthogonal LPP eig_values, eig_vectors = eigh(lmat, eigvals=(0, self.dim_projection - 1)) else: # Standard LPP # X^T D X rmat = sparse.csr_matrix.dot(data_trans, self.incidence_matrix).dot(data) eig_values, eig_vectors = eigh(lmat, b=rmat, eigvals=(0, self.dim_projection - 1)) # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered # according to increasing eigenvalues. # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)` self.transform_lpp = eig_vectors self.transform_comb = np.dot(self.transform_pca, self.transform_lpp) def transform(self, data, dim=None): """ Transform the given data by first subtracting the mean and then applying the linear projection. Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger than `self.dim_projection`. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`. Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only the `dim` top eigenvectors. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ if dim is None: data_trans = np.dot(data - self.mean_data, self.transform_comb) else: data_trans = np.dot(data - self.mean_data, self.transform_comb[:, 0:dim]) return data_trans def fit_transform(self, data, dim=None): """ Fit the model and transform the given data. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: same as the `transform` method. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ self.fit(data) return self.transform(data, dim=dim) def create_laplacian_matrix(self, data): """ Calculate the graph Laplacian matrix for the given data. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ # Find the `self.n_neighbors` nearest neighbors of each point nn_indices, nn_distances = self.index_knn.query_self( k=self.n_neighbors) N, K = nn_indices.shape row_ind = np.array([[i] * K for i in range(N)], dtype=np.int).ravel() col_ind = nn_indices.ravel() if self.edge_weights == 'simple': vals = np.ones(N * K) elif self.edge_weights == 'snn': # SNN distance is the cosine-inverse of the SNN similarity. The range of SNN distances will # be [0, pi / 2]. Hence, the SNN similarity will be in the range [0, 1]. vals = np.clip(np.cos(nn_distances).ravel(), 0., None) else: # Heat kernel vals = calculate_heat_kernel(data, nn_indices, self.heat_kernel_param, self.metric, metric_kwargs=self.metric_kwargs, n_jobs=self.n_jobs).ravel() # Adjacency or edge weight matrix (W) mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N)) self.adjacency_matrix = 0.5 * (mat_tmp + mat_tmp.transpose()) # Incidence matrix (D) vals_diag = self.adjacency_matrix.sum(axis=1) vals_diag = np.array(vals_diag[:, 0]).ravel() ind = np.arange(N) self.incidence_matrix = sparse.csr_matrix((vals_diag, (ind, ind)), shape=(N, N)) # Graph laplacian matrix (L = D - W) self.laplacian_matrix = self.incidence_matrix - self.adjacency_matrix
class KNNClassifier: """ Basic k nearest neighbors classifier that supports approximate nearest neighbor querying and custom distance metrics including shared nearest neighbors. """ def __init__(self, n_neighbors=1, metric=METRIC_DEF, metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, n_jobs=1, low_memory=False, seed_rng=SEED_DEFAULT): """ :param n_neighbors: int value specifying the number of nearest neighbors. Should be >= 1. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance. This is a secondary distance metric that is found to be better suited to high dimensional data. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param seed_rng: int value specifying the seed for the random number generator. """ self.n_neighbors = n_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.shared_nearest_neighbors = shared_nearest_neighbors self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.low_memory = low_memory self.seed_rng = seed_rng self.index_knn = None self.y_train = None self.n_classes = None self.labels_dtype = None self.label_enc = None self.label_dec = None def fit(self, X, y, y_unique=None): """ :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param y: numpy array of class labels of shape `(N, )`. :param y_unique: Allows the optional specification of the unique labels. Can be a tuple list, or numpy array of the unique labels. If this is not specified, then it is found using `numpy.unique`. :return: None """ self.labels_dtype = y.dtype # Labels are mapped to dtype int because `numba` does not handle generic numpy arrays if y_unique is None: y_unique = np.unique(y) self.n_classes = len(y_unique) ind = np.arange(self.n_classes) # Mapping from label values to integers and its inverse d = dict(zip(y_unique, ind)) self.label_enc = np.vectorize(d.__getitem__) d = dict(zip(ind, y_unique)) self.label_dec = np.vectorize(d.__getitem__) self.y_train = self.label_enc(y) self.index_knn = KNNIndex( X, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng ) def predict(self, X, is_train=False): """ Predict the class labels for the given inputs. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param is_train: Set to True if prediction is being done on the same data used to train. :return: numpy array with the class predictions, of shape `(N, )`. """ # Get the indices of the nearest neighbors from the training set if is_train: nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors) else: nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors) labels_pred, _ = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, self.n_neighbors) return labels_pred def predict_multiple_k(self, X, k_list, is_train=False): """ Find the KNN predictions for multiple k values specified via the param `k_list`. This is done efficiently by querying for the maximum number of nearest neighbors once and using the results. It is assumed that the values in `k_list` are sorted in increasing order. This is useful while performing a search for the best `k` value using cross-validation. NOTE: The maximum value in `k_list` should be <= `self.n_neighbors`. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param k_list: list or array of k values for which predictions are to be made. Each value should be an integer >= 1 and the values should be sorted in increasing order. For example, `k_list = [2, 4, 6, 8, 10]`. :param is_train: Set to True if prediction is being done on the same data used to train. :return: numpy array with the class predictions corresponding to each k value in `k_list`. Has shape `(len(k_list), N)`. """ if k_list[-1] > self.n_neighbors: raise ValueError("Invalid input: maximum value in `k_list` cannot be larger than {:d}.". format(self.n_neighbors)) # Query the maximum number of nearest neighbors from `k_list` if is_train: nn_indices, nn_distances = self.index_knn.query_self(k=k_list[-1]) else: nn_indices, nn_distances = self.index_knn.query(X, k=k_list[-1]) if self.n_jobs == 1 or len(k_list) == 1: labels_pred = np.array( [helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, k)[0] for k in k_list], dtype=self.labels_dtype ) else: helper_partial = partial(helper_knn_predict, nn_indices, self.y_train, self.n_classes, self.label_dec) pool_obj = multiprocessing.Pool(processes=self.n_jobs) outputs = [] _ = pool_obj.map_async(helper_partial, k_list, callback=outputs.extend) pool_obj.close() pool_obj.join() labels_pred = np.array([tup[0] for tup in outputs], dtype=self.labels_dtype) return labels_pred def predict_proba(self, X, is_train=False): """ Estimate the probability of each class along with the predicted most-frequent class. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param is_train: Set to True if prediction is being done on the same data used to train. :return: - numpy array with the class predictions, of shape `(N, )`. - numpy array with the estimated probability of each class, of shape `(N, self.n_classes)`. Each row should sum to 1. """ # Get the indices of the nearest neighbors from the training set if is_train: nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors) else: nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors) labels_pred, counts = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, self.n_neighbors) proba = counts / self.n_neighbors return labels_pred, proba def fit_predict(self, X, y): """ Fit a model and predict on the training data. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param y: numpy array of class labels of shape `(N, )`. :return: numpy array with the class predictions, of shape `(N, )`. """ self.fit(X, y) return self.predict(X, is_train=True)
def fit(self, layer_embeddings_normal, labels_normal, labels_pred_normal, layer_embeddings_adversarial, labels_pred_adversarial, layer_embeddings_noisy=None, labels_pred_noisy=None): """ Extract the LID feature vector for normal, noisy, and adversarial samples and train a logistic classifier to separate adversarial samples from (normal + noisy). Cross-validation is used to select the hyper-parameter `C` using area under the ROC curve as the validation metric. NOTE: True labels and predicted labels are required for the normal feature embeddings. Only predicted labels are needed for the noisy and adversarial feature embeddings. :param layer_embeddings_normal: list of numpy arrays with the layer embeddings for normal samples. Length of the list is equal to the number of layers. The numpy array at index `i` has shape `(n, d_i)`, where `n` is the number of samples and `d_i` is the dimension of the embeddings at layer `i`. :param labels_normal: numpy array of class labels for the normal samples. Should have shape `(n, )`, where `n` is the number of normal samples. :param labels_pred_normal: numpy array of DNN classifier predictions for the normal samples. Should have the same shape as `labels_normal`. :param layer_embeddings_adversarial: Same format as `layer_embeddings_normal`, but corresponding to the adversarial samples. :param labels_pred_adversarial: numpy array of DNN classifier predictions for the adversarial samples. Should have shape `(n, )`, where `n` is the number of adversarial samples. :param layer_embeddings_noisy: Same format as `layer_embeddings_normal`, but corresponding to the noisy samples. Can be set to `None` to exclude noisy data from training. :param labels_pred_noisy: numpy array of DNN classifier predictions for the noisy samples. Should have shape `(n, )`, where `n` is the number of noisy samples. Can be set to `None` to exclude noisy data from training. :return: (self, scores_normal, scores_adversarial) if layer_embeddings_noise is None (self, scores_normal, scores_adversarial, scores_noisy) otherwise. ------------------------------------------------------- - self: trained instance of the class. - scores_normal: numpy array with the scores (decision function of the logistic classifier) for normal samples. 1d array with the same number of samples as `layer_embeddings_normal`. - scores_noisy: scores corresponding to `layer_embeddings_noisy` if noisy training data is provided. - scores_adversarial: scores corresponding to `layer_embeddings_adversarial`. """ self.n_layers = len(layer_embeddings_normal) logger.info("Number of layer embeddings: {:d}.".format(self.n_layers)) if layer_embeddings_noisy is None: logger.info("Noisy training data not provided.") cond1 = False noisy_data = False else: cond1 = (len(layer_embeddings_noisy) != self.n_layers) noisy_data = True if labels_pred_noisy is None: raise ValueError("Class predictions are not provided for the noisy data") if cond1 or (len(layer_embeddings_adversarial) != self.n_layers): raise ValueError("The layer embeddings for noisy and attack samples must have the same length as that " "of normal samples") if labels_normal.shape != labels_pred_normal.shape: raise ValueError("Length of arrays 'labels_normal' and 'labels_pred_normal' is not equal") # Number of samples in each of the categories self.n_samples = [ layer_embeddings_normal[0].shape[0], layer_embeddings_noisy[0].shape[0] if noisy_data else 0, layer_embeddings_adversarial[0].shape[0] ] # Distinct class labels self.labels_unique = np.unique(labels_normal) for c in self.labels_unique: # Normal labeled samples from class `c` self.indices_true[c] = np.where(labels_normal == c)[0] # Normal samples predicted into class `c` self.indices_pred_normal[c] = np.where(labels_pred_normal == c)[0] # Adversarial samples predicted into class `c` self.indices_pred_adver[c] = np.where(labels_pred_adversarial == c)[0] if noisy_data: # Noisy samples predicted into class `c` self.indices_pred_noisy[c] = np.where(labels_pred_noisy == c)[0] # Number of nearest neighbors per class if self.n_neighbors is None: # Set based on the number of samples from this class and the neighborhood constant self.n_neighbors_per_class[c] = \ int(np.ceil(self.indices_true[c].shape[0] ** self.neighborhood_constant)) else: # Use the value specified as input self.n_neighbors_per_class[c] = self.n_neighbors # The data arrays at all layers should have the same number of samples if not all([layer_embeddings_normal[i].shape[0] == self.n_samples[0] for i in range(self.n_layers)]): raise ValueError("Input 'layer_embeddings_normal' does not have the expected format") if noisy_data: if not all([layer_embeddings_noisy[i].shape[0] == self.n_samples[1] for i in range(self.n_layers)]): raise ValueError("Input 'layer_embeddings_noisy' does not have the expected format") if not all([layer_embeddings_adversarial[i].shape[0] == self.n_samples[2] for i in range(self.n_layers)]): raise ValueError("Input 'layer_embeddings_adversarial' does not have the expected format") if self.save_knn_indices_to_file: # Create a temporary directory for saving the KNN indices self.temp_direc = tempfile.mkdtemp(dir=os.getcwd()) self.temp_knn_files = [''] * self.n_layers # KNN indices for the layer embeddings from each layer and each class self.index_knn = [dict() for _ in range(self.n_layers)] features_lid_normal = np.zeros((self.n_samples[0], self.n_layers)) features_lid_noisy = np.zeros((self.n_samples[1], self.n_layers)) features_lid_adversarial = np.zeros((self.n_samples[2], self.n_layers)) for i in range(self.n_layers): logger.info("Processing layer {:d}:".format(i + 1)) # Dimensionality reduction of the layer embeddings, if required if self.transform_models: data_normal = transform_data_from_model(layer_embeddings_normal[i], self.transform_models[i]) data_adver = transform_data_from_model(layer_embeddings_adversarial[i], self.transform_models[i]) if noisy_data: data_noisy = transform_data_from_model(layer_embeddings_noisy[i], self.transform_models[i]) else: data_noisy = None d1 = layer_embeddings_normal[i].shape[1] d2 = data_normal.shape[1] if d2 < d1: logger.info("Input dimension = {:d}, projected dimension = {:d}".format(d1, d2)) else: data_normal = layer_embeddings_normal[i] data_adver = layer_embeddings_adversarial[i] if noisy_data: data_noisy = layer_embeddings_noisy[i] else: data_noisy = None for c in self.labels_unique: logger.info("Building a KNN index on the feature embeddings of normal samples from class {}". format(c)) self.index_knn[i][c] = KNNIndex( data_normal[self.indices_true[c], :], n_neighbors=self.n_neighbors_per_class[c], metric=self.metric, metric_kwargs=self.metric_kwargs, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng ) logger.info("Calculating LID estimates for the normal, noisy, and adversarial layer embeddings " "predicted into class {}".format(c)) # Distance to nearest neighbors of all labeled samples from class `c` _, nn_distances_temp = self.index_knn[i][c].query_self(k=self.n_neighbors_per_class[c]) n_pred_normal = self.indices_pred_normal[c].shape[0] n_pred_adver = self.indices_pred_adver[c].shape[0] if noisy_data: n_pred_noisy = self.indices_pred_noisy[c].shape[0] else: n_pred_noisy = 0 if n_pred_normal: # Distance to nearest neighbors of samples predicted into class `c` that are also labeled as # class `c`. These samples will be a part of the KNN index nn_distances = helper_knn_distance(self.indices_pred_normal[c], self.indices_true[c], nn_distances_temp) mask = (nn_distances[:, 0] < 0.) if np.any(mask): # Distance to nearest neighbors of samples predicted into class `c` that are not labeled as # class `c`. These samples will not be a part of the KNN index ind_comp = self.indices_pred_normal[c][mask] _, temp_arr = self.index_knn[i][c].query(data_normal[ind_comp, :], k=self.n_neighbors_per_class[c]) nn_distances[mask, :] = temp_arr # LID estimates for the normal feature embeddings predicted into class `c` features_lid_normal[self.indices_pred_normal[c], i] = lid_mle_amsaleg(nn_distances) # LID estimates for the noisy feature embeddings predicted into class `c` if n_pred_noisy: temp_arr = data_noisy[self.indices_pred_noisy[c], :] _, nn_distances = self.index_knn[i][c].query(temp_arr, k=self.n_neighbors_per_class[c]) features_lid_noisy[self.indices_pred_noisy[c], i] = lid_mle_amsaleg(nn_distances) # LID estimates for the adversarial feature embeddings predicted into class `c` if n_pred_adver: temp_arr = data_adver[self.indices_pred_adver[c], :] _, nn_distances = self.index_knn[i][c].query(temp_arr, k=self.n_neighbors_per_class[c]) features_lid_adversarial[self.indices_pred_adver[c], i] = lid_mle_amsaleg(nn_distances) if self.save_knn_indices_to_file: logger.info("Saving the KNN indices per class from layer {:d} to a pickle file".format(i + 1)) self.temp_knn_files[i] = os.path.join(self.temp_direc, 'knn_indices_layer_{:d}.pkl'.format(i + 1)) with open(self.temp_knn_files[i], 'wb') as fp: pickle.dump(self.index_knn[i], fp) # Free up the allocated memory self.index_knn[i] = None # LID feature vectors and labels for the binary logistic classifier. # Normal and noisy samples are given label 0 and adversarial samples are given label 1 n_pos = features_lid_adversarial.shape[0] if noisy_data: features_lid = np.concatenate([features_lid_normal, features_lid_noisy, features_lid_adversarial], axis=0) labels_bin = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int), np.zeros(features_lid_noisy.shape[0], dtype=np.int), np.ones(n_pos, dtype=np.int)]) else: features_lid = np.concatenate([features_lid_normal, features_lid_adversarial], axis=0) labels_bin = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int), np.ones(n_pos, dtype=np.int)]) pos_prop = n_pos / float(labels_bin.shape[0]) # Randomly shuffle the samples to avoid determinism ind_perm = np.random.permutation(labels_bin.shape[0]) features_lid = features_lid[ind_perm, :] labels_bin = labels_bin[ind_perm] # Min-max scaling for the LID features self.scaler = MinMaxScaler().fit(features_lid) features_lid = self.scaler.transform(features_lid) logger.info("Training a binary logistic classifier with {:d} samples and {:d} LID features.". format(*features_lid.shape)) logger.info("Using {:d}-fold cross-validation with area under ROC curve as the metric to select " "the best regularization hyperparameter.".format(self.n_cv_folds)) logger.info("Proportion of positive (adversarial or OOD) samples in the training data: {:.4f}". format(pos_prop)) class_weight = None if self.balanced_classification: if (pos_prop < 0.45) or (pos_prop > 0.55): class_weight = {0: 1.0 / (1 - pos_prop), 1: 1.0 / pos_prop} logger.info("Balancing the classes by assigning sample weight {:.4f} to class 0 and sample weight " "{:.4f} to class 1".format(class_weight[0], class_weight[1])) self.model_logistic = LogisticRegressionCV( Cs=self.c_search_values, cv=self.n_cv_folds, penalty='l2', scoring='roc_auc', multi_class='auto', class_weight=class_weight, max_iter=self.max_iter, refit=True, n_jobs=self.n_jobs, random_state=self.seed_rng ).fit(features_lid, labels_bin) # Larger values of this score correspond to a higher probability of predicting class 1 (adversarial) scores_normal = self.model_logistic.decision_function(self.scaler.transform(features_lid_normal)) scores_adversarial = self.model_logistic.decision_function(self.scaler.transform(features_lid_adversarial)) if noisy_data: scores_noisy = self.model_logistic.decision_function(self.scaler.transform(features_lid_noisy)) return self, scores_normal, scores_adversarial, scores_noisy else: return self, scores_normal, scores_adversarial
class averaged_KLPE_anomaly_detection: def __init__(self, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, standardize=True, metric=METRIC_DEF, metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, n_jobs=1, low_memory=False, seed_rng=SEED_DEFAULT): """ :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param standardize: Set to True to standardize the individual features to the range [-1, 1]. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance. This is a secondary distance metric that is found to be better suited to high dimensional data. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param seed_rng: int value specifying the seed for the random number generator. """ self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.standardize = standardize self.metric = metric self.metric_kwargs = metric_kwargs self.shared_nearest_neighbors = shared_nearest_neighbors self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.low_memory = low_memory self.seed_rng = seed_rng self.scaler = None self.data_train = None self.neighborhood_range = None self.index_knn = None self.dist_stat_nominal = None np.random.seed(self.seed_rng) def fit(self, data): """ :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :return: None """ N, d = data.shape if self.standardize: self.scaler = MinMaxScaler(feature_range=(-1, 1)).fit(data) data = self.scaler.transform(data) if self.shared_nearest_neighbors: self.data_train = data if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) # The distance statistic is averaged over this neighborhood range low = self.n_neighbors - int(np.floor(0.5 * (self.n_neighbors - 1))) high = self.n_neighbors + int(np.floor(0.5 * self.n_neighbors)) self.neighborhood_range = (low, high) logger.info("Number of samples: {:d}. Number of features: {:d}".format( N, d)) logger.info( "Range of nearest neighbors used for the averaged K-LPE statistic: ({:d}, {:d})" .format(low, high)) # Build the KNN graph self.index_knn = KNNIndex( data, n_neighbors=self.neighborhood_range[1], metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng) # Compute the distance statistic for every data point self.dist_stat_nominal = self.distance_statistic(data, exclude_self=True) def score(self, data_test, exclude_self=False, return_distances=False): """ Calculate the anomaly score which is the negative log of the empirical p-value of the averaged KNN distance. :param data_test: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param exclude_self: Set to True if the points in `data` were already used to build the KNN index. :param return_distances: Set to True in order to include the distance statistics along with the negative log p-value scores in the returned tuple. :return score: numpy array of shape `(N, )` containing the score for each point. Points with higher score are more likely to be anomalous. Returned only if `return_distances` is set to True. dist: numpy array of shape `(N, )` containing the distance statistic for each point. """ # Calculate the k-nearest neighbors based distance statistic dist_stat_test = self.distance_statistic(data_test, exclude_self=exclude_self) # Negative log of the empirical p-value p = pvalue_score(self.dist_stat_nominal, dist_stat_test, log_transform=True, bootstrap=True) if return_distances: return p, dist_stat_test else: return p def distance_statistic(self, data, exclude_self=False): """ Calculate the average distance statistic by querying the nearest neighbors of the given set of points. :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param exclude_self: Set to True if the points in `data` were already used to build the KNN index. :return dist_stat: numpy array of distance statistic for each point. """ if exclude_self: # Data should be already scaled in the `fit` method nn_indices, nn_distances = self.index_knn.query_self( k=self.neighborhood_range[1]) else: if self.standardize: data = self.scaler.transform(data) nn_indices, nn_distances = self.index_knn.query( data, k=self.neighborhood_range[1]) if self.shared_nearest_neighbors: # The distance statistic is calculated based on the primary distance metric, but within the # neighborhood set found using the SNN distance. The idea is that for high-dimensional data, # the neighborhood found using SNN is more reliable dist_stat = self.distance_statistic_local( data, nn_indices, self.neighborhood_range[0]) else: dist_stat = np.mean(nn_distances[:, (self.neighborhood_range[0] - 1):], axis=1) return dist_stat def distance_statistic_local(self, data, nn_indices, k): """ Computes the mean distance statistic for each row of `data` within a local neighborhood specified by `nn_indices`. :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param nn_indices: numpy array of `p` nearest neighbor indices with shape `(N, p)`. :param k: start index of the neighbor from which the mean distance is computed. :return dist_array: numpy array of shape `(N, )` with the mean distance values. """ n = data.shape[0] if self.n_jobs == 1: dist_stat = [ helper_distance(data, self.data_train, nn_indices, self.metric, self.metric_kwargs, k, i) for i in range(n) ] else: helper_distance_partial = partial(helper_distance, data, self.data_train, nn_indices, self.metric, self.metric_kwargs, k) pool_obj = multiprocessing.Pool(processes=self.n_jobs) dist_stat = [] _ = pool_obj.map_async(helper_distance_partial, range(n), callback=dist_stat.extend) pool_obj.close() pool_obj.join() return np.array(dist_stat)
def fit(self, layer_embeddings, labels): """ Estimate parameters of the detection method given natural (non-adversarial) input data. Note that this data should be different from that used to train the DNN classifier. NOTE: Inputs to this method can be obtained by calling the function `extract_layer_embeddings`. :param layer_embeddings: list of numpy arrays with the layer embedding data. Length of the list is equal to the number of layers. The numpy array at index `i` has shape `(n, d_i)`, where `n` is the number of samples and `d_i` is the dimension of the embeddings at layer `i`. :param labels: numpy array of labels for the classification problem addressed by the DNN. Should have shape `(n, )`, where `n` is the number of samples. :return: Instance of the class with all parameters fit to the data. """ self.n_layers = len(layer_embeddings) self.labels_unique = np.unique(labels) self.n_classes = len(self.labels_unique) self.n_samples = labels.shape[0] # Mapping from the original labels to the set {0, 1, . . .,self.n_classes - 1}. This is needed by the label # count function d = dict(zip(self.labels_unique, np.arange(self.n_classes))) self.label_encoder = np.vectorize(d.__getitem__) # Number of nearest neighbors if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int( np.ceil(self.n_samples**self.neighborhood_constant)) logger.info("Number of classes: {:d}.".format(self.n_classes)) logger.info("Number of layer embeddings: {:d}.".format(self.n_layers)) logger.info("Number of samples: {:d}.".format(self.n_samples)) logger.info("Number of neighbors: {:d}.".format(self.n_neighbors)) if not all([ layer_embeddings[i].shape[0] == self.n_samples for i in range(self.n_layers) ]): raise ValueError( "Input 'layer_embeddings' does not have the expected format") self.labels_train_enc = self.label_encoder(labels) indices_true = dict() self.mask_exclude = np.ones((self.n_classes, self.n_classes), dtype=np.bool) for j, c in enumerate(self.labels_unique): # Index of labeled samples from class `c` indices_true[c] = np.where(labels == c)[0] self.mask_exclude[j, j] = False self.nonconformity_calib = np.zeros(self.n_samples) self.index_knn = [None for _ in range(self.n_layers)] for i in range(self.n_layers): logger.info("Processing layer {:d}:".format(i + 1)) if self.transform_models: logger.info( "Transforming the embeddings from layer {:d}.".format(i + 1)) data_proj = transform_data_from_model(layer_embeddings[i], self.transform_models[i]) logger.info( "Input dimension = {:d}, projected dimension = {:d}". format(layer_embeddings[i].shape[1], data_proj.shape[1])) else: data_proj = layer_embeddings[i] logger.info("Building a KNN index for nearest neighbor queries.") # Build a KNN index on the set of feature embeddings from normal samples from layer `i` self.index_knn[i] = KNNIndex( data_proj, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng) # Indices of the nearest neighbors of each sample nn_indices, _ = self.index_knn[i].query_self(k=self.n_neighbors) logger.info( "Calculating the class label counts and non-conformity scores in the neighborhood of " "each sample.") _, nc_counts = neighbors_label_counts(nn_indices, self.labels_train_enc, self.n_classes) for j, c in enumerate(self.labels_unique): # Neighborhood counts of all classes except `c` nc_counts_slice = nc_counts[:, self.mask_exclude[j, :]] # Nonconformity from layer `i` for all labeled samples from class `c` self.nonconformity_calib[indices_true[c]] += np.sum( nc_counts_slice[indices_true[c], :], axis=1) return self
def estimate_intrinsic_dimension( data, method='two_nn', # method choices are {'two_nn', 'lid_mle'} neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, metric='euclidean', metric_kwargs=None, approx_nearest_neighbors=True, n_jobs=1, low_memory=False, seed_rng=SEED_DEFAULT): """ Wrapper function for estimating the intrinsic dimension of the data. :param data: data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of features. :param method: method string. Valid choices are 'two_nn' and 'lid_mle'. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param metric: distance metric to use. Euclidean by default. :param metric_kwargs: optional keyword arguments for the distance metric specified as a dict. :param approx_nearest_neighbors: Set to True to use an approximate nearest neighbor method. Usually the right choice unless both the number of samples are features are small. :param n_jobs: number of CPU cores to use. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param seed_rng: seed for the random number generator. :return: positive float value specifying the estimated intrinsic dimension. """ # Build a KNN graph index index_knn = KNNIndex(data, neighborhood_constant=neighborhood_constant, n_neighbors=n_neighbors, metric=metric, metric_kwargs=metric_kwargs, shared_nearest_neighbors=False, approx_nearest_neighbors=approx_nearest_neighbors, n_jobs=n_jobs, low_memory=low_memory, seed_rng=seed_rng) # Query the nearest neighbors of each point nn_indices, nn_distances = index_knn.query_self() method = method.lower() if method == 'two_nn': # Two nearest neighbors ID estimator id = id_two_nearest_neighbors(nn_distances) elif method == 'lid_mle': # Median of the local intrinsic dimension estimates around each point id = np.median(lid_mle_amsaleg(nn_distances)) else: raise ValueError( "Invalid value '{}' specified for argument 'method'".format( method)) return id
def fit(self, data, labels, labels_pred): """ Estimate the `1 - alpha` density level sets for each class using the given data, with true labels and classifier-predicted labels. This will be used to calculate the trust score. :param data: numpy array with the feature vectors of shape `(n, d)`, where `n` and `d` are the number of samples and the data dimension respectively. :param labels: numpy array of labels for the classification problem addressed by the DNN. Should have shape `(n, )`, where `n` is the number of samples. :param labels_pred: numpy array similar to `labels`, but with the classes predicted by the classifier. :return: Instance of the class with all parameters fit to the data. """ self.n_samples, dim = data.shape self.labels_unique = np.unique(labels) self.n_classes = len(self.labels_unique) if self.n_neighbors is None: # Set number of nearest neighbors based on the maximum number of samples per class and the neighborhood # constant num = 0 for c in self.labels_unique: ind = np.where(labels == c)[0] if ind.shape[0] > num: num = ind.shape[0] self.n_neighbors = int(np.ceil(num ** self.neighborhood_constant)) logger.info("Number of samples: {:d}. Data dimension = {:d}.".format(self.n_samples, dim)) logger.info("Number of classes: {:d}.".format(self.n_classes)) logger.info("Number of neighbors (k): {:d}.".format(self.n_neighbors)) logger.info("Fraction of outliers (alpha): {:.4f}.".format(self.alpha)) if self.model_dim_reduction: data = transform_data_from_model(data, self.model_dim_reduction) dim = data.shape[1] logger.info("Applying dimension reduction to the data. Projected dimension = {:d}.".format(dim)) # Distance from each sample in `data` to the `1 - alpha` level sets corresponding to each class distance_level_sets = np.zeros((self.n_samples, self.n_classes)) self.index_knn = dict() self.epsilon = dict() indices_sub = dict() for j, c in enumerate(self.labels_unique): logger.info("Processing data from class '{}':".format(c)) logger.info("Building a KNN index for all the samples from class '{}'.".format(c)) indices_sub[c] = np.where(labels == c)[0] data_sub = data[indices_sub[c], :] self.index_knn[c] = KNNIndex( data_sub, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng ) # Distances to the k nearest neighbors of each sample _, nn_distances = self.index_knn[c].query_self(k=self.n_neighbors) # Radius or distance to the k-th nearest neighbor for each sample radius_arr = nn_distances[:, self.n_neighbors - 1] # Smallest radius `epsilon` such that only `alpha` fraction of the samples from class `c` have radius # greater than `epsilon` if self.alpha > 0.: self.epsilon[c] = np.percentile(radius_arr, 100 * (1 - self.alpha), interpolation='midpoint') # Exclude the outliers and build a KNN index with the remaining samples mask_incl = radius_arr <= self.epsilon[c] mask_excl = np.logical_not(mask_incl) num_excl = mask_excl[mask_excl].shape[0] else: # Slightly larger value than the largest radius self.epsilon[c] = 1.0001 * np.max(radius_arr) # All samples are included in the density level set mask_incl = np.ones(indices_sub[c].shape[0], dtype=np.bool) mask_excl = np.logical_not(mask_incl) num_excl = 0 if num_excl: logger.info("Excluding {:d} samples with radius larger than {:.6f} and building a KNN index with " "the remaining samples.".format(num_excl, self.epsilon[c])) self.index_knn[c] = KNNIndex( data_sub[mask_incl, :], n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng ) # Distance to the nearest neighbor of each sample that is part of the KNN index _, dist_temp = self.index_knn[c].query_self(k=1) ind = indices_sub[c][mask_incl] distance_level_sets[ind, j] = dist_temp[:, 0] # Distance to the nearest neighbor of each sample that is not a part of the KNN index (outliers) _, dist_temp = self.index_knn[c].query(data_sub[mask_excl, :], k=1) ind = indices_sub[c][mask_excl] distance_level_sets[ind, j] = dist_temp[:, 0] else: # No need to rebuild the KNN index because no samples are excluded. # Distance to the nearest neighbor of each sample distance_level_sets[indices_sub[c], j] = nn_distances[:, 0] logger.info("Calculating the trust score for the estimation data.") for c in self.labels_unique: # Compute the distance from each sample from class `c` to the level sets from the remaining classes data_sub = data[indices_sub[c], :] for j, c_hat in enumerate(self.labels_unique): if c_hat == c: continue _, dist_temp = self.index_knn[c_hat].query(data_sub, k=1) distance_level_sets[indices_sub[c], j] = dist_temp[:, 0] self.scores_estim = self._score_helper(distance_level_sets, labels_pred) return self
def set_kernel_scale(layer_embeddings_train, layer_embeddings_test, metric='euclidean', n_neighbors=10, n_jobs=1, search_size=20, alpha=0.5): # `layer_embeddings_train` and `layer_embeddings_test` will both be a list of numpy arrays n_layers = len(layer_embeddings_test) n_test = layer_embeddings_test[0].shape[0] # n_train = layer_embeddings_train[0].shape[0] # `1 - epsilon` values v = np.linspace(0.05, 0.95, num=search_size) sigma_multiplier = np.sqrt(-1. / np.log(v)) sigma_per_layer = np.ones((n_test, n_layers)) for i in range(n_layers): if metric == 'cosine': # For cosine distance, we scale the layer embedding vectors to have unit norm norm_train = np.linalg.norm(layer_embeddings_train[i], axis=1) + NORM_REG x_train = layer_embeddings_train[i] / norm_train[:, np.newaxis] norm_test = np.linalg.norm(layer_embeddings_test[i], axis=1) + NORM_REG x_test = layer_embeddings_test[i] / norm_test[:, np.newaxis] else: x_train = layer_embeddings_train[i] x_test = layer_embeddings_test[i] # Build a KNN index on the layer embeddings from the train split index_knn = KNNIndex(x_train, n_neighbors=n_neighbors, metric='euclidean', approx_nearest_neighbors=True, n_jobs=n_jobs) # Query the index of nearest neighbors of the layer embeddings from the test split nn_indices, nn_distances = index_knn.query(x_test, k=n_neighbors) # `nn_indices` and `nn_distances` should have shape `(n_test, n_neighbors)` # Candidate sigma values are obtained by multiplying `sqrt(\eta_k^2 - \eta_1^2)` of each test point with # the `sigma_multiplier` defined earlier. Here `eta_k` and `eta_1` denote distance to the k-th and the 1-st # nearest neighbor respectively sigma_cand_vals = (np.sqrt(nn_distances[:, -1]**2 - nn_distances[:, 0]**2).reshape(n_test, 1) * sigma_multiplier.reshape(1, search_size)) # `sigma_cand_vals` should have shape `(n_test, search_size)` # Compute pairwise distances between points in `layer_embeddings_test` and `layer_embeddings_train` dist_mat = pairwise_distances(x_test, Y=x_train, metric='euclidean', n_jobs=n_jobs) # `dist_mat` should have shape `(n_test, n_train)` # Calculate the objective function to maximize for different candidate `sigma` values if n_jobs == 1: out = [ helper_objective(nn_distances, dist_mat, alpha, sigma_cand_vals, t) for t in range(search_size) ] else: # partial function called by multiprocessing helper_objective_partial = partial(helper_objective, nn_distances, dist_mat, alpha, sigma_cand_vals) pool_obj = multiprocessing.Pool(processes=n_jobs) out = [] _ = pool_obj.map_async(helper_objective_partial, range(search_size), callback=out.extend) pool_obj.close() pool_obj.join() # `out` will be a list of length `search_size`, where each element is a numpy array with the objective # function values for the `n_test` samples. # `objec_arr` will have shape `(search_size, n_test)` objec_arr = np.array(out) # Find the sigma value corresponding to the maximum objective function for each test sample ind_max = np.argmax(objec_arr, axis=0) sigma_per_layer[:, i] = [ sigma_cand_vals[j, ind_max[j]] for j in range(n_test) ] return sigma_per_layer