class NeighborhoodPreservingProjection: """ Neighborhood preserving projection (NPP) method for dimensionality reduction. Also known as neighborhood preserving embedding (NPE) [1]. Orthogonal neighborhood preserving projection (ONPP) method is based on [2]. 1. He, Xiaofei, et al. "Neighborhood preserving embedding." Tenth IEEE International Conference on Computer Vision (ICCV'05) Volume 1. Vol. 2. IEEE, 2005. 2. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence, 29.12 (2007): 2143-2156. """ def __init__( self, dim_projection='auto', # 'auto' or positive integer orthogonal=False, # True to enable Orthogonal NPP (ONPP) method pca_cutoff=1.0, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, # Specify one of them. If `n_neighbors` # is specified, `neighborhood_constant` will be ignored. shared_nearest_neighbors=False, metric=METRIC_DEF, metric_kwargs=None, # distance metric and its parameter dict (if any) approx_nearest_neighbors=True, n_jobs=1, reg_eps=0.001, seed_rng=SEED_DEFAULT): """ :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced dimension will be chosen by estimating the intrinsic dimension of the data. If an integer value is specified, it should be in the range `[1, dim - 1]`, where `dim` is the observed dimension of the data. :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP in [3]. :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve in the projected dimension-reduced data. PCA is applied as a first-level dimension reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in order to handle only the data matrix singularity. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to find the K nearest neighbors. This is a secondary distance metric that is found to be better suited to high dimensional data. :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity calculation. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param reg_eps: small float value that multiplies the trace to regularize the Gram matrix, if it is close to singular. :param seed_rng: int value specifying the seed for the random number generator. """ self.dim_projection = dim_projection self.orthogonal = orthogonal self.pca_cutoff = pca_cutoff self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.shared_nearest_neighbors = shared_nearest_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.reg_eps = reg_eps self.seed_rng = seed_rng self.mean_data = None self.index_knn = None self.iterated_laplacian_matrix = None self.transform_pca = None self.transform_npp = None self.transform_comb = None def fit(self, data): """ Find the optimal projection matrix for the given data points. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) logger.info("Applying PCA as first-level dimension reduction step") data, self.mean_data, self.transform_pca = pca_wrapper( data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng) # If `self.neighbors > data.shape[1]` (number of neighbors larger than the data dimension), then the # Gram matrix that comes up while solving for the neighborhood weights becomes singular. To avoid this, # we can set `self.neighbors = data.shape[1]` or add a small nonzero value to the diagonal elements of the # Gram matrix d = data.shape[1] if self.n_neighbors >= d: k = max(d - 1, 1) logger.info( "Reducing the number of neighbors from {:d} to {:d} to avoid singular Gram " "matrix while solving for neighborhood weights.".format( self.n_neighbors, k)) self.n_neighbors = k if self.dim_projection == 'auto': # Estimate the intrinsic dimension of the data and use that as the projected dimension id = estimate_intrinsic_dimension( data, method='two_nn', n_neighbors=self.n_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) self.dim_projection = int(np.ceil(id)) logger.info( "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}." .format(id)) if self.dim_projection >= data.shape[1]: self.dim_projection = data.shape[1] logger.info("Dimension of the projected subspace = {:d}".format( self.dim_projection)) # Create a KNN index for all nearest neighbor tasks self.index_knn = KNNIndex( data, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) # Create the adjacency matrix `W` based on the optimal reconstruction weights of neighboring points # (as done in locally linear embedding). # Then calculate the iterated graph Laplacian matrix `M = (I - W)^T (I - W)`. self.create_iterated_laplacian(data) # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest # eigenvalues as the columns of the projection matrix logger.info( "Solving the generalized eigenvalue problem to find the optimal projection matrix." ) data_trans = data.T # X^T M X lmat = sparse.csr_matrix.dot(data_trans, self.iterated_laplacian_matrix).dot(data) if self.orthogonal: # ONPP, the paper [2] recommends skipping the eigenvector corresponding to the smallest eigenvalue eig_values, eig_vectors = eigh(lmat, eigvals=(1, self.dim_projection)) else: # Standard NPP or NPE # X^T X rmat = np.dot(data_trans, data) eig_values, eig_vectors = eigh(lmat, b=rmat, eigvals=(0, self.dim_projection - 1)) # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered # according to increasing eigenvalues. # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)` self.transform_npp = eig_vectors self.transform_comb = np.dot(self.transform_pca, self.transform_npp) def transform(self, data, dim=None): """ Transform the given data by first subtracting the mean and then applying the linear projection. Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger than `self.dim_projection`. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`. Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only the `dim` top eigenvectors. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ if dim is None: data_trans = np.dot(data - self.mean_data, self.transform_comb) else: data_trans = np.dot(data - self.mean_data, self.transform_comb[:, 0:dim]) return data_trans def fit_transform(self, data, dim=None): """ Fit the model and transform the given data. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: same as the `transform` method. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ self.fit(data) return self.transform(data, dim=dim) def create_iterated_laplacian(self, data): """ Calculate the optimal edge weights corresponding to the nearest neighbors of each point. This is exactly the same as the first step of the locally linear embedding (LLE) method. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :return: None """ # Find the `self.n_neighbors` nearest neighbors of each point nn_indices, nn_distances = self.index_knn.query_self( k=self.n_neighbors) N, K = nn_indices.shape if self.n_jobs == 1: w = [ helper_solve_lle(data, nn_indices, self.reg_eps, i) for i in range(N) ] else: helper_partial = partial(helper_solve_lle, data, nn_indices, self.reg_eps) pool_obj = multiprocessing.Pool(processes=self.n_jobs) w = [] _ = pool_obj.map_async(helper_partial, range(N), callback=w.extend) pool_obj.close() pool_obj.join() # Create a sparse matrix of size `(N, N)` for the adjacency matrix row_ind = np.array([[i] * (K + 1) for i in range(N)], dtype=np.int).ravel() col_ind = np.insert(nn_indices, 0, np.arange(N), axis=1).ravel() w = np.negative(w) vals = np.insert(w, 0, 1.0, axis=1).ravel() # Matrix `I - W` mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N)) # Matrix `M = (I - W)^T (I - W)`, also referred to as the iterated graph Laplacian self.iterated_laplacian_matrix = sparse.csr_matrix.dot( mat_tmp.transpose(), mat_tmp)
class LocalityPreservingProjection: """ Locality preserving projection (LPP) method for dimensionality reduction [1, 2]. Orthogonal LPP (OLPP) method based on [3]. 1. He, Xiaofei, and Partha Niyogi. "Locality preserving projections." Advances in neural information processing systems. 2004. 2. He, Xiaofei, et al. "Face recognition using LaplacianFaces." IEEE Transactions on Pattern Analysis & Machine Intelligence 3 (2005): 328-340. 3. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence, 29.12 (2007): 2143-2156. """ def __init__( self, dim_projection='auto', # 'auto' or positive integer orthogonal=False, # True to enable Orthogonal LPP (OLPP) pca_cutoff=1.0, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, # Specify one of them. If `n_neighbors` # is specified, `neighborhood_constant` will be ignored. shared_nearest_neighbors=False, edge_weights='SNN', # Choices are {'simple', 'SNN', 'heat_kernel'} heat_kernel_param=None, # Used only if `edge_weights = 'heat_kernel'` metric=METRIC_DEF, metric_kwargs=None, # distance metric and its parameter dict (if any) approx_nearest_neighbors=True, n_jobs=1, seed_rng=SEED_DEFAULT): """ :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced dimension will be chosen by estimating the intrinsic dimension of the data. If an integer value is specified, it should be in the range `[1, dim - 1]`, where `dim` is the observed dimension of the data. :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP in [3]. :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve in the projected dimension-reduced data. PCA is applied as a first-level dimension reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in order to handle only the data matrix singularity. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to find the K nearest neighbors. This is a secondary distance metric that is found to be better suited to high dimensional data. This will be set to True if `edge_weights = 'SNN'`. :param edge_weights: Weighting method to use for the edge weights. Valid choices are {'simple', 'SNN', 'heat_kernel'}. They are described below: - 'simple': the edge weight is set to one for every sample pair in the neighborhood. - 'SNN': the shared nearest neighbors (SNN) similarity score between two samples is used as the edge weight. This will be a value in [0, 1]. - 'heat_kernel': the heat (Gaussian) kernel with a suitable scale parameter defines the edge weight. :param heat_kernel_param: Heat kernel scale parameter. If set to `None`, this parameter is set automatically based on the median of the pairwise distances between samples. Else a positive real value can be specified. :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity calculation. This is used only if `edge_weights = 'SNN'`. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. Again, this is used only if `edge_weights = 'SNN'`. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param seed_rng: int value specifying the seed for the random number generator. """ self.dim_projection = dim_projection self.orthogonal = orthogonal self.pca_cutoff = pca_cutoff self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.shared_nearest_neighbors = shared_nearest_neighbors self.edge_weights = edge_weights.lower() self.heat_kernel_param = heat_kernel_param self.metric = metric self.metric_kwargs = metric_kwargs self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.seed_rng = seed_rng if self.edge_weights not in {'simple', 'snn', 'heat_kernel'}: raise ValueError( "Invalid value '{}' for parameter 'edge_weights'".format( self.edge_weights)) if self.edge_weights == 'snn': self.shared_nearest_neighbors = True self.mean_data = None self.index_knn = None self.adjacency_matrix = None self.incidence_matrix = None self.laplacian_matrix = None self.transform_pca = None self.transform_lpp = None self.transform_comb = None def fit(self, data): """ Find the optimal projection matrix for the given data points. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ N, d = data.shape if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) logger.info("Applying PCA as first-level dimension reduction step") data, self.mean_data, self.transform_pca = pca_wrapper( data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng) if self.dim_projection == 'auto': # Estimate the intrinsic dimension of the data and use that as the projected dimension id = estimate_intrinsic_dimension( data, method='two_nn', n_neighbors=self.n_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) self.dim_projection = int(np.ceil(id)) logger.info( "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}." .format(id)) if self.dim_projection >= data.shape[1]: self.dim_projection = data.shape[1] logger.info("Dimension of the projected subspace = {:d}".format( self.dim_projection)) # Create a KNN index for all nearest neighbor tasks self.index_knn = KNNIndex( data, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, seed_rng=self.seed_rng) # Create the symmetric adjacency matrix, diagonal incidence matrix, and the graph Laplacian matrix # for the data points self.create_laplacian_matrix(data) # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest # eigenvalues as the columns of the projection matrix logger.info( "Solving the generalized eigenvalue problem to find the optimal projection matrix." ) data_trans = data.T # X^T L X lmat = sparse.csr_matrix.dot(data_trans, self.laplacian_matrix).dot(data) if self.orthogonal: # Orthogonal LPP eig_values, eig_vectors = eigh(lmat, eigvals=(0, self.dim_projection - 1)) else: # Standard LPP # X^T D X rmat = sparse.csr_matrix.dot(data_trans, self.incidence_matrix).dot(data) eig_values, eig_vectors = eigh(lmat, b=rmat, eigvals=(0, self.dim_projection - 1)) # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered # according to increasing eigenvalues. # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)` self.transform_lpp = eig_vectors self.transform_comb = np.dot(self.transform_pca, self.transform_lpp) def transform(self, data, dim=None): """ Transform the given data by first subtracting the mean and then applying the linear projection. Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger than `self.dim_projection`. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`. Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only the `dim` top eigenvectors. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ if dim is None: data_trans = np.dot(data - self.mean_data, self.transform_comb) else: data_trans = np.dot(data - self.mean_data, self.transform_comb[:, 0:dim]) return data_trans def fit_transform(self, data, dim=None): """ Fit the model and transform the given data. :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions. :param dim: same as the `transform` method. :return: - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data. """ self.fit(data) return self.transform(data, dim=dim) def create_laplacian_matrix(self, data): """ Calculate the graph Laplacian matrix for the given data. :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of dimensions. :return: """ # Find the `self.n_neighbors` nearest neighbors of each point nn_indices, nn_distances = self.index_knn.query_self( k=self.n_neighbors) N, K = nn_indices.shape row_ind = np.array([[i] * K for i in range(N)], dtype=np.int).ravel() col_ind = nn_indices.ravel() if self.edge_weights == 'simple': vals = np.ones(N * K) elif self.edge_weights == 'snn': # SNN distance is the cosine-inverse of the SNN similarity. The range of SNN distances will # be [0, pi / 2]. Hence, the SNN similarity will be in the range [0, 1]. vals = np.clip(np.cos(nn_distances).ravel(), 0., None) else: # Heat kernel vals = calculate_heat_kernel(data, nn_indices, self.heat_kernel_param, self.metric, metric_kwargs=self.metric_kwargs, n_jobs=self.n_jobs).ravel() # Adjacency or edge weight matrix (W) mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N)) self.adjacency_matrix = 0.5 * (mat_tmp + mat_tmp.transpose()) # Incidence matrix (D) vals_diag = self.adjacency_matrix.sum(axis=1) vals_diag = np.array(vals_diag[:, 0]).ravel() ind = np.arange(N) self.incidence_matrix = sparse.csr_matrix((vals_diag, (ind, ind)), shape=(N, N)) # Graph laplacian matrix (L = D - W) self.laplacian_matrix = self.incidence_matrix - self.adjacency_matrix
class averaged_KLPE_anomaly_detection: def __init__(self, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, standardize=True, metric=METRIC_DEF, metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, n_jobs=1, low_memory=False, seed_rng=SEED_DEFAULT): """ :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param standardize: Set to True to standardize the individual features to the range [-1, 1]. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance. This is a secondary distance metric that is found to be better suited to high dimensional data. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param seed_rng: int value specifying the seed for the random number generator. """ self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.standardize = standardize self.metric = metric self.metric_kwargs = metric_kwargs self.shared_nearest_neighbors = shared_nearest_neighbors self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.low_memory = low_memory self.seed_rng = seed_rng self.scaler = None self.data_train = None self.neighborhood_range = None self.index_knn = None self.dist_stat_nominal = None np.random.seed(self.seed_rng) def fit(self, data): """ :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :return: None """ N, d = data.shape if self.standardize: self.scaler = MinMaxScaler(feature_range=(-1, 1)).fit(data) data = self.scaler.transform(data) if self.shared_nearest_neighbors: self.data_train = data if self.n_neighbors is None: # Set number of nearest neighbors based on the data size and the neighborhood constant self.n_neighbors = int(np.ceil(N**self.neighborhood_constant)) # The distance statistic is averaged over this neighborhood range low = self.n_neighbors - int(np.floor(0.5 * (self.n_neighbors - 1))) high = self.n_neighbors + int(np.floor(0.5 * self.n_neighbors)) self.neighborhood_range = (low, high) logger.info("Number of samples: {:d}. Number of features: {:d}".format( N, d)) logger.info( "Range of nearest neighbors used for the averaged K-LPE statistic: ({:d}, {:d})" .format(low, high)) # Build the KNN graph self.index_knn = KNNIndex( data, n_neighbors=self.neighborhood_range[1], metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng) # Compute the distance statistic for every data point self.dist_stat_nominal = self.distance_statistic(data, exclude_self=True) def score(self, data_test, exclude_self=False, return_distances=False): """ Calculate the anomaly score which is the negative log of the empirical p-value of the averaged KNN distance. :param data_test: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param exclude_self: Set to True if the points in `data` were already used to build the KNN index. :param return_distances: Set to True in order to include the distance statistics along with the negative log p-value scores in the returned tuple. :return score: numpy array of shape `(N, )` containing the score for each point. Points with higher score are more likely to be anomalous. Returned only if `return_distances` is set to True. dist: numpy array of shape `(N, )` containing the distance statistic for each point. """ # Calculate the k-nearest neighbors based distance statistic dist_stat_test = self.distance_statistic(data_test, exclude_self=exclude_self) # Negative log of the empirical p-value p = pvalue_score(self.dist_stat_nominal, dist_stat_test, log_transform=True, bootstrap=True) if return_distances: return p, dist_stat_test else: return p def distance_statistic(self, data, exclude_self=False): """ Calculate the average distance statistic by querying the nearest neighbors of the given set of points. :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param exclude_self: Set to True if the points in `data` were already used to build the KNN index. :return dist_stat: numpy array of distance statistic for each point. """ if exclude_self: # Data should be already scaled in the `fit` method nn_indices, nn_distances = self.index_knn.query_self( k=self.neighborhood_range[1]) else: if self.standardize: data = self.scaler.transform(data) nn_indices, nn_distances = self.index_knn.query( data, k=self.neighborhood_range[1]) if self.shared_nearest_neighbors: # The distance statistic is calculated based on the primary distance metric, but within the # neighborhood set found using the SNN distance. The idea is that for high-dimensional data, # the neighborhood found using SNN is more reliable dist_stat = self.distance_statistic_local( data, nn_indices, self.neighborhood_range[0]) else: dist_stat = np.mean(nn_distances[:, (self.neighborhood_range[0] - 1):], axis=1) return dist_stat def distance_statistic_local(self, data, nn_indices, k): """ Computes the mean distance statistic for each row of `data` within a local neighborhood specified by `nn_indices`. :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of dimensions (features). :param nn_indices: numpy array of `p` nearest neighbor indices with shape `(N, p)`. :param k: start index of the neighbor from which the mean distance is computed. :return dist_array: numpy array of shape `(N, )` with the mean distance values. """ n = data.shape[0] if self.n_jobs == 1: dist_stat = [ helper_distance(data, self.data_train, nn_indices, self.metric, self.metric_kwargs, k, i) for i in range(n) ] else: helper_distance_partial = partial(helper_distance, data, self.data_train, nn_indices, self.metric, self.metric_kwargs, k) pool_obj = multiprocessing.Pool(processes=self.n_jobs) dist_stat = [] _ = pool_obj.map_async(helper_distance_partial, range(n), callback=dist_stat.extend) pool_obj.close() pool_obj.join() return np.array(dist_stat)
class KNNClassifier: """ Basic k nearest neighbors classifier that supports approximate nearest neighbor querying and custom distance metrics including shared nearest neighbors. """ def __init__(self, n_neighbors=1, metric=METRIC_DEF, metric_kwargs=None, shared_nearest_neighbors=False, approx_nearest_neighbors=True, n_jobs=1, low_memory=False, seed_rng=SEED_DEFAULT): """ :param n_neighbors: int value specifying the number of nearest neighbors. Should be >= 1. :param metric: string or a callable that specifies the distance metric. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance. This is a secondary distance metric that is found to be better suited to high dimensional data. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. This is recommended when the number of points is large and/or when the dimension of the data is high. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param seed_rng: int value specifying the seed for the random number generator. """ self.n_neighbors = n_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.shared_nearest_neighbors = shared_nearest_neighbors self.approx_nearest_neighbors = approx_nearest_neighbors self.n_jobs = get_num_jobs(n_jobs) self.low_memory = low_memory self.seed_rng = seed_rng self.index_knn = None self.y_train = None self.n_classes = None self.labels_dtype = None self.label_enc = None self.label_dec = None def fit(self, X, y, y_unique=None): """ :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param y: numpy array of class labels of shape `(N, )`. :param y_unique: Allows the optional specification of the unique labels. Can be a tuple list, or numpy array of the unique labels. If this is not specified, then it is found using `numpy.unique`. :return: None """ self.labels_dtype = y.dtype # Labels are mapped to dtype int because `numba` does not handle generic numpy arrays if y_unique is None: y_unique = np.unique(y) self.n_classes = len(y_unique) ind = np.arange(self.n_classes) # Mapping from label values to integers and its inverse d = dict(zip(y_unique, ind)) self.label_enc = np.vectorize(d.__getitem__) d = dict(zip(ind, y_unique)) self.label_dec = np.vectorize(d.__getitem__) self.y_train = self.label_enc(y) self.index_knn = KNNIndex( X, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, shared_nearest_neighbors=self.shared_nearest_neighbors, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng ) def predict(self, X, is_train=False): """ Predict the class labels for the given inputs. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param is_train: Set to True if prediction is being done on the same data used to train. :return: numpy array with the class predictions, of shape `(N, )`. """ # Get the indices of the nearest neighbors from the training set if is_train: nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors) else: nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors) labels_pred, _ = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, self.n_neighbors) return labels_pred def predict_multiple_k(self, X, k_list, is_train=False): """ Find the KNN predictions for multiple k values specified via the param `k_list`. This is done efficiently by querying for the maximum number of nearest neighbors once and using the results. It is assumed that the values in `k_list` are sorted in increasing order. This is useful while performing a search for the best `k` value using cross-validation. NOTE: The maximum value in `k_list` should be <= `self.n_neighbors`. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param k_list: list or array of k values for which predictions are to be made. Each value should be an integer >= 1 and the values should be sorted in increasing order. For example, `k_list = [2, 4, 6, 8, 10]`. :param is_train: Set to True if prediction is being done on the same data used to train. :return: numpy array with the class predictions corresponding to each k value in `k_list`. Has shape `(len(k_list), N)`. """ if k_list[-1] > self.n_neighbors: raise ValueError("Invalid input: maximum value in `k_list` cannot be larger than {:d}.". format(self.n_neighbors)) # Query the maximum number of nearest neighbors from `k_list` if is_train: nn_indices, nn_distances = self.index_knn.query_self(k=k_list[-1]) else: nn_indices, nn_distances = self.index_knn.query(X, k=k_list[-1]) if self.n_jobs == 1 or len(k_list) == 1: labels_pred = np.array( [helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, k)[0] for k in k_list], dtype=self.labels_dtype ) else: helper_partial = partial(helper_knn_predict, nn_indices, self.y_train, self.n_classes, self.label_dec) pool_obj = multiprocessing.Pool(processes=self.n_jobs) outputs = [] _ = pool_obj.map_async(helper_partial, k_list, callback=outputs.extend) pool_obj.close() pool_obj.join() labels_pred = np.array([tup[0] for tup in outputs], dtype=self.labels_dtype) return labels_pred def predict_proba(self, X, is_train=False): """ Estimate the probability of each class along with the predicted most-frequent class. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param is_train: Set to True if prediction is being done on the same data used to train. :return: - numpy array with the class predictions, of shape `(N, )`. - numpy array with the estimated probability of each class, of shape `(N, self.n_classes)`. Each row should sum to 1. """ # Get the indices of the nearest neighbors from the training set if is_train: nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors) else: nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors) labels_pred, counts = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, self.n_neighbors) proba = counts / self.n_neighbors return labels_pred, proba def fit_predict(self, X, y): """ Fit a model and predict on the training data. :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples and `d` is the dimension. :param y: numpy array of class labels of shape `(N, )`. :return: numpy array with the class predictions, of shape `(N, )`. """ self.fit(X, y) return self.predict(X, is_train=True)
def estimate_intrinsic_dimension( data, method='two_nn', # method choices are {'two_nn', 'lid_mle'} neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, metric='euclidean', metric_kwargs=None, approx_nearest_neighbors=True, n_jobs=1, low_memory=False, seed_rng=SEED_DEFAULT): """ Wrapper function for estimating the intrinsic dimension of the data. :param data: data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of features. :param method: method string. Valid choices are 'two_nn' and 'lid_mle'. :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param metric: distance metric to use. Euclidean by default. :param metric_kwargs: optional keyword arguments for the distance metric specified as a dict. :param approx_nearest_neighbors: Set to True to use an approximate nearest neighbor method. Usually the right choice unless both the number of samples are features are small. :param n_jobs: number of CPU cores to use. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param seed_rng: seed for the random number generator. :return: positive float value specifying the estimated intrinsic dimension. """ # Build a KNN graph index index_knn = KNNIndex(data, neighborhood_constant=neighborhood_constant, n_neighbors=n_neighbors, metric=metric, metric_kwargs=metric_kwargs, shared_nearest_neighbors=False, approx_nearest_neighbors=approx_nearest_neighbors, n_jobs=n_jobs, low_memory=low_memory, seed_rng=seed_rng) # Query the nearest neighbors of each point nn_indices, nn_distances = index_knn.query_self() method = method.lower() if method == 'two_nn': # Two nearest neighbors ID estimator id = id_two_nearest_neighbors(nn_distances) elif method == 'lid_mle': # Median of the local intrinsic dimension estimates around each point id = np.median(lid_mle_amsaleg(nn_distances)) else: raise ValueError( "Invalid value '{}' specified for argument 'method'".format( method)) return id