def fit(self, data):
        """
        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :return: None
        """
        N, d = data.shape
        if self.standardize:
            self.scaler = MinMaxScaler(feature_range=(-1, 1)).fit(data)
            data = self.scaler.transform(data)

        if self.shared_nearest_neighbors:
            self.data_train = data

        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        # The distance statistic is averaged over this neighborhood range
        low = self.n_neighbors - int(np.floor(0.5 * (self.n_neighbors - 1)))
        high = self.n_neighbors + int(np.floor(0.5 * self.n_neighbors))
        self.neighborhood_range = (low, high)
        logger.info("Number of samples: {:d}. Number of features: {:d}".format(
            N, d))
        logger.info(
            "Range of nearest neighbors used for the averaged K-LPE statistic: ({:d}, {:d})"
            .format(low, high))
        # Build the KNN graph
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.neighborhood_range[1],
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            low_memory=self.low_memory,
            seed_rng=self.seed_rng)
        # Compute the distance statistic for every data point
        self.dist_stat_nominal = self.distance_statistic(data,
                                                         exclude_self=True)
    def fit(self, X, y, y_unique=None):
        """

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param y: numpy array of class labels of shape `(N, )`.
        :param y_unique: Allows the optional specification of the unique labels. Can be a tuple list, or numpy
                         array of the unique labels. If this is not specified, then it is found using
                         `numpy.unique`.
        :return: None
        """
        self.labels_dtype = y.dtype
        # Labels are mapped to dtype int because `numba` does not handle generic numpy arrays
        if y_unique is None:
            y_unique = np.unique(y)

        self.n_classes = len(y_unique)
        ind = np.arange(self.n_classes)
        # Mapping from label values to integers and its inverse
        d = dict(zip(y_unique, ind))
        self.label_enc = np.vectorize(d.__getitem__)

        d = dict(zip(ind, y_unique))
        self.label_dec = np.vectorize(d.__getitem__)
        self.y_train = self.label_enc(y)

        self.index_knn = KNNIndex(
            X,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            low_memory=self.low_memory,
            seed_rng=self.seed_rng
        )
    def fit(self, data):
        """
        Find the optimal projection matrix for the given data points.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        logger.info("Applying PCA as first-level dimension reduction step")
        data, self.mean_data, self.transform_pca = pca_wrapper(
            data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng)

        # If `self.neighbors > data.shape[1]` (number of neighbors larger than the data dimension), then the
        # Gram matrix that comes up while solving for the neighborhood weights becomes singular. To avoid this,
        # we can set `self.neighbors = data.shape[1]` or add a small nonzero value to the diagonal elements of the
        # Gram matrix
        d = data.shape[1]
        if self.n_neighbors >= d:
            k = max(d - 1, 1)
            logger.info(
                "Reducing the number of neighbors from {:d} to {:d} to avoid singular Gram "
                "matrix while solving for neighborhood weights.".format(
                    self.n_neighbors, k))
            self.n_neighbors = k

        if self.dim_projection == 'auto':
            # Estimate the intrinsic dimension of the data and use that as the projected dimension
            id = estimate_intrinsic_dimension(
                data,
                method='two_nn',
                n_neighbors=self.n_neighbors,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                seed_rng=self.seed_rng)
            self.dim_projection = int(np.ceil(id))
            logger.info(
                "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}."
                .format(id))

        if self.dim_projection >= data.shape[1]:
            self.dim_projection = data.shape[1]

        logger.info("Dimension of the projected subspace = {:d}".format(
            self.dim_projection))

        # Create a KNN index for all nearest neighbor tasks
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            seed_rng=self.seed_rng)

        # Create the adjacency matrix `W` based on the optimal reconstruction weights of neighboring points
        # (as done in locally linear embedding).
        # Then calculate the iterated graph Laplacian matrix `M = (I - W)^T (I - W)`.
        self.create_iterated_laplacian(data)

        # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest
        # eigenvalues as the columns of the projection matrix
        logger.info(
            "Solving the generalized eigenvalue problem to find the optimal projection matrix."
        )
        data_trans = data.T
        # X^T M X
        lmat = sparse.csr_matrix.dot(data_trans,
                                     self.iterated_laplacian_matrix).dot(data)
        if self.orthogonal:
            # ONPP, the paper [2] recommends skipping the eigenvector corresponding to the smallest eigenvalue
            eig_values, eig_vectors = eigh(lmat,
                                           eigvals=(1, self.dim_projection))
        else:
            # Standard NPP or NPE
            # X^T X
            rmat = np.dot(data_trans, data)
            eig_values, eig_vectors = eigh(lmat,
                                           b=rmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))

        # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered
        # according to increasing eigenvalues.
        # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)`
        self.transform_npp = eig_vectors

        self.transform_comb = np.dot(self.transform_pca, self.transform_npp)
class NeighborhoodPreservingProjection:
    """
    Neighborhood preserving projection (NPP) method for dimensionality reduction. Also known as neighborhood
    preserving embedding (NPE) [1].
    Orthogonal neighborhood preserving projection (ONPP) method is based on [2].

    1. He, Xiaofei, et al. "Neighborhood preserving embedding." Tenth IEEE International Conference on Computer
       Vision (ICCV'05) Volume 1. Vol. 2. IEEE, 2005.
    2. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based
    dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence,
    29.12 (2007): 2143-2156.

    """
    def __init__(
            self,
            dim_projection='auto',  # 'auto' or positive integer
            orthogonal=False,  # True to enable Orthogonal NPP (ONPP) method
            pca_cutoff=1.0,
            neighborhood_constant=NEIGHBORHOOD_CONST,
            n_neighbors=None,  # Specify one of them. If `n_neighbors`
            # is specified, `neighborhood_constant` will be ignored.
        shared_nearest_neighbors=False,
            metric=METRIC_DEF,
            metric_kwargs=None,  # distance metric and its parameter dict (if any)
            approx_nearest_neighbors=True,
            n_jobs=1,
            reg_eps=0.001,
            seed_rng=SEED_DEFAULT):
        """
        :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced
                               dimension will be chosen by estimating the intrinsic dimension of the data. If an
                               integer value is specified, it should be in the range `[1, dim - 1]`, where `dim`
                               is the observed dimension of the data.
        :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP
                           in [3].
        :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve
                           in the projected dimension-reduced data. PCA is applied as a first-level dimension
                           reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in
                           order to handle only the data matrix singularity.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to
                                         find the K nearest neighbors. This is a secondary distance metric that is
                                         found to be better suited to high dimensional data.
        :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity
                       calculation.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param reg_eps: small float value that multiplies the trace to regularize the Gram matrix, if it is
                        close to singular.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.dim_projection = dim_projection
        self.orthogonal = orthogonal
        self.pca_cutoff = pca_cutoff
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.reg_eps = reg_eps
        self.seed_rng = seed_rng

        self.mean_data = None
        self.index_knn = None
        self.iterated_laplacian_matrix = None
        self.transform_pca = None
        self.transform_npp = None
        self.transform_comb = None

    def fit(self, data):
        """
        Find the optimal projection matrix for the given data points.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        logger.info("Applying PCA as first-level dimension reduction step")
        data, self.mean_data, self.transform_pca = pca_wrapper(
            data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng)

        # If `self.neighbors > data.shape[1]` (number of neighbors larger than the data dimension), then the
        # Gram matrix that comes up while solving for the neighborhood weights becomes singular. To avoid this,
        # we can set `self.neighbors = data.shape[1]` or add a small nonzero value to the diagonal elements of the
        # Gram matrix
        d = data.shape[1]
        if self.n_neighbors >= d:
            k = max(d - 1, 1)
            logger.info(
                "Reducing the number of neighbors from {:d} to {:d} to avoid singular Gram "
                "matrix while solving for neighborhood weights.".format(
                    self.n_neighbors, k))
            self.n_neighbors = k

        if self.dim_projection == 'auto':
            # Estimate the intrinsic dimension of the data and use that as the projected dimension
            id = estimate_intrinsic_dimension(
                data,
                method='two_nn',
                n_neighbors=self.n_neighbors,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                seed_rng=self.seed_rng)
            self.dim_projection = int(np.ceil(id))
            logger.info(
                "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}."
                .format(id))

        if self.dim_projection >= data.shape[1]:
            self.dim_projection = data.shape[1]

        logger.info("Dimension of the projected subspace = {:d}".format(
            self.dim_projection))

        # Create a KNN index for all nearest neighbor tasks
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            seed_rng=self.seed_rng)

        # Create the adjacency matrix `W` based on the optimal reconstruction weights of neighboring points
        # (as done in locally linear embedding).
        # Then calculate the iterated graph Laplacian matrix `M = (I - W)^T (I - W)`.
        self.create_iterated_laplacian(data)

        # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest
        # eigenvalues as the columns of the projection matrix
        logger.info(
            "Solving the generalized eigenvalue problem to find the optimal projection matrix."
        )
        data_trans = data.T
        # X^T M X
        lmat = sparse.csr_matrix.dot(data_trans,
                                     self.iterated_laplacian_matrix).dot(data)
        if self.orthogonal:
            # ONPP, the paper [2] recommends skipping the eigenvector corresponding to the smallest eigenvalue
            eig_values, eig_vectors = eigh(lmat,
                                           eigvals=(1, self.dim_projection))
        else:
            # Standard NPP or NPE
            # X^T X
            rmat = np.dot(data_trans, data)
            eig_values, eig_vectors = eigh(lmat,
                                           b=rmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))

        # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered
        # according to increasing eigenvalues.
        # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)`
        self.transform_npp = eig_vectors

        self.transform_comb = np.dot(self.transform_pca, self.transform_npp)

    def transform(self, data, dim=None):
        """
        Transform the given data by first subtracting the mean and then applying the linear projection.
        Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger
        than `self.dim_projection`.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`.
                    Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only
                    the `dim` top eigenvectors.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        if dim is None:
            data_trans = np.dot(data - self.mean_data, self.transform_comb)
        else:
            data_trans = np.dot(data - self.mean_data,
                                self.transform_comb[:, 0:dim])

        return data_trans

    def fit_transform(self, data, dim=None):
        """
        Fit the model and transform the given data.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: same as the `transform` method.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        self.fit(data)
        return self.transform(data, dim=dim)

    def create_iterated_laplacian(self, data):
        """
        Calculate the optimal edge weights corresponding to the nearest neighbors of each point. This is exactly
        the same as the first step of the locally linear embedding (LLE) method.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :return: None
        """
        # Find the `self.n_neighbors` nearest neighbors of each point
        nn_indices, nn_distances = self.index_knn.query_self(
            k=self.n_neighbors)
        N, K = nn_indices.shape

        if self.n_jobs == 1:
            w = [
                helper_solve_lle(data, nn_indices, self.reg_eps, i)
                for i in range(N)
            ]
        else:
            helper_partial = partial(helper_solve_lle, data, nn_indices,
                                     self.reg_eps)
            pool_obj = multiprocessing.Pool(processes=self.n_jobs)
            w = []
            _ = pool_obj.map_async(helper_partial, range(N), callback=w.extend)
            pool_obj.close()
            pool_obj.join()

        # Create a sparse matrix of size `(N, N)` for the adjacency matrix
        row_ind = np.array([[i] * (K + 1) for i in range(N)],
                           dtype=np.int).ravel()
        col_ind = np.insert(nn_indices, 0, np.arange(N), axis=1).ravel()
        w = np.negative(w)
        vals = np.insert(w, 0, 1.0, axis=1).ravel()
        # Matrix `I - W`
        mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N))

        # Matrix `M = (I - W)^T (I - W)`, also referred to as the iterated graph Laplacian
        self.iterated_laplacian_matrix = sparse.csr_matrix.dot(
            mat_tmp.transpose(), mat_tmp)
    def fit(self, data):
        """
        Find the optimal projection matrix for the given data points.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        logger.info("Applying PCA as first-level dimension reduction step")
        data, self.mean_data, self.transform_pca = pca_wrapper(
            data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng)
        if self.dim_projection == 'auto':
            # Estimate the intrinsic dimension of the data and use that as the projected dimension
            id = estimate_intrinsic_dimension(
                data,
                method='two_nn',
                n_neighbors=self.n_neighbors,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                seed_rng=self.seed_rng)
            self.dim_projection = int(np.ceil(id))
            logger.info(
                "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}."
                .format(id))

        if self.dim_projection >= data.shape[1]:
            self.dim_projection = data.shape[1]

        logger.info("Dimension of the projected subspace = {:d}".format(
            self.dim_projection))

        # Create a KNN index for all nearest neighbor tasks
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            seed_rng=self.seed_rng)

        # Create the symmetric adjacency matrix, diagonal incidence matrix, and the graph Laplacian matrix
        # for the data points
        self.create_laplacian_matrix(data)

        # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest
        # eigenvalues as the columns of the projection matrix
        logger.info(
            "Solving the generalized eigenvalue problem to find the optimal projection matrix."
        )
        data_trans = data.T
        # X^T L X
        lmat = sparse.csr_matrix.dot(data_trans,
                                     self.laplacian_matrix).dot(data)
        if self.orthogonal:
            # Orthogonal LPP
            eig_values, eig_vectors = eigh(lmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))
        else:
            # Standard LPP
            # X^T D X
            rmat = sparse.csr_matrix.dot(data_trans,
                                         self.incidence_matrix).dot(data)
            eig_values, eig_vectors = eigh(lmat,
                                           b=rmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))

        # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered
        # according to increasing eigenvalues.
        # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)`
        self.transform_lpp = eig_vectors

        self.transform_comb = np.dot(self.transform_pca, self.transform_lpp)
class LocalityPreservingProjection:
    """
    Locality preserving projection (LPP) method for dimensionality reduction [1, 2].
    Orthogonal LPP (OLPP) method based on [3].

    1. He, Xiaofei, and Partha Niyogi. "Locality preserving projections." Advances in neural information processing
    systems. 2004.
    2. He, Xiaofei, et al. "Face recognition using LaplacianFaces." IEEE Transactions on Pattern Analysis & Machine
    Intelligence 3 (2005): 328-340.
    3. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based
    dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence,
    29.12 (2007): 2143-2156.

    """
    def __init__(
            self,
            dim_projection='auto',  # 'auto' or positive integer
            orthogonal=False,  # True to enable Orthogonal LPP (OLPP)
            pca_cutoff=1.0,
            neighborhood_constant=NEIGHBORHOOD_CONST,
            n_neighbors=None,  # Specify one of them. If `n_neighbors`
            # is specified, `neighborhood_constant` will be ignored.
        shared_nearest_neighbors=False,
            edge_weights='SNN',  # Choices are {'simple', 'SNN', 'heat_kernel'}
            heat_kernel_param=None,  # Used only if `edge_weights = 'heat_kernel'`
            metric=METRIC_DEF,
            metric_kwargs=None,  # distance metric and its parameter dict (if any)
            approx_nearest_neighbors=True,
            n_jobs=1,
            seed_rng=SEED_DEFAULT):
        """
        :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced
                               dimension will be chosen by estimating the intrinsic dimension of the data. If an
                               integer value is specified, it should be in the range `[1, dim - 1]`, where `dim`
                               is the observed dimension of the data.
        :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP
                           in [3].
        :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve
                           in the projected dimension-reduced data. PCA is applied as a first-level dimension
                           reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in
                           order to handle only the data matrix singularity.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to
                                         find the K nearest neighbors. This is a secondary distance metric that is
                                         found to be better suited to high dimensional data. This will be set to
                                         True if `edge_weights = 'SNN'`.
        :param edge_weights: Weighting method to use for the edge weights. Valid choices are {'simple', 'SNN',
                             'heat_kernel'}. They are described below:
                             - 'simple': the edge weight is set to one for every sample pair in the neighborhood.
                             - 'SNN': the shared nearest neighbors (SNN) similarity score between two samples is used
                             as the edge weight. This will be a value in [0, 1].
                             - 'heat_kernel': the heat (Gaussian) kernel with a suitable scale parameter defines the
                             edge weight.
        :param heat_kernel_param: Heat kernel scale parameter. If set to `None`, this parameter is set automatically
                                  based on the median of the pairwise distances between samples. Else a positive
                                  real value can be specified.
        :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity
                       calculation. This is used only if `edge_weights = 'SNN'`.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary. Again, this is used only if `edge_weights = 'SNN'`.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.dim_projection = dim_projection
        self.orthogonal = orthogonal
        self.pca_cutoff = pca_cutoff
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.edge_weights = edge_weights.lower()
        self.heat_kernel_param = heat_kernel_param
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.seed_rng = seed_rng

        if self.edge_weights not in {'simple', 'snn', 'heat_kernel'}:
            raise ValueError(
                "Invalid value '{}' for parameter 'edge_weights'".format(
                    self.edge_weights))

        if self.edge_weights == 'snn':
            self.shared_nearest_neighbors = True

        self.mean_data = None
        self.index_knn = None
        self.adjacency_matrix = None
        self.incidence_matrix = None
        self.laplacian_matrix = None
        self.transform_pca = None
        self.transform_lpp = None
        self.transform_comb = None

    def fit(self, data):
        """
        Find the optimal projection matrix for the given data points.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        logger.info("Applying PCA as first-level dimension reduction step")
        data, self.mean_data, self.transform_pca = pca_wrapper(
            data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng)
        if self.dim_projection == 'auto':
            # Estimate the intrinsic dimension of the data and use that as the projected dimension
            id = estimate_intrinsic_dimension(
                data,
                method='two_nn',
                n_neighbors=self.n_neighbors,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                seed_rng=self.seed_rng)
            self.dim_projection = int(np.ceil(id))
            logger.info(
                "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}."
                .format(id))

        if self.dim_projection >= data.shape[1]:
            self.dim_projection = data.shape[1]

        logger.info("Dimension of the projected subspace = {:d}".format(
            self.dim_projection))

        # Create a KNN index for all nearest neighbor tasks
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            seed_rng=self.seed_rng)

        # Create the symmetric adjacency matrix, diagonal incidence matrix, and the graph Laplacian matrix
        # for the data points
        self.create_laplacian_matrix(data)

        # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest
        # eigenvalues as the columns of the projection matrix
        logger.info(
            "Solving the generalized eigenvalue problem to find the optimal projection matrix."
        )
        data_trans = data.T
        # X^T L X
        lmat = sparse.csr_matrix.dot(data_trans,
                                     self.laplacian_matrix).dot(data)
        if self.orthogonal:
            # Orthogonal LPP
            eig_values, eig_vectors = eigh(lmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))
        else:
            # Standard LPP
            # X^T D X
            rmat = sparse.csr_matrix.dot(data_trans,
                                         self.incidence_matrix).dot(data)
            eig_values, eig_vectors = eigh(lmat,
                                           b=rmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))

        # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered
        # according to increasing eigenvalues.
        # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)`
        self.transform_lpp = eig_vectors

        self.transform_comb = np.dot(self.transform_pca, self.transform_lpp)

    def transform(self, data, dim=None):
        """
        Transform the given data by first subtracting the mean and then applying the linear projection.
        Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger
        than `self.dim_projection`.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`.
                    Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only
                    the `dim` top eigenvectors.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        if dim is None:
            data_trans = np.dot(data - self.mean_data, self.transform_comb)
        else:
            data_trans = np.dot(data - self.mean_data,
                                self.transform_comb[:, 0:dim])

        return data_trans

    def fit_transform(self, data, dim=None):
        """
        Fit the model and transform the given data.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: same as the `transform` method.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        self.fit(data)
        return self.transform(data, dim=dim)

    def create_laplacian_matrix(self, data):
        """
        Calculate the graph Laplacian matrix for the given data.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        # Find the `self.n_neighbors` nearest neighbors of each point
        nn_indices, nn_distances = self.index_knn.query_self(
            k=self.n_neighbors)

        N, K = nn_indices.shape
        row_ind = np.array([[i] * K for i in range(N)], dtype=np.int).ravel()
        col_ind = nn_indices.ravel()
        if self.edge_weights == 'simple':
            vals = np.ones(N * K)
        elif self.edge_weights == 'snn':
            # SNN distance is the cosine-inverse of the SNN similarity. The range of SNN distances will
            # be [0, pi / 2]. Hence, the SNN similarity will be in the range [0, 1].
            vals = np.clip(np.cos(nn_distances).ravel(), 0., None)
        else:
            # Heat kernel
            vals = calculate_heat_kernel(data,
                                         nn_indices,
                                         self.heat_kernel_param,
                                         self.metric,
                                         metric_kwargs=self.metric_kwargs,
                                         n_jobs=self.n_jobs).ravel()

        # Adjacency or edge weight matrix (W)
        mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N))
        self.adjacency_matrix = 0.5 * (mat_tmp + mat_tmp.transpose())

        # Incidence matrix (D)
        vals_diag = self.adjacency_matrix.sum(axis=1)
        vals_diag = np.array(vals_diag[:, 0]).ravel()
        ind = np.arange(N)
        self.incidence_matrix = sparse.csr_matrix((vals_diag, (ind, ind)),
                                                  shape=(N, N))

        # Graph laplacian matrix (L = D - W)
        self.laplacian_matrix = self.incidence_matrix - self.adjacency_matrix
class KNNClassifier:
    """
    Basic k nearest neighbors classifier that supports approximate nearest neighbor querying and custom distance
    metrics including shared nearest neighbors.
    """
    def __init__(self,
                 n_neighbors=1,
                 metric=METRIC_DEF, metric_kwargs=None,
                 shared_nearest_neighbors=False,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 low_memory=False,
                 seed_rng=SEED_DEFAULT):
        """
        :param n_neighbors: int value specifying the number of nearest neighbors. Should be >= 1.
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance.
                                         This is a secondary distance metric that is found to be better suited to
                                         high dimensional data.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                           is likely to increase the running time.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.low_memory = low_memory
        self.seed_rng = seed_rng

        self.index_knn = None
        self.y_train = None
        self.n_classes = None
        self.labels_dtype = None
        self.label_enc = None
        self.label_dec = None

    def fit(self, X, y, y_unique=None):
        """

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param y: numpy array of class labels of shape `(N, )`.
        :param y_unique: Allows the optional specification of the unique labels. Can be a tuple list, or numpy
                         array of the unique labels. If this is not specified, then it is found using
                         `numpy.unique`.
        :return: None
        """
        self.labels_dtype = y.dtype
        # Labels are mapped to dtype int because `numba` does not handle generic numpy arrays
        if y_unique is None:
            y_unique = np.unique(y)

        self.n_classes = len(y_unique)
        ind = np.arange(self.n_classes)
        # Mapping from label values to integers and its inverse
        d = dict(zip(y_unique, ind))
        self.label_enc = np.vectorize(d.__getitem__)

        d = dict(zip(ind, y_unique))
        self.label_dec = np.vectorize(d.__getitem__)
        self.y_train = self.label_enc(y)

        self.index_knn = KNNIndex(
            X,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            low_memory=self.low_memory,
            seed_rng=self.seed_rng
        )

    def predict(self, X, is_train=False):
        """
        Predict the class labels for the given inputs.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param is_train: Set to True if prediction is being done on the same data used to train.

        :return: numpy array with the class predictions, of shape `(N, )`.
        """
        # Get the indices of the nearest neighbors from the training set
        if is_train:
            nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors)
        else:
            nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors)

        labels_pred, _ = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec,
                                            self.n_neighbors)
        return labels_pred

    def predict_multiple_k(self, X, k_list, is_train=False):
        """
        Find the KNN predictions for multiple k values specified via the param `k_list`. This is done efficiently
        by querying for the maximum number of nearest neighbors once and using the results. It is assumed that the
        values in `k_list` are sorted in increasing order. This is useful while performing a search for the
        best `k` value using cross-validation.

        NOTE: The maximum value in `k_list` should be <= `self.n_neighbors`.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param k_list: list or array of k values for which predictions are to be made. Each value should be an
                       integer >= 1 and the values should be sorted in increasing order. For example,
                       `k_list = [2, 4, 6, 8, 10]`.
        :param is_train: Set to True if prediction is being done on the same data used to train.

        :return: numpy array with the class predictions corresponding to each k value in `k_list`.
                 Has shape `(len(k_list), N)`.
        """
        if k_list[-1] > self.n_neighbors:
            raise ValueError("Invalid input: maximum value in `k_list` cannot be larger than {:d}.".
                             format(self.n_neighbors))

        # Query the maximum number of nearest neighbors from `k_list`
        if is_train:
            nn_indices, nn_distances = self.index_knn.query_self(k=k_list[-1])
        else:
            nn_indices, nn_distances = self.index_knn.query(X, k=k_list[-1])

        if self.n_jobs == 1 or len(k_list) == 1:
            labels_pred = np.array(
                [helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, k)[0] for k in k_list],
                dtype=self.labels_dtype
            )
        else:
            helper_partial = partial(helper_knn_predict, nn_indices, self.y_train, self.n_classes, self.label_dec)
            pool_obj = multiprocessing.Pool(processes=self.n_jobs)
            outputs = []
            _ = pool_obj.map_async(helper_partial, k_list, callback=outputs.extend)
            pool_obj.close()
            pool_obj.join()

            labels_pred = np.array([tup[0] for tup in outputs], dtype=self.labels_dtype)

        return labels_pred

    def predict_proba(self, X, is_train=False):
        """
        Estimate the probability of each class along with the predicted most-frequent class.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param is_train: Set to True if prediction is being done on the same data used to train.

        :return:
            - numpy array with the class predictions, of shape `(N, )`.
            - numpy array with the estimated probability of each class, of shape `(N, self.n_classes)`.
              Each row should sum to 1.
        """
        # Get the indices of the nearest neighbors from the training set
        if is_train:
            nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors)
        else:
            nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors)

        labels_pred, counts = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec,
                                                 self.n_neighbors)
        proba = counts / self.n_neighbors

        return labels_pred, proba

    def fit_predict(self, X, y):
        """
        Fit a model and predict on the training data.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param y: numpy array of class labels of shape `(N, )`.
        :return: numpy array with the class predictions, of shape `(N, )`.
        """
        self.fit(X, y)
        return self.predict(X, is_train=True)
示例#8
0
    def fit(self, layer_embeddings_normal, labels_normal, labels_pred_normal,
            layer_embeddings_adversarial, labels_pred_adversarial,
            layer_embeddings_noisy=None, labels_pred_noisy=None):
        """
        Extract the LID feature vector for normal, noisy, and adversarial samples and train a logistic classifier
        to separate adversarial samples from (normal + noisy). Cross-validation is used to select the hyper-parameter
        `C` using area under the ROC curve as the validation metric.

        NOTE:
        True labels and predicted labels are required for the normal feature embeddings.
        Only predicted labels are needed for the noisy and adversarial feature embeddings.

        :param layer_embeddings_normal: list of numpy arrays with the layer embeddings for normal samples.
                                        Length of the list is equal to the number of layers. The numpy array at
                                        index `i` has shape `(n, d_i)`, where `n` is the number of samples and `d_i`
                                        is the dimension of the embeddings at layer `i`.
        :param labels_normal: numpy array of class labels for the normal samples. Should have shape `(n, )`, where
                              `n` is the number of normal samples.
        :param labels_pred_normal: numpy array of DNN classifier predictions for the normal samples. Should have the
                                   same shape as `labels_normal`.
        :param layer_embeddings_adversarial: Same format as `layer_embeddings_normal`, but corresponding to
                                             the adversarial samples.
        :param labels_pred_adversarial: numpy array of DNN classifier predictions for the adversarial samples. Should
                                        have shape `(n, )`, where `n` is the number of adversarial samples.
        :param layer_embeddings_noisy: Same format as `layer_embeddings_normal`, but corresponding to the noisy
                                       samples. Can be set to `None` to exclude noisy data from training.
        :param labels_pred_noisy: numpy array of DNN classifier predictions for the noisy samples. Should have shape
                                  `(n, )`, where `n` is the number of noisy samples. Can be set to `None` to exclude
                                  noisy data from training.
        :return:
            (self, scores_normal, scores_adversarial) if layer_embeddings_noise is None
            (self, scores_normal, scores_adversarial, scores_noisy) otherwise.
            -------------------------------------------------------
            - self: trained instance of the class.
            - scores_normal: numpy array with the scores (decision function of the logistic classifier) for normal
                             samples. 1d array with the same number of samples as `layer_embeddings_normal`.
            - scores_noisy: scores corresponding to `layer_embeddings_noisy` if noisy training data is provided.
            - scores_adversarial: scores corresponding to `layer_embeddings_adversarial`.
        """
        self.n_layers = len(layer_embeddings_normal)
        logger.info("Number of layer embeddings: {:d}.".format(self.n_layers))
        if layer_embeddings_noisy is None:
            logger.info("Noisy training data not provided.")
            cond1 = False
            noisy_data = False
        else:
            cond1 = (len(layer_embeddings_noisy) != self.n_layers)
            noisy_data = True
            if labels_pred_noisy is None:
                raise ValueError("Class predictions are not provided for the noisy data")

        if cond1 or (len(layer_embeddings_adversarial) != self.n_layers):
            raise ValueError("The layer embeddings for noisy and attack samples must have the same length as that "
                             "of normal samples")

        if labels_normal.shape != labels_pred_normal.shape:
            raise ValueError("Length of arrays 'labels_normal' and 'labels_pred_normal' is not equal")

        # Number of samples in each of the categories
        self.n_samples = [
            layer_embeddings_normal[0].shape[0],
            layer_embeddings_noisy[0].shape[0] if noisy_data else 0,
            layer_embeddings_adversarial[0].shape[0]
        ]
        # Distinct class labels
        self.labels_unique = np.unique(labels_normal)
        for c in self.labels_unique:
            # Normal labeled samples from class `c`
            self.indices_true[c] = np.where(labels_normal == c)[0]
            # Normal samples predicted into class `c`
            self.indices_pred_normal[c] = np.where(labels_pred_normal == c)[0]
            # Adversarial samples predicted into class `c`
            self.indices_pred_adver[c] = np.where(labels_pred_adversarial == c)[0]
            if noisy_data:
                # Noisy samples predicted into class `c`
                self.indices_pred_noisy[c] = np.where(labels_pred_noisy == c)[0]

            # Number of nearest neighbors per class
            if self.n_neighbors is None:
                # Set based on the number of samples from this class and the neighborhood constant
                self.n_neighbors_per_class[c] = \
                    int(np.ceil(self.indices_true[c].shape[0] ** self.neighborhood_constant))
            else:
                # Use the value specified as input
                self.n_neighbors_per_class[c] = self.n_neighbors

        # The data arrays at all layers should have the same number of samples
        if not all([layer_embeddings_normal[i].shape[0] == self.n_samples[0] for i in range(self.n_layers)]):
            raise ValueError("Input 'layer_embeddings_normal' does not have the expected format")

        if noisy_data:
            if not all([layer_embeddings_noisy[i].shape[0] == self.n_samples[1] for i in range(self.n_layers)]):
                raise ValueError("Input 'layer_embeddings_noisy' does not have the expected format")

        if not all([layer_embeddings_adversarial[i].shape[0] == self.n_samples[2] for i in range(self.n_layers)]):
            raise ValueError("Input 'layer_embeddings_adversarial' does not have the expected format")

        if self.save_knn_indices_to_file:
            # Create a temporary directory for saving the KNN indices
            self.temp_direc = tempfile.mkdtemp(dir=os.getcwd())
            self.temp_knn_files = [''] * self.n_layers

        # KNN indices for the layer embeddings from each layer and each class
        self.index_knn = [dict() for _ in range(self.n_layers)]
        features_lid_normal = np.zeros((self.n_samples[0], self.n_layers))
        features_lid_noisy = np.zeros((self.n_samples[1], self.n_layers))
        features_lid_adversarial = np.zeros((self.n_samples[2], self.n_layers))
        for i in range(self.n_layers):
            logger.info("Processing layer {:d}:".format(i + 1))
            # Dimensionality reduction of the layer embeddings, if required
            if self.transform_models:
                data_normal = transform_data_from_model(layer_embeddings_normal[i], self.transform_models[i])
                data_adver = transform_data_from_model(layer_embeddings_adversarial[i], self.transform_models[i])
                if noisy_data:
                    data_noisy = transform_data_from_model(layer_embeddings_noisy[i], self.transform_models[i])
                else:
                    data_noisy = None

                d1 = layer_embeddings_normal[i].shape[1]
                d2 = data_normal.shape[1]
                if d2 < d1:
                    logger.info("Input dimension = {:d}, projected dimension = {:d}".format(d1, d2))
            else:
                data_normal = layer_embeddings_normal[i]
                data_adver = layer_embeddings_adversarial[i]
                if noisy_data:
                    data_noisy = layer_embeddings_noisy[i]
                else:
                    data_noisy = None

            for c in self.labels_unique:
                logger.info("Building a KNN index on the feature embeddings of normal samples from class {}".
                            format(c))
                self.index_knn[i][c] = KNNIndex(
                    data_normal[self.indices_true[c], :], n_neighbors=self.n_neighbors_per_class[c],
                    metric=self.metric, metric_kwargs=self.metric_kwargs,
                    approx_nearest_neighbors=self.approx_nearest_neighbors,
                    n_jobs=self.n_jobs,
                    low_memory=self.low_memory,
                    seed_rng=self.seed_rng
                )
                logger.info("Calculating LID estimates for the normal, noisy, and adversarial layer embeddings "
                            "predicted into class {}".format(c))
                # Distance to nearest neighbors of all labeled samples from class `c`
                _, nn_distances_temp = self.index_knn[i][c].query_self(k=self.n_neighbors_per_class[c])

                n_pred_normal = self.indices_pred_normal[c].shape[0]
                n_pred_adver = self.indices_pred_adver[c].shape[0]
                if noisy_data:
                    n_pred_noisy = self.indices_pred_noisy[c].shape[0]
                else:
                    n_pred_noisy = 0

                if n_pred_normal:
                    # Distance to nearest neighbors of samples predicted into class `c` that are also labeled as
                    # class `c`. These samples will be a part of the KNN index
                    nn_distances = helper_knn_distance(self.indices_pred_normal[c], self.indices_true[c],
                                                       nn_distances_temp)
                    mask = (nn_distances[:, 0] < 0.)
                    if np.any(mask):
                        # Distance to nearest neighbors of samples predicted into class `c` that are not labeled as
                        # class `c`. These samples will not be a part of the KNN index
                        ind_comp = self.indices_pred_normal[c][mask]
                        _, temp_arr = self.index_knn[i][c].query(data_normal[ind_comp, :],
                                                                 k=self.n_neighbors_per_class[c])
                        nn_distances[mask, :] = temp_arr

                    # LID estimates for the normal feature embeddings predicted into class `c`
                    features_lid_normal[self.indices_pred_normal[c], i] = lid_mle_amsaleg(nn_distances)

                # LID estimates for the noisy feature embeddings predicted into class `c`
                if n_pred_noisy:
                    temp_arr = data_noisy[self.indices_pred_noisy[c], :]
                    _, nn_distances = self.index_knn[i][c].query(temp_arr, k=self.n_neighbors_per_class[c])
                    features_lid_noisy[self.indices_pred_noisy[c], i] = lid_mle_amsaleg(nn_distances)

                # LID estimates for the adversarial feature embeddings predicted into class `c`
                if n_pred_adver:
                    temp_arr = data_adver[self.indices_pred_adver[c], :]
                    _, nn_distances = self.index_knn[i][c].query(temp_arr, k=self.n_neighbors_per_class[c])
                    features_lid_adversarial[self.indices_pred_adver[c], i] = lid_mle_amsaleg(nn_distances)

            if self.save_knn_indices_to_file:
                logger.info("Saving the KNN indices per class from layer {:d} to a pickle file".format(i + 1))
                self.temp_knn_files[i] = os.path.join(self.temp_direc, 'knn_indices_layer_{:d}.pkl'.format(i + 1))
                with open(self.temp_knn_files[i], 'wb') as fp:
                    pickle.dump(self.index_knn[i], fp)

                # Free up the allocated memory
                self.index_knn[i] = None

        # LID feature vectors and labels for the binary logistic classifier.
        # Normal and noisy samples are given label 0 and adversarial samples are given label 1
        n_pos = features_lid_adversarial.shape[0]
        if noisy_data:
            features_lid = np.concatenate([features_lid_normal, features_lid_noisy, features_lid_adversarial],
                                          axis=0)
            labels_bin = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int),
                                         np.zeros(features_lid_noisy.shape[0], dtype=np.int),
                                         np.ones(n_pos, dtype=np.int)])
        else:
            features_lid = np.concatenate([features_lid_normal, features_lid_adversarial], axis=0)
            labels_bin = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int),
                                         np.ones(n_pos, dtype=np.int)])

        pos_prop = n_pos / float(labels_bin.shape[0])
        # Randomly shuffle the samples to avoid determinism
        ind_perm = np.random.permutation(labels_bin.shape[0])
        features_lid = features_lid[ind_perm, :]
        labels_bin = labels_bin[ind_perm]
        # Min-max scaling for the LID features
        self.scaler = MinMaxScaler().fit(features_lid)
        features_lid = self.scaler.transform(features_lid)
        logger.info("Training a binary logistic classifier with {:d} samples and {:d} LID features.".
                    format(*features_lid.shape))
        logger.info("Using {:d}-fold cross-validation with area under ROC curve as the metric to select "
                    "the best regularization hyperparameter.".format(self.n_cv_folds))
        logger.info("Proportion of positive (adversarial or OOD) samples in the training data: {:.4f}".
                    format(pos_prop))
        class_weight = None
        if self.balanced_classification:
            if (pos_prop < 0.45) or (pos_prop > 0.55):
                class_weight = {0: 1.0 / (1 - pos_prop),
                                1: 1.0 / pos_prop}
                logger.info("Balancing the classes by assigning sample weight {:.4f} to class 0 and sample weight "
                            "{:.4f} to class 1".format(class_weight[0], class_weight[1]))

        self.model_logistic = LogisticRegressionCV(
            Cs=self.c_search_values,
            cv=self.n_cv_folds,
            penalty='l2',
            scoring='roc_auc',
            multi_class='auto',
            class_weight=class_weight,
            max_iter=self.max_iter,
            refit=True,
            n_jobs=self.n_jobs,
            random_state=self.seed_rng
        ).fit(features_lid, labels_bin)

        # Larger values of this score correspond to a higher probability of predicting class 1 (adversarial)
        scores_normal = self.model_logistic.decision_function(self.scaler.transform(features_lid_normal))
        scores_adversarial = self.model_logistic.decision_function(self.scaler.transform(features_lid_adversarial))
        if noisy_data:
            scores_noisy = self.model_logistic.decision_function(self.scaler.transform(features_lid_noisy))
            return self, scores_normal, scores_adversarial, scores_noisy
        else:
            return self, scores_normal, scores_adversarial
class averaged_KLPE_anomaly_detection:
    def __init__(self,
                 neighborhood_constant=NEIGHBORHOOD_CONST,
                 n_neighbors=None,
                 standardize=True,
                 metric=METRIC_DEF,
                 metric_kwargs=None,
                 shared_nearest_neighbors=False,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 low_memory=False,
                 seed_rng=SEED_DEFAULT):
        """

        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param standardize: Set to True to standardize the individual features to the range [-1, 1].
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance.
                                         This is a secondary distance metric that is found to be better suited to
                                         high dimensional data.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                           is likely to increase the running time.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.standardize = standardize
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.low_memory = low_memory
        self.seed_rng = seed_rng

        self.scaler = None
        self.data_train = None
        self.neighborhood_range = None
        self.index_knn = None
        self.dist_stat_nominal = None
        np.random.seed(self.seed_rng)

    def fit(self, data):
        """
        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :return: None
        """
        N, d = data.shape
        if self.standardize:
            self.scaler = MinMaxScaler(feature_range=(-1, 1)).fit(data)
            data = self.scaler.transform(data)

        if self.shared_nearest_neighbors:
            self.data_train = data

        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        # The distance statistic is averaged over this neighborhood range
        low = self.n_neighbors - int(np.floor(0.5 * (self.n_neighbors - 1)))
        high = self.n_neighbors + int(np.floor(0.5 * self.n_neighbors))
        self.neighborhood_range = (low, high)
        logger.info("Number of samples: {:d}. Number of features: {:d}".format(
            N, d))
        logger.info(
            "Range of nearest neighbors used for the averaged K-LPE statistic: ({:d}, {:d})"
            .format(low, high))
        # Build the KNN graph
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.neighborhood_range[1],
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            low_memory=self.low_memory,
            seed_rng=self.seed_rng)
        # Compute the distance statistic for every data point
        self.dist_stat_nominal = self.distance_statistic(data,
                                                         exclude_self=True)

    def score(self, data_test, exclude_self=False, return_distances=False):
        """
        Calculate the anomaly score which is the negative log of the empirical p-value of the averaged KNN distance.

        :param data_test: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the
                          number of dimensions (features).
        :param exclude_self: Set to True if the points in `data` were already used to build the KNN index.
        :param return_distances: Set to True in order to include the distance statistics along with the negative
                                 log p-value scores in the returned tuple.
        :return
            score: numpy array of shape `(N, )` containing the score for each point. Points with higher score are
                   more likely to be anomalous.
            Returned only if `return_distances` is set to True.
            dist: numpy array of shape `(N, )` containing the distance statistic for each point.
        """
        # Calculate the k-nearest neighbors based distance statistic
        dist_stat_test = self.distance_statistic(data_test,
                                                 exclude_self=exclude_self)
        # Negative log of the empirical p-value
        p = pvalue_score(self.dist_stat_nominal,
                         dist_stat_test,
                         log_transform=True,
                         bootstrap=True)

        if return_distances:
            return p, dist_stat_test
        else:
            return p

    def distance_statistic(self, data, exclude_self=False):
        """
        Calculate the average distance statistic by querying the nearest neighbors of the given set of points.

        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :param exclude_self: Set to True if the points in `data` were already used to build the KNN index.
        :return dist_stat: numpy array of distance statistic for each point.
        """
        if exclude_self:
            # Data should be already scaled in the `fit` method
            nn_indices, nn_distances = self.index_knn.query_self(
                k=self.neighborhood_range[1])
        else:
            if self.standardize:
                data = self.scaler.transform(data)

            nn_indices, nn_distances = self.index_knn.query(
                data, k=self.neighborhood_range[1])

        if self.shared_nearest_neighbors:
            # The distance statistic is calculated based on the primary distance metric, but within the
            # neighborhood set found using the SNN distance. The idea is that for high-dimensional data,
            # the neighborhood found using SNN is more reliable
            dist_stat = self.distance_statistic_local(
                data, nn_indices, self.neighborhood_range[0])
        else:
            dist_stat = np.mean(nn_distances[:, (self.neighborhood_range[0] -
                                                 1):],
                                axis=1)

        return dist_stat

    def distance_statistic_local(self, data, nn_indices, k):
        """
        Computes the mean distance statistic for each row of `data` within a local neighborhood specified by
        `nn_indices`.

        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :param nn_indices: numpy array of `p` nearest neighbor indices with shape `(N, p)`.
        :param k: start index of the neighbor from which the mean distance is computed.
        :return dist_array: numpy array of shape `(N, )` with the mean distance values.
        """
        n = data.shape[0]
        if self.n_jobs == 1:
            dist_stat = [
                helper_distance(data, self.data_train, nn_indices, self.metric,
                                self.metric_kwargs, k, i) for i in range(n)
            ]
        else:
            helper_distance_partial = partial(helper_distance, data,
                                              self.data_train, nn_indices,
                                              self.metric, self.metric_kwargs,
                                              k)
            pool_obj = multiprocessing.Pool(processes=self.n_jobs)
            dist_stat = []
            _ = pool_obj.map_async(helper_distance_partial,
                                   range(n),
                                   callback=dist_stat.extend)
            pool_obj.close()
            pool_obj.join()

        return np.array(dist_stat)
    def fit(self, layer_embeddings, labels):
        """
        Estimate parameters of the detection method given natural (non-adversarial) input data. Note that this
        data should be different from that used to train the DNN classifier.
        NOTE: Inputs to this method can be obtained by calling the function `extract_layer_embeddings`.

        :param layer_embeddings: list of numpy arrays with the layer embedding data. Length of the list is equal to
                                 the number of layers. The numpy array at index `i` has shape `(n, d_i)`, where `n`
                                 is the number of samples and `d_i` is the dimension of the embeddings at layer `i`.
        :param labels: numpy array of labels for the classification problem addressed by the DNN. Should have shape
                       `(n, )`, where `n` is the number of samples.
        :return: Instance of the class with all parameters fit to the data.
        """
        self.n_layers = len(layer_embeddings)
        self.labels_unique = np.unique(labels)
        self.n_classes = len(self.labels_unique)
        self.n_samples = labels.shape[0]
        # Mapping from the original labels to the set {0, 1, . . .,self.n_classes - 1}. This is needed by the label
        # count function
        d = dict(zip(self.labels_unique, np.arange(self.n_classes)))
        self.label_encoder = np.vectorize(d.__getitem__)

        # Number of nearest neighbors
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(
                np.ceil(self.n_samples**self.neighborhood_constant))

        logger.info("Number of classes: {:d}.".format(self.n_classes))
        logger.info("Number of layer embeddings: {:d}.".format(self.n_layers))
        logger.info("Number of samples: {:d}.".format(self.n_samples))
        logger.info("Number of neighbors: {:d}.".format(self.n_neighbors))
        if not all([
                layer_embeddings[i].shape[0] == self.n_samples
                for i in range(self.n_layers)
        ]):
            raise ValueError(
                "Input 'layer_embeddings' does not have the expected format")

        self.labels_train_enc = self.label_encoder(labels)
        indices_true = dict()
        self.mask_exclude = np.ones((self.n_classes, self.n_classes),
                                    dtype=np.bool)
        for j, c in enumerate(self.labels_unique):
            # Index of labeled samples from class `c`
            indices_true[c] = np.where(labels == c)[0]
            self.mask_exclude[j, j] = False

        self.nonconformity_calib = np.zeros(self.n_samples)
        self.index_knn = [None for _ in range(self.n_layers)]
        for i in range(self.n_layers):
            logger.info("Processing layer {:d}:".format(i + 1))
            if self.transform_models:
                logger.info(
                    "Transforming the embeddings from layer {:d}.".format(i +
                                                                          1))
                data_proj = transform_data_from_model(layer_embeddings[i],
                                                      self.transform_models[i])
                logger.info(
                    "Input dimension = {:d}, projected dimension = {:d}".
                    format(layer_embeddings[i].shape[1], data_proj.shape[1]))
            else:
                data_proj = layer_embeddings[i]

            logger.info("Building a KNN index for nearest neighbor queries.")
            # Build a KNN index on the set of feature embeddings from normal samples from layer `i`
            self.index_knn[i] = KNNIndex(
                data_proj,
                n_neighbors=self.n_neighbors,
                metric=self.metric,
                metric_kwargs=self.metric_kwargs,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                low_memory=self.low_memory,
                seed_rng=self.seed_rng)
            # Indices of the nearest neighbors of each sample
            nn_indices, _ = self.index_knn[i].query_self(k=self.n_neighbors)
            logger.info(
                "Calculating the class label counts and non-conformity scores in the neighborhood of "
                "each sample.")
            _, nc_counts = neighbors_label_counts(nn_indices,
                                                  self.labels_train_enc,
                                                  self.n_classes)

            for j, c in enumerate(self.labels_unique):
                # Neighborhood counts of all classes except `c`
                nc_counts_slice = nc_counts[:, self.mask_exclude[j, :]]
                # Nonconformity from layer `i` for all labeled samples from class `c`
                self.nonconformity_calib[indices_true[c]] += np.sum(
                    nc_counts_slice[indices_true[c], :], axis=1)

        return self
def estimate_intrinsic_dimension(
        data,
        method='two_nn',  # method choices are {'two_nn', 'lid_mle'}
        neighborhood_constant=NEIGHBORHOOD_CONST,
        n_neighbors=None,
        metric='euclidean',
        metric_kwargs=None,
        approx_nearest_neighbors=True,
        n_jobs=1,
        low_memory=False,
        seed_rng=SEED_DEFAULT):
    """
    Wrapper function for estimating the intrinsic dimension of the data.

    :param data: data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of features.
    :param method: method string. Valid choices are 'two_nn' and 'lid_mle'.
    :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function
                                  of the number of samples (data size). If `N` is the number of samples, then the
                                  number of neighbors is set to `N^neighborhood_constant`. It is recommended to set
                                  this value in the range 0.4 to 0.5.
    :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                        the `neighborhood_constant` is ignored. It is sufficient to specify either
                        `neighborhood_constant` or `n_neighbors`.
    :param metric: distance metric to use. Euclidean by default.
    :param metric_kwargs: optional keyword arguments for the distance metric specified as a dict.
    :param approx_nearest_neighbors: Set to True to use an approximate nearest neighbor method. Usually the right
                                     choice unless both the number of samples are features are small.
    :param n_jobs: number of CPU cores to use.
    :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                       is likely to increase the running time.
    :param seed_rng: seed for the random number generator.

    :return: positive float value specifying the estimated intrinsic dimension.
    """
    # Build a KNN graph index
    index_knn = KNNIndex(data,
                         neighborhood_constant=neighborhood_constant,
                         n_neighbors=n_neighbors,
                         metric=metric,
                         metric_kwargs=metric_kwargs,
                         shared_nearest_neighbors=False,
                         approx_nearest_neighbors=approx_nearest_neighbors,
                         n_jobs=n_jobs,
                         low_memory=low_memory,
                         seed_rng=seed_rng)
    # Query the nearest neighbors of each point
    nn_indices, nn_distances = index_knn.query_self()

    method = method.lower()
    if method == 'two_nn':
        # Two nearest neighbors ID estimator
        id = id_two_nearest_neighbors(nn_distances)
    elif method == 'lid_mle':
        # Median of the local intrinsic dimension estimates around each point
        id = np.median(lid_mle_amsaleg(nn_distances))
    else:
        raise ValueError(
            "Invalid value '{}' specified for argument 'method'".format(
                method))

    return id
    def fit(self, data, labels, labels_pred):
        """
        Estimate the `1 - alpha` density level sets for each class using the given data, with true labels and
        classifier-predicted labels. This will be used to calculate the trust score.

        :param data: numpy array with the feature vectors of shape `(n, d)`, where `n` and `d` are the number of
                     samples and the data dimension respectively.
        :param labels: numpy array of labels for the classification problem addressed by the DNN. Should have shape
                       `(n, )`, where `n` is the number of samples.
        :param labels_pred: numpy array similar to `labels`, but with the classes predicted by the classifier.

        :return: Instance of the class with all parameters fit to the data.
        """
        self.n_samples, dim = data.shape
        self.labels_unique = np.unique(labels)
        self.n_classes = len(self.labels_unique)
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the maximum number of samples per class and the neighborhood
            # constant
            num = 0
            for c in self.labels_unique:
                ind = np.where(labels == c)[0]
                if ind.shape[0] > num:
                    num = ind.shape[0]

            self.n_neighbors = int(np.ceil(num ** self.neighborhood_constant))

        logger.info("Number of samples: {:d}. Data dimension = {:d}.".format(self.n_samples, dim))
        logger.info("Number of classes: {:d}.".format(self.n_classes))
        logger.info("Number of neighbors (k): {:d}.".format(self.n_neighbors))
        logger.info("Fraction of outliers (alpha): {:.4f}.".format(self.alpha))
        if self.model_dim_reduction:
            data = transform_data_from_model(data, self.model_dim_reduction)
            dim = data.shape[1]
            logger.info("Applying dimension reduction to the data. Projected dimension = {:d}.".format(dim))

        # Distance from each sample in `data` to the `1 - alpha` level sets corresponding to each class
        distance_level_sets = np.zeros((self.n_samples, self.n_classes))
        self.index_knn = dict()
        self.epsilon = dict()
        indices_sub = dict()
        for j, c in enumerate(self.labels_unique):
            logger.info("Processing data from class '{}':".format(c))
            logger.info("Building a KNN index for all the samples from class '{}'.".format(c))
            indices_sub[c] = np.where(labels == c)[0]
            data_sub = data[indices_sub[c], :]
            self.index_knn[c] = KNNIndex(
                data_sub, n_neighbors=self.n_neighbors,
                metric=self.metric, metric_kwargs=self.metric_kwargs,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                low_memory=self.low_memory,
                seed_rng=self.seed_rng
            )
            # Distances to the k nearest neighbors of each sample
            _, nn_distances = self.index_knn[c].query_self(k=self.n_neighbors)
            # Radius or distance to the k-th nearest neighbor for each sample
            radius_arr = nn_distances[:, self.n_neighbors - 1]

            # Smallest radius `epsilon` such that only `alpha` fraction of the samples from class `c` have radius
            # greater than `epsilon`
            if self.alpha > 0.:
                self.epsilon[c] = np.percentile(radius_arr, 100 * (1 - self.alpha), interpolation='midpoint')

                # Exclude the outliers and build a KNN index with the remaining samples
                mask_incl = radius_arr <= self.epsilon[c]
                mask_excl = np.logical_not(mask_incl)
                num_excl = mask_excl[mask_excl].shape[0]
            else:
                # Slightly larger value than the largest radius
                self.epsilon[c] = 1.0001 * np.max(radius_arr)

                # All samples are included in the density level set
                mask_incl = np.ones(indices_sub[c].shape[0], dtype=np.bool)
                mask_excl = np.logical_not(mask_incl)
                num_excl = 0

            if num_excl:
                logger.info("Excluding {:d} samples with radius larger than {:.6f} and building a KNN index with "
                            "the remaining samples.".format(num_excl, self.epsilon[c]))
                self.index_knn[c] = KNNIndex(
                    data_sub[mask_incl, :], n_neighbors=self.n_neighbors,
                    metric=self.metric, metric_kwargs=self.metric_kwargs,
                    approx_nearest_neighbors=self.approx_nearest_neighbors,
                    n_jobs=self.n_jobs,
                    low_memory=self.low_memory,
                    seed_rng=self.seed_rng
                )
                # Distance to the nearest neighbor of each sample that is part of the KNN index
                _, dist_temp = self.index_knn[c].query_self(k=1)
                ind = indices_sub[c][mask_incl]
                distance_level_sets[ind, j] = dist_temp[:, 0]

                # Distance to the nearest neighbor of each sample that is not a part of the KNN index (outliers)
                _, dist_temp = self.index_knn[c].query(data_sub[mask_excl, :], k=1)
                ind = indices_sub[c][mask_excl]
                distance_level_sets[ind, j] = dist_temp[:, 0]
            else:
                # No need to rebuild the KNN index because no samples are excluded.
                # Distance to the nearest neighbor of each sample
                distance_level_sets[indices_sub[c], j] = nn_distances[:, 0]

        logger.info("Calculating the trust score for the estimation data.")
        for c in self.labels_unique:
            # Compute the distance from each sample from class `c` to the level sets from the remaining classes
            data_sub = data[indices_sub[c], :]
            for j, c_hat in enumerate(self.labels_unique):
                if c_hat == c:
                    continue

                _, dist_temp = self.index_knn[c_hat].query(data_sub, k=1)
                distance_level_sets[indices_sub[c], j] = dist_temp[:, 0]

        self.scores_estim = self._score_helper(distance_level_sets, labels_pred)
        return self
示例#13
0
def set_kernel_scale(layer_embeddings_train,
                     layer_embeddings_test,
                     metric='euclidean',
                     n_neighbors=10,
                     n_jobs=1,
                     search_size=20,
                     alpha=0.5):
    # `layer_embeddings_train` and `layer_embeddings_test` will both be a list of numpy arrays
    n_layers = len(layer_embeddings_test)
    n_test = layer_embeddings_test[0].shape[0]
    # n_train = layer_embeddings_train[0].shape[0]

    # `1 - epsilon` values
    v = np.linspace(0.05, 0.95, num=search_size)
    sigma_multiplier = np.sqrt(-1. / np.log(v))
    sigma_per_layer = np.ones((n_test, n_layers))
    for i in range(n_layers):
        if metric == 'cosine':
            # For cosine distance, we scale the layer embedding vectors to have unit norm
            norm_train = np.linalg.norm(layer_embeddings_train[i],
                                        axis=1) + NORM_REG
            x_train = layer_embeddings_train[i] / norm_train[:, np.newaxis]
            norm_test = np.linalg.norm(layer_embeddings_test[i],
                                       axis=1) + NORM_REG
            x_test = layer_embeddings_test[i] / norm_test[:, np.newaxis]
        else:
            x_train = layer_embeddings_train[i]
            x_test = layer_embeddings_test[i]

        # Build a KNN index on the layer embeddings from the train split
        index_knn = KNNIndex(x_train,
                             n_neighbors=n_neighbors,
                             metric='euclidean',
                             approx_nearest_neighbors=True,
                             n_jobs=n_jobs)
        # Query the index of nearest neighbors of the layer embeddings from the test split
        nn_indices, nn_distances = index_knn.query(x_test, k=n_neighbors)
        # `nn_indices` and `nn_distances` should have shape `(n_test, n_neighbors)`

        # Candidate sigma values are obtained by multiplying `sqrt(\eta_k^2 - \eta_1^2)` of each test point with
        # the `sigma_multiplier` defined earlier. Here `eta_k` and `eta_1` denote distance to the k-th and the 1-st
        # nearest neighbor respectively
        sigma_cand_vals = (np.sqrt(nn_distances[:, -1]**2 -
                                   nn_distances[:, 0]**2).reshape(n_test, 1) *
                           sigma_multiplier.reshape(1, search_size))
        # `sigma_cand_vals` should have shape `(n_test, search_size)`

        # Compute pairwise distances between points in `layer_embeddings_test` and `layer_embeddings_train`
        dist_mat = pairwise_distances(x_test,
                                      Y=x_train,
                                      metric='euclidean',
                                      n_jobs=n_jobs)
        # `dist_mat` should have shape `(n_test, n_train)`
        # Calculate the objective function to maximize for different candidate `sigma` values
        if n_jobs == 1:
            out = [
                helper_objective(nn_distances, dist_mat, alpha,
                                 sigma_cand_vals, t)
                for t in range(search_size)
            ]
        else:
            # partial function called by multiprocessing
            helper_objective_partial = partial(helper_objective, nn_distances,
                                               dist_mat, alpha,
                                               sigma_cand_vals)
            pool_obj = multiprocessing.Pool(processes=n_jobs)
            out = []
            _ = pool_obj.map_async(helper_objective_partial,
                                   range(search_size),
                                   callback=out.extend)
            pool_obj.close()
            pool_obj.join()

        # `out` will be a list of length `search_size`, where each element is a numpy array with the objective
        # function values for the `n_test` samples.
        # `objec_arr` will have shape `(search_size, n_test)`
        objec_arr = np.array(out)
        # Find the sigma value corresponding to the maximum objective function for each test sample
        ind_max = np.argmax(objec_arr, axis=0)
        sigma_per_layer[:, i] = [
            sigma_cand_vals[j, ind_max[j]] for j in range(n_test)
        ]

    return sigma_per_layer