class NeighborhoodPreservingProjection:
    """
    Neighborhood preserving projection (NPP) method for dimensionality reduction. Also known as neighborhood
    preserving embedding (NPE) [1].
    Orthogonal neighborhood preserving projection (ONPP) method is based on [2].

    1. He, Xiaofei, et al. "Neighborhood preserving embedding." Tenth IEEE International Conference on Computer
       Vision (ICCV'05) Volume 1. Vol. 2. IEEE, 2005.
    2. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based
    dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence,
    29.12 (2007): 2143-2156.

    """
    def __init__(
            self,
            dim_projection='auto',  # 'auto' or positive integer
            orthogonal=False,  # True to enable Orthogonal NPP (ONPP) method
            pca_cutoff=1.0,
            neighborhood_constant=NEIGHBORHOOD_CONST,
            n_neighbors=None,  # Specify one of them. If `n_neighbors`
            # is specified, `neighborhood_constant` will be ignored.
        shared_nearest_neighbors=False,
            metric=METRIC_DEF,
            metric_kwargs=None,  # distance metric and its parameter dict (if any)
            approx_nearest_neighbors=True,
            n_jobs=1,
            reg_eps=0.001,
            seed_rng=SEED_DEFAULT):
        """
        :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced
                               dimension will be chosen by estimating the intrinsic dimension of the data. If an
                               integer value is specified, it should be in the range `[1, dim - 1]`, where `dim`
                               is the observed dimension of the data.
        :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP
                           in [3].
        :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve
                           in the projected dimension-reduced data. PCA is applied as a first-level dimension
                           reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in
                           order to handle only the data matrix singularity.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to
                                         find the K nearest neighbors. This is a secondary distance metric that is
                                         found to be better suited to high dimensional data.
        :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity
                       calculation.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param reg_eps: small float value that multiplies the trace to regularize the Gram matrix, if it is
                        close to singular.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.dim_projection = dim_projection
        self.orthogonal = orthogonal
        self.pca_cutoff = pca_cutoff
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.reg_eps = reg_eps
        self.seed_rng = seed_rng

        self.mean_data = None
        self.index_knn = None
        self.iterated_laplacian_matrix = None
        self.transform_pca = None
        self.transform_npp = None
        self.transform_comb = None

    def fit(self, data):
        """
        Find the optimal projection matrix for the given data points.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        logger.info("Applying PCA as first-level dimension reduction step")
        data, self.mean_data, self.transform_pca = pca_wrapper(
            data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng)

        # If `self.neighbors > data.shape[1]` (number of neighbors larger than the data dimension), then the
        # Gram matrix that comes up while solving for the neighborhood weights becomes singular. To avoid this,
        # we can set `self.neighbors = data.shape[1]` or add a small nonzero value to the diagonal elements of the
        # Gram matrix
        d = data.shape[1]
        if self.n_neighbors >= d:
            k = max(d - 1, 1)
            logger.info(
                "Reducing the number of neighbors from {:d} to {:d} to avoid singular Gram "
                "matrix while solving for neighborhood weights.".format(
                    self.n_neighbors, k))
            self.n_neighbors = k

        if self.dim_projection == 'auto':
            # Estimate the intrinsic dimension of the data and use that as the projected dimension
            id = estimate_intrinsic_dimension(
                data,
                method='two_nn',
                n_neighbors=self.n_neighbors,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                seed_rng=self.seed_rng)
            self.dim_projection = int(np.ceil(id))
            logger.info(
                "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}."
                .format(id))

        if self.dim_projection >= data.shape[1]:
            self.dim_projection = data.shape[1]

        logger.info("Dimension of the projected subspace = {:d}".format(
            self.dim_projection))

        # Create a KNN index for all nearest neighbor tasks
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            seed_rng=self.seed_rng)

        # Create the adjacency matrix `W` based on the optimal reconstruction weights of neighboring points
        # (as done in locally linear embedding).
        # Then calculate the iterated graph Laplacian matrix `M = (I - W)^T (I - W)`.
        self.create_iterated_laplacian(data)

        # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest
        # eigenvalues as the columns of the projection matrix
        logger.info(
            "Solving the generalized eigenvalue problem to find the optimal projection matrix."
        )
        data_trans = data.T
        # X^T M X
        lmat = sparse.csr_matrix.dot(data_trans,
                                     self.iterated_laplacian_matrix).dot(data)
        if self.orthogonal:
            # ONPP, the paper [2] recommends skipping the eigenvector corresponding to the smallest eigenvalue
            eig_values, eig_vectors = eigh(lmat,
                                           eigvals=(1, self.dim_projection))
        else:
            # Standard NPP or NPE
            # X^T X
            rmat = np.dot(data_trans, data)
            eig_values, eig_vectors = eigh(lmat,
                                           b=rmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))

        # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered
        # according to increasing eigenvalues.
        # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)`
        self.transform_npp = eig_vectors

        self.transform_comb = np.dot(self.transform_pca, self.transform_npp)

    def transform(self, data, dim=None):
        """
        Transform the given data by first subtracting the mean and then applying the linear projection.
        Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger
        than `self.dim_projection`.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`.
                    Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only
                    the `dim` top eigenvectors.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        if dim is None:
            data_trans = np.dot(data - self.mean_data, self.transform_comb)
        else:
            data_trans = np.dot(data - self.mean_data,
                                self.transform_comb[:, 0:dim])

        return data_trans

    def fit_transform(self, data, dim=None):
        """
        Fit the model and transform the given data.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: same as the `transform` method.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        self.fit(data)
        return self.transform(data, dim=dim)

    def create_iterated_laplacian(self, data):
        """
        Calculate the optimal edge weights corresponding to the nearest neighbors of each point. This is exactly
        the same as the first step of the locally linear embedding (LLE) method.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :return: None
        """
        # Find the `self.n_neighbors` nearest neighbors of each point
        nn_indices, nn_distances = self.index_knn.query_self(
            k=self.n_neighbors)
        N, K = nn_indices.shape

        if self.n_jobs == 1:
            w = [
                helper_solve_lle(data, nn_indices, self.reg_eps, i)
                for i in range(N)
            ]
        else:
            helper_partial = partial(helper_solve_lle, data, nn_indices,
                                     self.reg_eps)
            pool_obj = multiprocessing.Pool(processes=self.n_jobs)
            w = []
            _ = pool_obj.map_async(helper_partial, range(N), callback=w.extend)
            pool_obj.close()
            pool_obj.join()

        # Create a sparse matrix of size `(N, N)` for the adjacency matrix
        row_ind = np.array([[i] * (K + 1) for i in range(N)],
                           dtype=np.int).ravel()
        col_ind = np.insert(nn_indices, 0, np.arange(N), axis=1).ravel()
        w = np.negative(w)
        vals = np.insert(w, 0, 1.0, axis=1).ravel()
        # Matrix `I - W`
        mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N))

        # Matrix `M = (I - W)^T (I - W)`, also referred to as the iterated graph Laplacian
        self.iterated_laplacian_matrix = sparse.csr_matrix.dot(
            mat_tmp.transpose(), mat_tmp)
class LocalityPreservingProjection:
    """
    Locality preserving projection (LPP) method for dimensionality reduction [1, 2].
    Orthogonal LPP (OLPP) method based on [3].

    1. He, Xiaofei, and Partha Niyogi. "Locality preserving projections." Advances in neural information processing
    systems. 2004.
    2. He, Xiaofei, et al. "Face recognition using LaplacianFaces." IEEE Transactions on Pattern Analysis & Machine
    Intelligence 3 (2005): 328-340.
    3. Kokiopoulou, Effrosyni, and Yousef Saad. "Orthogonal neighborhood preserving projections: A projection-based
    dimensionality reduction technique." IEEE Transactions on Pattern Analysis and Machine Intelligence,
    29.12 (2007): 2143-2156.

    """
    def __init__(
            self,
            dim_projection='auto',  # 'auto' or positive integer
            orthogonal=False,  # True to enable Orthogonal LPP (OLPP)
            pca_cutoff=1.0,
            neighborhood_constant=NEIGHBORHOOD_CONST,
            n_neighbors=None,  # Specify one of them. If `n_neighbors`
            # is specified, `neighborhood_constant` will be ignored.
        shared_nearest_neighbors=False,
            edge_weights='SNN',  # Choices are {'simple', 'SNN', 'heat_kernel'}
            heat_kernel_param=None,  # Used only if `edge_weights = 'heat_kernel'`
            metric=METRIC_DEF,
            metric_kwargs=None,  # distance metric and its parameter dict (if any)
            approx_nearest_neighbors=True,
            n_jobs=1,
            seed_rng=SEED_DEFAULT):
        """
        :param dim_projection: Dimension of data in the projected feature space. If set to 'auto', a suitable reduced
                               dimension will be chosen by estimating the intrinsic dimension of the data. If an
                               integer value is specified, it should be in the range `[1, dim - 1]`, where `dim`
                               is the observed dimension of the data.
        :param orthogonal: Set to True to select the OLPP method. It was shown to have better performance than LPP
                           in [3].
        :param pca_cutoff: float value in (0, 1] specifying the proportion of cumulative data variance to preserve
                           in the projected dimension-reduced data. PCA is applied as a first-level dimension
                           reduction to handle potential data matrix singularity also. Set `pca_cutoff = 1.0` in
                           order to handle only the data matrix singularity.
        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance to
                                         find the K nearest neighbors. This is a secondary distance metric that is
                                         found to be better suited to high dimensional data. This will be set to
                                         True if `edge_weights = 'SNN'`.
        :param edge_weights: Weighting method to use for the edge weights. Valid choices are {'simple', 'SNN',
                             'heat_kernel'}. They are described below:
                             - 'simple': the edge weight is set to one for every sample pair in the neighborhood.
                             - 'SNN': the shared nearest neighbors (SNN) similarity score between two samples is used
                             as the edge weight. This will be a value in [0, 1].
                             - 'heat_kernel': the heat (Gaussian) kernel with a suitable scale parameter defines the
                             edge weight.
        :param heat_kernel_param: Heat kernel scale parameter. If set to `None`, this parameter is set automatically
                                  based on the median of the pairwise distances between samples. Else a positive
                                  real value can be specified.
        :param metric: string or a callable that specifies the distance metric to be used for the SNN similarity
                       calculation. This is used only if `edge_weights = 'SNN'`.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary. Again, this is used only if `edge_weights = 'SNN'`.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.dim_projection = dim_projection
        self.orthogonal = orthogonal
        self.pca_cutoff = pca_cutoff
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.edge_weights = edge_weights.lower()
        self.heat_kernel_param = heat_kernel_param
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.seed_rng = seed_rng

        if self.edge_weights not in {'simple', 'snn', 'heat_kernel'}:
            raise ValueError(
                "Invalid value '{}' for parameter 'edge_weights'".format(
                    self.edge_weights))

        if self.edge_weights == 'snn':
            self.shared_nearest_neighbors = True

        self.mean_data = None
        self.index_knn = None
        self.adjacency_matrix = None
        self.incidence_matrix = None
        self.laplacian_matrix = None
        self.transform_pca = None
        self.transform_lpp = None
        self.transform_comb = None

    def fit(self, data):
        """
        Find the optimal projection matrix for the given data points.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        N, d = data.shape
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        logger.info("Applying PCA as first-level dimension reduction step")
        data, self.mean_data, self.transform_pca = pca_wrapper(
            data, cutoff=self.pca_cutoff, seed_rng=self.seed_rng)
        if self.dim_projection == 'auto':
            # Estimate the intrinsic dimension of the data and use that as the projected dimension
            id = estimate_intrinsic_dimension(
                data,
                method='two_nn',
                n_neighbors=self.n_neighbors,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                seed_rng=self.seed_rng)
            self.dim_projection = int(np.ceil(id))
            logger.info(
                "Estimated intrinsic dimension of the (PCA-projected) data = {:.2f}."
                .format(id))

        if self.dim_projection >= data.shape[1]:
            self.dim_projection = data.shape[1]

        logger.info("Dimension of the projected subspace = {:d}".format(
            self.dim_projection))

        # Create a KNN index for all nearest neighbor tasks
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            seed_rng=self.seed_rng)

        # Create the symmetric adjacency matrix, diagonal incidence matrix, and the graph Laplacian matrix
        # for the data points
        self.create_laplacian_matrix(data)

        # Solve the generalized eigenvalue problem and take the eigenvectors corresponding to the smallest
        # eigenvalues as the columns of the projection matrix
        logger.info(
            "Solving the generalized eigenvalue problem to find the optimal projection matrix."
        )
        data_trans = data.T
        # X^T L X
        lmat = sparse.csr_matrix.dot(data_trans,
                                     self.laplacian_matrix).dot(data)
        if self.orthogonal:
            # Orthogonal LPP
            eig_values, eig_vectors = eigh(lmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))
        else:
            # Standard LPP
            # X^T D X
            rmat = sparse.csr_matrix.dot(data_trans,
                                         self.incidence_matrix).dot(data)
            eig_values, eig_vectors = eigh(lmat,
                                           b=rmat,
                                           eigvals=(0,
                                                    self.dim_projection - 1))

        # `eig_vectors` is a numpy array with each eigenvector along a column. The eigenvectors are ordered
        # according to increasing eigenvalues.
        # `eig_vectors` will have shape `(data.shape[1], self.dim_projection)`
        self.transform_lpp = eig_vectors

        self.transform_comb = np.dot(self.transform_pca, self.transform_lpp)

    def transform(self, data, dim=None):
        """
        Transform the given data by first subtracting the mean and then applying the linear projection.
        Optionally, you can specify the dimension of the transformed data using `dim`. This cannot be larger
        than `self.dim_projection`.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: If set to `None`, the dimension of the transformed data is `self.dim_projection`.
                    Else `dim` can be set to a value <= `self.dim_projection`. Doing this basically takes only
                    the `dim` top eigenvectors.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        if dim is None:
            data_trans = np.dot(data - self.mean_data, self.transform_comb)
        else:
            data_trans = np.dot(data - self.mean_data,
                                self.transform_comb[:, 0:dim])

        return data_trans

    def fit_transform(self, data, dim=None):
        """
        Fit the model and transform the given data.

        :param data: numpy array of shape `(N, d)` with `N` samples in `d` dimensions.
        :param dim: same as the `transform` method.

        :return:
            - data_trans: numpy array of shape `(N, dim)` with the transformed, dimension-reduced data.
        """
        self.fit(data)
        return self.transform(data, dim=dim)

    def create_laplacian_matrix(self, data):
        """
        Calculate the graph Laplacian matrix for the given data.

        :param data: data matrix of shape `(N, d)` where `N` is the number of samples and `d` is the number of
                     dimensions.
        :return:
        """
        # Find the `self.n_neighbors` nearest neighbors of each point
        nn_indices, nn_distances = self.index_knn.query_self(
            k=self.n_neighbors)

        N, K = nn_indices.shape
        row_ind = np.array([[i] * K for i in range(N)], dtype=np.int).ravel()
        col_ind = nn_indices.ravel()
        if self.edge_weights == 'simple':
            vals = np.ones(N * K)
        elif self.edge_weights == 'snn':
            # SNN distance is the cosine-inverse of the SNN similarity. The range of SNN distances will
            # be [0, pi / 2]. Hence, the SNN similarity will be in the range [0, 1].
            vals = np.clip(np.cos(nn_distances).ravel(), 0., None)
        else:
            # Heat kernel
            vals = calculate_heat_kernel(data,
                                         nn_indices,
                                         self.heat_kernel_param,
                                         self.metric,
                                         metric_kwargs=self.metric_kwargs,
                                         n_jobs=self.n_jobs).ravel()

        # Adjacency or edge weight matrix (W)
        mat_tmp = sparse.csr_matrix((vals, (row_ind, col_ind)), shape=(N, N))
        self.adjacency_matrix = 0.5 * (mat_tmp + mat_tmp.transpose())

        # Incidence matrix (D)
        vals_diag = self.adjacency_matrix.sum(axis=1)
        vals_diag = np.array(vals_diag[:, 0]).ravel()
        ind = np.arange(N)
        self.incidence_matrix = sparse.csr_matrix((vals_diag, (ind, ind)),
                                                  shape=(N, N))

        # Graph laplacian matrix (L = D - W)
        self.laplacian_matrix = self.incidence_matrix - self.adjacency_matrix
class averaged_KLPE_anomaly_detection:
    def __init__(self,
                 neighborhood_constant=NEIGHBORHOOD_CONST,
                 n_neighbors=None,
                 standardize=True,
                 metric=METRIC_DEF,
                 metric_kwargs=None,
                 shared_nearest_neighbors=False,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 low_memory=False,
                 seed_rng=SEED_DEFAULT):
        """

        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param standardize: Set to True to standardize the individual features to the range [-1, 1].
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance.
                                         This is a secondary distance metric that is found to be better suited to
                                         high dimensional data.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                           is likely to increase the running time.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.standardize = standardize
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.low_memory = low_memory
        self.seed_rng = seed_rng

        self.scaler = None
        self.data_train = None
        self.neighborhood_range = None
        self.index_knn = None
        self.dist_stat_nominal = None
        np.random.seed(self.seed_rng)

    def fit(self, data):
        """
        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :return: None
        """
        N, d = data.shape
        if self.standardize:
            self.scaler = MinMaxScaler(feature_range=(-1, 1)).fit(data)
            data = self.scaler.transform(data)

        if self.shared_nearest_neighbors:
            self.data_train = data

        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size and the neighborhood constant
            self.n_neighbors = int(np.ceil(N**self.neighborhood_constant))

        # The distance statistic is averaged over this neighborhood range
        low = self.n_neighbors - int(np.floor(0.5 * (self.n_neighbors - 1)))
        high = self.n_neighbors + int(np.floor(0.5 * self.n_neighbors))
        self.neighborhood_range = (low, high)
        logger.info("Number of samples: {:d}. Number of features: {:d}".format(
            N, d))
        logger.info(
            "Range of nearest neighbors used for the averaged K-LPE statistic: ({:d}, {:d})"
            .format(low, high))
        # Build the KNN graph
        self.index_knn = KNNIndex(
            data,
            n_neighbors=self.neighborhood_range[1],
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            low_memory=self.low_memory,
            seed_rng=self.seed_rng)
        # Compute the distance statistic for every data point
        self.dist_stat_nominal = self.distance_statistic(data,
                                                         exclude_self=True)

    def score(self, data_test, exclude_self=False, return_distances=False):
        """
        Calculate the anomaly score which is the negative log of the empirical p-value of the averaged KNN distance.

        :param data_test: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the
                          number of dimensions (features).
        :param exclude_self: Set to True if the points in `data` were already used to build the KNN index.
        :param return_distances: Set to True in order to include the distance statistics along with the negative
                                 log p-value scores in the returned tuple.
        :return
            score: numpy array of shape `(N, )` containing the score for each point. Points with higher score are
                   more likely to be anomalous.
            Returned only if `return_distances` is set to True.
            dist: numpy array of shape `(N, )` containing the distance statistic for each point.
        """
        # Calculate the k-nearest neighbors based distance statistic
        dist_stat_test = self.distance_statistic(data_test,
                                                 exclude_self=exclude_self)
        # Negative log of the empirical p-value
        p = pvalue_score(self.dist_stat_nominal,
                         dist_stat_test,
                         log_transform=True,
                         bootstrap=True)

        if return_distances:
            return p, dist_stat_test
        else:
            return p

    def distance_statistic(self, data, exclude_self=False):
        """
        Calculate the average distance statistic by querying the nearest neighbors of the given set of points.

        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :param exclude_self: Set to True if the points in `data` were already used to build the KNN index.
        :return dist_stat: numpy array of distance statistic for each point.
        """
        if exclude_self:
            # Data should be already scaled in the `fit` method
            nn_indices, nn_distances = self.index_knn.query_self(
                k=self.neighborhood_range[1])
        else:
            if self.standardize:
                data = self.scaler.transform(data)

            nn_indices, nn_distances = self.index_knn.query(
                data, k=self.neighborhood_range[1])

        if self.shared_nearest_neighbors:
            # The distance statistic is calculated based on the primary distance metric, but within the
            # neighborhood set found using the SNN distance. The idea is that for high-dimensional data,
            # the neighborhood found using SNN is more reliable
            dist_stat = self.distance_statistic_local(
                data, nn_indices, self.neighborhood_range[0])
        else:
            dist_stat = np.mean(nn_distances[:, (self.neighborhood_range[0] -
                                                 1):],
                                axis=1)

        return dist_stat

    def distance_statistic_local(self, data, nn_indices, k):
        """
        Computes the mean distance statistic for each row of `data` within a local neighborhood specified by
        `nn_indices`.

        :param data: numpy data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number
                     of dimensions (features).
        :param nn_indices: numpy array of `p` nearest neighbor indices with shape `(N, p)`.
        :param k: start index of the neighbor from which the mean distance is computed.
        :return dist_array: numpy array of shape `(N, )` with the mean distance values.
        """
        n = data.shape[0]
        if self.n_jobs == 1:
            dist_stat = [
                helper_distance(data, self.data_train, nn_indices, self.metric,
                                self.metric_kwargs, k, i) for i in range(n)
            ]
        else:
            helper_distance_partial = partial(helper_distance, data,
                                              self.data_train, nn_indices,
                                              self.metric, self.metric_kwargs,
                                              k)
            pool_obj = multiprocessing.Pool(processes=self.n_jobs)
            dist_stat = []
            _ = pool_obj.map_async(helper_distance_partial,
                                   range(n),
                                   callback=dist_stat.extend)
            pool_obj.close()
            pool_obj.join()

        return np.array(dist_stat)
class KNNClassifier:
    """
    Basic k nearest neighbors classifier that supports approximate nearest neighbor querying and custom distance
    metrics including shared nearest neighbors.
    """
    def __init__(self,
                 n_neighbors=1,
                 metric=METRIC_DEF, metric_kwargs=None,
                 shared_nearest_neighbors=False,
                 approx_nearest_neighbors=True,
                 n_jobs=1,
                 low_memory=False,
                 seed_rng=SEED_DEFAULT):
        """
        :param n_neighbors: int value specifying the number of nearest neighbors. Should be >= 1.
        :param metric: string or a callable that specifies the distance metric.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param shared_nearest_neighbors: Set to True in order to use the shared nearest neighbor (SNN) distance.
                                         This is a secondary distance metric that is found to be better suited to
                                         high dimensional data.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. This is recommended when the number of points is
                                         large and/or when the dimension of the data is high.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                           is likely to increase the running time.
        :param seed_rng: int value specifying the seed for the random number generator.
        """
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.shared_nearest_neighbors = shared_nearest_neighbors
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.n_jobs = get_num_jobs(n_jobs)
        self.low_memory = low_memory
        self.seed_rng = seed_rng

        self.index_knn = None
        self.y_train = None
        self.n_classes = None
        self.labels_dtype = None
        self.label_enc = None
        self.label_dec = None

    def fit(self, X, y, y_unique=None):
        """

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param y: numpy array of class labels of shape `(N, )`.
        :param y_unique: Allows the optional specification of the unique labels. Can be a tuple list, or numpy
                         array of the unique labels. If this is not specified, then it is found using
                         `numpy.unique`.
        :return: None
        """
        self.labels_dtype = y.dtype
        # Labels are mapped to dtype int because `numba` does not handle generic numpy arrays
        if y_unique is None:
            y_unique = np.unique(y)

        self.n_classes = len(y_unique)
        ind = np.arange(self.n_classes)
        # Mapping from label values to integers and its inverse
        d = dict(zip(y_unique, ind))
        self.label_enc = np.vectorize(d.__getitem__)

        d = dict(zip(ind, y_unique))
        self.label_dec = np.vectorize(d.__getitem__)
        self.y_train = self.label_enc(y)

        self.index_knn = KNNIndex(
            X,
            n_neighbors=self.n_neighbors,
            metric=self.metric,
            metric_kwargs=self.metric_kwargs,
            shared_nearest_neighbors=self.shared_nearest_neighbors,
            approx_nearest_neighbors=self.approx_nearest_neighbors,
            n_jobs=self.n_jobs,
            low_memory=self.low_memory,
            seed_rng=self.seed_rng
        )

    def predict(self, X, is_train=False):
        """
        Predict the class labels for the given inputs.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param is_train: Set to True if prediction is being done on the same data used to train.

        :return: numpy array with the class predictions, of shape `(N, )`.
        """
        # Get the indices of the nearest neighbors from the training set
        if is_train:
            nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors)
        else:
            nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors)

        labels_pred, _ = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec,
                                            self.n_neighbors)
        return labels_pred

    def predict_multiple_k(self, X, k_list, is_train=False):
        """
        Find the KNN predictions for multiple k values specified via the param `k_list`. This is done efficiently
        by querying for the maximum number of nearest neighbors once and using the results. It is assumed that the
        values in `k_list` are sorted in increasing order. This is useful while performing a search for the
        best `k` value using cross-validation.

        NOTE: The maximum value in `k_list` should be <= `self.n_neighbors`.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param k_list: list or array of k values for which predictions are to be made. Each value should be an
                       integer >= 1 and the values should be sorted in increasing order. For example,
                       `k_list = [2, 4, 6, 8, 10]`.
        :param is_train: Set to True if prediction is being done on the same data used to train.

        :return: numpy array with the class predictions corresponding to each k value in `k_list`.
                 Has shape `(len(k_list), N)`.
        """
        if k_list[-1] > self.n_neighbors:
            raise ValueError("Invalid input: maximum value in `k_list` cannot be larger than {:d}.".
                             format(self.n_neighbors))

        # Query the maximum number of nearest neighbors from `k_list`
        if is_train:
            nn_indices, nn_distances = self.index_knn.query_self(k=k_list[-1])
        else:
            nn_indices, nn_distances = self.index_knn.query(X, k=k_list[-1])

        if self.n_jobs == 1 or len(k_list) == 1:
            labels_pred = np.array(
                [helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec, k)[0] for k in k_list],
                dtype=self.labels_dtype
            )
        else:
            helper_partial = partial(helper_knn_predict, nn_indices, self.y_train, self.n_classes, self.label_dec)
            pool_obj = multiprocessing.Pool(processes=self.n_jobs)
            outputs = []
            _ = pool_obj.map_async(helper_partial, k_list, callback=outputs.extend)
            pool_obj.close()
            pool_obj.join()

            labels_pred = np.array([tup[0] for tup in outputs], dtype=self.labels_dtype)

        return labels_pred

    def predict_proba(self, X, is_train=False):
        """
        Estimate the probability of each class along with the predicted most-frequent class.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param is_train: Set to True if prediction is being done on the same data used to train.

        :return:
            - numpy array with the class predictions, of shape `(N, )`.
            - numpy array with the estimated probability of each class, of shape `(N, self.n_classes)`.
              Each row should sum to 1.
        """
        # Get the indices of the nearest neighbors from the training set
        if is_train:
            nn_indices, nn_distances = self.index_knn.query_self(k=self.n_neighbors)
        else:
            nn_indices, nn_distances = self.index_knn.query(X, k=self.n_neighbors)

        labels_pred, counts = helper_knn_predict(nn_indices, self.y_train, self.n_classes, self.label_dec,
                                                 self.n_neighbors)
        proba = counts / self.n_neighbors

        return labels_pred, proba

    def fit_predict(self, X, y):
        """
        Fit a model and predict on the training data.

        :param X: numpy array with the feature vectors of shape `(N, d)`, where `N` is the number of samples
                  and `d` is the dimension.
        :param y: numpy array of class labels of shape `(N, )`.
        :return: numpy array with the class predictions, of shape `(N, )`.
        """
        self.fit(X, y)
        return self.predict(X, is_train=True)
def estimate_intrinsic_dimension(
        data,
        method='two_nn',  # method choices are {'two_nn', 'lid_mle'}
        neighborhood_constant=NEIGHBORHOOD_CONST,
        n_neighbors=None,
        metric='euclidean',
        metric_kwargs=None,
        approx_nearest_neighbors=True,
        n_jobs=1,
        low_memory=False,
        seed_rng=SEED_DEFAULT):
    """
    Wrapper function for estimating the intrinsic dimension of the data.

    :param data: data array of shape `(N, d)`, where `N` is the number of samples and `d` is the number of features.
    :param method: method string. Valid choices are 'two_nn' and 'lid_mle'.
    :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function
                                  of the number of samples (data size). If `N` is the number of samples, then the
                                  number of neighbors is set to `N^neighborhood_constant`. It is recommended to set
                                  this value in the range 0.4 to 0.5.
    :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                        the `neighborhood_constant` is ignored. It is sufficient to specify either
                        `neighborhood_constant` or `n_neighbors`.
    :param metric: distance metric to use. Euclidean by default.
    :param metric_kwargs: optional keyword arguments for the distance metric specified as a dict.
    :param approx_nearest_neighbors: Set to True to use an approximate nearest neighbor method. Usually the right
                                     choice unless both the number of samples are features are small.
    :param n_jobs: number of CPU cores to use.
    :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                       is likely to increase the running time.
    :param seed_rng: seed for the random number generator.

    :return: positive float value specifying the estimated intrinsic dimension.
    """
    # Build a KNN graph index
    index_knn = KNNIndex(data,
                         neighborhood_constant=neighborhood_constant,
                         n_neighbors=n_neighbors,
                         metric=metric,
                         metric_kwargs=metric_kwargs,
                         shared_nearest_neighbors=False,
                         approx_nearest_neighbors=approx_nearest_neighbors,
                         n_jobs=n_jobs,
                         low_memory=low_memory,
                         seed_rng=seed_rng)
    # Query the nearest neighbors of each point
    nn_indices, nn_distances = index_knn.query_self()

    method = method.lower()
    if method == 'two_nn':
        # Two nearest neighbors ID estimator
        id = id_two_nearest_neighbors(nn_distances)
    elif method == 'lid_mle':
        # Median of the local intrinsic dimension estimates around each point
        id = np.median(lid_mle_amsaleg(nn_distances))
    else:
        raise ValueError(
            "Invalid value '{}' specified for argument 'method'".format(
                method))

    return id