def __init__(self, D: np.ndarray, secondary_distance_type: str, metric: str = 'distance', classes: np.ndarray = None, vectors: np.ndarray = None): """Initialize a hubness experiment""" io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) if secondary_distance_type not in SEC_DIST.keys(): raise ValueError("Requested secondary distance type unknown.") if classes is not None: io.check_distance_matrix_shape_fits_labels(D, classes) if vectors is None: self.embedding_dim = None else: # got vectors io.check_distance_matrix_shape_fits_vectors(D, vectors) self.embedding_dim = vectors.shape[1] self.original_distance = D self.secondary_distance_type = secondary_distance_type self.classes = classes self.vectors = vectors self.metric = metric self.n = D.shape[0] # Obtained later through functions: self.secondary_distance = None self.hubness = dict() self.anti_hubs = dict() self.max_hub_k_occurence = dict() self.knn_accuracy = dict() self.gk_index = None
def nicdm(D:np.ndarray, k:int=7, metric:str='distance', test_ind:np.ndarray=None, n_jobs:int=1): """Transform a distance matrix with local scaling variant NICDM. Transforms the given distance matrix into new one using NICDM [1]_ with the given neighborhood radius `k` (average). There are two types of local scaling methods implemented. The original one and the non-iterative contextual dissimilarity measure, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance'}, optional (default: 'distance') Currently, only distance matrices are supported. test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_nicdm : ndarray Secondary distance NICDM matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ #log = logging.ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) if metric == 'similarity': raise NotImplementedError("NICDM does not support similarity matrices " "at the moment.") else: kth = np.arange(k) exclude = np.inf if n_jobs == -1: n_jobs = cpu_count() D = np.copy(D) n = D.shape[0] if test_ind is None: train_ind = slice(0, n) else: train_ind = np.setdiff1d(np.arange(n), test_ind) np.fill_diagonal(D, exclude) if n_jobs > 1: r_ctype = RawArray(ctypes.c_double, n) r = np.frombuffer(r_ctype, dtype=np.float64) with Pool(processes=n_jobs, initializer=_nicdm_load_shared_data, initargs=(D, train_ind, r, r_ctype)) as pool: for i, knn in enumerate(pool.imap( func=partial(_nicdm_calculate_r, kth=kth, k=k), iterable=range(n))): pass # r is handled within func r_geom = _local_geomean(r) D_nicdm_ctype = RawArray(ctypes.c_double, D.size) D_nicdm = np.frombuffer(D_nicdm_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_nicdm_load_shared_data, initargs=(D, train_ind, r, r_ctype, D_nicdm, D_nicdm_ctype)) as pool: for _ in pool.imap( func=partial(_nicdm_calculate_sec_dist, r_geom=r_geom, n=n, metric=metric), iterable=range(n)): pass # results handled within func else: # no multiprocessing knn = np.partition(D[:, train_ind], kth=kth, axis=1)[:, :k] r = knn.mean(axis=1) r_geom = _local_geomean(r) D_nicdm = np.zeros_like(D) for i in range(n): # vectorized inner loop for 100x speed-up (using broadcasting) #D_nicdm[i, i+1:] = ((r_geom**2) * D[i, i+1:]) / (r[i] * r[i+1:]) D_nicdm[i, i+1:] = (r_geom * D[i, i+1:]) / np.sqrt(r[i] * r[i+1:]) D_nicdm += D_nicdm.T return D_nicdm
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_ind:np.ndarray=None, n_jobs:int=1): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) sparse = issparse(D) n = D.shape[0] if n_jobs == -1: n_jobs = cpu_count() if metric == 'similarity': kth = n - k exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") if sparse and n_jobs != 1: log.warning("Parallel processing not implemented for sparse " "matrices. Using single process instead.") n_jobs = 1 else: # metric == 'distance': kth = k - 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if sparse: log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) if test_ind is None: train_ind = slice(0, n) #take all else: train_ind = np.setdiff1d(np.arange(n), test_ind) if sparse: r = np.zeros(n) for i in range(n): di = D[i, train_ind].toarray() di[i] = exclude r[i] = np.partition(di, kth=kth)[kth] D_ls = lil_matrix(D.shape) # Number of nonzero cells per row nnz = D.getnnz(axis=1) else: np.fill_diagonal(D, exclude) if n_jobs > 1: r_ctype = RawArray(ctypes.c_double, n) r = np.frombuffer(r_ctype, dtype=np.float64) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_r, kth=kth), iterable=range(n)): pass # results handled within func else: r = np.partition(D[:, train_ind], kth=kth)[:, kth] if sparse or n_jobs == 1: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': if sparse and nnz[i] <= k: # Don't rescale if there are tmp[1:] = np.nan # too few neighbors in row else: tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T else: D_ls_ctype = RawArray(ctypes.c_double, D.size) D_ls = np.frombuffer(D_ls_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_ls_load_shared_data, initargs=(D, train_ind, r, r_ctype, D_ls, D_ls_ctype)) as pool: for _ in pool.imap(func=partial(_ls_calculate_sec_dist, n=n, metric=metric, self_tmp_value=self_tmp_value), iterable=range(n)): pass # results handled within func # triu is copied to tril within func if sparse: for i, nz in enumerate(nnz): if nz <= k: # too few neighbors D_ls[i, :] = D[i, :] return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def centering(X: np.ndarray, metric: str = 'vector', test_set_mask: np.ndarray = None): """ Perform centering, i.e. shift the origin to the data centroid. Centering of vector data `X` with ``n`` objects in an ``m``-dimensional feature space. The mean of each feature is calculated and subtracted from each point [1]_. In distance based mode, it must be checked upstream, that the distance matrix is a gram matrix as described below! Parameters ---------- X : ndarray - An ``(n x m)`` vector data matrix with ``n`` objects in an ``m``-dimensional feature space - An ``(n x n)`` distance matrix of form ``K = X(X.T)``, if `X` is an ``(n x m)`` matrix; and of form ``K = (X.T)X``, if `X` is an ``(m x n)`` matrix, where ``X.T`` denotes the transpose of `X`. NOTE: The type must be defined via parameter 'metric'! metric : {'vector', 'inner'}, optional (Default: 'vector') Define, whether `X` is vector data or a Gram matrix of inner product similarities as described above. test_set_mask : ndarray, optional (default: None) Hold back data as a test set and perform centering on the remaining data (training set). Returns ------- X_cent : ndarray Centered vectors with shape (n, m), if given vector data. K_cent : ndarray Centered inner product similarities with shape (n, n), if given Gram matrix. References ---------- .. [1] Suzuki, I., Hara, K., Shimbo, M., Saerens, M., & Fukumizu, K. (2013). Centering similarity measures to reduce hubs. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing (pp 613–623). Retrieved from https://www.aclweb.org/anthology/D/D13/D13-1058.pdf """ # Kernel based centering requires inner product similarities, NOT distances. # Since the parameter was previously erroneously called 'distance', # this is kept for compatibility reasons. if metric in ('similarity', 'distance', 'inner', 'inner_product'): if test_set_mask is not None: raise NotImplementedError("Kernel based centering does not " "support train/test splits so far.") io.check_distance_matrix_shape(X) n = X.shape[0] H = np.identity(n) - np.ones((n, n)) / n # K = X.T.X must be provided upstream return H.dot(X).dot(H) elif metric == 'vector': n = X.shape[0] if test_set_mask is None: # center among all data return X - np.mean(X, axis=0) else: # center among training data train_ind = np.setdiff1d(np.arange(n), test_set_mask) return X - np.mean(X[train_ind], axis=0) else: raise ValueError("Parameter 'metric' must be 'inner' or 'vector'.")
def simhub(D: np.ndarray, y: np.ndarray, train_ind: np.ndarray = None, test_ind: np.ndarray = None, s: int = 50, return_distances: bool = True, vect_usage: int = 0): """Calculate dissimilarity based on hubness-aware SNN distances [1]_. Parameters ---------- D : ndarray The ``n x s`` distance or similarity matrix, where ``n`` and ``s`` are the dataset and sample size, respectively. y : ndarray or None Class labels. Required for supervised simhub (simhubIN + simhubPUR). If None, calculate unsupervised simhubIN as per equation (6) in [1]_. train_ind : ndarray, optional, default: None The index array that determines, to which data points the columns in `D` correspond. Not required, if `D` is a quadratic all-against-all distance matrix. test_ind : ndarray, optional, default: None Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. s : int, optional, default: 50 Neighborhood size. Can be optimized as to minimize hubness. return_distances : bool, optional, default: True If True, return distances (1 - similarities). Otherwise return similarities. vect_usage : int, optional, default: 0 If > 0, always use vectorization for the inner simhub loop. If < 0, always use nested loops. If == 0, this is dependent on data set size and vectorization is used if ``n >= 2000``. Returns ------- D_shi : ndarray Secondary distance (simhubIN) matrix. References ---------- .. [1] Tomašev, N., Mladenić, D.(2012). Hubness-aware shared neighbor distances for high-dimensional $$k$$ -nearest neighbor classification. Knowledge and Information Systems, 39(1), 89–122. http://doi.org/10.1007/s10115-012-0607-5 """ if train_ind is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, train_ind) # Assuming distances in D self_value = 0. sort_order = 1 exclude = np.inf distance = D.copy() n, m = distance.shape if not 0 < s < m: raise ValueError("Neighbor hood size s, must be [1, {}-1], but " "was {}.".format(m, s)) if test_ind is None: n_ind = range(n) else: n_ind = test_ind # Exclude self distances if train_ind is None: np.fill_diagonal(distance, exclude) else: for j, sample in enumerate(train_ind): distance[sample, j] = exclude knn = np.zeros_like(distance, bool) # find nearest neighbors for each point for i in range(n): di = distance[i, :] # TODO change to np.partition for PERF nn = np.argsort(di)[::sort_order] knn[i, nn[:s]] = True del distance # Reverse nearest neighbor count N_s = knn[:m, :].sum(axis=0) if y is not None: # Set of class labels C = np.unique(y) # Class specific reverse nearest neighbors N_sc = np.zeros((C.size, m)) for c_idx, c_val in enumerate(C): N_sc[c_idx, :] = np.sum(knn[:m, :] * (y == c_val).reshape(-1, 1), axis=0) assert np.alltrue(N_sc.sum( axis=0) == N_s), "N_s,c(x) don't sum up to N_s(x)" # Account for each point being the 0th nearest neighbor N_sc += 1 # In any case: the same for N_s N_s += 1 if y is not None: # non-homogeneity (inconsistency) in occurrence N_sc /= N_s HR_s = -np.sum(N_sc * np.log(N_sc), axis=0) # Information gain max_H_s = np.log(C.size) info_gain = max_H_s - HR_s else: # set a dummy value for unsupervised mode info_gain = 1 # "occurrence informativeness" I_n = np.log(m / N_s) # simhub calculation D_shi = np.zeros_like(D) if train_ind is None: train_ind = ... if vect_usage > 0 or (vect_usage == 0 and m < 2000): # using vectorization and broadcasting for i in n_ind: x = np.logical_and(knn[i, :], knn[train_ind, :]) D_shi[i, :] = np.sum(x * I_n * info_gain, axis=1) else: # use non-vectorized loops for i in n_ind: for j in range(m): x = np.logical_and(knn[i, :], knn[j, :]) D_shi[i, j] = np.sum(x * I_n * info_gain) del knn # Normalization to [0, 1] range if y is None: D_shi /= (s * np.log(m)) else: D_shi /= (s * np.log(m) * max_H_s) # Convert to distances if return_distances: D_shi *= -1 D_shi += 1 else: self_value = 1 if test_ind is None: # Ensure correct self distances and return sec. dist. matrix np.fill_diagonal(D_shi, self_value) return D_shi else: # only return test-train-distances (there are no self distances here) return D_shi[test_ind]
def simhubIN(D: np.ndarray, train_ind: np.ndarray = None, test_ind: np.ndarray = None, s: int = 50, return_distances: bool = True, n_jobs: int = 1): """Calculate dissimilarity based on hubness-aware SNN distances [1]_. Parameters ---------- D : ndarray The ``n x s`` distance, where ``n`` and ``s`` are the dataset and sample size, respectively. train_ind : ndarray, optional, default: None The index array that determines, to which data points the columns in `D` correspond. Not required, if `D` is a quadratic all-against-all distance matrix. test_ind : ndarray, optional, default: None Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. s : int, optional, default: 50 Neighborhood size. Can be optimized as to minimize hubness. return_distances : bool, optional, default: True If True, return distances (1 - similarities). Otherwise return similarities. n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_shi : ndarray Secondary distance (simhubIN) matrix. References ---------- .. [1] Tomašev, N., Mladenić, D., Tomasev, N., & Mladenić, D. (2012). Hubness-aware shared neighbor distances for high-dimensional $$k$$ -nearest neighbor classification. Knowledge and Information Systems, 39(1), 89–122. http://doi.org/10.1007/s10115-012-0607-5 """ if train_ind is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, train_ind) # Assuming distances in D self_value = 0. sort_order = 1 exclude = np.inf distance = D.copy() n, m = distance.shape if test_ind is None: n_ind = range(n) else: n_ind = test_ind # Exclude self distances if train_ind is None: np.fill_diagonal(distance, exclude) else: for j, sample in enumerate(train_ind): distance[sample, j] = exclude if n_jobs == -1: n_jobs = cpu_count() if n_jobs > 1: knn_ctype = RawArray(ctypes.c_bool, D.size) knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape) with Pool(processes=n_jobs, initializer=_shi_init_knn, initargs=(distance, knn)) as pool: for _ in pool.imap(func=partial(_shi_hood, s=s, sort_order=sort_order), iterable=range(n)): pass else: knn = np.zeros_like(distance, bool) # find nearest neighbors for each point for i in range(n): di = distance[i, :] # TODO change to np.partition for PERF nn = np.argsort(di)[::sort_order] knn[i, nn[:s]] = True del distance # "Occurence informativeness" occ_inf_knn = knn[:m, :].copy() np.fill_diagonal(occ_inf_knn, True) N_s = occ_inf_knn.sum(axis=0) I_n = np.log(m / N_s) del occ_inf_knn # simhub calculation if train_ind is None: train_ind = ... if n_jobs > 1: D_shi_ctype = RawArray(ctypes.c_double, D.size) D_shi = np.frombuffer(D_shi_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_shi_init_simhub, initargs=(knn, train_ind, I_n, D_shi)) as pool: if m < 2000: for _ in pool.imap(func=partial(_shi_simhub_vect, s=s), iterable=n_ind): pass else: for _ in pool.imap(func=partial(_shi_simhub, s=s, m=m), iterable=n_ind): pass else: D_shi = np.zeros_like(D) if m < 2000: # using vectorization and broadcasting for i in n_ind: x = np.logical_and(knn[i, :], knn[train_ind, :]) D_shi[i, :] = np.sum(x * I_n, axis=1) else: # use non-vectorized loops for i in n_ind: for j in range(m): x = np.logical_and(knn[i, :], knn[j, :]) D_shi[i, j] = np.sum(x * I_n) del knn # Normalization to [0, 1] range D_shi /= (s * np.log(m)) # Convert to distances if return_distances: D_shi *= -1 D_shi += 1 else: self_value = 1 if test_ind is None: # Ensure correct self distances and return sec. dist. matrix np.fill_diagonal(D_shi, self_value) return D_shi else: # only return test-train-distances (there are no self distances here) return D_shi[test_ind]
def shared_nearest_neighbors(D: np.ndarray, k: int = 10, metric='distance', n_jobs: int = 1): """Transform distance matrix using shared nearest neighbors [1]_. SNN similarity is based on computing the overlap between the `k` nearest neighbors of two objects. SNN approaches try to symmetrize nearest neighbor relations using only rank and not distance information [2]_. Parameters ---------- D : np.ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 10) Neighborhood radius: The `k` nearest neighbors are used to calculate SNN. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether the matrix `D` is a distance or similarity matrix n_jobs : int, optional, default: 1 Number of processes for parallel computations. - `1`: Don't use multiprocessing. - `-1`: Use all CPUs Returns ------- D_snn : ndarray Secondary distance SNN matrix References ---------- .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure based on shared near neighbors,” IEEE Transactions on Computers, vol. 22, pp. 1025–1034, 1973. .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th International Conference on Data Mining Workshops, 460–467. http://doi.org/10.1109/ICDMW.2013.101 """ io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) n = D.shape[0] if metric == 'distance': self_value = 0. sort_order = 1 exclude = np.inf kth = k if metric == 'similarity': self_value = 1. sort_order = -1 exclude = -np.inf kth = n - k distance = D.copy() np.fill_diagonal(distance, exclude) if n_jobs == -1: n_jobs = cpu_count() if n_jobs > 1: knn_ctype = RawArray(ctypes.c_bool, D.size) knn = np.frombuffer(knn_ctype, dtype=bool).reshape(D.shape) D_snn_ctype = RawArray(ctypes.c_double, D.size) D_snn = np.frombuffer(D_snn_ctype, dtype=np.float64).reshape(D.shape) with Pool(processes=n_jobs, initializer=_snn_init, initargs=(distance, knn, D_snn)) as pool: for _ in pool.imap(func=partial(_snn_my_hood, k=k, kth=kth, sort_order=sort_order), iterable=range(n)): pass for _ in pool.imap(func=partial(_snn_our_hood, k=k, metric=metric), iterable=range(n)): pass else: knn = np.zeros_like(distance, bool) # find nearest neighbors for each point for i in range(n): di = distance[i, :] nn = np.argpartition(di, kth=kth)[::sort_order] knn[i, nn[0:k]] = True D_snn = np.zeros_like(distance) for i in range(n): knn_i = knn[i, :] j_idx = slice(i + 1, n) # using broadcasting Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1) if metric == 'distance': D_snn[i, j_idx] = 1. - Dij / k else: # metric == 'similarity': D_snn[i, j_idx] = Dij / k D_snn += D_snn.T np.fill_diagonal(D_snn, self_value) return D_snn
def test_check_shape(self): with self.assertRaises(TypeError): d = np.empty((2, 3)) io.check_distance_matrix_shape(d)
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance', average:str='weighted', return_y_pred:int=0, verbose:int=0, n_jobs:int=1) -> float: """ Calculate R-Precision (recall at R-th position). Parameters ---------- S : ndarray or CSR matrix Distance (similarity) matrix y : ndarray Target (ground truth) labels metric : 'distance' or 'similarity', optional, default: 'similarity' Define, whether `S` is a distance or similarity matrix. average : 'weighted', 'macro' or None, optional, default: 'weighted' Ignored. Weighted and macro precisions are returned. return_y_pred : int, optional, default: 0 If > 0, return the labels of the `return_y_pred` nearest neighbors verbose : int, optional, default: 0 Increasing level of output. n_jobs : int, optional, default: 1 Number of parallel processes to use. Returns ------- r_precision : dictionary with following keys: macro : float Macro R-Precision. weighted : float Weighted R-Precision. per_item : ndarray R-Precision at the object. relevant_items : ndarray Relevant items per class. y_true : ndarray Target labels (req. for weighting). y_pred : ndarray Labels of some k-nearest neighbors """ io.check_distance_matrix_shape(S) io.check_distance_matrix_shape_fits_labels(S, y) io.check_valid_metric_parameter(metric) log = ConsoleLogging() n, _ = S.shape S_is_sparse = issparse(S) if metric != 'similarity' or not S_is_sparse: raise NotImplementedError("Only sparse similarity matrices so far.") # Map labels to 0..n(labels)-1 le = LabelEncoder() # Add int.min for misclassifications incorr_orig = np.array([np.nan]).astype(int) le.fit(np.append(y, incorr_orig)) y = le.transform(y) incorrect = le.transform(incorr_orig) # Number of relevant items, i.e. number of each label relevant_items = np.bincount(y) - 1 # one less for self class # R-Precision for each item r_prec = np.zeros(n, dtype=np.float) # Classify each point in test set if verbose: log.message("Creating shared memory data.") n_random_pred = mp.Value(ctypes.c_int) n_random_pred.value = 0 if verbose and log: log.message("Spawning processes for prediction.") y_pred = np.zeros((n, return_y_pred), dtype=float) kwargs = {'y_pred' : return_y_pred, 'incorrect' : incorrect} with mp.Pool(processes=n_jobs, initializer=_load_shared_csr, initargs=(S, y, n_random_pred, relevant_items)) as pool: for i, r in enumerate( pool.imap( func=partial(_r_prec_worker, **kwargs), iterable=range(n), chunksize=int(1e2))): if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1): log.message("Classification: {} of {} on {}.".format( i+1, n, mp.current_process().name), flush=True) try: r_prec[i] = r[0] y_pred[i, :] = r[1] except: r_prec[i] = r if i == n-1: pass pool.join() if verbose and log: log.message("Retrieving nearest neighbors.") # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T if verbose and log: log.message("Finishing.") if n_random_pred.value: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers or there were no other " "objects in the same class.").format(n_random_pred.value)) return_dict = {'macro' : r_prec.mean(), 'weighted' : np.average(r_prec, weights=relevant_items[y]), 'per_item' : r_prec, 'relevant_items' : relevant_items, 'y_true' : y, 'y_pred' : y_pred} return return_dict
def predict(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_ind:np.ndarray=None, verbose:int=0, sample_idx=None, return_cmat=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth) or ``n x c`` in case of ``c`` binarized multilabels k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). return_cmat : bool, optional, default: True If False, only return the predictions `y_pred`. Otherwise also return the confusion matrices. Returns ------- y_pred : ndarray (shape=(n_k, n, c), dtype=int) Predicted class labels (`n_k`... number of items in parameter `k`) HINT: Referring to the above example... ... ``y_pred[0]`` gives the predictions of the ``k=1`` experiment. cmat : ndarray (shape=(n_k x c x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, 0, :, :]`` gives the confusion matrix of the first class in the ``k=20`` experiment in the following order: TN FP FN TP """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) #io._check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed if not issparse(D): D = D.copy() target = target.astype(int) if target.ndim == 1: target = target[:, np.newaxis] if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy io.check_valid_metric_parameter(metric) train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, target.shape[1], len(cl), len(cl)), dtype=int) y_pred = np.zeros((k_length, *target.shape), dtype=int) classes = target.copy() for idx, cur_class in enumerate(np.array(cl).ravel()): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) if issparse(D): row = D.getrow(i) #row = D.data ind = row.nonzero()[1] row = row.toarray().ravel() else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: if issparse(D): rp = ind else: rp = np.arange(len(sample_idx)) rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) log.warning("Query was classified randomly, because all " "distances were non-finite numbers.") for l in range(target.shape[1]): l_classes = classes[:, l] if sample_idx is None: nn_class = l_classes[idx[0:k[j]]][finite_val] else: l_sample_classes = sample_classes[:, l] nn_class = l_sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] seed_class = classes[i, l] # "tie": use nearest neighbor if len(max_cs) > 1: y_pred[j, i, l] = nn_class[0] cmat[j, l, seed_class, nn_class[0]] += 1 # majority vote else: y_pred[j, i, l] = cl[max_cs[0]] cmat[j, l, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") if return_cmat: return y_pred, cmat else: return y_pred
def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, sample_idx=None, filter_self=True): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). sample_idx : ... TODO add description filter_self : bool, optional, default: True Remove self similarities from sparse ``D``. This assumes that the highest similarity per row is the self similarity. NOTE: Quadratic dense matrices are always filtered for self distances/similarities, even if `filter_self` is set t0 `False`. Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = ConsoleLogging() if sample_idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, sample_idx) io.check_distance_matrix_shape_fits_labels(D, target) io.check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) D_is_sparse = issparse(D) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if sample_idx is not None: raise NotImplementedError("Sample k-NN does not support train/" "test splits at the moment.") # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) if D_is_sparse: # Add a label for unknown class (object w/o nonzero sim to any others) cl = np.append(cl, cl.max()+1) n_classes = len(cl) + 1 else: n_classes = len(cl) cmat = np.zeros((k_length, n_classes, n_classes)) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx if sample_idx is not None: sample_classes = classes[sample_idx] j = np.ones(n, int) j *= (n+1) # illegal indices will throw index out of bounds error j[sample_idx] = np.arange(len(sample_idx)) for j, sample in enumerate(sample_idx): D[sample, j] = d_self cl = range(len(cl)) rnd_classif = np.zeros(k_length) # Classify each point in test set for i in test_set_ind: if verbose and ((i+1)%1000==0 or i+1==n): log.message("Prediction: {} of {}.".format(i+1, n), flush=True) seed_class = classes[i] if D_is_sparse: row = D.getrow(i) else: row = D[i, :] if sample_idx is None: row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) if sample_idx is None: rp = train_set_ind else: rp = np.arange(len(sample_idx)) if D_is_sparse: nnz = row.nnz rp = np.random.permutation(nnz) d2 = row.data[rp] # Partition for each k value kth = nnz - k - 1 # sort the two highest similarities to end kth = np.append(kth, [nnz-2, nnz-1]) # Clip negative indices (nnz < k) np.clip(kth, a_min=0, a_max=nnz-1, out=kth) # Remove duplicate k values and sort kth = np.unique(kth) d2idx = np.argpartition(d2, kth=kth) d2idx = d2idx[~np.isnan(d2[d2idx])][::-1] idx = row.nonzero()[1][rp[d2idx]] idx = idx[1:] # rem self sim else: rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] d2idx = d2idx[~np.isnan(d2[d2idx])] # filter NaN values idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): # Make sure no inf/-inf/nan values are used for classification if D_is_sparse: #print(row[0, idx[0:k[j]]].toarray()) finite_val = np.isfinite(row[0, idx[0:k[j]]].toarray().ravel()) #print(finite_val) else: finite_val = np.isfinite(row[idx[0:k[j]]]) # However, if no values are finite, classify randomly if finite_val.sum() == 0: idx = np.random.permutation(idx) finite_val = np.ones_like(finite_val) rnd_classif[j] += 1 if sample_idx is None: nn_class = classes[idx[0:k[j]]][finite_val] else: #finite_val = np.isfinite(sample_row[idx[0:k[j]]]) nn_class = sample_classes[idx[0:k[j]]][finite_val] cs = np.bincount(nn_class.astype(int)) if cs.size > 0: max_cs = np.where(cs == np.max(cs))[0] else: max_cs = np.array([len(cl) - 1]) # misclassification label # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if np.any(rnd_classif): for x in rnd_classif: log.warning(("{} queries were classified randomly, because all " "distances were non-finite numbers.").format(x)) if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def mutual_proximity_gammai(D: np.ndarray, metric: str = 'distance', min_nnz: int = 30, test_set_ind: np.ndarray = None, verbose: int = 0): """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai variant assumes independent Gamma distributed distances (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse `D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. min_nnz : int, optional, default: 30 Calculate MP between two objects `i` and `j`, iff at least ``min_nnz`` values are present in both row ``i`` and ``j``. Otherwise, return the original similarity. Ignored, if `metric` is 'distance'. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP gammai matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = ConsoleLogging() # Checking input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) if metric == 'similarity': self_value = 1 else: # metric == 'distance': self_value = 0 if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP if verbose: log.message('Mutual proximity Gammai rescaling started.', flush=True) D = D.copy() if issparse(D): return _mutual_proximity_gammai_sparse(D, min_nnz, test_set_ind, verbose, log) np.fill_diagonal(D, np.nan) mu = np.nanmean(D[train_set_ind], 0) va = np.nanvar(D[train_set_ind], 0, ddof=1) # Avoid downstream div/0 errors va[va == 0] = 1e-7 A = (mu**2) / va B = va / mu D_mp = np.zeros_like(D) # MP gammai for i in range(n): if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True) j_idx = slice(i + 1, n) if metric == 'similarity': p1 = _local_gamcdf(D[i, j_idx], A[i], B[i]) p2 = _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx]) D_mp[i, j_idx] = (p1 * p2).ravel() else: # distance p1 = 1 - _local_gamcdf(D[i, j_idx], A[i], B[i]) p2 = 1 - _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx]) D_mp[i, j_idx] = (1 - p1 * p2).ravel() # Mirroring the matrix D_mp += D_mp.T # set correct self dist/sim np.fill_diagonal(D_mp, self_value) return D_mp
def mutual_proximity_gaussi( D: np.ndarray, metric: str = 'distance', sample_size: int = 0, min_nnz: int = 30, test_set_ind: np.ndarray = None, verbose: int = 0, idx: np.ndarray = None, ): """Transform distances with Mutual Proximity (indep. normal distributions). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi variant assumes independent normal distributions (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse `D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. sample_size : int, optional (default: 0) Define sample size from which Gauss parameters are estimated. Use all data when set to ``0``. Ignored in case of SampleMP (i.e. if provided `idx`). min_nnz : int, optional, default: 30 Calculate MP between two objects `i` and `j`, iff at least ``min_nnz`` values are present in both row ``i`` and ``j``. Otherwise, return the original similarity. Ignored, if `metric` is 'distance'. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Ignored in case of SampleMP (i.e. if provided `idx`). verbose : int, optional (default: 0) Increasing level of output (progress report). idx : ndarray, optional (default: None) The index array that determines to which data points the columns in `D` correspond. Only required for SampleMP. Returns ------- D_mp : ndarray Secondary distance MP gaussi matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization log = ConsoleLogging() # Checking input if idx is None: io.check_distance_matrix_shape(D) else: io.check_sample_shape_fits(D, idx) io.check_valid_metric_parameter(metric) n = D.shape[0] s = D.shape[1] if metric == 'similarity': self_value = 1 else: # metric == 'distance': self_value = 0 if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP Gaussi if verbose: log.message('Mutual Proximity Gaussi rescaling started.', flush=True) D = D.copy() if issparse(D): return _mutual_proximity_gaussi_sparse(D, sample_size, min_nnz, test_set_ind, verbose, log) # ignore self dist/sim for parameter estimation if idx is None: np.fill_diagonal(D, np.nan) else: for j, i in enumerate(idx): D[i, j] = np.nan # Calculate mean and std if idx is None: if sample_size == 0: mu = np.nanmean(D[train_set_ind], 0) sd = np.nanstd(D[train_set_ind], 0, ddof=0) else: samples = np.random.shuffle(train_set_ind)[0:sample_size] mu = np.nanmean(D[samples], 0) sd = np.nanstd(D[samples], 0, ddof=0) else: mu = np.nanmean(D, 1) sd = np.nanstd(D, 1, ddof=0) # Avoid downstream div/0 errors sd[sd == 0] = 1e-7 # set self dist/sim back to self_value to avoid scipy warnings if idx is None: np.fill_diagonal(D, self_value) else: for j, i in enumerate(idx): D[i, j] = self_value # MP Gaussi D_mp = np.zeros_like(D) for i in range(n): if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gaussi: {} of {}.".format(i + 1, n), flush=True) if idx is None: j = slice(i + 1, n) j_mom = j else: j = slice(0, s) j_mom = idx[j] if metric == 'similarity': p1 = norm.cdf(D[i, j], mu[i], sd[i]) p2 = norm.cdf(D[i, j], mu[j_mom], sd[j_mom]) D_mp[i, j] = (p1 * p2).ravel() else: # sf(.) := 1 - cdf(.) p1 = norm.sf(D[i, j], mu[i], sd[i]) p2 = norm.sf(D[i, j], mu[j_mom], sd[j_mom]) D_mp[i, j] = (1 - p1 * p2).ravel() if idx is None: D_mp += D_mp.T np.fill_diagonal(D_mp, self_value) else: # Ensure correct self distances for j, sample in enumerate(idx): D_mp[sample, j] = self_value return D_mp
def _mutual_proximity_empiric_full(D: np.ndarray, metric: str = 'distance', test_set_ind: np.ndarray = None, min_nnz: int = 0, verbose: int = 0, n_jobs=None): """Transform a distance matrix with Mutual Proximity (empiric distribution). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse ``D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_set_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. min_nnz : int, optional, default: 0 Calculate MP between two objects `i` and `j`, iff at least ``min_nnz`` values are present in both row ``i`` and ``j``. Otherwise, return the original distance/similarity. NOTE: Currently only implemented for MP empiric w/ sparse sim matrices verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = ConsoleLogging() # Check input io.check_distance_matrix_shape(D) io.check_valid_metric_parameter(metric) if metric == 'similarity': self_value = 1 exclude_value = np.inf else: # metric == 'distance': self_value = 0 exclude_value = -np.inf if issparse(D): raise ValueError("MP sparse only supports similarity matrices.") if test_set_ind is None: pass # TODO implement #train_set_ind = slice(0, n) elif not np.all(~test_set_ind): raise NotImplementedError("MP empiric does not yet support train/" "test splits.") #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if issparse(D): return _mutual_proximity_empiric_sparse(D, test_set_ind, min_nnz, verbose, log, n_jobs) # Start MP D = D.copy() # ensure correct self distances (NOT done for sparse matrices!) np.fill_diagonal(D, exclude_value) D_mp = np.zeros_like(D) # Calculate MP empiric for i in range(n - 1): if verbose and ((i + 1) % 1000 == 0 or i == n - 2): log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True) # Calculate only triu part of matrix j_idx = i + 1 dI = D[i, :][np.newaxis, :] dJ = D[j_idx:n, :] d = D[j_idx:n, i][:, np.newaxis] if metric == 'similarity': D_mp[i, j_idx:] = np.sum((dI <= d) & (dJ <= d), 1) / n #(n - 2) else: # metric == 'distance': D_mp[i, j_idx:] = 1 - (np.sum( (dI > d) & (dJ > d), 1) / n) #(n - 2)) # Mirror, so that matrix is symmetric D_mp += D_mp.T np.fill_diagonal(D_mp, self_value) return D_mp