def __init__(self, D:np.ndarray, secondary_distance_type:str, metric:str='distance', classes:np.ndarray=None, vectors:np.ndarray=None): """Initialize a hubness experiment""" IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if secondary_distance_type not in SEC_DIST.keys(): raise ValueError("Requested secondary distance type unknown.") if classes is not None: IO._check_distance_matrix_shape_fits_labels(D, classes) if vectors is None: self.embedding_dim = None else: # got vectors IO._check_distance_matrix_shape_fits_vectors(D, vectors) self.embedding_dim = vectors.shape[1] self.original_distance = D self.secondary_distance_type = secondary_distance_type self.classes = classes self.vectors = vectors self.metric = metric self.n = D.shape[0] # Obtained later through functions: self.secondary_distance = None self.hubness = dict() self.anti_hubs = dict() self.max_hub_k_occurence = dict() self.knn_accuracy = dict() self.gk_index = None
def load_dexter(): """Load the dexter data set. .. note:: Deprecated in hub-toolbox 2.3 Will be removed in hub-toolbox 3.0. Please use IO.load_dexter() instead. """ return IO.load_dexter()
def __init__(self, D:np.ndarray=None, classes:np.ndarray=None, vectors:np.ndarray=None, metric:str='distance'): """Initialize a quick hubness analysis. Parameters ---------- D : ndarray, optional (default: None) The n x n symmetric distance (similarity) matrix. Default: load example dataset (dexter). classes : ndarray, optional (default: None) The 1 x n class labels. Required for k-NN, GK. vectors : ndarray, optional (default: None) The m x n vector data. Required for IntrDim estimation. metric : {'distance', 'similarity'} Define whether `D` is a distance or similarity matrix. """ self.has_class_data, self.has_vector_data = False, False if D is None: print('\n' 'NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.' '\n' 'DEXTER is a text classification problem in a bag-of-word \n' 'representation. This is a two-class classification problem\n' 'with sparse continuous input variables. \n' 'This dataset is one of five datasets of the NIPS 2003\n' 'feature selection challenge.\n' 'http://archive.ics.uci.edu/ml/datasets/Dexter\n') self.D, self.classes, self.vectors = IO.load_dexter() self.has_class_data, self.has_vector_data = True, True self.metric = 'distance' else: # copy data and ensure correct type (not int16 etc.) self.D = np.copy(D).astype(np.float64) if classes is None: self.classes = None else: self.classes = np.copy(classes).astype(np.float64) self.has_class_data = True if vectors is None: self.vectors = None else: self.vectors = np.copy(vectors).astype(np.float64) self.has_vector_data = True self.metric = metric self.n = len(self.D) self.experiments = []
def __init__(self, D, k:int=5, isSimilarityMatrix:bool=False): self.log = Logging.ConsoleLogging() if isinstance(D, np.memmap): self.D = D else: self.D = IO.copy_D_or_load_memmap(D, writeable=False) self.k = k if isSimilarityMatrix: self.d_self = -np.inf self.sort_order = -1 # descending, interested in highest similarity else: self.d_self = np.inf self.sort_order = 1 # ascending, interested in smallest distance np.random.seed()
def __init__(self, D, isSimilarityMatrix=False): """ .. note:: Deprecated in hub-toolbox 2.3 Class will be removed in hub-toolbox 3.0. Please use static functions instead. """ print( "DEPRECATED: Please use the appropriate MutualProximity." "mutual_proximity_DISTRIBUTIONTYPE() function instead.", file=sys.stderr, ) self.D = IO.copy_D_or_load_memmap(D, writeable=True) self.log = Logging.ConsoleLogging() if isSimilarityMatrix: self.self_value = 1 else: self.self_value = 0 self.isSimilarityMatrix = isSimilarityMatrix
def __init__(self, D, isSimilarityMatrix=False, missing_values=None, tmp='/tmp/'): """ .. note:: Deprecated in hub-toolbox 2.3 Class will be removed in hub-toolbox 3.0. Please use static functions instead. """ print("DEPRECATED: Please use the appropriate MutualProximity_parallel." "mutual_proximity_DISTRIBUTIONTYPE() function instead.", file=sys.stderr) self.D = IO.copy_D_or_load_memmap(D, writeable=True) self.log = Logging.ConsoleLogging() if isSimilarityMatrix: self.self_value = 1 else: self.self_value = 0 self.isSimilarityMatrix = isSimilarityMatrix self.tmp = tmp if missing_values is None: if issparse(D): self.mv = 0 else: self.mv = None else: self.mv = missing_values
def test_check_dist_vs_vectors(self): with self.assertRaises(TypeError): D = np.zeros((5, 5)) vectors = np.zeros((4, 5)) IO._check_distance_matrix_shape_fits_vectors(D, vectors)
def test_check_dist_vs_classes(self): with self.assertRaises(TypeError): D = np.empty((5, 5)) classes = np.empty(4) IO._check_distance_matrix_shape_fits_labels(D, classes)
def test_check_shape(self): with self.assertRaises(TypeError): d = np.empty((2, 3)) IO._check_distance_matrix_shape(d)
def score(D:np.ndarray, target:np.ndarray, k=5, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0): """Perform `k`-nearest neighbor classification. Use the ``n x n`` symmetric distance matrix `D` and target class labels `target` to perform a `k`-NN experiment (leave-one-out cross-validation or evaluation of test set; see parameter `test_set_ind`). Ties are broken by the nearest neighbor. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. target : ndarray (of dtype=int) The ``n x 1`` target class labels (ground truth). k : int or array_like (of dtype=int), optional (default: 5) Neighborhood size for `k`-NN classification. For each value in `k`, one `k`-NN experiment is performed. HINT: Providing more than one value for `k` is a cheap means to perform multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Perform a LOO-CV experiment - ndarray : Hold out points indexed in this array as test set. Fit model to remaining data. Evaluate model on test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- acc : ndarray (shape=(n_k x 1), dtype=float) Classification accuracy (`n_k`... number of items in parameter `k`) HINT: Refering to the above example... ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment. corr : ndarray (shape=(n_k x n), dtype=int) Raw vectors of correctly classified items HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment. cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) Confusion matrix (``n_t`` number of unique items in parameter target) HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of the ``k=20`` experiment. """ # Check input sanity log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_distance_matrix_shape_fits_labels(D, target) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 # Copy, because data is changed D = D.copy() target = target.astype(int) if verbose: log.message("Start k-NN experiment.") # Handle LOO-CV vs. test set mode if test_set_ind is None: n = D.shape[0] test_set_ind = range(n) # dummy train_set_ind = n # dummy else: # number of points to be classified n = test_set_ind.size # Indices of training examples train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Number of k-NN parameters try: k_length = k.size except AttributeError as e: if isinstance(k, int): k = np.array([k]) k_length = k.size elif isinstance(k, list): k = np.array(k) k_length = k.size else: raise e acc = np.zeros((k_length, 1)) corr = np.zeros((k_length, D.shape[0])) cl = np.sort(np.unique(target)) cmat = np.zeros((k_length, len(cl), len(cl))) classes = target.copy() for idx, cur_class in enumerate(cl): # change labels to 0, 1, ..., len(cl)-1 classes[target == cur_class] = idx cl = range(len(cl)) # Classify each point in test set for i in test_set_ind: seed_class = classes[i] if issparse(D): row = D.getrow(i).toarray().ravel() else: row = D[i, :] row[i] = d_self # Sort points in training set according to distance # Randomize, in case there are several points of same distance # (this is especially relevant for SNN rescaling) rp = train_set_ind rp = np.random.permutation(rp) d2 = row[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] idx = rp[d2idx] # More than one k is useful for cheap multiple k-NN experiments at once for j in range(k_length): nn_class = classes[idx[0:k[j]]] cs = np.bincount(nn_class.astype(int)) max_cs = np.where(cs == np.max(cs))[0] # "tie": use nearest neighbor if len(max_cs) > 1: if seed_class == nn_class[0]: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, nn_class[0]] += 1 # majority vote else: if cl[max_cs[0]] == seed_class: acc[j] += 1/n corr[j, i] = 1 cmat[j, seed_class, cl[max_cs[0]]] += 1 if verbose: log.message("Finished k-NN experiment.") return acc, corr, cmat
def mutual_proximity_gammai(D:np.ndarray, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, n_jobs:int=-1, mv=None): """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai variant assumes independent Gamma distributed distances (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: -1) Number of parallel processes to be used. NOTE: set ``n_jobs=-1`` to use all CPUs Returns ------- D_mp : ndarray Secondary distance MP gammai matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) n = D.shape[0] sample_size = 0 # not implemented if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) if issparse(D): return _mutual_proximity_gammai_sparse(D, sample_size, train_set_ind, verbose, log, mv, n_jobs) else: log.warning("MP gammai does not support parallel execution for dense " "matrices at the moment. Continuing with 1 process.") from hub_toolbox.MutualProximity import mutual_proximity_gammai return mutual_proximity_gammai(D, metric, test_set_ind, verbose)
def mutual_proximity_gammai(D: np.ndarray, metric: str = "distance", test_set_ind: np.ndarray = None, verbose: int = 0): """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai variant assumes independent Gamma distributed distances (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse `D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP gammai matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = Logging.ConsoleLogging() # Checking input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == "similarity": self_value = 1 else: # metric == 'distance': self_value = 0 if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP if verbose: log.message("Mutual proximity Gammai rescaling started.", flush=True) D = D.copy() if issparse(D): return _mutual_proximity_gammai_sparse(D, test_set_ind, verbose, log) np.fill_diagonal(D, np.nan) mu = np.nanmean(D[train_set_ind], 0) va = np.nanvar(D[train_set_ind], 0, ddof=1) A = (mu ** 2) / va B = va / mu D_mp = np.zeros_like(D) # MP gammai for i in range(n): if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True) j_idx = slice(i + 1, n) if metric == "similarity": p1 = _local_gamcdf(D[i, j_idx], A[i], B[i]) p2 = _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx]) D_mp[i, j_idx] = (p1 * p2).ravel() else: # distance p1 = 1 - _local_gamcdf(D[i, j_idx], A[i], B[i]) p2 = 1 - _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx]) D_mp[i, j_idx] = (1 - p1 * p2).ravel() # Mirroring the matrix D_mp += D_mp.T # set correct self dist/sim np.fill_diagonal(D_mp, self_value) return D_mp
def mutual_proximity_gauss(D: np.ndarray, metric: str = "distance", test_set_ind: np.ndarray = None, verbose: int = 0): """Transform a distance matrix with Mutual Proximity (normal distribution). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gauss variant assumes dependent normal distributions (VERY SLOW). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray - ndarray: The ``n x n`` symmetric distance or similarity matrix. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP gauss matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = Logging.ConsoleLogging() # Checking input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == "similarity": self_value = 1 else: # metric == 'distance': self_value = 0 if issparse(D): log.error("Sparse matrices not supported by MP Gauss.") raise TypeError("Sparse matrices not supported by MP Gauss.") if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP D = D.copy() np.fill_diagonal(D, self_value) # np.fill_diagonal(D, np.nan) mu = np.mean(D[train_set_ind], 0) sd = np.std(D[train_set_ind], 0, ddof=0) # =========================================================================== # mu = np.nanmean(D[train_set_ind], 0) # sd = np.nanstd(D[train_set_ind], 0, ddof=0) # =========================================================================== # Code for the BadMatrixSigma error [derived from matlab] # =========================================================================== # eps = np.spacing(1) # epsmat = np.array([[1e5 * eps, 0], [0, 1e5 * eps]]) # =========================================================================== D_mp = np.zeros_like(D) # MP Gauss for i in range(n): if verbose and ((i + 1) % 1000 == 0 or i + 1 == n): log.message("MP_gauss: {} of {}.".format(i + 1, n)) for j in range(i + 1, n): # =================================================================== # mask = np.isnan(D[[i, j], :]) # D_mask = np.ma.array(D[[i, j], :], mask=mask) # c = np.ma.cov(D_mask, ddof=0) # =================================================================== c = np.cov(D[[i, j], :], ddof=0) x = np.array([D[i, j], D[j, i]]) m = np.array([mu[i], mu[j]]) low = np.tile(np.finfo(np.float32).min, 2) p12 = mvn.mvnun(low, x, m, c)[0] # [0]...p, [1]...inform if np.isnan(p12): # =============================================================== # power = 7 # while np.isnan(p12): # c += epsmat * (10**power) # p12 = mvn.mvnun(low, x, m, c)[0] # power += 1 # log.warning("p12 is NaN: i={}, j={}. Increased cov matrix by " # "O({}).".format(i, j, epsmat[0, 0]*(10**power))) # =============================================================== p12 = 0.0 log.warning("p12 is NaN: i={}, j={}. Set to zero.".format(i, j)) if metric == "similarity": D_mp[i, j] = p12 else: # distance p1 = norm.cdf(D[i, j], mu[i], sd[i]) p2 = norm.cdf(D[i, j], mu[j], sd[j]) D_mp[i, j] = p1 + p2 - p12 D_mp += D_mp.T np.fill_diagonal(D_mp, self_value) return D_mp
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0, n_jobs:int=-1): """Compute hubness of a distance matrix. Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse nearest neighbor count, i.e. how often does a point occur in the `k`-nearest neighbor lists of other points). Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 5) Neighborhood size for `k`-occurence. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: -1) Number of parallel processes spawned for hubness calculation. Default value (-1): number of available CPUs. Returns ------- S_k : float Hubness (skewness of `k`-occurence distribution) D_k : ndarray `k`-nearest neighbor lists N_k : ndarray `k`-occurence list References ---------- .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. Journal of Machine Learning Research, 11, 2487–2531. Retrieved from http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/ radovanovic10a.pdf """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 if verbose: log.message("Hubness calculation (skewness of {}-occurence)".format(k)) # Initialization n = D.shape[0] D = D.copy() D_k = np.zeros((k, D.shape[1]), dtype=np.float32 ) if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: # Set self dist to inf np.fill_diagonal(D, d_self) # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self # Parallelization if n_jobs == -1: # take all cpus NUMBER_OF_PROCESSES = mp.cpu_count() # @UndefinedVariable else: NUMBER_OF_PROCESSES = n_jobs tasks = [] batches = [] batch_size = n // NUMBER_OF_PROCESSES for i in range(NUMBER_OF_PROCESSES-1): batches.append( np.arange(i*batch_size, (i+1)*batch_size) ) batches.append( np.arange((NUMBER_OF_PROCESSES-1)*batch_size, n) ) for idx, batch in enumerate(batches): submatrix = D[batch[0]:batch[-1]+1] tasks.append((_partial_hubness, (k, d_self, log, sort_order, batch, submatrix, idx, n, verbose))) task_queue = mp.Queue() # @UndefinedVariable done_queue = mp.Queue() # @UndefinedVariable for task in tasks: task_queue.put(task) for i in range(NUMBER_OF_PROCESSES): # @UnusedVariable mp.Process(target=_worker, args=(task_queue, done_queue)).start() # @UndefinedVariable for i in range(len(tasks)): # @UnusedVariable rows, Dk_part = done_queue.get() D_k[:, rows[0]:rows[-1]+1] = Dk_part for i in range(NUMBER_OF_PROCESSES): # @UnusedVariable task_queue.put('STOP') # k-occurence N_k = np.bincount(D_k.astype(int).ravel()) # Hubness S_k = stats.skew(N_k) if verbose: log.message("Hubness calculation done.", flush=True) # return hubness, k-nearest neighbors, N occurence return S_k, D_k, N_k
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance', test_set_ind:np.ndarray=None): """Transform a distance matrix with Local Scaling. Transforms the given distance matrix into new one using local scaling [1]_ with the given `k`-th nearest neighbor. There are two types of local scaling methods implemented. The original one and NICDM, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray or csr_matrix The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: self similarities in sparse `D_ls` are set to ``np.inf`` test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_ls : ndarray Secondary distance LocalScaling matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = Logging.ConsoleLogging() # Checking input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'similarity': sort_order = -1 exclude = -np.inf self_tmp_value = np.inf self_value = 1. log.warning("Similarity matrix support for LS is experimental.") else: # metric == 'distance': sort_order = 1 exclude = np.inf self_value = 0 self_tmp_value = self_value if issparse(D): log.error("Sparse distance matrices are not supported.") raise NotImplementedError( "Sparse distance matrices are not supported.") D = np.copy(D) n = D.shape[0] if test_set_ind is None: train_set_ind = slice(0, n) #take all else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) r = np.zeros(n) for i in range(n): if issparse(D): di = D[i, train_set_ind].toarray() else: di = D[i, train_set_ind] di[i] = exclude nn = np.argsort(di)[::sort_order] r[i] = di[nn[k-1]] #largest similarities or smallest distances if issparse(D): D_ls = lil_matrix(D.shape) else: D_ls = np.zeros_like(D) for i in range(n): # vectorized inner loop: calc only triu part tmp = np.empty(n-i) tmp[0] = self_tmp_value if metric == 'similarity': tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) else: tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:])) D_ls[i, i:] = tmp # copy triu to tril -> symmetric matrix (diag=zeros) # NOTE: does not affect self values, since inf+inf=inf and 0+0=0 D_ls += D_ls.T if issparse(D): return D_ls.tocsr() else: np.fill_diagonal(D_ls, self_value) return D_ls
def nicdm(D:np.ndarray, k:int=7, metric:str='distance', test_set_ind:np.ndarray=None): """Transform a distance matrix with local scaling variant NICDM. Transforms the given distance matrix into new one using NICDM [1]_ with the given neighborhood radius `k` (average). There are two types of local scaling methods implemented. The original one and the non-iterative contextual dissimilarity measure, both reduce hubness in distance spaces, similarly to Mutual Proximity. Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 7) Neighborhood radius for local scaling. metric : {'distance'}, optional (default: 'distance') Currently, only distance matrices are supported. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. Returns ------- D_nicdm : ndarray Secondary distance NICDM matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ #log = Logging.ConsoleLogging() # Checking input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'similarity': raise NotImplementedError("NICDM does not support similarity matrices " "at the moment.") D = np.copy(D) if metric == 'distance': sort_order = 1 exclude = np.inf else: #metric == 'similarity': sort_order = -1 exclude = -np.inf n = D.shape[0] if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) knn = np.zeros((n, k)) r = np.zeros(n) np.fill_diagonal(D, np.inf) for i in range(n): di = D[i, :].copy() di[i] = exclude di = di[train_set_ind] nn = np.argsort(di)[::sort_order] knn[i, :] = di[nn[0:k]] # largest sim. or smallest dist. r[i] = np.mean(knn[i]) r_geom = _local_geomean(knn.ravel()) D_nicdm = np.zeros_like(D) for i in range(n): # vectorized inner loop for 100x speed-up (using broadcasting) D_nicdm[i, i+1:] = (r_geom * D[i, i+1:]) / np.sqrt(r[i] * r[i+1:]) D_nicdm += D_nicdm.T return D_nicdm
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0): """Compute hubness of a distance matrix. Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse nearest neighbor count, i.e. how often does a point occur in the `k`-nearest neighbor lists of other points). Parameters ---------- D : ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 5) Neighborhood size for `k`-occurence. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- S_k : float Hubness (skewness of `k`-occurence distribution) D_k : ndarray `k`-nearest neighbor lists N_k : ndarray `k`-occurence list References ---------- .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. Journal of Machine Learning Research, 11, 2487–2531. Retrieved from http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/ radovanovic10a.pdf """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'distance': d_self = np.inf sort_order = 1 if metric == 'similarity': d_self = -np.inf sort_order = -1 if verbose: log.message("Hubness calculation (skewness of {}-occurence)".format(k)) D = D.copy() D_k = np.zeros((k, D.shape[1]), dtype=np.float32) n = D.shape[0] if issparse(D): pass # correct self-distance must be ensured upstream for sparse else: # Set self dist to inf np.fill_diagonal(D, d_self) # make non-finite (NaN, Inf) appear at the end of the sorted list D[~np.isfinite(D)] = d_self for i in range(n): if verbose and ((i+1)%10000==0 or i+1==n): log.message("NN: {} of {}.".format(i+1, n), flush=True) if issparse(D): d = D[i, :].toarray().ravel() # dense copy of one row else: # normal ndarray d = D[i, :] d[i] = d_self d[~np.isfinite(d)] = d_self # Randomize equal values in the distance matrix rows to avoid the # problem case if all numbers to sort are the same, which would yield # high hubness, even if there is none. rp = np.random.permutation(n) d2 = d[rp] d2idx = np.argsort(d2, axis=0)[::sort_order] D_k[:, i] = rp[d2idx[0:k]] # N-occurence N_k = np.bincount(D_k.astype(int).ravel(), minlength=n) # Hubness S_k = stats.skew(N_k) # return k-hubness, k-nearest neighbors, k-occurence if verbose: log.message("Hubness calculation done.", flush=True) return S_k, D_k, N_k
def test_check_valid_metric(self): with self.assertRaises(ValueError): metric = 'dissimilarity' IO._check_valid_metric_parameter(metric)
def mutual_proximity_empiric(D:np.ndarray, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, n_jobs:int=-1): """Transform a distance matrix with Mutual Proximity (empiric distribution). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: -1) Number of parallel processes to be used. NOTE: set ``n_jobs=-1`` to use all CPUs Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) # DO NOT DELETE this comment, will be used upon parallel MP emp dist impl #=========================================================================== # # Initialization # n = D.shape[0] # # # Check input # if D.shape[0] != D.shape[1]: # raise TypeError("Distance/similarity matrix is not quadratic.") # if metric == 'similarity': # self_value = 1 # elif metric == 'distance': # self_value = 0 # if issparse(D): # raise ValueError("MP sparse only supports similarity matrices.") # else: # raise ValueError("Parameter 'metric' must be 'distance' " # "or 'similarity'.") # if test_set_ind is None: # pass # TODO implement # #train_set_ind = slice(0, n) # elif not np.all(~test_set_ind): # raise NotImplementedError("MP empiric does not yet support train/" # "test splits.") # #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) #=========================================================================== if issparse(D): return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log, n_jobs) else: log.warning("MP empiric does not support parallel execution for dense " "matrices at the moment. Continuing with 1 process.") from hub_toolbox.MutualProximity import mutual_proximity_empiric return mutual_proximity_empiric(D, metric, test_set_ind, verbose)
def shared_nearest_neighbors(D:np.ndarray, k:int=10, metric='distance'): """Transform distance matrix using shared nearest neighbors [1]_. SNN similarity is based on computing the overlap between the `k` nearest neighbors of two objects. SNN approaches try to symmetrize nearest neighbor relations using only rank and not distance information [2]_. Parameters ---------- D : np.ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 10) Neighborhood radius: The `k` nearest neighbors are used to calculate SNN. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether the matrix `D` is a distance or similarity matrix Returns ------- D_snn : ndarray Secondary distance SNN matrix References ---------- .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure based on shared near neighbors,” IEEE Transactions on Computers, vol. 22, pp. 1025–1034, 1973. .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th International Conference on Data Mining Workshops, 460–467. http://doi.org/10.1109/ICDMW.2013.101 """ IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'distance': self_value = 0. sort_order = 1 exclude = np.inf if metric == 'similarity': self_value = 1. sort_order = -1 exclude = -np.inf distance = D.copy() np.fill_diagonal(distance, exclude) n = np.shape(distance)[0] knn = np.zeros_like(distance, bool) # find nearest neighbors for each point for i in range(n): di = distance[i, :] nn = np.argsort(di)[::sort_order] knn[i, nn[0:k]] = True D_snn = np.zeros_like(distance) for i in range(n): knn_i = knn[i, :] j_idx = slice(i+1, n) # using broadcasting Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1) if metric == 'distance': D_snn[i, j_idx] = 1. - Dij / k else: # metric == 'similarity': D_snn[i, j_idx] = Dij / k D_snn += D_snn.T np.fill_diagonal(D_snn, self_value) return D_snn
def mutual_proximity_empiric( D: np.ndarray, metric: str = "distance", test_set_ind: np.ndarray = None, verbose: int = 0 ): """Transform a distance matrix with Mutual Proximity (empiric distribution). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. NOTE: In case of sparse ``D`, zeros are interpreted as missing values and ignored during calculations. Thus, results may differ from using a dense version. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] log = Logging.ConsoleLogging() # Check input IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == "similarity": self_value = 1 exclude_value = np.inf else: # metric == 'distance': self_value = 0 exclude_value = -np.inf if issparse(D): raise ValueError("MP sparse only supports similarity matrices.") if test_set_ind is None: pass # TODO implement # train_set_ind = slice(0, n) elif not np.all(~test_set_ind): raise NotImplementedError("MP empiric does not yet support train/" "test splits.") # train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) # Start MP D = D.copy() if issparse(D): return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log) # ensure correct self distances (NOT done for sparse matrices!) np.fill_diagonal(D, exclude_value) D_mp = np.zeros_like(D) # Calculate MP empiric for i in range(n - 1): if verbose and ((i + 1) % 1000 == 0 or i == n - 2): log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True) # Calculate only triu part of matrix j_idx = i + 1 dI = D[i, :][np.newaxis, :] dJ = D[j_idx:n, :] d = D[j_idx:n, i][:, np.newaxis] if metric == "similarity": D_mp[i, j_idx:] = np.sum((dI <= d) & (dJ <= d), 1) / (n - 1) else: # metric == 'distance': D_mp[i, j_idx:] = 1 - (np.sum((dI > d) & (dJ > d), 1) / (n - 1)) # Mirror, so that matrix is symmetric D_mp += D_mp.T np.fill_diagonal(D_mp, self_value) return D_mp
def shared_nearest_neighbors(D: np.ndarray, k: int = 10, metric='similarity'): """Transform distance matrix using shared nearest neighbors [1]_. SNN similarity is based on computing the overlap between the `k` nearest neighbors of two objects. SNN approaches try to symmetrize nearest neighbor relations using only rank and not distance information [2]_. Parameters ---------- D : np.ndarray The ``n x n`` symmetric distance (similarity) matrix. k : int, optional (default: 10) Neighborhood radius: The `k` nearest neighbors are used to calculate SNN. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether the matrix `D` is a distance or similarity matrix Returns ------- D_snn : ndarray Secondary distance SNN matrix References ---------- .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure based on shared near neighbors,” IEEE Transactions on Computers, vol. 22, pp. 1025–1034, 1973. .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th International Conference on Data Mining Workshops, 460–467. http://doi.org/10.1109/ICDMW.2013.101 """ IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) if metric == 'distance': self_value = 0. sort_order = 1 exclude = np.inf if metric == 'similarity': self_value = 1. sort_order = -1 exclude = -np.inf distance = D.copy() np.fill_diagonal(distance, exclude) n = np.shape(distance)[0] knn = np.zeros_like(distance, bool) # find nearest neighbors for each point for i in range(n): di = distance[i, :] nn = np.argsort(di)[::sort_order] knn[i, nn[0:k]] = True D_snn = np.zeros_like(distance) for i in range(n): knn_i = knn[i, :] j_idx = slice(i + 1, n) # using broadcasting Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1) if metric == 'distance': D_snn[i, j_idx] = 1. - Dij / k else: # metric == 'similarity': D_snn[i, j_idx] = Dij / k D_snn += D_snn.T np.fill_diagonal(D_snn, self_value) return D_snn
def mutual_proximity_gaussi(D:np.ndarray, metric:str='distance', sample_size:int=0, test_set_ind:np.ndarray=None, verbose:int=0, n_jobs:int=-1, mv=None): """Transform a distance matrix with Mutual Proximity (indep. normal distr.). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi variant assumes independent normal distributions (FAST). The resulting second. distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. sample_size : int, optional (default: 0) Define sample size from which Gauss parameters are estimated. Use all data when set to ``0``. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: -1) Number of parallel processes to be used. NOTE: set ``n_jobs=-1`` to use all CPUs Returns ------- D_mp : ndarray Secondary distance MP gaussi matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ # Initialization n = D.shape[0] # @UnusedVariable log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) # DO NOT DELETE comment #=========================================================================== # # Checking input # if D.shape[0] != D.shape[1]: # raise TypeError("Distance/similarity matrix is not quadratic.") # if metric == 'similarity': # self_value = 1 # elif metric == 'distance': # self_value = 0 # else: # raise ValueError("Parameter metric must be 'distance' or 'similarity'.") #=========================================================================== if test_set_ind is None: train_set_ind = slice(0, n) else: train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) #=========================================================================== # # Start MP Gaussi # if verbose: # log.message('Mutual Proximity Gaussi rescaling started.', flush=True) # D = D.copy() #=========================================================================== if issparse(D): return _mutual_proximity_gaussi_sparse(D, sample_size, train_set_ind, verbose, log, mv, n_jobs) else: log.warning("MP gaussi does not support parallel execution for dense " "matrices at the moment. Continuing with 1 process.") from hub_toolbox.MutualProximity import mutual_proximity_gaussi return mutual_proximity_gaussi(D, metric, sample_size, test_set_ind, verbose)