예제 #1
0
    def __init__(self, D:np.ndarray, secondary_distance_type:str,
                 metric:str='distance', classes:np.ndarray=None,
                 vectors:np.ndarray=None):
        """Initialize a hubness experiment"""

        IO._check_distance_matrix_shape(D)
        IO._check_valid_metric_parameter(metric)
        if secondary_distance_type not in SEC_DIST.keys():
            raise ValueError("Requested secondary distance type unknown.")
        if classes is not None:
            IO._check_distance_matrix_shape_fits_labels(D, classes)
        if vectors is None:
            self.embedding_dim = None
        else: # got vectors
            IO._check_distance_matrix_shape_fits_vectors(D, vectors)
            self.embedding_dim = vectors.shape[1]
        self.original_distance = D
        self.secondary_distance_type = secondary_distance_type
        self.classes = classes
        self.vectors = vectors
        self.metric = metric
        self.n = D.shape[0]
        # Obtained later through functions:
        self.secondary_distance = None
        self.hubness = dict()
        self.anti_hubs = dict()
        self.max_hub_k_occurence = dict()
        self.knn_accuracy = dict()
        self.gk_index = None
예제 #2
0
def load_dexter():
    """Load the dexter data set.

    .. note:: Deprecated in hub-toolbox 2.3
              Will be removed in hub-toolbox 3.0.
              Please use IO.load_dexter() instead.
    """

    return IO.load_dexter()
예제 #3
0
    def __init__(self, D:np.ndarray=None, classes:np.ndarray=None,
                 vectors:np.ndarray=None, metric:str='distance'):
        """Initialize a quick hubness analysis.

        Parameters
        ----------
        D : ndarray, optional (default: None)
            The n x n symmetric distance (similarity) matrix.
            Default: load example dataset (dexter).

        classes : ndarray, optional (default: None)
            The 1 x n class labels. Required for k-NN, GK.

        vectors : ndarray, optional (default: None)
            The m x n vector data. Required for IntrDim estimation.

        metric : {'distance', 'similarity'}
            Define whether `D` is a distance or similarity matrix.
        """

        self.has_class_data, self.has_vector_data = False, False
        if D is None:
            print('\n'
                  'NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.'
                  '\n'
                  'DEXTER is a text classification problem in a bag-of-word \n'
                  'representation. This is a two-class classification problem\n'
                  'with sparse continuous input variables. \n'
                  'This dataset is one of five datasets of the NIPS 2003\n'
                  'feature selection challenge.\n'
                  'http://archive.ics.uci.edu/ml/datasets/Dexter\n')
            self.D, self.classes, self.vectors = IO.load_dexter()
            self.has_class_data, self.has_vector_data = True, True
            self.metric = 'distance'
        else:
            # copy data and ensure correct type (not int16 etc.)
            self.D = np.copy(D).astype(np.float64)
            if classes is None:
                self.classes = None
            else:
                self.classes = np.copy(classes).astype(np.float64)
                self.has_class_data = True
            if vectors is None:
                self.vectors = None
            else:
                self.vectors = np.copy(vectors).astype(np.float64)
                self.has_vector_data = True
            self.metric = metric
        self.n = len(self.D)
        self.experiments = []
예제 #4
0
 def __init__(self, D, k:int=5, isSimilarityMatrix:bool=False):
     self.log = Logging.ConsoleLogging()
     if isinstance(D, np.memmap):
         self.D = D
     else:
         self.D = IO.copy_D_or_load_memmap(D, writeable=False)
     self.k = k
     if isSimilarityMatrix:
         self.d_self = -np.inf
         self.sort_order = -1 # descending, interested in highest similarity
     else:
         self.d_self = np.inf
         self.sort_order = 1 # ascending, interested in smallest distance
     np.random.seed()
예제 #5
0
 def __init__(self, D, isSimilarityMatrix=False):
     """
     .. note:: Deprecated in hub-toolbox 2.3
               Class will be removed in hub-toolbox 3.0.
               Please use static functions instead.
     """
     print(
         "DEPRECATED: Please use the appropriate MutualProximity."
         "mutual_proximity_DISTRIBUTIONTYPE() function instead.",
         file=sys.stderr,
     )
     self.D = IO.copy_D_or_load_memmap(D, writeable=True)
     self.log = Logging.ConsoleLogging()
     if isSimilarityMatrix:
         self.self_value = 1
     else:
         self.self_value = 0
     self.isSimilarityMatrix = isSimilarityMatrix
 def __init__(self, D, isSimilarityMatrix=False, missing_values=None, tmp='/tmp/'):
     """
     .. note:: Deprecated in hub-toolbox 2.3
               Class will be removed in hub-toolbox 3.0.
               Please use static functions instead.
     """
     print("DEPRECATED: Please use the appropriate MutualProximity_parallel."
           "mutual_proximity_DISTRIBUTIONTYPE() function instead.", 
           file=sys.stderr)
     self.D = IO.copy_D_or_load_memmap(D, writeable=True)
     self.log = Logging.ConsoleLogging()
     if isSimilarityMatrix:
         self.self_value = 1
     else:
         self.self_value = 0
     self.isSimilarityMatrix = isSimilarityMatrix
     self.tmp = tmp
     if missing_values is None:
         if issparse(D):
             self.mv = 0
         else:
             self.mv = None
     else: 
         self.mv = missing_values
예제 #7
0
 def test_check_dist_vs_vectors(self):
     with self.assertRaises(TypeError):
         D = np.zeros((5, 5))
         vectors = np.zeros((4, 5))
         IO._check_distance_matrix_shape_fits_vectors(D, vectors)
예제 #8
0
 def test_check_dist_vs_classes(self):
     with self.assertRaises(TypeError):
         D = np.empty((5, 5))
         classes = np.empty(4)
         IO._check_distance_matrix_shape_fits_labels(D, classes)
예제 #9
0
 def test_check_shape(self):
     with self.assertRaises(TypeError):
         d = np.empty((2, 3))
         IO._check_distance_matrix_shape(d)
예제 #10
0
def score(D:np.ndarray, target:np.ndarray, k=5, 
          metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0):
    """Perform `k`-nearest neighbor classification.
    
    Use the ``n x n`` symmetric distance matrix `D` and target class 
    labels `target` to perform a `k`-NN experiment (leave-one-out 
    cross-validation or evaluation of test set; see parameter `test_set_ind`).
    Ties are broken by the nearest neighbor.
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    target : ndarray (of dtype=int)
        The ``n x 1`` target class labels (ground truth).
    
    k : int or array_like (of dtype=int), optional (default: 5)
        Neighborhood size for `k`-NN classification.
        For each value in `k`, one `k`-NN experiment is performed.
        
        HINT: Providing more than one value for `k` is a cheap means to perform 
        multiple `k`-NN experiments at once. Try e.g. ``k=[1, 5, 20]``.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Perform a LOO-CV experiment
        - ndarray : Hold out points indexed in this array as test set. Fit 
          model to remaining data. Evaluate model on test set.
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
    
    Returns
    -------
    acc : ndarray (shape=(n_k x 1), dtype=float)
        Classification accuracy (`n_k`... number of items in parameter `k`)
        
        HINT: Refering to the above example... 
        ... ``acc[0]`` gives the accuracy of the ``k=1`` experiment.
    corr : ndarray (shape=(n_k x n), dtype=int)
        Raw vectors of correctly classified items
        
        HINT: ... ``corr[1, :]`` gives these items for the ``k=5`` experiment.
    cmat : ndarray (shape=(n_k x n_t x n_t), dtype=int) 
        Confusion matrix (``n_t`` number of unique items in parameter target)
        
        HINT: ... ``cmat[2, :, :]`` gives the confusion matrix of 
        the ``k=20`` experiment.
    """
    
    # Check input sanity
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_distance_matrix_shape_fits_labels(D, target)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
    
    # Copy, because data is changed
    D = D.copy()
    target = target.astype(int)
    
    if verbose:
        log.message("Start k-NN experiment.")
    # Handle LOO-CV vs. test set mode
    if test_set_ind is None:
        n = D.shape[0]
        test_set_ind = range(n)    # dummy 
        train_set_ind = n   # dummy
    else:  
        # number of points to be classified
        n = test_set_ind.size
        # Indices of training examples
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    # Number of k-NN parameters
    try:
        k_length = k.size
    except AttributeError as e:
        if isinstance(k, int):
            k = np.array([k])
            k_length = k.size
        elif isinstance(k, list):
            k = np.array(k)
            k_length = k.size
        else:
            raise e
        
    acc = np.zeros((k_length, 1))
    corr = np.zeros((k_length, D.shape[0]))
        
    cl = np.sort(np.unique(target))
    cmat = np.zeros((k_length, len(cl), len(cl)))
    
    classes = target.copy()
    for idx, cur_class in enumerate(cl):
        # change labels to 0, 1, ..., len(cl)-1
        classes[target == cur_class] = idx
    
    cl = range(len(cl))
    
    # Classify each point in test set
    for i in test_set_ind:
        seed_class = classes[i]
        
        if issparse(D):
            row = D.getrow(i).toarray().ravel()
        else:
            row = D[i, :]
        row[i] = d_self
        
        # Sort points in training set according to distance
        # Randomize, in case there are several points of same distance
        # (this is especially relevant for SNN rescaling)
        rp = train_set_ind
        rp = np.random.permutation(rp)
        d2 = row[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        idx = rp[d2idx]      
        
        # More than one k is useful for cheap multiple k-NN experiments at once
        for j in range(k_length):
            nn_class = classes[idx[0:k[j]]]
            cs = np.bincount(nn_class.astype(int))
            max_cs = np.where(cs == np.max(cs))[0]
            
            # "tie": use nearest neighbor
            if len(max_cs) > 1:
                if seed_class == nn_class[0]:
                    acc[j] += 1/n 
                    corr[j, i] = 1
                cmat[j, seed_class, nn_class[0]] += 1       
            # majority vote
            else:
                if cl[max_cs[0]] == seed_class:
                    acc[j] += 1/n
                    corr[j, i] = 1
                cmat[j, seed_class, cl[max_cs[0]]] += 1
                       
    if verbose:
        log.message("Finished k-NN experiment.")
        
    return acc, corr, cmat
def mutual_proximity_gammai(D:np.ndarray, metric:str='distance', 
                            test_set_ind:np.ndarray=None, verbose:int=0, 
                            n_jobs:int=-1, mv=None):
    """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai 
    variant assumes independent Gamma distributed distances (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    n_jobs : int, optional (default: -1)
        Number of parallel processes to be used.
        
        NOTE: set ``n_jobs=-1`` to use all CPUs
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gammai matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    n = D.shape[0]
    sample_size = 0 # not implemented
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)    
    if issparse(D):
        return _mutual_proximity_gammai_sparse(D, sample_size, train_set_ind, 
                                               verbose, log, mv, n_jobs)
    else:
        log.warning("MP gammai does not support parallel execution for dense "
                    "matrices at the moment. Continuing with 1 process.")
        from hub_toolbox.MutualProximity import mutual_proximity_gammai
        return mutual_proximity_gammai(D, metric, test_set_ind, verbose)
예제 #12
0
def mutual_proximity_gammai(D: np.ndarray, metric: str = "distance", test_set_ind: np.ndarray = None, verbose: int = 0):
    """Transform a distance matrix with Mutual Proximity (indep. Gamma distr.).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gammai 
    variant assumes independent Gamma distributed distances (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
        
        NOTE: In case of sparse `D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gammai matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = Logging.ConsoleLogging()

    # Checking input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == "similarity":
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    if verbose:
        log.message("Mutual proximity Gammai rescaling started.", flush=True)
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_gammai_sparse(D, test_set_ind, verbose, log)

    np.fill_diagonal(D, np.nan)

    mu = np.nanmean(D[train_set_ind], 0)
    va = np.nanvar(D[train_set_ind], 0, ddof=1)
    A = (mu ** 2) / va
    B = va / mu

    D_mp = np.zeros_like(D)

    # MP gammai
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gammai: {} of {}".format(i + 1, n), flush=True)
        j_idx = slice(i + 1, n)

        if metric == "similarity":
            p1 = _local_gamcdf(D[i, j_idx], A[i], B[i])
            p2 = _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx])
            D_mp[i, j_idx] = (p1 * p2).ravel()
        else:  # distance
            p1 = 1 - _local_gamcdf(D[i, j_idx], A[i], B[i])
            p2 = 1 - _local_gamcdf(D[j_idx, i], A[j_idx], B[j_idx])
            D_mp[i, j_idx] = (1 - p1 * p2).ravel()

    # Mirroring the matrix
    D_mp += D_mp.T
    # set correct self dist/sim
    np.fill_diagonal(D_mp, self_value)

    return D_mp
예제 #13
0
def mutual_proximity_gauss(D: np.ndarray, metric: str = "distance", test_set_ind: np.ndarray = None, verbose: int = 0):
    """Transform a distance matrix with Mutual Proximity (normal distribution).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gauss 
    variant assumes dependent normal distributions (VERY SLOW).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gauss matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = Logging.ConsoleLogging()

    # Checking input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == "similarity":
        self_value = 1
    else:  # metric == 'distance':
        self_value = 0
    if issparse(D):
        log.error("Sparse matrices not supported by MP Gauss.")
        raise TypeError("Sparse matrices not supported by MP Gauss.")
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    D = D.copy()

    np.fill_diagonal(D, self_value)
    # np.fill_diagonal(D, np.nan)

    mu = np.mean(D[train_set_ind], 0)
    sd = np.std(D[train_set_ind], 0, ddof=0)
    # ===========================================================================
    # mu = np.nanmean(D[train_set_ind], 0)
    # sd = np.nanstd(D[train_set_ind], 0, ddof=0)
    # ===========================================================================

    # Code for the BadMatrixSigma error [derived from matlab]
    # ===========================================================================
    # eps = np.spacing(1)
    # epsmat = np.array([[1e5 * eps, 0], [0, 1e5 * eps]])
    # ===========================================================================

    D_mp = np.zeros_like(D)

    # MP Gauss
    for i in range(n):
        if verbose and ((i + 1) % 1000 == 0 or i + 1 == n):
            log.message("MP_gauss: {} of {}.".format(i + 1, n))
        for j in range(i + 1, n):
            # ===================================================================
            # mask = np.isnan(D[[i, j], :])
            # D_mask = np.ma.array(D[[i, j], :], mask=mask)
            # c = np.ma.cov(D_mask, ddof=0)
            # ===================================================================
            c = np.cov(D[[i, j], :], ddof=0)
            x = np.array([D[i, j], D[j, i]])
            m = np.array([mu[i], mu[j]])

            low = np.tile(np.finfo(np.float32).min, 2)
            p12 = mvn.mvnun(low, x, m, c)[0]  # [0]...p, [1]...inform
            if np.isnan(p12):
                # ===============================================================
                # power = 7
                # while np.isnan(p12):
                #     c += epsmat * (10**power)
                #     p12 = mvn.mvnun(low, x, m, c)[0]
                #     power += 1
                # log.warning("p12 is NaN: i={}, j={}. Increased cov matrix by "
                #             "O({}).".format(i, j, epsmat[0, 0]*(10**power)))
                # ===============================================================

                p12 = 0.0
                log.warning("p12 is NaN: i={}, j={}. Set to zero.".format(i, j))

            if metric == "similarity":
                D_mp[i, j] = p12
            else:  # distance
                p1 = norm.cdf(D[i, j], mu[i], sd[i])
                p2 = norm.cdf(D[i, j], mu[j], sd[j])
                D_mp[i, j] = p1 + p2 - p12
    D_mp += D_mp.T
    np.fill_diagonal(D_mp, self_value)
    return D_mp
예제 #14
0
def hubness(D:np.ndarray, k:int=5, metric='distance', 
            verbose:int=0, n_jobs:int=-1):
    """Compute hubness of a distance matrix.
    
    Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse 
    nearest neighbor count, i.e. how often does a point occur in the 
    `k`-nearest neighbor lists of other points).
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 5)
        Neighborhood size for `k`-occurence.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    n_jobs : int, optional (default: -1)
        Number of parallel processes spawned for hubness calculation.
        Default value (-1): number of available CPUs.
        
    Returns
    -------
    S_k : float
        Hubness (skewness of `k`-occurence distribution)
    D_k : ndarray
        `k`-nearest neighbor lists
    N_k : ndarray
        `k`-occurence list    
    
    References
    ----------
    .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). 
           Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. 
           Journal of Machine Learning Research, 11, 2487–2531. Retrieved from 
           http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/
           radovanovic10a.pdf
    """
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
    
    if verbose:
        log.message("Hubness calculation (skewness of {}-occurence)".format(k))
        
    # Initialization
    n = D.shape[0]
    D = D.copy()
    D_k = np.zeros((k, D.shape[1]), dtype=np.float32 )
    
    if issparse(D): 
        pass # correct self-distance must be ensured upstream for sparse
    else:
        # Set self dist to inf
        np.fill_diagonal(D, d_self)
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self
                        
    # Parallelization
    if n_jobs == -1: # take all cpus
        NUMBER_OF_PROCESSES = mp.cpu_count()  # @UndefinedVariable
    else:
        NUMBER_OF_PROCESSES = n_jobs
    tasks = []
    
    batches = []
    batch_size = n // NUMBER_OF_PROCESSES
    for i in range(NUMBER_OF_PROCESSES-1):
        batches.append( np.arange(i*batch_size, (i+1)*batch_size) )
    batches.append( np.arange((NUMBER_OF_PROCESSES-1)*batch_size, n) )
    
    for idx, batch in enumerate(batches):
        submatrix = D[batch[0]:batch[-1]+1]
        tasks.append((_partial_hubness, 
                     (k, d_self, log, sort_order, 
                      batch, submatrix, idx, n, verbose)))   
    
    task_queue = mp.Queue()  # @UndefinedVariable
    done_queue = mp.Queue()  # @UndefinedVariable
    
    for task in tasks:
        task_queue.put(task)
        
    for i in range(NUMBER_OF_PROCESSES):  # @UnusedVariable
        mp.Process(target=_worker, args=(task_queue, done_queue)).start()  # @UndefinedVariable
    
    for i in range(len(tasks)):  # @UnusedVariable
        rows, Dk_part = done_queue.get()
        D_k[:, rows[0]:rows[-1]+1] = Dk_part
        
    for i in range(NUMBER_OF_PROCESSES):  # @UnusedVariable
        task_queue.put('STOP')        
               
    # k-occurence
    N_k = np.bincount(D_k.astype(int).ravel())    
    # Hubness
    S_k = stats.skew(N_k)
     
    if verbose:
        log.message("Hubness calculation done.", flush=True)
        
    # return hubness, k-nearest neighbors, N occurence
    return S_k, D_k, N_k
예제 #15
0
def local_scaling(D:np.ndarray, k:int=7, metric:str='distance',
                  test_set_ind:np.ndarray=None):
    """Transform a distance matrix with Local Scaling.
    
    Transforms the given distance matrix into new one using local scaling [1]_
    with the given `k`-th nearest neighbor. There are two types of local
    scaling methods implemented. The original one and NICDM, both reduce
    hubness in distance spaces, similarly to Mutual Proximity.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 7)
        Neighborhood radius for local scaling.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: self similarities in sparse `D_ls` are set to ``np.inf``
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    Returns
    -------
    D_ls : ndarray
        Secondary distance LocalScaling matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    log = Logging.ConsoleLogging()
    # Checking input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'similarity':
        sort_order = -1
        exclude = -np.inf
        self_tmp_value = np.inf
        self_value = 1.
        log.warning("Similarity matrix support for LS is experimental.")
    else: # metric == 'distance':
        sort_order = 1
        exclude = np.inf
        self_value = 0
        self_tmp_value = self_value
        if issparse(D):
            log.error("Sparse distance matrices are not supported.")
            raise NotImplementedError(
                "Sparse distance matrices are not supported.") 
            
    D = np.copy(D)
    n = D.shape[0]
    if test_set_ind is None:
        train_set_ind = slice(0, n) #take all        
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    
    r = np.zeros(n)
    for i in range(n):
        if issparse(D):
            di = D[i, train_set_ind].toarray()
        else:
            di = D[i, train_set_ind]
        di[i] = exclude
        nn = np.argsort(di)[::sort_order]
        r[i] = di[nn[k-1]] #largest similarities or smallest distances
    
    if issparse(D):
        D_ls = lil_matrix(D.shape)
    else:
        D_ls = np.zeros_like(D)
        
    for i in range(n):
        # vectorized inner loop: calc only triu part
        tmp = np.empty(n-i)
        tmp[0] = self_tmp_value
        if metric == 'similarity':
            tmp[1:] = np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
        else:
            tmp[1:] = 1 - np.exp(-1 * D[i, i+1:]**2 / (r[i] * r[i+1:]))
        D_ls[i, i:] = tmp
    # copy triu to tril -> symmetric matrix (diag=zeros)
    # NOTE: does not affect self values, since inf+inf=inf and 0+0=0
    D_ls += D_ls.T
    
    if issparse(D):
        return D_ls.tocsr()
    else:
        np.fill_diagonal(D_ls, self_value)
        return D_ls
예제 #16
0
def nicdm(D:np.ndarray, k:int=7, metric:str='distance', 
          test_set_ind:np.ndarray=None):
    """Transform a distance matrix with local scaling variant NICDM.
    
    Transforms the given distance matrix into new one using NICDM [1]_
    with the given neighborhood radius `k` (average). There are two types of 
    local scaling methods implemented. The original one and the non-iterative 
    contextual dissimilarity measure, both reduce hubness in distance spaces, 
    similarly to Mutual Proximity.
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 7)
        Neighborhood radius for local scaling.
    
    metric : {'distance'}, optional (default: 'distance')
        Currently, only distance matrices are supported.
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    Returns
    -------
    D_nicdm : ndarray
        Secondary distance NICDM matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    #log = Logging.ConsoleLogging()
    # Checking input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'similarity':
        raise NotImplementedError("NICDM does not support similarity matrices "
                                  "at the moment.")
    D = np.copy(D)
    
    if metric == 'distance':
        sort_order = 1
        exclude = np.inf
    else: #metric == 'similarity':
        sort_order = -1
        exclude = -np.inf
            
    n = D.shape[0]
    
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    knn = np.zeros((n, k))
    r = np.zeros(n)
    np.fill_diagonal(D, np.inf)
    for i in range(n):
        di = D[i, :].copy()
        di[i] = exclude
        di = di[train_set_ind]
        nn = np.argsort(di)[::sort_order]
        knn[i, :] = di[nn[0:k]] # largest sim. or smallest dist.
        r[i] = np.mean(knn[i]) 
    r_geom = _local_geomean(knn.ravel())
     
    D_nicdm = np.zeros_like(D)
    for i in range(n):
        # vectorized inner loop for 100x speed-up (using broadcasting)
        D_nicdm[i, i+1:] = (r_geom * D[i, i+1:]) / np.sqrt(r[i] * r[i+1:])
    D_nicdm += D_nicdm.T
     
    return D_nicdm
예제 #17
0
def hubness(D:np.ndarray, k:int=5, metric='distance', verbose:int=0):
    """Compute hubness of a distance matrix.
    
    Hubness [1]_ is the skewness of the `k`-occurrence histogram (reverse 
    nearest neighbor count, i.e. how often does a point occur in the 
    `k`-nearest neighbor lists of other points).
    
    Parameters
    ----------
    D : ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
    
    k : int, optional (default: 5)
        Neighborhood size for `k`-occurence.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    S_k : float
        Hubness (skewness of `k`-occurence distribution)
    D_k : ndarray
        `k`-nearest neighbor lists
    N_k : ndarray
        `k`-occurence list    
    
    References
    ----------
    .. [1] Radovanović, M., Nanopoulos, A., & Ivanović, M. (2010). 
           Hubs in Space : Popular Nearest Neighbors in High-Dimensional Data. 
           Journal of Machine Learning Research, 11, 2487–2531. Retrieved from 
           http://jmlr.csail.mit.edu/papers/volume11/radovanovic10a/
           radovanovic10a.pdf
    """
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        d_self = np.inf
        sort_order = 1
    if metric == 'similarity':
        d_self = -np.inf
        sort_order = -1
        
    if verbose:
        log.message("Hubness calculation (skewness of {}-occurence)".format(k))
    D = D.copy()           
    D_k = np.zeros((k, D.shape[1]), dtype=np.float32)
    n = D.shape[0]
    
    if issparse(D): 
        pass # correct self-distance must be ensured upstream for sparse
    else:
        # Set self dist to inf
        np.fill_diagonal(D, d_self)
        # make non-finite (NaN, Inf) appear at the end of the sorted list
        D[~np.isfinite(D)] = d_self
    
    for i in range(n):
        if verbose and ((i+1)%10000==0 or i+1==n):
            log.message("NN: {} of {}.".format(i+1, n), flush=True)
        if issparse(D):
            d = D[i, :].toarray().ravel() # dense copy of one row
        else: # normal ndarray
            d = D[i, :]
        d[i] = d_self
        d[~np.isfinite(d)] = d_self
        # Randomize equal values in the distance matrix rows to avoid the 
        # problem case if all numbers to sort are the same, which would yield 
        # high hubness, even if there is none.
        rp = np.random.permutation(n)
        d2 = d[rp]
        d2idx = np.argsort(d2, axis=0)[::sort_order]
        D_k[:, i] = rp[d2idx[0:k]]      
               
    # N-occurence
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=n)    
    # Hubness
    S_k = stats.skew(N_k)
     
    # return k-hubness, k-nearest neighbors, k-occurence
    if verbose:
        log.message("Hubness calculation done.", flush=True)
    return S_k, D_k, N_k    
예제 #18
0
 def test_check_valid_metric(self):
     with self.assertRaises(ValueError):
         metric = 'dissimilarity'
         IO._check_valid_metric_parameter(metric)
def mutual_proximity_empiric(D:np.ndarray, metric:str='distance', 
                             test_set_ind:np.ndarray=None, verbose:int=0,
                             n_jobs:int=-1):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using 
    the empiric data distribution (EXACT, rather SLOW). The resulting 
    secondary distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    n_jobs : int, optional (default: -1)
        Number of parallel processes to be used.
        
        NOTE: set ``n_jobs=-1`` to use all CPUs
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    # DO NOT DELETE this comment, will be used upon parallel MP emp dist impl
    #===========================================================================
    # # Initialization
    # n = D.shape[0]
    # 
    # # Check input
    # if D.shape[0] != D.shape[1]:
    #     raise TypeError("Distance/similarity matrix is not quadratic.")        
    # if metric == 'similarity':
    #     self_value = 1
    # elif metric == 'distance':
    #     self_value = 0
    #     if issparse(D):
    #         raise ValueError("MP sparse only supports similarity matrices.")
    # else:
    #     raise ValueError("Parameter 'metric' must be 'distance' "
    #                      "or 'similarity'.")  
    # if test_set_ind is None:
    #     pass # TODO implement
    #     #train_set_ind = slice(0, n)
    # elif not np.all(~test_set_ind):
    #     raise NotImplementedError("MP empiric does not yet support train/"
    #                               "test splits.")
    #     #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
    #===========================================================================
    if issparse(D):
        return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log, n_jobs)
    else:
        log.warning("MP empiric does not support parallel execution for dense "
                    "matrices at the moment. Continuing with 1 process.")
        from hub_toolbox.MutualProximity import mutual_proximity_empiric
        return mutual_proximity_empiric(D, metric, test_set_ind, verbose)
예제 #20
0
def shared_nearest_neighbors(D:np.ndarray, k:int=10, metric='distance'):
    """Transform distance matrix using shared nearest neighbors [1]_.
    
    SNN similarity is based on computing the overlap between the `k` nearest 
    neighbors of two objects. SNN approaches try to symmetrize nearest neighbor 
    relations using only rank and not distance information [2]_.
    
    Parameters
    ----------
    D : np.ndarray
        The ``n x n`` symmetric distance (similarity) matrix.
        
    k : int, optional (default: 10)
        Neighborhood radius: The `k` nearest neighbors are used to calculate SNN.
        
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether the matrix `D` is a distance or similarity matrix

    Returns
    -------
    D_snn : ndarray
        Secondary distance SNN matrix
        
    References
    ---------- 
    .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure 
           based on shared near neighbors,” IEEE Transactions on Computers, 
           vol. 22, pp. 1025–1034, 1973.
    
    .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors 
           Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th 
           International Conference on Data Mining Workshops, 460–467. 
           http://doi.org/10.1109/ICDMW.2013.101
    """
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        self_value = 0.
        sort_order = 1
        exclude = np.inf
    if metric == 'similarity':
        self_value = 1.
        sort_order = -1
        exclude = -np.inf
    
    distance = D.copy()
    np.fill_diagonal(distance, exclude)
    n = np.shape(distance)[0]
    knn = np.zeros_like(distance, bool)
    
    # find nearest neighbors for each point
    for i in range(n):
        di = distance[i, :]
        nn = np.argsort(di)[::sort_order]
        knn[i, nn[0:k]] = True
    
    D_snn = np.zeros_like(distance)
    for i in range(n):
        knn_i = knn[i, :]
        j_idx = slice(i+1, n)
        
        # using broadcasting
        Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1)
        if metric == 'distance':
            D_snn[i, j_idx] = 1. - Dij / k
        else: # metric == 'similarity':
            D_snn[i, j_idx] = Dij / k
        
    D_snn += D_snn.T
    np.fill_diagonal(D_snn, self_value)
    return D_snn
예제 #21
0
def mutual_proximity_empiric(
    D: np.ndarray, metric: str = "distance", test_set_ind: np.ndarray = None, verbose: int = 0
):
    """Transform a distance matrix with Mutual Proximity (empiric distribution).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using 
    the empiric data distribution (EXACT, rather SLOW). The resulting 
    secondary distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
          
        NOTE: In case of sparse ``D`, zeros are interpreted as missing values 
        and ignored during calculations. Thus, results may differ 
        from using a dense version.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
        
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
        
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
        
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP empiric matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """
    # Initialization
    n = D.shape[0]
    log = Logging.ConsoleLogging()

    # Check input
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == "similarity":
        self_value = 1
        exclude_value = np.inf
    else:  # metric == 'distance':
        self_value = 0
        exclude_value = -np.inf
        if issparse(D):
            raise ValueError("MP sparse only supports similarity matrices.")
    if test_set_ind is None:
        pass  # TODO implement
        # train_set_ind = slice(0, n)
    elif not np.all(~test_set_ind):
        raise NotImplementedError("MP empiric does not yet support train/" "test splits.")
        # train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)

    # Start MP
    D = D.copy()

    if issparse(D):
        return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log)

    # ensure correct self distances (NOT done for sparse matrices!)
    np.fill_diagonal(D, exclude_value)

    D_mp = np.zeros_like(D)

    # Calculate MP empiric
    for i in range(n - 1):
        if verbose and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True)
        # Calculate only triu part of matrix
        j_idx = i + 1

        dI = D[i, :][np.newaxis, :]
        dJ = D[j_idx:n, :]
        d = D[j_idx:n, i][:, np.newaxis]

        if metric == "similarity":
            D_mp[i, j_idx:] = np.sum((dI <= d) & (dJ <= d), 1) / (n - 1)
        else:  # metric == 'distance':
            D_mp[i, j_idx:] = 1 - (np.sum((dI > d) & (dJ > d), 1) / (n - 1))

    # Mirror, so that matrix is symmetric
    D_mp += D_mp.T
    np.fill_diagonal(D_mp, self_value)

    return D_mp
예제 #22
0
파일: popcom.py 프로젝트: wyjhxq/PopCorn
def shared_nearest_neighbors(D: np.ndarray, k: int = 10, metric='similarity'):
    """Transform distance matrix using shared nearest neighbors [1]_.

    SNN similarity is based on computing the overlap between the `k` nearest
    neighbors of two objects. SNN approaches try to symmetrize nearest neighbor
    relations using only rank and not distance information [2]_.

    Parameters
    ----------
    D : np.ndarray
        The ``n x n`` symmetric distance (similarity) matrix.

    k : int, optional (default: 10)
        Neighborhood radius: The `k` nearest neighbors are used to calculate SNN.

    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether the matrix `D` is a distance or similarity matrix

    Returns
    -------
    D_snn : ndarray
        Secondary distance SNN matrix

    References
    ----------
    .. [1] R. Jarvis and E. A. Patrick, “Clustering using a similarity measure
           based on shared near neighbors,” IEEE Transactions on Computers,
           vol. 22, pp. 1025–1034, 1973.

    .. [2] Flexer, A., & Schnitzer, D. (2013). Can Shared Nearest Neighbors
           Reduce Hubness in High-Dimensional Spaces? 2013 IEEE 13th
           International Conference on Data Mining Workshops, 460–467.
           http://doi.org/10.1109/ICDMW.2013.101
    """
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    if metric == 'distance':
        self_value = 0.
        sort_order = 1
        exclude = np.inf
    if metric == 'similarity':
        self_value = 1.
        sort_order = -1
        exclude = -np.inf

    distance = D.copy()
    np.fill_diagonal(distance, exclude)
    n = np.shape(distance)[0]
    knn = np.zeros_like(distance, bool)

    # find nearest neighbors for each point
    for i in range(n):
        di = distance[i, :]
        nn = np.argsort(di)[::sort_order]
        knn[i, nn[0:k]] = True

    D_snn = np.zeros_like(distance)
    for i in range(n):
        knn_i = knn[i, :]
        j_idx = slice(i + 1, n)

        # using broadcasting
        Dij = np.sum(np.logical_and(knn_i, knn[j_idx, :]), 1)
        if metric == 'distance':
            D_snn[i, j_idx] = 1. - Dij / k
        else:  # metric == 'similarity':
            D_snn[i, j_idx] = Dij / k

    D_snn += D_snn.T
    np.fill_diagonal(D_snn, self_value)
    return D_snn
def mutual_proximity_gaussi(D:np.ndarray, metric:str='distance', 
                            sample_size:int=0, test_set_ind:np.ndarray=None, 
                            verbose:int=0, n_jobs:int=-1, mv=None):
    """Transform a distance matrix with Mutual Proximity (indep. normal distr.).
    
    Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix. Gaussi 
    variant assumes independent normal distributions (FAST).
    The resulting second. distance/similarity matrix should show lower hubness.
    
    Parameters
    ----------
    D : ndarray or csr_matrix
        - ndarray: The ``n x n`` symmetric distance or similarity matrix.
        - csr_matrix: The ``n x n`` symmetric similarity matrix.
    
    metric : {'distance', 'similarity'}, optional (default: 'distance')
        Define, whether matrix `D` is a distance or similarity matrix.
        
        NOTE: In case of sparse `D`, only 'similarity' is supported.
    
    sample_size : int, optional (default: 0)
        Define sample size from which Gauss parameters are estimated.
        Use all data when set to ``0``.
    
    test_sed_ind : ndarray, optional (default: None)
        Define data points to be hold out as part of a test set. Can be:
        
        - None : Rescale all distances
        - ndarray : Hold out points indexed in this array as test set. 
    
    verbose : int, optional (default: 0)
        Increasing level of output (progress report).
    
    n_jobs : int, optional (default: -1)
        Number of parallel processes to be used.
        
        NOTE: set ``n_jobs=-1`` to use all CPUs
    
    Returns
    -------
    D_mp : ndarray
        Secondary distance MP gaussi matrix.
    
    References
    ----------
    .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). 
           Local and global scaling reduce hubs in space. The Journal of Machine 
           Learning Research, 13(1), 2871–2902.
    """    
    # Initialization   
    n = D.shape[0]  # @UnusedVariable
    log = Logging.ConsoleLogging()
    IO._check_distance_matrix_shape(D)
    IO._check_valid_metric_parameter(metric)
    # DO NOT DELETE comment
    #===========================================================================
    # # Checking input
    # if D.shape[0] != D.shape[1]:
    #     raise TypeError("Distance/similarity matrix is not quadratic.")        
    # if metric == 'similarity':
    #     self_value = 1
    # elif metric == 'distance':
    #     self_value = 0
    # else:
    #     raise ValueError("Parameter metric must be 'distance' or 'similarity'.") 
    #===========================================================================
    if test_set_ind is None:
        train_set_ind = slice(0, n)
    else:
        train_set_ind = np.setdiff1d(np.arange(n), test_set_ind)
     
    #===========================================================================
    # # Start MP Gaussi    
    # if verbose:
    #     log.message('Mutual Proximity Gaussi rescaling started.', flush=True)
    # D = D.copy()
    #===========================================================================

    if issparse(D):
        return _mutual_proximity_gaussi_sparse(D, sample_size, train_set_ind, 
                                               verbose, log, mv, n_jobs)
    else:
        log.warning("MP gaussi does not support parallel execution for dense "
                    "matrices at the moment. Continuing with 1 process.")
        from hub_toolbox.MutualProximity import mutual_proximity_gaussi
        return mutual_proximity_gaussi(D, metric, sample_size, test_set_ind, verbose)