def mp_empiric(self, train_set_mask=None, verbose=False, empspex=False, n_jobs=-1): """ .. note:: Deprecated in hub-toolbox 2.3 Class will be removed in hub-toolbox 3.0. Please use static functions instead. """ if self.isSimilarityMatrix: metric = 'similarity' else: metric = 'distance' if train_set_mask is not None: test_set_ind = np.setdiff1d(np.arange(self.D.shape[0]), train_set_mask) return mutual_proximity_empiric(self.D, metric, test_set_ind, verbose)
def mutual_proximity_empiric(D:np.ndarray, metric:str='distance', test_set_ind:np.ndarray=None, verbose:int=0, n_jobs:int=-1): """Transform a distance matrix with Mutual Proximity (empiric distribution). Applies Mutual Proximity (MP) [1]_ on a distance/similarity matrix using the empiric data distribution (EXACT, rather SLOW). The resulting secondary distance/similarity matrix should show lower hubness. Parameters ---------- D : ndarray or csr_matrix - ndarray: The ``n x n`` symmetric distance or similarity matrix. - csr_matrix: The ``n x n`` symmetric similarity matrix. metric : {'distance', 'similarity'}, optional (default: 'distance') Define, whether matrix `D` is a distance or similarity matrix. NOTE: In case of sparse `D`, only 'similarity' is supported. test_sed_ind : ndarray, optional (default: None) Define data points to be hold out as part of a test set. Can be: - None : Rescale all distances - ndarray : Hold out points indexed in this array as test set. verbose : int, optional (default: 0) Increasing level of output (progress report). n_jobs : int, optional (default: -1) Number of parallel processes to be used. NOTE: set ``n_jobs=-1`` to use all CPUs Returns ------- D_mp : ndarray Secondary distance MP empiric matrix. References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). Local and global scaling reduce hubs in space. The Journal of Machine Learning Research, 13(1), 2871–2902. """ log = Logging.ConsoleLogging() IO._check_distance_matrix_shape(D) IO._check_valid_metric_parameter(metric) # DO NOT DELETE this comment, will be used upon parallel MP emp dist impl #=========================================================================== # # Initialization # n = D.shape[0] # # # Check input # if D.shape[0] != D.shape[1]: # raise TypeError("Distance/similarity matrix is not quadratic.") # if metric == 'similarity': # self_value = 1 # elif metric == 'distance': # self_value = 0 # if issparse(D): # raise ValueError("MP sparse only supports similarity matrices.") # else: # raise ValueError("Parameter 'metric' must be 'distance' " # "or 'similarity'.") # if test_set_ind is None: # pass # TODO implement # #train_set_ind = slice(0, n) # elif not np.all(~test_set_ind): # raise NotImplementedError("MP empiric does not yet support train/" # "test splits.") # #train_set_ind = np.setdiff1d(np.arange(n), test_set_ind) #=========================================================================== if issparse(D): return _mutual_proximity_empiric_sparse(D, test_set_ind, verbose, log, n_jobs) else: log.warning("MP empiric does not support parallel execution for dense " "matrices at the moment. Continuing with 1 process.") from hub_toolbox.MutualProximity import mutual_proximity_empiric return mutual_proximity_empiric(D, metric, test_set_ind, verbose)
print("DEXTER:") print("-------") D, c, v = load_dexter() acc_d, _, _ = score(D, c, [5], 'distance') S = csr_matrix(1 - D) acc_s, _, _ = score(S, c, [5], 'similarity') Sn_d, _, _ = hubness(D, 5, 'distance') Sn_s, _, _ = hubness(S, 5, 'similarity') print("Orig. dist. hubness:", Sn_d) print("Orig. sim. hubness:", Sn_s) if do == 'dexter': print("Orig. dist. k-NN accuracy:", acc_d) print('Orig. sim. k-NN accuracy:', acc_s) D_mp_emp_d = mutual_proximity_empiric(D) D_mp_emp_s = mutual_proximity_empiric(S, 'similarity') Sn_mp_emp_d, _, _ = hubness(D_mp_emp_d, 5) Sn_mp_emp_s, _, _ = hubness(D_mp_emp_s, 5, 'similarity') print("MP emp dist. hubness:", Sn_mp_emp_d) print("MP emp sim. hubness:", Sn_mp_emp_s) if do == 'dexter': acc_mp_emp_d, _, _ = score(D_mp_emp_d, c, [5], 'distance') acc_mp_emp_s, _, _ = score(D_mp_emp_s, c, [5], 'similarity') print("MP emp dist. k-NN accuracy:", acc_mp_emp_d) print("MP emp sim. k-NN accuracy:", acc_mp_emp_s) D_mp_gaussi_d = mutual_proximity_gaussi(D) D_mp_gaussi_s = mutual_proximity_gaussi(S, 'similarity') Sn_mp_gaussi_d, _, _ = hubness(D_mp_gaussi_d, 5) Sn_mp_gaussi_s, _, _ = hubness(D_mp_gaussi_s, 5, 'similarity')