def test_intrinsic_dim_mle_levina_low_memory(self): """ Same as above, but invoking the speed-memory trade-off. """ _, _, vector = load_dexter() ID_MLE_REF = 74.472 id_mle = intrinsic_dimension(vector, 6, 12, 'levina', 'vector', None, mem_threshold=0) return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3)
def test_intrinsic_dim_mle_levina(self): """Test against value calc. by matlab reference implementation.""" _, _, vector = load_dexter() ID_MLE_REF = 74.472 id_mle = intrinsic_dimension(vector, k1=6, k2=12, estimator='levina', metric='vector', trafo=None) return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3)
def test_load_dexter(self): """Loading dexter, checking shape of distances, labels, vectors""" self.dist, self.lab, self.vect = load_dexter() symm_dist_shape = self.dist.shape[0] == self.dist.shape[1] corr_dist_shape = self.dist.shape[0] == self.vect.shape[0] corr_label_shape = self.lab.shape[0] == self.vect.shape[0] return self.assertTrue( symm_dist_shape == corr_dist_shape == corr_label_shape)
def __init__(self, D: np.ndarray = None, classes: np.ndarray = None, vectors: np.ndarray = None, metric: str = 'distance'): """Initialize a quick hubness analysis. Parameters ---------- D : ndarray, optional (default: None) The n x n symmetric distance (similarity) matrix. Default: load example dataset (dexter). classes : ndarray, optional (default: None) The 1 x n class labels. Required for k-NN, GK. vectors : ndarray, optional (default: None) The m x n vector data. Required for IntrDim estimation. metric : {'distance', 'similarity'} Define whether `D` is a distance or similarity matrix. """ self.has_class_data, self.has_vector_data = False, False if D is None: print( '\n' 'NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.' '\n' 'DEXTER is a text classification problem in a bag-of-word \n' 'representation. This is a two-class classification problem\n' 'with sparse continuous input variables. \n' 'This dataset is one of five datasets of the NIPS 2003\n' 'feature selection challenge.\n' 'http://archive.ics.uci.edu/ml/datasets/Dexter\n') self.D, self.classes, self.vectors = io.load_dexter() self.has_class_data, self.has_vector_data = True, True self.metric = 'distance' else: # copy data and ensure correct type (not int16 etc.) self.D = np.copy(D).astype(np.float64) if classes is None: self.classes = None else: self.classes = np.copy(classes).astype(np.float64) self.has_class_data = True if vectors is None: self.vectors = None else: self.vectors = np.copy(vectors).astype(np.float64) self.has_vector_data = True self.metric = metric self.n = len(self.D) self.experiments = []
def setUp(self): _, y, X = load_dexter() r = np.random.permutation(y.size) self.X = X[r, :] self.y = y[r] split = int(len(y) / 10 * 9) train_ind = slice(0, split) test_ind = slice(split, len(y)) self.X_train = self.X[train_ind] self.X_test = self.X[test_ind] self.y_train = self.y[train_ind] self.y_test = self.y[test_ind]
def setUp(self): self.distance, self.label, self.vector = load_dexter() self.n = self.distance.shape[0]
def setUp(self): self.distance, self.target, self.vectors = load_dexter()
# Gini index if k_occurrence.shape[0] > 10_000: limiting = 'space' else: limiting = 'time' self.gini_index_ = self._gini_index(k_occurrence, limiting) # Robin Hood index self.hood_index_ = self._hood_index(k_occurrence) # Atkinson index self.atkinson_index_ = self._atkinson_index(k_occurrence) # anti-hub occurrence self.antihubs_, self.antihub_occurrence_ = \ self._antihub_occurrence(k_occurrence) # hub occurrence self.hubs_, self.hub_occurrence_ = \ self._hub_occurrence(k=self.k, k_occurrence=k_occurrence, n_test=n_test, hub_size=self.hub_size) # Largest hub # TODO That should probably also be diveded by k... self.groupie_ratio_ = k_occurrence.max() / n_test return self if __name__ == '__main__': # Simple test case from hub_toolbox.io import load_dexter dexter_distance, l, v = load_dexter() Sn, Dk, Nk = hubness(dexter_distance) Snv, Dkv, Nkv = hubness_from_vectors(v, metric='cosine') print("Hubness =", Sn, Snv)
else: self_value = 1 if test_ind is None: # Ensure correct self distances and return sec. dist. matrix np.fill_diagonal(D_shi, self_value) return D_shi else: # only return test-train-distances (there are no self distances here) return D_shi[test_ind] if __name__ == '__main__': from hub_toolbox.hubness import hubness from hub_toolbox.knn_classification import score D, y, X = io.load_dexter() print("D", D.shape) print("y", y.shape) print("X", X.shape) D_shi = simhub(D, y=None) D_snn = shared_nearest_neighbors(D, k=50) h = hubness(D_shi, k=5) h_snn = hubness(D_snn, k=5) acc = score(D_shi, y, 5) acc_snn = score(D_snn, y, 5) D_sh = simhub(D=D, y=y) h_sh = hubness(D_sh, k=5) acc_sh = score(D_sh, y, 5) print("hubness SNN:", h_snn[0]) print("hubness SHI:", h[0])