예제 #1
0
 def test_intrinsic_dim_mle_levina_low_memory(self):
     """ Same as above, but invoking the speed-memory trade-off. """
     _, _, vector = load_dexter()
     ID_MLE_REF = 74.472
     id_mle = intrinsic_dimension(vector, 6, 12, 'levina', 
                                  'vector', None, mem_threshold=0)
     return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3)
예제 #2
0
 def test_intrinsic_dim_mle_levina(self):
     """Test against value calc. by matlab reference implementation."""
     _, _, vector = load_dexter()
     ID_MLE_REF = 74.472
     id_mle = intrinsic_dimension(vector, k1=6, k2=12, 
         estimator='levina', metric='vector', trafo=None)
     return np.testing.assert_almost_equal(id_mle, ID_MLE_REF, decimal=3)
예제 #3
0
 def test_load_dexter(self):
     """Loading dexter, checking shape of distances, labels, vectors"""
     self.dist, self.lab, self.vect = load_dexter()
     symm_dist_shape = self.dist.shape[0] == self.dist.shape[1]
     corr_dist_shape = self.dist.shape[0] == self.vect.shape[0]
     corr_label_shape = self.lab.shape[0] == self.vect.shape[0]
     return self.assertTrue(
         symm_dist_shape == corr_dist_shape == corr_label_shape)
    def __init__(self,
                 D: np.ndarray = None,
                 classes: np.ndarray = None,
                 vectors: np.ndarray = None,
                 metric: str = 'distance'):
        """Initialize a quick hubness analysis.

        Parameters
        ----------
        D : ndarray, optional (default: None)
            The n x n symmetric distance (similarity) matrix.
            Default: load example dataset (dexter).

        classes : ndarray, optional (default: None)
            The 1 x n class labels. Required for k-NN, GK.

        vectors : ndarray, optional (default: None)
            The m x n vector data. Required for IntrDim estimation.

        metric : {'distance', 'similarity'}
            Define whether `D` is a distance or similarity matrix.
        """

        self.has_class_data, self.has_vector_data = False, False
        if D is None:
            print(
                '\n'
                'NO PARAMETERS GIVEN! Loading & evaluating DEXTER data set.'
                '\n'
                'DEXTER is a text classification problem in a bag-of-word \n'
                'representation. This is a two-class classification problem\n'
                'with sparse continuous input variables. \n'
                'This dataset is one of five datasets of the NIPS 2003\n'
                'feature selection challenge.\n'
                'http://archive.ics.uci.edu/ml/datasets/Dexter\n')
            self.D, self.classes, self.vectors = io.load_dexter()
            self.has_class_data, self.has_vector_data = True, True
            self.metric = 'distance'
        else:
            # copy data and ensure correct type (not int16 etc.)
            self.D = np.copy(D).astype(np.float64)
            if classes is None:
                self.classes = None
            else:
                self.classes = np.copy(classes).astype(np.float64)
                self.has_class_data = True
            if vectors is None:
                self.vectors = None
            else:
                self.vectors = np.copy(vectors).astype(np.float64)
                self.has_vector_data = True
            self.metric = metric
        self.n = len(self.D)
        self.experiments = []
예제 #5
0
 def setUp(self):
     _, y, X = load_dexter()
     r = np.random.permutation(y.size)
     self.X = X[r, :]
     self.y = y[r]
     split = int(len(y) / 10 * 9)
     train_ind = slice(0, split)
     test_ind = slice(split, len(y))
     self.X_train = self.X[train_ind]
     self.X_test = self.X[test_ind]
     self.y_train = self.y[train_ind]
     self.y_test = self.y[test_ind]
예제 #6
0
 def setUp(self):
     self.distance, self.label, self.vector = load_dexter()
     self.n = self.distance.shape[0]
예제 #7
0
 def setUp(self):
     self.distance, self.target, self.vectors = load_dexter()
예제 #8
0
        # Gini index
        if k_occurrence.shape[0] > 10_000:
            limiting = 'space'
        else:
            limiting = 'time'
        self.gini_index_ = self._gini_index(k_occurrence, limiting)
        # Robin Hood index
        self.hood_index_ = self._hood_index(k_occurrence)
        # Atkinson index
        self.atkinson_index_ = self._atkinson_index(k_occurrence)
        # anti-hub occurrence
        self.antihubs_, self.antihub_occurrence_ = \
            self._antihub_occurrence(k_occurrence)
        # hub occurrence
        self.hubs_, self.hub_occurrence_ = \
            self._hub_occurrence(k=self.k, k_occurrence=k_occurrence,
                                 n_test=n_test, hub_size=self.hub_size)
        # Largest hub
        # TODO That should probably also be diveded by k...
        self.groupie_ratio_ = k_occurrence.max() / n_test
        return self


if __name__ == '__main__':
    # Simple test case
    from hub_toolbox.io import load_dexter
    dexter_distance, l, v = load_dexter()
    Sn, Dk, Nk = hubness(dexter_distance)
    Snv, Dkv, Nkv = hubness_from_vectors(v, metric='cosine')
    print("Hubness =", Sn, Snv)
    else:
        self_value = 1

    if test_ind is None:
        # Ensure correct self distances and return sec. dist. matrix
        np.fill_diagonal(D_shi, self_value)
        return D_shi
    else:
        # only return test-train-distances (there are no self distances here)
        return D_shi[test_ind]


if __name__ == '__main__':
    from hub_toolbox.hubness import hubness
    from hub_toolbox.knn_classification import score
    D, y, X = io.load_dexter()
    print("D", D.shape)
    print("y", y.shape)
    print("X", X.shape)
    D_shi = simhub(D, y=None)
    D_snn = shared_nearest_neighbors(D, k=50)
    h = hubness(D_shi, k=5)
    h_snn = hubness(D_snn, k=5)
    acc = score(D_shi, y, 5)
    acc_snn = score(D_snn, y, 5)

    D_sh = simhub(D=D, y=y)
    h_sh = hubness(D_sh, k=5)
    acc_sh = score(D_sh, y, 5)
    print("hubness SNN:", h_snn[0])
    print("hubness SHI:", h[0])