def test_dis_sim_local(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] dist_dsl = dis_sim_local(self.vectors, k=50) h_dsl = hubness(dist_dsl)[0] acc_dsl = score(dist_dsl, self.target)[0][0, 0] result = (h_orig / h_dsl > 10) & (acc_dsl - acc_orig > 0.03) return self.assertTrue(result)
def test_dis_sim_global(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] dist_dsg = dis_sim_global(self.vectors) h_dsg = hubness(dist_dsg)[0] acc_dsg = score(dist_dsg, self.target)[0][0, 0] result = (h_orig / h_dsg > 2) & (acc_dsg - acc_orig > 0.07) return self.assertTrue(result)
def test_localized_centering(self): """Test whether hubness and k-NN accuracy improve for dexter""" h_orig = hubness(self.distance)[0] acc_orig = score(self.distance, self.target)[0][0, 0] sim_lcent = localized_centering(self.vectors, "cosine", 20, 1) h_lcent = hubness(sim_lcent, metric="similarity")[0] acc_lcent = score(sim_lcent, self.target, metric="similarity")[0][0, 0] result = (h_orig / h_lcent > 1.5) & (acc_lcent - acc_orig > 0.03) return self.assertTrue(result)
def test_ls_dist_equals_sim(self): """Test for equal RANKS using dist. vs. sim. (LS_dist != 1-LS_sim). Using hubness and k-NN accuracy as proxy.""" self.setUpMod('rnd') ls_dist = local_scaling(self.dist, metric='distance') ls_sim = local_scaling(1 - self.dist, metric='similarity') h_dist, _, _ = hubness(ls_dist, metric='distance') h_sim, _, _ = hubness(ls_sim, metric='similarity') acc_dist, _, _ = score(ls_dist, self.label, metric='distance') acc_sim, _, _ = score(ls_sim, self.label, metric='similarity') dist_sim_equal_in_hubness_knn = np.allclose(h_dist, h_sim) and \ np.allclose(acc_dist, acc_sim) return self.assertTrue(dist_sim_equal_in_hubness_knn)
def _calc_hubness(self, k:int=5): """Calculate hubness (skewness of `k`-occurence). Also calculate percentage of anti hubs (`k`-occurence == 0) and percentage of k-NN lists the largest hub occurs in. """ S_k, _, N_k = hubness(D=self.secondary_distance, metric=self.metric, k=k) self.hubness[k] = S_k self.anti_hubs[k] = 100 * (N_k == 0).sum() / self.n self.max_hub_k_occurence[k] = 100 * N_k.max() / self.n return self
do = 'dexter' if do == 'random': print("RANDOM DATA:") print("------------") S = triu(rand(1000, 1000, 0.05, 'csr', np.float32, 43), 1) S += S.T D = 1. - S.toarray() elif do == 'dexter': print("DEXTER:") print("-------") D, c, v = load_dexter() acc_d, _, _ = score(D, c, [5], 'distance') S = csr_matrix(1 - D) acc_s, _, _ = score(S, c, [5], 'similarity') Sn_d, _, _ = hubness(D, 5, 'distance') Sn_s, _, _ = hubness(S, 5, 'similarity') print("Orig. dist. hubness:", Sn_d) print("Orig. sim. hubness:", Sn_s) if do == 'dexter': print("Orig. dist. k-NN accuracy:", acc_d) print('Orig. sim. k-NN accuracy:', acc_s) D_mp_emp_d = mutual_proximity_empiric(D) D_mp_emp_s = mutual_proximity_empiric(S, 'similarity') Sn_mp_emp_d, _, _ = hubness(D_mp_emp_d, 5) Sn_mp_emp_s, _, _ = hubness(D_mp_emp_s, 5, 'similarity') print("MP emp dist. hubness:", Sn_mp_emp_d) print("MP emp sim. hubness:", Sn_mp_emp_s) if do == 'dexter': acc_mp_emp_d, _, _ = score(D_mp_emp_d, c, [5], 'distance')