def _calc_secondary_distance(self): """Calculate secondary distances (e.g. Mutual Proximity)""" sec_dist_fun = SEC_DIST[self.secondary_distance_type] try: self.secondary_distance = sec_dist_fun( D=self.original_distance, metric=self.metric) except TypeError: # centering has no keyword 'D=' if self.secondary_distance_type in ['cent', 'wcent']: self.secondary_distance = \ cosine_distance(sec_dist_fun(X=self.vectors)) elif self.secondary_distance_type in ['lcent']: self.secondary_distance = 1. - sec_dist_fun(X=self.vectors) elif self.secondary_distance_type in ['dsg', 'dsl']: self.secondary_distance = sec_dist_fun(X=self.vectors) else: raise ValueError("Erroneous secondary distance type: {}". format(self.secondary_distance_type)) return self
def load_dexter(): """Load the example data set (dexter). Returns ------- D : ndarray Distance matrix classes : ndarray Class label vector vectors : ndarray Vector data matrix """ n = 300 dim = 20000 # Read class labels classes_file = os.path.dirname(os.path.realpath(__file__)) +\ '/example_datasets/dexter_train.labels' classes = np.loadtxt(classes_file) # Read data vectors = np.zeros((n, dim)) data_file = os.path.dirname(os.path.realpath(__file__)) + \ '/example_datasets/dexter_train.data' with open(data_file, mode='r') as fid: data = fid.readlines() row = 0 for line in data: line = line.strip().split() # line now contains pairs of dim:val for word in line: col, val = word.split(':') vectors[row][int(col)-1] = int(val) row += 1 # Calc distance D = cosine_distance(vectors) return D, classes, vectors
def test_cosine_dist_equal_to_scipy_pdist_cos(self): cos_dist = cosine_distance(self.vectors) cos_dist_scipy = squareform(pdist(self.vectors, 'cosine')) result = np.allclose(cos_dist, cos_dist_scipy) return self.assertTrue(result)