def cluster_score(W, W_true, cost_dist='euclidean'): """ Subsequently, computes several distances between the true and estimated loading matrices. Parameters: ---------- W: np.ndarray Estimated loading matrix W_true: np.ndarray True loading matrix Returns: ---------- distances: dict The distance between the true and the estimated clusters. Computes the "Jaccard", "Hamming" and "Kulsinski" distances and return them as a dict. score: float The optimal assignment cost. W_aligned: np.ndarray A copy of W with its columns rearranged according to the optimal alignment. alignment: tuple of (np.ndarray, np.ndarray) The row-idx and column-idx of the optimal alignment. """ # align W to W_true by shuffling its columns alignment, score = get_alignment(W, W_true, cost_dist) # create an aligned version of W by permuting its columns W_aligned = W.copy()[:, alignment[1]] # compare clusters am_W, am_W_true = W_aligned.argmax(1), W_true.argmax(1) distances = {} distances['jaccard'] = spd.jaccard(am_W, am_W_true) distances['hamming'] = spd.hamming(am_W, am_W_true) distances['kulsinski'] = spd.kulsinski(am_W, am_W_true) return distances, score, W_aligned, alignment
def get_nearest_neighbor(self, x_test, k, sample_class): distances = [] targets_index = [] for i in range(len(sample_class)): if (sample_class[i][:] != x_test).any(): if self.distance_calculator == 'jaccard': distance = dis.jaccard(x_test, sample_class[i][:]) elif self.distance_calculator == 'dice': distance = dis.dice(x_test, sample_class[i][:]) elif self.distance_calculator == 'correlation': distance = dis.correlation(x_test, sample_class[i][:]) elif self.distance_calculator == 'yule': distance = dis.yule(x_test, sample_class[i][:]) elif self.distance_calculator == 'russelo-rao': distance = dis.russellrao(x_test, sample_class[i][:]) elif self.distance_calculator == 'sokal-michener': distance = dis.sokalmichener(x_test, sample_class[i][:]) elif self.distance_calculator == 'rogers-tanimoto': distance = dis.rogerstanimoto(x_test, sample_class[i][:]) elif self.distance_calculator == 'kulzinsky': distance = dis.kulsinski(x_test, sample_class[i][:]) distances.append([distance, i]) # make a list of the k neighbors' targets distances.sort() for i in range(k): targets_index.append(distances[i][1]) return targets_index
def test_kulsinski(self): n_items = MATRIX.shape[1] should_be = np.zeros((n_items, n_items)) for i in range(n_items): for j in range(n_items): should_be[i, j] = spd.kulsinski(BOOL_MATRIX.T[i], BOOL_MATRIX.T[j]) actually_is = (1 - kulsinski(self.data).toarray()) self.assertTrue(np.allclose(should_be, actually_is))
def kulsinski(self, x=None, y=None, w=None): """ 库尔辛斯基差异 x = [1, 0, 0] y = [0, 1, 0] """ x = x or self.x y = y or self.y w = w or self.w return distance.kulsinski(x, y, w)
def calc_kulczynski(query_vec, num_of_docs): # smaller better! vec_distances = [] for index, row in data.iterrows(): vec_distances.append(kulsinski(query_vec.toarray(), row['text'])) result_docs = data.copy() result_docs['kulsinski'] = list(vec_distances) result_docs = result_docs.sort_values(by=['kulsinski']) # default: asc result_docs = result_docs.head(num_of_docs) result_docs.drop('kulsinski', axis=1, inplace=True) return result_docs
def distances(W, W_true, G, G_true, alignment=None): if alignment is None: alignment = np.arange(W.shape[1]) Wal, Gal = align_from_permutation(W, G, alignment) am_W, am_W_true = Wal.argmax(1), W_true.argmax(1) distances = {} distances['jaccard'] = spd.jaccard(am_W, am_W_true) distances['hamming'] = spd.hamming(am_W, am_W_true) distances['kulsinski'] = spd.kulsinski(am_W, am_W_true) cov_mse = covariance_mse(Gal, G_true) distances['cov_mse'] = cov_mse distances['cov_mse_mean'] = np.mean(cov_mse) distances['cov_mse_max'] = np.max(cov_mse) return distances
def test_kulsinski_similarity(): true = np.double(np.random.binomial(n=1, p=.5, size=10)) predicted = np.double(np.round(np.random.random(10))) refscore = kulsinski(true, predicted) yt = T.vector('yt') yp = T.vector('yp') f = theano.function([yt, yp], tmetrics.classification.kulsinski_similarity(yt, yp), allow_input_downcast=True) score = f(true.astype('float32'), predicted.astype('float32')) print 'true' print true print 'predicted' print predicted print 'refscore {}'.format(refscore) print 'score {}'.format(score) assert np.allclose(refscore, score)
def distances(v1, v2): if v1.sum() == 0 or v2.sum() == 0: if v1.sum() == v2.sum(): return _NEAR else: return _FAR v1 = v1.toarray() v2 = v2.toarray() b1 = v1 > 0 b2 = v2 > 0 return np.asarray([ sp_dist.cosine(v1, v2), sp_dist.dice(b1, b2), sp_dist.hamming(b1, b2), sp_dist.kulsinski(b1, b2) ])
def cross_channel_boolean_distance_features(mask): """calculates the cross channel distance features Calculates the distances across channels Parameters ---------- mask : 3D array, shape (M, N, C) The input mask with multiple channels. Returns ------- features : dict dictionary including different distances across channels """ features = dict() for ch1 in range(mask.shape[2]): for ch2 in range(ch1 + 1, mask.shape[2]): # rehaping the channels to 1D channel1 = mask[:, :, ch1].ravel() channel2 = mask[:, :, ch2].ravel() # creating the suffix name for better readability suffix = "_Ch" + str(ch1 + 1) + "_Ch" + str(ch2 + 1) # storing the distance values features["dice_distance" + suffix] = dist.dice(channel1, channel2) features["hamming_distance" + suffix] = dist.hamming( channel1, channel2) features["jaccard_distance" + suffix] = dist.jaccard( channel1, channel2) features["kulsinski_distance" + suffix] = dist.kulsinski( channel1, channel2) features["rogerstanimoto_distance" + suffix] = dist.rogerstanimoto( channel1, channel2) features["russellrao_distance" + suffix] = dist.russellrao( channel1, channel2) features["sokalmichener_distance" + suffix] = dist.sokalmichener( channel1, channel2) features["sokalsneath_distance" + suffix] = dist.sokalsneath( channel1, channel2) features["yule_distance" + suffix] = dist.yule(channel1, channel2) return features
def calculate_pss(self, profile, ignore=None, method="pairwise"): """ Calculate Profiles Similarity Score. """ if len(self) != len(profile): raise ProfileError("Different profiles' lengths") prof_1 = self prof_2 = profile if ignore: for i in ignore: try: prof_1.profile = list(prof_1.profile) del prof_1.profile[prof_1.query.index(i)] prof_1.profile = tuple(prof_1.profile) except IndexError: raise ProfileError("Element to ignore not in profile") try: prof_2.profile = list(prof_2.profile) del prof_2.profile[prof_2.query.index(i)] prof_2.profile = tuple(prof_2.profile) except IndexError: raise ProfileError("Element to ignore not in profile") if method == "pairwise": return sum(a == b for a, b in zip(prof_1.profile, prof_2.profile)) elif method == "jaccard": return dist.jaccard(prof_1.profile, prof_2.profile) elif method == "yule": return dist.yule(prof_1.profile, prof_2.profile) elif method == "dice": return dist.dice(prof_1.profile, prof_2.profile) elif method == "hamming": return dist.hamming(prof_1.profile, prof_2.profile) elif method == "kulsinski": return dist.kulsinski(prof_1.profile, prof_2.profile) elif method == "rogerstanimoto": return dist.rogerstanimoto(prof_1.profile, prof_2.profile) elif method == "russellrao": return dist.russellrao(prof_1.profile, prof_2.profile) elif method == "sokalmichener": return dist.sokalmichener(prof_1.profile, prof_2.profile)
def kulsinski(app1SyscallsVector, app2SyscallsVector): return spDist.kulsinski(app1SyscallsVector, app2SyscallsVector)
def exec_similarity(dct, algorithm): if validate_similarity_algorithms(dct, algorithm): return {} if algorithm == 'braycurtis': return [ answer.update({ algorithm: braycurtis(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'canberra': return [ answer.update({ algorithm: canberra(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'chebyshev': return [ answer.update({ algorithm: chebyshev(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'cityblock': return [ answer.update({ algorithm: cityblock(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'correlation': return [ answer.update({ algorithm: correlation(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'cosine': return [ answer.update({ algorithm: cosine(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'euclidean': return [ answer.update({ algorithm: euclidean(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'mahalanobis': return [ answer.update({ algorithm: mahalanobis(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] #elif algorithm is 'minkowski': #return [answer.update({algorithm:minkowski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf']))}) for answer in dct['answers']] elif algorithm == 'seuclidean': return [ answer.update({ algorithm: seuclidean(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'sqeuclidean': return [ answer.update({ algorithm: sqeuclidean(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'wminkowski': return [ answer.update({ algorithm: wminkowski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'dice': return [ answer.update({ algorithm: dice(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'hamming': return [ answer.update({ algorithm: hamming(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'jaccard': return [ answer.update({ algorithm: jaccard(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'kulsinski': return [ answer.update({ algorithm: kulsinski(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'rogerstanimoto': return [ answer.update({ algorithm: rogerstanimoto(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'russellrao': return [ answer.update({ algorithm: russellrao(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'sokalmichener': return [ answer.update({ algorithm: sokalmichener(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'sokalsneath': return [ answer.update({ algorithm: sokalsneath(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ] elif algorithm == 'yule': return [ answer.update({ algorithm: yule(ndarray_dict(dct['tf_idf']), ndarray_dict(answer['tf_idf'])) }) for answer in dct['answers'] ]
def distance(self, vector1, vector2, type_): """ Calculate distance between two vectors. Args: vector1 (list of int/float/bool): Vector in vector space vector2 (list of int/float/bool): Vector in vector space type_ (str): Type of distance calculation. Allowed types are: * For numeric vectors * - braycurtis: Computes the Bray-Curtis distance between two arrays. - canberra: Computes the Canberra distance between two arrays. - chebyshev: Computes the Chebyshev distance. - cityblock: Computes the City Block (Manhattan) distance. - correlation: Computes the correlation distance between two arrays. - cosine: Computes the Cosine distance between arrays. - euclidean: Computes the Euclidean distance between two arrays. - sqeuclidean: Computes the squared Euclidean distance between two arrays. * For boolean vectors * - dice: Computes the Dice dissimilarity between two boolean arrays. - hamming: Computes the Hamming distance between two arrays. - jaccard: Computes the Jaccard-Needham dissimilarity between two boolean arrays. - kulsinski: Computes the Kulsinski dissimilarity between two boolean arrays. - rogerstanimoto: Computes the Rogers-Tanimoto dissimilarity between two boolean arrays. - russellrao: Computes the Russell-Rao dissimilarity between two boolean arrays. - sokalmichener: Computes the Sokal-Michener dissimilarity between two boolean arrays. - sokalsneath: Computes the Sokal-Sneath dissimilarity between two boolean arrays. - yule: Computes the Yule dissimilarity between two boolean arrays. Returns: float: Distance between vectors. """ if type_ == "braycurtis": return distance.braycurtis(vector1, vector2) elif type_ == "canberra": return distance.canberra(vector1, vector2) elif type_ == "chebyshev": return distance.chebyshev(vector1, vector2) elif type_ == "cityblock": return distance.cityblock(vector1, vector2) elif type_ == "correlation": return distance.correlation(vector1, vector2) elif type_ == "cosine": return distance.cosine(vector1, vector2) elif type_ == "euclidean": return distance.euclidean(vector1, vector2) elif type_ == "sqeuclidean": return distance.sqeuclidean(vector1, vector2) elif type_ == "dice": return distance.dice(vector1, vector2) elif type_ == "hamming": return distance.hamming(vector1, vector2) elif type_ == "jaccard": return distance.jaccard(vector1, vector2) elif type_ == "kulsinski": return distance.kulsinski(vector1, vector2) elif type_ == "kulsinski": return distance.kulsinski(vector1, vector2) elif type_ == "rogerstanimoto": return distance.rogerstanimoto(vector1, vector2) elif type_ == "russellrao": return distance.russellrao(vector1, vector2) elif type_ == "sokalmichener": return distance.sokalmichener(vector1, vector2) elif type_ == "sokalsneath": return distance.sokalsneath(vector1, vector2) elif type_ == "yule": return distance.yule(vector1, vector2) else: raise ValueError( """Wrong value for type_. Please enter one of supported values. Type help(distance) to see supported values.""")
def main(): from scipy.spatial import distance a = np.array([1, 2, 43]) b = np.array([3, 2, 1]) d = Distance() print('-----------------------------------------------------------------') print('My braycurtis: {}'.format(d.braycurtis(a, b))) print('SciPy braycurtis: {}'.format(distance.braycurtis(a, b))) print('-----------------------------------------------------------------') print('My canberra: {}'.format(d.canberra(a, b))) print('SciPy canberra: {}'.format(distance.canberra(a, b))) print('-----------------------------------------------------------------') print('My chebyshev: {}'.format(d.chebyshev(a, b))) print('SciPy chebyshev: {}'.format(distance.chebyshev(a, b))) print('-----------------------------------------------------------------') print('My cityblock: {}'.format(d.cityblock(a, b))) print('SciPy cityblock: {}'.format(distance.cityblock(a, b))) print('-----------------------------------------------------------------') print('My correlation: {}'.format(d.correlation(a, b))) print('SciPy correlation: {}'.format(distance.correlation(a, b))) print('-----------------------------------------------------------------') print('My euclidean: {}'.format(d.euclidean(a, b))) print('SciPy euclidean: {}'.format(distance.euclidean(a, b))) print('-----------------------------------------------------------------') print('My hamming: {}'.format(d.hamming(a, b))) print('SciPy hamming: {}'.format(distance.hamming(a, b))) print('-----------------------------------------------------------------') print('My jaccard: {}'.format(d.jaccard(a, b))) print('SciPy jaccard: {}'.format(distance.jaccard(a, b))) print('-----------------------------------------------------------------') print('My manhattan: {}'.format(d.cityblock(a, b))) print('SciPy manhattan: {}'.format(distance.cityblock(a, b))) print('-----------------------------------------------------------------') print('My cosine: {}'.format(d.cosine(a, b))) print('SciPy cosine: {}'.format(distance.cosine(a, b))) print('-----------------------------------------------------------------') print('My dice: {}'.format(d.dice(a, b))) print('SciPy dice: {}'.format(distance.dice(a, b))) print('-----------------------------------------------------------------') print('My kulsinski: {}'.format(d.kulsinski(a, b))) print('SciPy kulsinski: {}'.format(distance.kulsinski(a, b))) print('-----------------------------------------------------------------') iv = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 1]]) print('My mahalanobis: {}'.format(d.mahalanobis(a, b, iv))) print('SciPy mahalanobis: {}'.format(distance.mahalanobis(a, b, iv))) print('-----------------------------------------------------------------') print('My seuclidean: {}'.format( d.seuclidean(a, b, np.array([0.1, 0.1, 0.1])))) print('SciPy seuclidean: {}'.format( distance.seuclidean(a, b, [0.1, 0.1, 0.1]))) print('-----------------------------------------------------------------') print('My sokalmichener: {}'.format(d.sokalmichener(a, b))) print('SciPy sokalmichener: {}'.format(distance.sokalmichener(a, b))) print('-----------------------------------------------------------------') print('My sokal_sneath: {}'.format(d.sokalsneath(a, b))) print('SciPy sokal_sneath: {}'.format(distance.sokalsneath(a, b))) print('-----------------------------------------------------------------') print('My sqeuclidean: {}'.format(d.sqeuclidean(a, b))) print('SciPy sqeuclidean: {}'.format(distance.sqeuclidean(a, b))) print('-----------------------------------------------------------------') print('My minkowski: {}'.format(d.minkowski(a, b, 2))) print('SciPy minkowski: {}'.format(distance.minkowski(a, b, 2))) print('-----------------------------------------------------------------') print('My rogerstanimoto: {}'.format(d.rogerstanimoto(a, b))) print('SciPy rogerstanimoto: {}'.format(distance.rogerstanimoto(a, b))) print('-----------------------------------------------------------------') print('My russellrao: {}'.format(d.russellrao(a, b))) print('SciPy russellrao: {}'.format(distance.russellrao(a, b))) print('-----------------------------------------------------------------') print('My wminkowski: {}'.format(d.wminkowski(a, b, 2, np.ones(3)))) print('SciPy wminkowski: {}'.format( distance.wminkowski(a, b, 2, np.ones(3)))) print('-----------------------------------------------------------------') print('My yule: {}'.format(d.yule(a, b))) print('SciPy yule: {}'.format(distance.yule(a, b))) print('-----------------------------------------------------------------')