def random_distribution(n): #make up some data data = np.random.normal(scale=n, size=(n, n)) data[0:n / 2,0:n / 2] += 75 data[n / 2:, n / 2:] = np.random.poisson(lam=n,size=data[n / 2:, n / 2:].shape) #cluster the rows row_dist = ssd.squareform(ssd.pdist(data)) row_Z = sch.linkage(row_dist) row_idxing = sch.leaves_list(row_Z) row_labels = ['bar{}'.format(i) for i in range(n)] #cluster the columns col_dist = ssd.squareform(ssd.pdist(data.T)) col_Z = sch.linkage(col_dist) col_idxing = sch.leaves_list(col_Z) #make the dendrogram col_labels = ['foo{}'.format(i) for i in range(n)] data = data[:,col_idxing][row_idxing,:] heatmap = pdh.DendroHeatMap(heat_map_data=data,left_dendrogram=row_Z, top_dendrogram=col_Z, heatmap_colors=("#ffeda0", "#feb24c", "#f03b20"), window_size="auto", color_legend_displayed=False, label_color="#777777") heatmap.row_labels = row_labels heatmap.col_labels = col_labels heatmap.title = 'An example heatmap' heatmap.show()#heatmap.save("example.png")
def test_rsa_relatedness(self): ref_mat = loadmat('rsa_ref/debug_rsa_relatedness.mat') rdm_stack_all = ref_mat['rdm_stack_all'] cand_rdm_stack_all = ref_mat['cand_rdm_stack_all'] index_matrix_array = ref_mat['index_matrix_array'] p_value_array = ref_mat['p_value_array'] # print(rdm_stack_all.shape, cand_rdm_stack_all.shape, index_matrix_array.shape, p_value_array.shape) for i_case in range(p_value_array.shape[-1]): ref_rdms = rdm_stack_all[:, :, :, i_case] if i_case % 2 != 0: ref_rdms = ref_rdms[:, :, :1] # check singular case. ref_rdms = np.array([squareform(ref_rdms[:, :, x]) for x in range(ref_rdms.shape[2])]) cand_rdms = cand_rdm_stack_all[:, :, :, i_case] cand_rdms = np.array([squareform(cand_rdms[:, :, x]) for x in range(cand_rdms.shape[2])]) # compute similarity. similarity_matrix_ref = rdm_similarity_batch(ref_rdms, cand_rdms, computation_method='spearmanr').mean( axis=1) p_val_this = rdm_relatedness_test(mean_ref_rdm=ref_rdms.mean(axis=0), model_rdms=cand_rdms, similarity_ref=similarity_matrix_ref, n=100, perm_idx_list=index_matrix_array[:, :, i_case].T - 1) p_val_ref = p_value_array[:, i_case] assert p_val_this.shape == p_val_ref.shape # print(p_val_this - p_val_ref) # print(abs(p_val_this - p_val_ref).max()) self.assertTrue(np.allclose(p_val_this, p_val_ref))
def compute_distance(): ''' Computes distances between congress members for a particular category and writes out the results in a text file. Web App reads these text files to show graphs. ''' category_map = {1: 'Health Care', 2: 'National Security', 3:'Economy', 4:'Environment', 5:'Domestic Issues' } vm = Voting_Matrix('114') for j in xrange(1,6): votes, member_to_row = vm.generate_matrix(category = [j]) y = pdist(votes, 'cosine') y_dist = squareform(y) normed_distances = np.zeros((len(y_dist), len(y_dist))) for i in xrange(len(y_dist)): min_value = min(y_dist[i,:]) max_value = max(y_dist[i,:]) normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value) np.savetxt("data/%s114Distance.csv" %category_map[j], normed_distances, delimiter=",", fmt='%5.5f') votes, member_to_row = vm.generate_matrix(category = [1,2,3,4,5]) y = pdist(votes, 'cosine') y_dist = squareform(y) normed_distances = np.zeros((len(y_dist), len(y_dist))) for i in xrange(len(y_dist)): min_value = min(y_dist[i,:]) max_value = max(y_dist[i,:]) normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value) np.savetxt("data/All Categories114Distance.csv" , normed_distances, delimiter=",", fmt='%5.5f') df = pd.read_csv('../DataCollectionInsertion/Members/114Members.csv') row_nums = np.array([member_to_row[str(df.iloc[i]['person__id'])] for i in xrange(len(df))]) df['row_nums'] = row_nums df.to_csv('../DataCollectionInsertion/Members/114Members.csv', sep=',')
def calculate_cophenetic_correlation(connmat): Y = 1 - connmat Z = linkage(squareform(Y),method='average') c,d= cophenet(Z,squareform(Y)) #print c #print d return (c,d)
def initRTI(nodeLocs, delta_p, sigmax2, delta, excessPathLen): # Set up pixel locations as a grid. personLL = nodeLocs.min(axis=0) personUR = nodeLocs.max(axis=0) pixelCoords, xVals, yVals = calcGridPixelCoords(personLL, personUR, delta_p) pixels = pixelCoords.shape[0] #plt.figure(3) #plotLocs(pixelCoords) # Find distances between pixels and transceivers DistPixels = dist.squareform(dist.pdist(pixelCoords)) DistPixelAndNode = dist.cdist(pixelCoords, nodeLocs) DistNodes = dist.squareform(dist.pdist(nodeLocs)) # Find the (inverse of) the Covariance matrix between pixels CovPixelsInv = linalg.inv(sigmax2*np.exp(-DistPixels/delta)) # Calculate weight matrix for each link. nodes = len(nodeLocs) links = nodes*(nodes-1) W = np.zeros((links, pixels)) for ln in range(links): txNum, rxNum = txRxForLinkNum(ln, nodes) ePL = DistPixelAndNode[:,txNum] + DistPixelAndNode[:,rxNum] - DistNodes[txNum,rxNum] inEllipseInd = np.argwhere(ePL < excessPathLen) pixelsIn = len(inEllipseInd) if pixelsIn > 0: W[ln, inEllipseInd] = 1.0 / float(pixelsIn) # Compute the projection matrix inversion = np.dot(linalg.inv(np.dot(W.T, W) + CovPixelsInv), W.T) return (inversion, xVals, yVals)
def mds_author_term(fname1='corr_2d_mds_authors_by_terms.png', fname2='corr_2d_mds_terms_by_authors.png'): bib_data = get_bib_data() mat, authors, term_list, authors_cnt = get_author_by_term_mat(bib_data, tfreq=5, afreq=10) adist = dist.squareform(dist.pdist(mat, 'correlation')) coords,_ = mds(adist, dim=2) fig = plt.figure() fig.clf() plt.xlim(-15, 20) plt.ylim(-15, 20) for label, x, y in zip(authors, coords[:,0], coords[:,1]): plt.annotate(label, xy=(x*20,y*20)) plt.axis('off') plt.savefig(fname1) mat = mat.T tdist = dist.squareform(dist.pdist(mat, 'correlation')) coords, _ = mds(tdist, dim=2) #fig = plt.figure() fig.clf(); plt.xlim(-80,100) plt.ylim(-100,100) for label, x, y in zip(term_list, coords[:,0], coords[:,1]): plt.annotate(label, xy=(x*500,y*500)) plt.axis('off') plt.savefig(fname2)
def dCorr(x, y): """Returns the distance-correlation between x and y""" n = len(x) assert n == len(y), "Vectors must be of the same length" def dCov2(xM, yM): """Returns the distance-covariance squared of x and y, given the pairwise distance matrices xM and yM.""" return (1.0 / n**2) * np.sum(xM * yM) #sum of all entries in component-wise product A = distance.squareform(distance.pdist(np.array(x).reshape(n, -1))) B = distance.squareform(distance.pdist(np.array(y).reshape(n, -1))) #Center along both axes: A -= A.mean(axis = 0) B -= B.mean(axis = 0) A -= A.mean(axis = 1) B -= B.mean(axis = 1) #Calculate distance covariances dcov = np.sqrt(dCov2(A, B)) dvarx = np.sqrt(dCov2(A, A)) dvary = np.sqrt(dCov2(B, B)) toR = dcov / np.sqrt(dvarx * dvary) if np.isnan(toR): return 0.0 else: return toR
def distcorr(X, Y): """ Compute the distance correlation function >>> a = [1,2,3,4,5] >>> b = np.array([1,2,9,4,4]) >>> distcorr(a, b) 0.762676242417 """ X,Y = zip(*[v for i,v in enumerate(zip(X,Y)) if not np.any(np.isnan(v))]) X = np.atleast_1d(X) Y = np.atleast_1d(Y) if np.prod(X.shape) == len(X): X = X[:, None] if np.prod(Y.shape) == len(Y): Y = Y[:, None] X = np.atleast_2d(X) Y = np.atleast_2d(Y) n = X.shape[0] if Y.shape[0] != X.shape[0]: raise ValueError('Number of samples must match') a = squareform(pdist(X)) b = squareform(pdist(Y)) A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean() B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean() dcov2_xy = (A * B).sum()/float(n * n) dcov2_xx = (A * A).sum()/float(n * n) dcov2_yy = (B * B).sum()/float(n * n) dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy)) return dcor
def getDistances(x, attr, var, cidx, didx, cheader): """ This creates the distance array for only discrete or continuous data with no missing data """ from scipy.spatial.distance import pdist, squareform #-------------------------------------------------------------------------- def pre_normalize(x): idx = 0 for i in cheader: cmin = attr[i][2] diff = attr[i][3] x[:,idx] -= cmin x[:,idx] /= diff idx += 1 return x #-------------------------------------------------------------------------- dtype = var['dataType'] numattr = var['NumAttributes'] if(dtype == 'discrete'): return squareform(pdist(x,metric='hamming')) if(dtype == 'mixed'): d_dist = squareform(pdist(x[:,didx],metric='hamming')) xc = pre_normalize(x[:,cidx]) c_dist = squareform(pdist(xc,metric='cityblock')) return np.add(d_dist, c_dist) / numattr else: #(dtype == 'continuous'): return squareform(pdist(pre_normalize(x),metric='cityblock'))
def __init__(self, rng, matches_vec, batch_size, sample_diff_every_epoch=True, n_same_pairs=None): """ If `n_same_pairs` is given, this number of same pairs is sampled, otherwise all same pairs are used. """ self.rng = rng self.matches_vec = matches_vec self.batch_size = batch_size if n_same_pairs is None: # Use all pairs I, J = np.where(np.triu(distance.squareform(matches_vec))) # indices of same pairs else: # Sample same pairs n_pairs = min(n_same_pairs, len(np.where(matches_vec == True)[0])) same_sample = self.rng.choice( np.where(matches_vec == True)[0], size=n_pairs, replace=False ) same_vec = np.zeros(self.matches_vec.shape[0], dtype=np.bool) same_vec[same_sample] = True I, J = np.where(np.triu(distance.squareform(same_vec))) self.x1_same_indices = [] self.x2_same_indices = [] for i, j in zip(I, J): self.x1_same_indices.append(i) self.x2_same_indices.append(j) if not sample_diff_every_epoch: self.x1_diff_indices, self.x2_diff_indices = self._sample_diff_pairs() self.sample_diff_every_epoch = sample_diff_every_epoch
def test_grad_grad(x): r_r = 7; q = 0 x_in = copy.deepcopy(x) in_dims = n_channels_in*(filter_sz**2) x = copy.deepcopy(x_g) x[r_r,q] = x_in N = x.shape[1] grad_s = np.zeros((in_dims, in_dims, N)) x_mean = np.mean(x,axis=1) x_no_mean = x - x_mean[:,np.newaxis] corrs = (1-pdist(x,'correlation')) corr_mat = squareform(corrs); target_mat = squareform(target) loss = np.std(corrs) #np.sqrt(np.sum((corrs - corrs.mean())**2)) d_sum_n = np.mean(x_no_mean, axis=1) d2_sum_sqrt = np.sqrt(np.sum(x_no_mean**2, axis=1)) d2_sum_sqrt2 = d2_sum_sqrt**2 d_minus_sum_n = x_no_mean - d_sum_n[:,np.newaxis] d_minus_sum_n_div = d_minus_sum_n/d2_sum_sqrt[:,np.newaxis] d_dot_dT = np.dot(x_no_mean, x_no_mean.T) for i in np.arange(in_dims): for j in np.arange(in_dims): if i != j: grad_s[i,j] = (d_minus_sum_n[j]*d2_sum_sqrt[i] - d_dot_dT[i,j]*d_minus_sum_n_div[i])/(d2_sum_sqrt[j]*d2_sum_sqrt2[i]) grad_s_mean = grad_s.sum(1)/len(corrs) # in_dims by N grad = np.sum((grad_s - grad_s_mean)*(corr_mat[r_r] - corrs.mean())[:,np.newaxis],axis=1)/(loss*(N**2)) return grad[r_r,q]
def loglike(x, A): P = x.dot(x.T) P = squareform(P-diag(diag(P))) B = squareform(A) return np.
def test_PDist(): targets = np.tile(xrange(3),2) chunks = np.repeat(np.array((0,1)),3) ds = dataset_wizard(samples=data, targets=targets, chunks=chunks) data_c = data - np.mean(data,0) # DSM matrix elements should come out as samples of one feature # to be in line with what e.g. a classifier returns -- facilitates # collection in a searchlight ... euc = pdist(data, 'euclidean')[None].T pear = pdist(data, 'correlation')[None].T city = pdist(data, 'cityblock')[None].T center_sq = squareform(pdist(data_c,'correlation')) # Now center each chunk separately dsm1 = PDist() dsm2 = PDist(pairwise_metric='euclidean') dsm3 = PDist(pairwise_metric='cityblock') dsm4 = PDist(center_data=True,square=True) assert_array_almost_equal(dsm1(ds).samples,pear) assert_array_almost_equal(dsm2(ds).samples,euc) dsm_res = dsm3(ds) assert_array_almost_equal(dsm_res.samples,city) # length correspondings to a single triangular matrix assert_equal(len(dsm_res.sa.pairs), len(ds) * (len(ds) - 1) / 2) # generate label pairs actually reflect the vectorform generated by # squareform() dsm_res_square = squareform(dsm_res.samples.T[0]) for i, p in enumerate(dsm_res.sa.pairs): assert_equal(dsm_res_square[p[0], p[1]], dsm_res.samples[i, 0]) dsm_res = dsm4(ds) assert_array_almost_equal(dsm_res.samples,center_sq) # sample attributes are carried over assert_almost_equal(ds.sa.targets, dsm_res.sa.targets)
def main(): # fetch distance matrix from specified input file distMatFile = sys.argv[1] nameList,Dij_sq,N=fetchDistMat(distMatFile) # in scipy most routines operate on 'condensed' # distance matrices, i.e. upper triagonal matrices # the function square contained in the scipy.spatial # submodule might be used in order to switch from # full square to condensed matrices and vice versa Dij_cd = ssd.squareform(Dij_sq) # hierarchical clustering where the distance between # two coordinates is the distance of the cluster # averages # cluster Result = 'top down view' of the hierarchical # clustering clusterResult = sch.linkage(Dij_cd, method='average') # returns cophenetic distances # corr = cophenetic correlation # Cij_cd = condensed cophenetic distance matrix corr,Cij_cd = sch.cophenet(clusterResult,Dij_cd) Cij_sq = ssd.squareform(Cij_cd) # print dendrogram on top of cophenetic distance # matrix to standard outstream droPyt_distMat_dendrogram_sciPy(Cij_sq,clusterResult,N)
def vi_pairwise_matrix(segs, split=False): """Compute the pairwise VI distances within a set of segmentations. If 'split' is set to True, two matrices are returned, one for each direction of the conditional entropy. 0-labeled pixels are ignored. Parameters ---------- segs : iterable of np.ndarray of int A list or iterable of segmentations. All arrays must have the same shape. split : bool, optional Should the split VI be returned, or just the VI itself (default)? Returns ------- vi_sq : np.ndarray of float, shape (len(segs), len(segs)) The distances between segmentations. If `split==False`, this is a symmetric square matrix of distances. Otherwise, the lower triangle of the output matrix is the false split distance, while the upper triangle is the false merge distance. """ d = np.array([s.ravel() for s in segs]) if split: def dmerge(x, y): return split_vi(x, y)[0] def dsplit(x, y): return split_vi(x, y)[1] merges, splits = [squareform(pdist(d, df)) for df in [dmerge, dsplit]] out = merges tri = np.tril(np.ones(splits.shape), -1).astype(bool) out[tri] = splits[tri] else: out = squareform(pdist(d, vi)) return out
def kcca(self, X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0): n, p = X.shape n, q = Y.shape Kx = DIST.squareform(DIST.pdist(X, kernel_x)) Ky = DIST.squareform(DIST.pdist(Y, kernel_y)) J = np.eye(n) - np.ones((n, n)) / n M = np.dot(np.dot(Kx.T, J), Ky) / n L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky sqx = SLA.sqrtm(SLA.inv(L)) sqy = SLA.sqrtm(SLA.inv(N)) a = np.dot(np.dot(sqx, M), sqy.T) A, s, Bh = SLA.svd(a, full_matrices=False) B = Bh.T # U = np.dot(np.dot(A.T, sqx), X).T # V = np.dot(np.dot(B.T, sqy), Y).T print s.shape print A.shape print B.shape return s, A, B
def distcorr(X, Y, flip=True): """ Compute the distance correlation function >>> a = [1,2,3,4,5] >>> b = np.array([1,2,9,4,4]) >>> distcorr(a, b) 0.762676242417 Taken from: https://gist.github.com/satra/aa3d19a12b74e9ab7941 """ X = np.atleast_1d(X) Y = np.atleast_1d(Y) if np.prod(X.shape) == len(X): X = X[:, None] if np.prod(Y.shape) == len(Y): Y = Y[:, None] X = np.atleast_2d(X) Y = np.atleast_2d(Y) n = X.shape[0] if Y.shape[0] != X.shape[0]: raise ValueError('Number of samples must match') a = squareform(pdist(X)) b = squareform(pdist(Y)) A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean() B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean() dcov2_xy = (A * B).sum()/float(n * n) dcov2_xx = (A * A).sum()/float(n * n) dcov2_yy = (B * B).sum()/float(n * n) dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy)) if flip == True: dcor = 1-dcor return dcor
def kcca(X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0): ''' カーネル正準相関分析 http://staff.aist.go.jp/s.akaho/papers/ibis00.pdf ''' n, p = X.shape n, q = Y.shape Kx = DIST.squareform(DIST.pdist(X, kernel_x)) Ky = DIST.squareform(DIST.pdist(Y, kernel_y)) J = np.eye(n) - np.ones((n, n)) / n M = np.dot(np.dot(Kx.T, J), Ky) / n L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky sqx = LA.sqrtm(LA.inv(L)) sqy = LA.sqrtm(LA.inv(N)) a = np.dot(np.dot(sqx, M), sqy.T) A, s, Bh = LA.svd(a, full_matrices=False) B = Bh.T # U = np.dot(np.dot(A.T, sqx), X).T # V = np.dot(np.dot(B.T, sqy), Y).T return s, A, B
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'): l = asarray(l) sig = 1 #l[0] #l = l[1:] xs = ascolumn(xs) if ys is None: d = squareform(pdist(xs/l, 'sqeuclidean')) else: ys = ascolumn(ys) d = cdist(xs/l, ys/l, 'sqeuclidean') cov = exp(-d/2) if not deriv: return sig * cov grads = [] if wrt == 'l': #grads.append(cov) # grad of sig for i in xrange(shape(xs)[1]): if ys is None: grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean')) else: grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean') grad /= l[i] ** 3 grads.append(grad) return sig * cov, grads elif wrt == 'y': if shape(xs)[0] != 1: print '*** x not a row vector ***' jac = sig * cov * ((ys - xs) / l**2).T return sig * cov, jac
def kernelMatrixLaplacian(x, firstVar=None, grid=None, par=[1., 3], diff=False, diff2 = False, constant_plane=False, precomp = None): sig = par[0] ord=par[1] if precomp == None: precomp = kernelMatrixLaplacianPrecompute(x, firstVar, grid, par) u = precomp[0] expu = precomp[1] if firstVar == None and grid==None: if diff==False and diff2==False: K = dfun.squareform(lapPol(u,ord) *expu) np.fill_diagonal(K, 1) elif diff2==False: K = dfun.squareform(-lapPolDiff(u, ord) * expu/(2*sig*sig)) np.fill_diagonal(K, -1./((2*ord-1)*2*sig*sig)) else: K = dfun.squareform(lapPolDiff2(u, ord) *expu /(4*sig**4)) np.fill_diagonal(K, 1./((35)*4*sig**4)) else: if diff==False and diff2==False: K = lapPol(u,ord) * expu elif diff2==False: K = -lapPolDiff(u, ord) * expu/(2*sig*sig) else: K = lapPolDiff2(u, ord) *expu/(4*sig**4) if constant_plane: uu = dfun.pdist(x[:,x.shape[1]-1])/sig K2 = dfun.squareform(lapPol(uu,ord)*np.exp(-uu)) np.fill_diagonal(K2, 1) return K,K2,precomp else: return K,precomp
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \ dist_f=None): if len(np.array(X).shape) == 1: _X = np.array([X]).T else: _X = np.array(X) if len(np.array(Y).shape) == 1: _Y = np.array([Y]).T else: _Y = np.array(Y) if dist_f == None: if symmetric: cM = pdist(_X) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y) M = kernel(cM, theta) return M else: if symmetric: cM = pdist(_X, dist_f) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y, dist_f) M = kernel(cM, theta) return M return
def getDistMatrixes(cls, distDict, distMeasure, linkageCriterion): """ Find and return the correlation matrix, linkage matrix and distance matrix for the distance/correlation measure given with distMeasure parameter. """ from scipy.spatial.distance import squareform from numpy import ones, fill_diagonal from scipy.cluster.hierarchy import linkage if distMeasure == cls.CORR_PEARSON or distMeasure == cls.SIM_MCCONNAUGHEY: '''As these measures generate values between -1 and 1, need special handling''' # Cluster distances, i.e. convert correlation into distance between 0 and 1 triangularCorrMatrix = distDict[distMeasure] triangularDistMatrix = ones(len(triangularCorrMatrix)) - [(x + 1) / 2 for x in triangularCorrMatrix] linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion) # Make correlation matrix square correlationMatrix = squareform(triangularCorrMatrix) fill_diagonal(correlationMatrix, 1) else: # Cluster distances triangularDistMatrix = distDict[distMeasure] linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion) # Convert triangular distances into square correlation matrix squareDistMatrix = squareform(triangularDistMatrix) squareSize = len(squareDistMatrix) correlationMatrix = ones((squareSize, squareSize)) - squareDistMatrix return correlationMatrix, linkageMatrix, triangularDistMatrix
def correlate_all(M): """Return all-pairs Pearson's correlation as a squareform matrix. Best on numpy.array(dtype=float) TODO: this can be more efficient Args: M: numpy.array row matrix Returns: squareform top triangle matrix of all-pairs correlation, row order index. RUNTIME on random.rand(500,200) 21.2 ms (improve of 200x over formula) RUNTIME on random.rand(15000,250) """ m = np.size(M, 0) # number of rows (variables) n = np.size(M, 1) # number of columns (power) sums = np.sum(M,1).reshape(m,1) stds = np.std(M,1).reshape(m,1) # divided by n # TODO: does making this cummlative matter? Dot = squareform(np.dot(M, M.T), checks=False) SumProd = squareform(np.dot(sums, sums.T), checks=False) StdProd = squareform(np.dot(stds, stds.T), checks=False) CorrMatrix = (Dot - (SumProd/n)) / (StdProd*n) # Correlation Matrix return CorrMatrix
def slRSA_m_1Ss(ds, model, omit, partial_dsm = None, radius=3, cmetric='pearson'): '''one subject Executes slRSA on single subjects and returns tuple of arrays of 1-p's [0], and fisher Z transformed r's [1] ds: pymvpa dsets for 1 subj model: model DSM to be correlated with neural DSMs per searchlight center partial_dsm: model DSM to be partialled out of model-neural DSM correlation omit: list of targets omitted from pymvpa datasets radius: sl radius, default 3 cmetric: default pearson, other optin 'spearman' ''' if __debug__: debug.active += ["SLC"] for om in omit: ds = ds[ds.sa.targets != om] # cut out omits print('Target |%s| omitted from analysis' % (om)) ds = mean_group_sample(['targets'])(ds) #make UT ds print('Mean group sample computed at size:',ds.shape,'...with UT:',ds.UT) print('Beginning slRSA analysis...') if partial_dsm == None: tdcm = rsa.TargetDissimilarityCorrelationMeasure(squareform(model), comparison_metric=cmetric) elif partial_dsm != None: tdcm = rsa.TargetDissimilarityCorrelationMeasure(squareform(model), comparison_metric=cmetric, partial_dsm = squareform(partial_dsm)) sl = sphere_searchlight(tdcm,radius=radius) slmap = sl(ds) if partial_dsm == None: print('slRSA complete with map of shape:',slmap.shape,'...p max/min:',slmap.samples[0].max(),slmap.samples[0].min(),'...r max/min',slmap.samples[1].max(),slmap.samples[1].min()) return 1-slmap.samples[1],np.arctanh(slmap.samples[0]) else: print('slRSA complete with map of shape:',slmap.shape,'...r max/min:',slmap.samples[0].max(),slmap.samples[0].min()) return 1-slmap.samples[1],np.arctanh(slmap.samples[0])
def similarities(obj): """ Optional: similarities of entities. """ phi = coo_matrix(np.load(str(obj.directory / 'phi.npy'))) theta = coo_matrix(np.load(str(obj.directory / 'theta.npy'))) with CsvWriter(obj.directory, DocumentSimilarity) as out: distances = squareform(pdist(theta.T, 'cosine')) out << (dict(a_id=i, b_id=sim_i, similarity=1 - row[sim_i]) for i, row in enumerate(distances) for sim_i in row.argsort()[:31] # first 30 similar docs if sim_i != i) with CsvWriter(obj.directory, TopicSimilarity) as out: distances = squareform(pdist(phi.T, 'cosine')) out << (dict(a_id=topic_id(1, i), b_id=topic_id(1, sim_i), similarity=1 - row[sim_i]) for i, row in enumerate(distances) for sim_i in row.argsort()[:] if sim_i != i) with CsvWriter(obj.directory, TermSimilarity) as out: distances = squareform(pdist(phi, 'cosine')) out << (dict(a_modality_id=1, a_id=i, b_modality_id=1, b_id=sim_i, similarity=1 - row[sim_i]) for i, row in enumerate(distances) for sim_i in row.argsort()[:21] # first 20 similar terms if sim_i != i)
def bootstrap_correlations(df, cor, bootstraps=100, procs=1): """""" # take absolute value of all values in cor for calculating two-sided p-value abs_cor = np.abs(squareform(cor, checks=False)) # create an empty array of significant value counts in same shape as abs_cor n_sig = np.zeros(abs_cor.shape) if procs == 1: for i in xrange(bootstraps): n_sig += bootstrapped_correlation(i, df, abs_cor) else: import multiprocessing pool = multiprocessing.Pool(procs) print "Number of processors used: " + str(procs) # make partial function for use in multiprocessing pfun = partial(bootstrapped_correlation, cor=abs_cor, df=df) # run multiprocessing multi_results = pool.map(pfun, xrange(bootstraps)) pool.close() pool.join() # find number of significant results across all bootstraps n_sig = np.sum(multi_results, axis=0) # get p_values out p_val_square = squareform(1. * n_sig / bootstraps, checks=False) p_vals = [] for i in xrange(p_val_square.shape[0]): for j in xrange(i + 1, p_val_square.shape[0]): p_vals.append(p_val_square[i, j]) return p_vals
def _compute_AB(x, y, index): xa = np.atleast_2d(x) ya = np.atleast_2d(y) if xa.ndim > 2 or ya.ndim > 2: raise ValueError("x and y must be 1d or 2d array_like objects") if xa.shape[0] == 1: xa = xa.T if ya.shape[0] == 1: ya = ya.T if xa.shape[0] != ya.shape[0]: raise ValueError("x and y must have the same sample sizes") if index <= 0 or index > 2: raise ValueError("index must be in (0, 2]") # compute A a_kl = squareform(pdist(xa, 'euclidean')**index) a_k = np.mean(a_kl, axis=1).reshape(-1, 1) a_l = a_k.T a = np.mean(a_kl) A = a_kl - a_k - a_l + a # compute B b_kl = squareform(pdist(ya, 'euclidean')**index) b_k = np.mean(b_kl, axis=1).reshape(-1, 1) b_l = b_k.T b = np.mean(b_kl) B = b_kl - b_k - b_l + b return A, B
def test_pdist(self): for metric, argdict in self.scipy_metrics.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = pdist(self.X1, metric, **kwargs) Dsq_true = squareform(D_true) dm = DistanceMetric(metric, **kwargs) for X in self.X1, self.spX1: yield self.check_pdist, metric, X, dm, Dsq_true, True for X in self.X1, self.spX1: yield self.check_pdist, metric, X, dm, D_true, False for rmetric, (metric, func) in self.reduced_metrics.iteritems(): argdict = self.scipy_metrics[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = func(pdist(self.X1, metric, **kwargs), **kwargs) Dsq_true = squareform(D_true) dm = DistanceMetric(rmetric, **kwargs) for X in self.X1, self.spX1: yield self.check_pdist, rmetric, X, dm, Dsq_true, True for X in self.X1, self.spX1: yield self.check_pdist, rmetric, X, dm, D_true, False
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5): if len(patch) == 1: return [patch] if statistic == 'db': if method == 'kmeans': if len(patch) <= 5: K_max = 2 else: K_max = min(len(patch) / 2, max_K) clustering = {} db_index = [] X = df.ix[patch, :] for k in range(2, K_max + 1): kmeans = cluster.KMeans(n_clusters=k).fit(X) clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch) dist_mu = squareform(pdist(kmeans.cluster_centers_)) sigma = [] for i in range(k): points_in_cluster = clustering[k][clustering[k][0] == i].index sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum())) db_index.append(davies_bouldin(dist_mu, np.array(sigma))) db_index = np.array(db_index) k_optimal = np.argmin(db_index) + 2 return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)] elif method == 'agglomerative': if len(patch) <= 5: K_max = 2 else: K_max = min(len(patch) / 2, max_K) clustering = {} db_index = [] X = df.ix[patch, :] for k in range(2, K_max + 1): agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X) clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch) tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)] centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp]) dist_mu = squareform(pdist(centers)) sigma = [] for i in range(k): points_in_cluster = clustering[k][clustering[k][0] == i].index sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum())) db_index.append(davies_bouldin(dist_mu, np.array(sigma))) db_index = np.array(db_index) k_optimal = np.argmin(db_index) + 2 return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)] elif statistic == 'gap': X = np.array(df.ix[patch, :]) if method == 'kmeans': f = cluster.KMeans gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f) k_optimal = list(gaps).index(max(gaps))+1 clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch) return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)] else: raise 'error: only db and gat statistics are supported'
def plot_clustering_similarity(results, plot_dir=None, verbose=False, ext='png'): HCA = results.HCA # get all clustering solutions clusterings = HCA.results.items() # plot cluster agreement across embedding spaces names = [k for k,v in clusterings] cluster_similarity = np.zeros((len(clusterings), len(clusterings))) cluster_similarity = pd.DataFrame(cluster_similarity, index=names, columns=names) distance_similarity = np.zeros((len(clusterings), len(clusterings))) distance_similarity = pd.DataFrame(distance_similarity, index=names, columns=names) for clustering1, clustering2 in combinations(clusterings, 2): name1 = clustering1[0].split('-')[-1] name2 = clustering2[0].split('-')[-1] # record similarity of distance_df dist_corr = np.corrcoef(squareform(clustering1[1]['distance_df']), squareform(clustering2[1]['distance_df']))[1,0] distance_similarity.loc[name1, name2] = dist_corr distance_similarity.loc[name2, name1] = dist_corr # record similarity of clustering of dendrogram clusters1 = clustering1[1]['labels'] clusters2 = clustering2[1]['labels'] rand_score = adjusted_rand_score(clusters1, clusters2) MI_score = adjusted_mutual_info_score(clusters1, clusters2) cluster_similarity.loc[name1, name2] = rand_score cluster_similarity.loc[name2, name1] = MI_score with sns.plotting_context(context='notebook', font_scale=1.4): clust_fig = plt.figure(figsize = (12,12)) sns.heatmap(cluster_similarity, square=True) plt.title('Cluster Similarity: TRIL: Adjusted MI, TRIU: Adjusted Rand', y=1.02) dist_fig = plt.figure(figsize = (12,12)) sns.heatmap(distance_similarity, square=True) plt.title('Distance Similarity, metric: %s' % HCA.dist_metric, y=1.02) if plot_dir is not None: save_figure(clust_fig, path.join(plot_dir, 'cluster_similarity_across_measures.%s' % ext), {'bbox_inches': 'tight'}) save_figure(dist_fig, path.join(plot_dir, 'distance_similarity_across_measures.%s' % ext), {'bbox_inches': 'tight'}) plt.close(clust_fig) plt.close(dist_fig) if verbose: # assess relationship between two measurements rand_scores = cluster_similarity.values[np.triu_indices_from(cluster_similarity, k=1)] MI_scores = cluster_similarity.T.values[np.triu_indices_from(cluster_similarity, k=1)] score_consistency = np.corrcoef(rand_scores, MI_scores)[0,1] print('Correlation between measures of cluster consistency: %.2f' \ % score_consistency)
def hierarchical_consensus_cluster( df, k, distance__column_x_column=None, distance_function="euclidean", n_clustering=10, random_seed=20121020, linkage_method="ward", plot_df=True, directory_path=None, ): if distance__column_x_column is None: print("Computing distance with {} ...".format(distance_function)) distance__column_x_column = DataFrame( squareform(pdist(df.values.T, distance_function)), index=df.columns, columns=df.columns, ) print("HCC with K={} ...".format(k)) clustering_x_column = full( (n_clustering, distance__column_x_column.shape[1]), nan) n_per_print = max(1, n_clustering // 10) seed(random_seed) for clustering in range(n_clustering): if clustering % n_per_print == 0: print("\t(K={}) {}/{} ...".format(k, clustering + 1, n_clustering)) random_columns_with_repeat = randint( 0, high=distance__column_x_column.shape[0], size=distance__column_x_column.shape[0], ) clustering_x_column[clustering, random_columns_with_repeat] = fcluster( linkage( squareform(distance__column_x_column.iloc[ random_columns_with_repeat, random_columns_with_repeat]), method=linkage_method, ), k, criterion="maxclust", ) column_cluster, column_cluster__ccc = _cluster_clustering_x_element_and_compute_ccc( clustering_x_column, k, linkage_method) if directory_path is not None: cluster_x_column = make_membership_df_from_categorical_series( Series(column_cluster, index=df.columns)) cluster_x_column.index = Index( ("C{}".format(cluster) for cluster in cluster_x_column.index), name="Cluster", ) cluster_x_column.to_csv( "{}/cluster_x_column.tsv".format(directory_path), sep="\t") if plot_df: print("Plotting df ...") file_name = "cluster.html" if directory_path is None: html_file_path = None else: html_file_path = "{}/{}".format(directory_path, file_name) plot_heat_map( df, normalization_axis=0, normalization_method="-0-", column_annotation=column_cluster, title="HCC K={} Column Cluster".format(k), xaxis_title=df.columns.name, yaxis_title=df.index.name, html_file_path=html_file_path, ) return column_cluster, column_cluster__ccc
def average(self): self.unsaved_changes = True if hasattr(self, 'aActor'): self.ren.RemoveActor(self.aActor) self.ui.statLabel.setText("Averaging, applying grid . . .") QtWidgets.QApplication.processEvents() #temporarily shift all data such that it appears in the first cartesian quadrant tT = np.amin(self.rO, axis=0) self.rO, self.fO, self.rp, self.flp = self.rO - tT, self.fO - tT, self.rp - tT, self.flp - tT #use max to get a 'window' for assessing grid spacing RefMax = np.amax(self.rO, axis=0) RefMin = np.amin(self.rO, axis=0) windowVerts = np.matrix([[0.25 * RefMin[0], 0.25 * RefMin[1]], [0.25 * RefMin[0], 0.25 * (RefMax[1])], [0.25 * (RefMax[1]), 0.25 * (RefMax[1])], [0.25 * (RefMax[0]), 0.25 * (RefMin[1])]]) p = path.Path(windowVerts) inWindow = p.contains_points( self.rp[:, :2]) #first 2 columns of RefPoints is x and y windowed = self.rp[inWindow, :2] #populate grid size if attribute doesn't exist if not hasattr(self, 'gsize'): gs = squareform(pdist(windowed, 'euclidean')) self.gsize = np.mean(np.sort(gs)[:, 1]) self.ui.gridInd.setValue(self.gsize) else: self.gsize = self.ui.gridInd.value() #grid the reference based on gsize, bumping out the grid by 10% in either direction grid_x, grid_y = np.meshgrid( np.linspace(1.1 * RefMin[0], 1.1 * RefMax[0], int((1.1 * RefMax[0] - 1.1 * RefMin[0]) / self.gsize)), np.linspace(1.1 * RefMin[1], 1.1 * RefMax[1], int((1.1 * RefMax[1] - 1.1 * RefMin[1]) / self.gsize)), indexing='xy') #apply the grid to the reference data grid_Ref = griddata(self.rp[:, :2], self.rp[:, -1], (grid_x, grid_y), method='linear') #apply the grid to the aligned data grid_Align = griddata(self.flp[:, :2], self.flp[:, -1], (grid_x, grid_y), method='linear') self.ui.statLabel.setText("Averaging using grid . . .") QtWidgets.QApplication.processEvents() #average z values grid_Avg = (grid_Ref + grid_Align) / 2 #make sure that there isn't anything averaged outside the floating outline p = path.Path(self.rO[:, :2]) inTest = np.hstack((np.ravel(grid_x.T)[np.newaxis].T, np.ravel(grid_y.T)[np.newaxis].T)) inOutline = p.contains_points(inTest) #averaged points self.ap = np.hstack((inTest[inOutline,:], \ np.ravel(grid_Avg.T)[np.newaxis].T[inOutline])) #move everything back to original location self.rO, self.fO, self.rp, self.flp, self.ap = \ self.rO+tT, self.fO+tT, self.rp+tT, self.flp+tT, self.ap+tT self.ui.statLabel.setText("Rendering . . .") QtWidgets.QApplication.processEvents() #show it color = (int(0.2784 * 255), int(0.6745 * 255), int(0.6941 * 255)) _, self.aActor, _, = gen_point_cloud(self.ap, color, self.PointSize) self.ren.AddActor(self.aActor) s, nl, axs = self.get_scale() self.aActor.SetScale(s) self.aActor.Modified() #update self.ui.vtkWidget.update() self.ui.vtkWidget.setFocus() self.ui.statLabel.setText("Averaging complete.") self.averaged = True self.ui.averageButton.setStyleSheet( "background-color :rgb(77, 209, 97);")
def proclus(X, k=2, l=3, minDeviation=0.1, A=30, B=3, niters=30, seed=1234): """ Run PROCLUS on a database to obtain a set of clusters and dimensions associated with each one. Parameters: ---------- - X: the data set - k: the desired number of clusters - l: average number of dimensions per cluster - minDeviation: for selection of bad medoids - A: constant for initial set of medoids - B: a smaller constant than A for the final set of medoids - niters: maximum number of iterations for the second phase - seed: seed for the RNG """ np.random.seed(seed) N, d = X.shape if B > A: raise Exception("B has to be smaller than A.") if l < 2: raise Exception("l must be >=2.") ############################### # 1.) Initialization phase ############################### # first find a superset of the set of k medoids by random sampling idxs = np.arange(N) np.random.shuffle(idxs) S = idxs[0:(A * k)] M = greedy(X, S, B * k) ############################### # 2.) Iterative phase ############################### BestObjective = np.inf # choose a random set of k medoids from M: Mcurr = np.random.permutation(M)[0:k] # M current Mbest = None # Best set of medoids found D = squareform(pdist(X)) # precompute the euclidean distance matrix it = 0 # iteration counter L = [] # locality sets of the medoids, i.e., points within delta_i of m_i. Dis = [] # important dimensions for each cluster assigns = [] # cluster membership assignments while True: it += 1 L = [] for i in range(len(Mcurr)): mi = Mcurr[i] # compute delta_i, the distance to the nearest medoid of m_i: di = D[mi, np.setdiff1d(Mcurr, mi)].min() # compute L_i, points in sphere centered at m_i with radius d_i L.append(np.where(D[mi] <= di)[0]) # find dimensions: Dis = findDimensions(X, k, l, L, Mcurr) # form the clusters: assigns = assignPoints(X, Mcurr, Dis) # evaluate the clusters: ObjectiveFunction = evaluateClusters(X, assigns, Dis, Mcurr) badM = [] # bad medoids Mold = Mcurr.copy() if ObjectiveFunction < BestObjective: BestObjective = ObjectiveFunction Mbest = Mcurr.copy() # compute the bad medoids in Mbest: badM = computeBadMedoids(X, assigns, Dis, Mcurr, minDeviation) print("bad medoids:") print(badM) if len(badM) > 0: # replace the bad medoids with random points from M: print("old mcurr:") print(Mcurr) Mavail = np.setdiff1d(M, Mbest) newSel = np.random.choice(Mavail, size=len(badM), replace=False) Mcurr = np.setdiff1d(Mbest, badM) Mcurr = np.union1d(Mcurr, newSel) print("new mcurr:") print(Mcurr) print("finished iter: %d" % it) if np.allclose(Mold, Mcurr) or it >= niters: break print("finished iterative phase...") ############################### # 3.) Refinement phase ############################### # compute a new L based on assignments: L = [] for i in range(len(Mcurr)): mi = Mcurr[i] L.append(np.where(assigns == mi)[0]) Dis = findDimensions(X, k, l, L, Mcurr) assigns = assignPoints(X, Mcurr, Dis) # handle outliers: # smallest Manhattan segmental distance of m_i to all (k-1) # other medoids with respect to D_i: deltais = np.zeros(k) for i in range(k): minDist = np.inf for j in range(k): if j != i: dist = manhattanSegmentalDist(X[Mcurr[i]], X[Mcurr[j]], Dis[i]) if dist < minDist: minDist = dist deltais[i] = minDist # mark as outliers the points that are not within delta_i of any m_i: for i in range(len(assigns)): clustered = False for j in range(k): d = manhattanSegmentalDist(X[Mcurr[j]], X[i], Dis[j]) if d <= deltais[j]: clustered = True break if not clustered: #print "marked an outlier" assigns[i] = -1 return (Mcurr, Dis, assigns)
random.seed(seed) np.random.seed(seed) idx = np.random.permutation(data.index) # Calculate indexes for numpy array cluster_index = int(len(col_names)) distance_cluster_index = cluster_index + 1 ################################################################### # LOCAL # ################################################################# # Start timing start_time = time.perf_counter() D = squareform(pdist(data)) max_distance, [I_row, I_col] = np.nanmax(D), np.unravel_index(np.argmax(D), D.shape) n_restrictions = (((len(restrictions.index)**2) - (restrictions.isin([0]).sum().sum())) / 2) - data.shape[0] # print(max_distance) lambda_value = (max_distance / n_restrictions) * lambda_var # Generate neighbourhood possible_changes = [] for i in range(len(data.index)): for w in range(k): possible_changes.append((i, w)) np.random.shuffle(possible_changes) # Generate initial solution data['cluster'] = np.random.randint(0, k, data.shape[0])
from scipy.sparse import csr_matrix from scipy.spatial.distance import pdist, squareform from scipy.spatial.qhull import QhullError from sklearn.exceptions import NotFittedError from gtda.homology import VietorisRipsPersistence, SparseRipsPersistence, \ WeakAlphaPersistence, EuclideanCechPersistence, FlagserPersistence pio.renderers.default = 'plotly_mimetype' X_pc = np.array([[[2., 2.47942554], [2.47942554, 2.84147098], [2.98935825, 2.79848711], [2.79848711, 2.41211849], [2.41211849, 1.92484888]]]) X_pc_list = list(X_pc) X_dist = np.array([squareform(pdist(x)) for x in X_pc]) X_dist_list = list(X_dist) X_pc_sparse = [csr_matrix(x) for x in X_pc] X_dist_sparse = [csr_matrix(x) for x in X_dist] X_dist_disconnected = np.array([[[0, np.inf], [np.inf, 0]]]) # 8-point sampling of a noisy circle X_circle = np.array([[[1.00399159, -0.00797583], [0.70821787, 0.68571714], [-0.73369765, -0.71298056], [0.01110395, -1.03739883], [-0.64968271, 0.7011624], [0.03895963, 0.94494511], [0.76291108, -0.68774373], [-1.01932365, -0.05793851]]]) def test_vrp_params():
np.save(abs_corr_array_path, abs_corr_array.data) else: abs_corr_array = np.load(abs_corr_array_path) print abs_corr_array.shape # -- calculate linkage and clusters if IS_CALCULATE_LINKAGE: # -- load the correlation matrix abs_corr_array = np.load(abs_corr_array_path) # -- transform the correlation matrix into distance measure abs_corr_dist_arr = np.around(1 - abs_corr_array, 7) # -- transform the correlation matrix into condensed distance matrix dist_corr = spdst.squareform(abs_corr_dist_arr) # -- force calculation of linkage is_force_calc_link_arr = True else: # -- skip calculation and load linkage from link_arr_path is_force_calc_link_arr = False abs_corr_dist_arr = None # -- cluster of indices in abs_corr_dist_arr array cluster_lst, cluster_size_lst = fap.compute_clusters_from_dist( abs_corr_dist_arr=abs_corr_dist_arr, link_arr_path=link_arr_path, is_force_calc_link_arr=is_force_calc_link_arr)
def find_max_distance(A): """ Returns the maximum distance from 2x points. Each point is represented by an x,y coordinate. """ return nanmax(squareform(pdist(A)))
def calculate_distance(matrix, metric): distance_matrix =pdist(matrix, metric=metric) distance_matrix = squareform(distance_matrix) return(distance_matrix)
def update_plot(self, x, y_target=None, n_bold=3, show_forward=True): plt.gcf().clear() x = self.unflatten_coeffs(np.array(x)) points = self.trace_fourier_curves(x) for i in range(len(points)): plt.plot(points[i, :, 0], points[i, :, 1], c=(0, 0, 0, min(1, 10 / len(points)))) if i >= len(points) - n_bold: plt.plot(points[i, :, 0], points[i, :, 1], c=(0, 0, 0)) if show_forward: if y_target is not None: aspect_ratio, circularity, angle = y_target # Visualize circularity star = np.array( (4, 4 )) + .5 * star_with_given_circularity(circularity) plt.plot(star[:, 0], star[:, 1], c=(0, 0, 0, .25), lw=1) # Visualize aspect ratio and angle rect = np.array( (4, 2.5)) + .4 * rect_with_given_aspect_and_angle( aspect_ratio, angle) plt.plot(rect[:, 0], rect[:, 1], c=(0, 0, 0, .25), lw=1) # Find largest diameter of the shape d = squareform(pdist(points[i])) max_idx = np.unravel_index(d.argmax(), d.shape) p0, p1 = points[i, max_idx[0]], points[i, max_idx[1]] angle = np.arctan2((p1 - p0)[1], (p1 - p0)[0]) max_diameter = d[max_idx] # Plot d0, d1 = points[i, max_idx[0]], points[i, max_idx[1]] plt.plot([d0[0], d1[0]], [d0[1], d1[1]], c=(0, 1, 0), ls='-', lw=1) plt.scatter([d0[0], d1[0]], [d0[1], d1[1]], c=[(0, 1, 0)], s=3, zorder=10) if y_target is not None: # Find largest width orthogonal to diameter c, s = np.cos(angle), np.sin(angle) rotation = np.matrix([[c, s], [-s, c]]) p_rotated = np.dot(rotation, points[i].T).T min_diameter = np.max(p_rotated[:, 1]) - np.min( p_rotated[:, 1]) # Aspect ratio & circularity aspect_ratio = min_diameter / max_diameter shape = geo.Polygon(points[i]) circularity = 4 * np.pi * shape.area / shape.length**2 # Visualize circularity star = np.array( (4, 4 )) + .5 * star_with_given_circularity(circularity) plt.plot(star[:, 0], star[:, 1], c=(0, 1, 0, .5), ls='-', lw=1) # Visualize aspect ratio and angle rect = np.array( (4, 2.5)) + .4 * rect_with_given_aspect_and_angle( aspect_ratio, angle) plt.plot(rect[:, 0], rect[:, 1], c=(0, 1, 0, .5), ls='-', lw=1) plt.axis('equal') plt.axis([ min(-5, points[:, :, 0].min() - 1), max(5, points[:, :, 0].max() + 1), min(-5, points[:, :, 1].min() - 1), max(5, points[:, :, 1].max() + 1) ])
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) print fileName x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs [ Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerAll")) [ Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerFemaleMale")) [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = [] silB = [] for c in range(iSpeakers ): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] # get subset of feature vectors Yt = distance.pdist( MidTermFeaturesNormTemp.T ) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = [] for c2 in range( iSpeakers ): # compute distances from samples of other clusters if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append( min(silBs) ) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA) silB = numpy.array(silB) sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics( MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gtFile = fileName.replace('.wav', '.segments') # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags # if PLOT: # fig = plt.figure() # if numOfSpeakers>0: # ax1 = fig.add_subplot(111) # else: # ax1 = fig.add_subplot(211) # ax1.set_yticks(numpy.array(range(len(classNames)))) # ax1.axis((0, Duration, -1, len(classNames))) # ax1.set_yticklabels(classNames) # ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls) if os.path.isfile(gtFile): # if PLOT: # ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization( cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100 * purityClusterMean, 100 * puritySpeakerMean) # if PLOT: # plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) ) # if PLOT: # plt.xlabel("time (seconds)") # #print sRange, silAll # if numOfSpeakers<=0: # plt.subplot(212) # plt.plot(sRange, silAll) # plt.xlabel("number of clusters"); # plt.ylabel("average clustering's sillouette"); # plt.show() return cls
def rbf(X, sigma=0.5): pairwise_dists = squareform(pdist(X, 'euclidean')) A = scipy.exp(-pairwise_dists**2 / (2. * sigma**2)) return A
for c_num in range(len(coords)): if c_num not in assigned: cluster_info[c_num] = { 'members': [c_num], 'centroid': coords[members], 'int_time': 1.0 } return cluster_info coords, targets = load_targets('test_targets.dat') labellist = targets['target'] seps = calc_separation(coords) dist = ssd.squareform(seps) linked = linkage(dist, method='complete', optimal_ordering=True) all_clusters = extract_levels(linked, labellist) cluster_info = do_clustering(coords, all_clusters, seps) fig = plt.figure() ax = fig.add_subplot(111) for c_num in cluster_info.keys(): cluster_data = cluster_info[c_num] indices = cluster_data['members'] num_coords = len(indices)
print(type(soap_water)) # Average output average_soap = SOAP( species=species, rcut=rcut, nmax=nmax, lmax=lmax, average=True, sparse=False ) soap_water = average_soap.create(water) print("average soap water", soap_water.shape) methanol = molecule('CH3OH') soap_methanol = average_soap.create(methanol) print("average soap methanol", soap_methanol.shape) h2o2 = molecule('H2O2') soap_peroxide = average_soap.create(h2o2) # Distance from scipy.spatial.distance import pdist, squareform import numpy as np molecules = np.vstack([soap_water, soap_methanol, soap_peroxide]) distance = squareform(pdist(molecules)) print("distance matrix: water - methanol - H2O2") print(distance)
def martin98(locations, E_incident, permittivity, location_sizes, wavelength, step_size): """ Basic implementation of the algorithm as proposed in [Olivier J. F. Martin and Nicolas B. Piller, Electromagnetic scattering in polarizable back-grounds]. Parameters ---------- locations : numpy array Array containing the locations where the E field must be evaluated. E_incident : numpy array Array containing the value of the incident E field at each location. permittivity : numpy array Array containing the permittivity at each location. location_sizes : numpy array Array containing the size of the samples at each location. wavelength : float Wavelength of the incident wave. step_size : float Minimal distance between samples. Returns ------- E_r : numpy array Scattered E field at each location. """ # Find number of locations nloc = np.shape(locations)[0] # Relative permittivity of background epsilon_B = 1 # Wave number #k_0 = 2*np.pi*frequency/speed_of_light k_0 = 2 * np.pi / wavelength k_rho = k_0 * np.sqrt(epsilon_B) # Calculate distance between all points in the plane varrho = pdist(locations, 'euclidean') # Calculate G matrix G_condensed = 1j / 4 * hankel1(0, k_rho * varrho) # Convert condensed G matrix to square form G = squareform(G_condensed) # Volume of each location V_mesh = np.square(location_sizes * step_size) # Self contribution to the electric field R_eff = np.sqrt(V_mesh / np.pi) #Effective radius beta = 1 # No coupling between TE and TM polarizations gamma = R_eff / k_rho * hankel1( 1, k_rho * R_eff) + 2j / (np.pi * np.square(k_rho)) M = 1j * np.pi / 2 * beta * gamma # Set diagonal of G matrix to 0 np.fill_diagonal(G, M / V_mesh) # Difference between background permittivity and permittivity at a specific # location Delta_epsilon = permittivity - epsilon_B # Total E field (vector) E_r = np.linalg.inv( np.identity(nloc) - k_0**2 * G @ np.diag(Delta_epsilon * V_mesh)) @ E_incident return E_r
def query(oracle, query, trn_type=1, smooth=False, weight=0.5): """ :param oracle: :param query: :param trn_type: :param smooth: :param weight: :return: """ """ Return the closest path in target oracle given a query sequence Args: oracle: an oracle object already learned, the target. query: the query sequence in a matrix form such that the ith row is the feature at the ith time point method: trn_type: smooth:(off-line only) weight: """ N = len(query) K = oracle.num_clusters() P = [[0] * K for _i in range(N)] if smooth: D = dist.pdist(oracle.f_array[1:], 'sqeuclidean') D = dist.squareform(D, checks=False) map_k_outer = partial(_query_k, oracle=oracle, query=query, smooth=smooth, D=D, weight=weight) else: map_k_outer = partial(_query_k, oracle=oracle, query=query) map_query = partial(_query_init, oracle=oracle, query=query[0]) P[0], C = zip(*map(map_query, oracle.rsfx[0][:])) P[0] = list(P[0]) C = np.array(C) if trn_type == 1: trn = _create_trn_self elif trn_type == 2: trn = _create_trn_sfx_rsfx else: trn = _create_trn argmin = np.argmin distance_cache = np.zeros(oracle.n_states) for i in xrange(1, N): # iterate over the rest of query state_cache = [] dist_cache = distance_cache map_k_inner = partial(map_k_outer, i=i, P=P, trn=trn, state_cache=state_cache, dist_cache=dist_cache) P[i], _c = zip(*map(map_k_inner, range(K))) P[i] = list(P[i]) C += np.array(_c) i_hat = argmin(C) P = map(list, zip(*P)) return P, C, i_hat
import pandas as pd from scipy.spatial import distance import numpy as np from matplotlib import pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage from scipy.spatial import distance np.set_printoptions(precision=1, suppress=True) # Cortar la impresión de decimales a 1 os.chdir('datos') #Lectura simple de datos ya con la limpieza df = pd.read_csv("tirosL.csv") #los datos ya estan limpios #df=df.sample(10000) X = df.head(1000) # Convertir el vector de distancias a una matriz cuadrada md = distance.squareform(distance.pdist(X, 'euclidean')) print(md) Z = linkage(X, 'complete') plt.figure(figsize=(12, 5)) dendrogram(Z, truncate_mode='lastp', p=5, show_leaf_counts=True, leaf_font_size=14.) #dendrogram(Z, leaf_font_size=14) plt.show()
facecolor=box_color, edgecolor=box_color, linewidth=basewidth, clip_on=False)) loading_axes[task_i].hlines(i + .4, -2, -.5, color=box_color, clip_on=False, linewidth=basewidth, linestyle=':') # **************************************************************************** # Distance Matrices # **************************************************************************** participant_distances = squareform(abs_pdist(data.T)) participant_distances = results['task'].HCA.results['data']['clustered_df'] loading_distances = results['task'].HCA.results['EFA5_oblimin']['clustered_df'] sns.heatmap(participant_distances, ax=participant_distance, cmap=ListedColormap(sns.color_palette('gray', n_colors=100)), xticklabels=False, yticklabels=False, square=True, cbar=False, linewidth=0) sns.heatmap(loading_distances, ax=loading_distance, xticklabels=False, yticklabels=False, square=True,
#print cls.inertia_ #labels = cls.labels_ ############################################################################### ## Visualize the results on PCA-reduced data tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) reduced_data = tsne.fit_transform(X) plt.scatter(reduced_data[:, 0], reduced_data[:, 1]) plt.savefig(cur_file_dir + 'result/' + 'user_dr_tsne.png') # plt.show() plt.cla() plt.clf() plt.close() # Compute DBSCAN #D = distance.squareform(distance.pdist(X)) # 高维数据 D = distance.squareform(distance.pdist(reduced_data)) # 低维数据 D = np.sort(D, axis=0) minPts = 10 nearest = D[1:(minPts + 1), :] nearest = nearest.reshape(1, nearest.size) sort_nearest = np.sort(nearest) plt.plot(range(len(sort_nearest[0, :])), sort_nearest[0, :], linewidth=1.0, marker='x') #plt.axis([-2, len(sort_nearest[0,:])+1000, -2, max(sort_nearest[0,:])+2]) plt.savefig(cur_file_dir + 'result/' + 'nearest.png') plt.cla() plt.clf() plt.close() #db = DBSCAN(eps=0.90, min_samples=minPts).fit(X) # 高维数据
def _krige(X, y, coords, variogram_function, variogram_model_parameters, coordinates_type): """Sets up and solves the ordinary kriging system for the given coordinate pair. This function is only used for the statistics calculations. Parameters ---------- X: ndarray float array [n_samples, n_dim], the input array of coordinates y: ndarray float array [n_samples], the input array of measurement values coords: ndarray float array [1, n_dim], point at which to evaluate the kriging system variogram_function: callable function that will be called to evaluate variogram model variogram_model_parameters: list user-specified parameters for variogram model coordinates_type: str type of coordinates in X array, can be 'euclidean' for standard rectangular coordinates or 'geographic' if the coordinates are lat/lon Returns ------- zinterp: float kriging estimate at the specified point sigmasq: float mean square error of the kriging estimate """ zero_index = None zero_value = False # calculate distance between points... need a square distance matrix # of inter-measurement-point distances and a vector of distances between # measurement points (X) and the kriging point (coords) if coordinates_type == 'euclidean': d = squareform(pdist(X, metric='euclidean')) bd = np.squeeze(cdist(X, coords[None, :], metric='euclidean')) # geographic coordinate distances still calculated in the old way... # assume X[:, 0] ('x') => lon, X[:, 1] ('y') => lat # also assume problem is 2D; check done earlier in initializing variogram elif coordinates_type == 'geographic': x1, x2 = np.meshgrid(X[:, 0], X[:, 0], sparse=True) y1, y2 = np.meshgrid(X[:, 1], X[:, 1], sparse=True) d = great_circle_distance(x1, y1, x2, y2) bd = great_circle_distance(X[:, 0], X[:, 1], coords[0] * np.ones(X.shape[0]), coords[1] * np.ones(X.shape[0])) # this check is done when initializing variogram, but kept here anyways... else: raise ValueError("Specified coordinate type '%s' " "is not supported." % coordinates_type) # check if kriging point overlaps with measurement point if np.any(np.absolute(bd) <= 1e-10): zero_value = True zero_index = np.where(bd <= 1e-10)[0][0] # set up kriging matrix n = X.shape[0] a = np.zeros((n + 1, n + 1)) a[:n, :n] = -variogram_function(variogram_model_parameters, d) np.fill_diagonal(a, 0.0) a[n, :] = 1.0 a[:, n] = 1.0 a[n, n] = 0.0 # set up RHS b = np.zeros((n + 1, 1)) b[:n, 0] = -variogram_function(variogram_model_parameters, bd) if zero_value: b[zero_index, 0] = 0.0 b[n, 0] = 1.0 # solve res = np.linalg.solve(a, b) zinterp = np.sum(res[:n, 0] * y) sigmasq = np.sum(res[:, 0] * -b[:, 0]) return zinterp, sigmasq
def _calc_max_dist(self): # Simplest possible max distance measure return distance.squareform(distance.pdist(self.points)).max()
df = pd.DataFrame({'Max/Min topics': column1,'Nights': column2,'Number of topics': column3, 'Topics': column4}) print(df) # show the data frame writer = xlwt('table of max and min number topics.xlsx') # create an excel file from the data frame workbook = writer.book # define the excel workbook df.to_excel(writer, 'Sheet1') # place the data frame on the first sheet of the excel file worksheet = writer.sheets['Sheet1'] # define the worksheet worksheet.set_column('B:Q',35) # set the column width for columns B up to Q, so we can see all the text in the cells writer.save() ########################################### # Hierarchical clustering with topic model ########################################### dm = squareform(pdist(X, 'cosine'))# 'cosine'is one of the methods that can be used to calculate the distance between newly formed clusters # we use the cosine similarity because it works well for topic clustering # creating a linkage matrix linkage_object = linkage(dm, method='ward', metric='euclidean') print(linkage_object) # linkage_object[i] will tell us which clusters were merged in the i-th pass # calculate a full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram(linkage_object,leaf_rotation=90.,leaf_font_size=8.,) plt.show() # we create a truncated dendrogram, which only shows the last p=15 out of our 989 merges.
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, force_all_finite=True, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. ['nan_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf and np.nan in array. The possibilities are: - True: Force all values of array to be finite. - False: accept both np.inf and np.nan in array. - 'allow-nan': accept only np.nan values in array. Values cannot be infinite. .. versionadded:: 0.22 **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. See also -------- pairwise_distances_chunked : performs the same calculation as this function, but returns a generator of chunks of the distance matrix, in order to limit memory usage. paired_distances : Computes the distances between corresponding elements of two arrays """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True, force_all_finite=force_all_finite) whom = ("`pairwise_distances`. Precomputed distance " " need to have non-negative values.") check_non_negative(X, whom=whom) return X elif ((metric == 'cosine') and (Y is None) and (not issparse(X)) and X.dtype == np.float64): return _daal4py_cosine_distance_dense(X) elif ((metric == 'correlation') and (Y is None) and (not issparse(X)) and X.dtype == np.float64): return _daal4py_correlation_distance_dense(X) elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None if (dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool))): msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays(X, Y, dtype=dtype, force_all_finite=force_all_finite) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def omeClust(data, metadata=config.metadata, resolution=config.resolution, output_dir=config.output_dir, estimated_number_of_clusters=config.estimated_number_of_clusters, linkage_method=config.linkage_method, plot=config.plot, size_to_plot=None, enrichment_method="nmi"): # read input files data = pd.read_table(data, index_col=0, header=0) # print(data.shape) #print(data.index) #print(data.columns) if metadata is not None: metadata = pd.read_table(metadata, index_col=0, header=0) # print(data.index) #print(metadata.index) ind = metadata.index.intersection(data.index) #diff = set(metadata.index).difference(set(data.index)) #print (diff) #print(len(ind), data.shape[1], ind) if len(ind) != data.shape[0]: print( "the data and metadata have different number of rows and number of common rows is: ", len(ind)) print("The number of missing metadata are: ", data.shape[0] - len(ind)) # print("Metadata will not be used!!! ") # metadata = None # else: diff_rows = data.index.difference(metadata.index) #print (diff_rows) empty_section_metadata = pd.DataFrame(index=diff_rows, columns=metadata.columns) metadata = pd.concat([metadata, empty_section_metadata]) metadata = metadata.loc[data.index, :] #print (data, metadata) #data = data.loc[ind] #data = data.loc[ind, :] config.output_dir = output_dir check_requirements() data_flag = True if all(a == b for (a, b) in zip(data.columns, data.index)): df_distance = data data_flag = False else: df_distance = pd.DataFrame(squareform( pdist(data, metric=distance.pDistance)), index=data.index, columns=data.index) df_distance = df_distance[df_distance.values.sum(axis=1) != 0] df_distance = df_distance[df_distance.values.sum(axis=0) != 0] df_distance.to_csv(output_dir + '/adist.txt', sep='\t') # df_distance = stats.scale_data(df_distance, scale = 'log') # viz.tsne_ord(df_distance, cluster_members = data.columns) clusters = main_run( distance_matrix=df_distance, number_of_estimated_clusters=estimated_number_of_clusters, linkage_method=linkage_method, output_dir=output_dir, do_plot=plot, resolution=resolution) omeClust_enrichment_scores, sorted_keys = None, None shapeby = None if metadata is not None: omeClust_enrichment_scores, sorted_keys = utilities.omeClust_enrichment_score( clusters, metadata, method=enrichment_method) if len(sorted_keys) > 3: shapeby = sorted_keys[3] print(shapeby, " is the most influential metadata in clusters") else: omeClust_enrichment_scores, sorted_keys = utilities.omeClust_enrichment_score( clusters, metadata, method=enrichment_method) #print (omeClust_enrichment_scores, sorted_keys) dataprocess.write_output(clusters, output_dir, df_distance, omeClust_enrichment_scores, sorted_keys) feature2cluster = dataprocess.feature2cluster(clusters, df_distance) feature2cluster_map = pd.DataFrame.from_dict(feature2cluster, orient='index', columns=['Cluster']) feature2cluster_map = feature2cluster_map.loc[data.index, :] feature2cluster_map.to_csv(output_dir + '/feature_cluster_label.txt', sep='\t') if plot: if size_to_plot is None: size_to_plot = config.size_to_plot try: viz.pcoa_ord(df_distance, cluster_members=dataprocess.cluster2dict(clusters), size_tobe_colored=size_to_plot, metadata=metadata, shapeby=shapeby) except: pass try: viz.tsne_ord(df_distance, cluster_members=dataprocess.cluster2dict(clusters), size_tobe_colored=size_to_plot, metadata=metadata, shapeby=shapeby) except: pass try: viz.pca_ord(df_distance, cluster_members=dataprocess.cluster2dict(clusters), size_tobe_colored=size_to_plot, metadata=metadata, shapeby=shapeby) except: pass try: viz.mds_ord(df_distance, cluster_members=dataprocess.cluster2dict(clusters), size_tobe_colored=size_to_plot, metadata=metadata, shapeby=shapeby) except: pass # draw network max_dist = max(omeClust_enrichment_scores['branch_condensed_distance']) min_weight = df_distance.max().max() - max_dist viz.network_plot(D=df_distance, partition=dataprocess.feature2cluster(clusters, D=df_distance), min_weight=min_weight) return feature2cluster_map
axs = axs.ravel() colors = cm.rainbow(np.linspace(0,1,T)) for i in range(T): axs[i].scatter(x_latent_time[i,:,0],x_latent_time[i,:,1],color=colors[i],alpha=0.3) axs[i].set_title(str(i)) plt.show() #**** plot some individual paths (by initial state?) xdata_time = np.concatenate((dataset['state'],dataset['control']),axis=1) xdata_time = xdata_time[:N,:,:] xdata_time = np.transpose(xdata_time,(2,0,1)) #obs x knots x dim # get distance metric of initial conditions xdata_x0 = np.squeeze(xdata_time[1,:,0:4]) # drop control from scipy.spatial.distance import pdist,squareform y = squareform(pdist(xdata_x0,'euclidean')) # now get examples from one point, and plot sorted by distance to example point import random ind = random.randint(0,N) dist = y[ind,:] idx = np.argsort(dist) # plot vae examples plt.scatter(x_latent[:,0],x_latent[:,1]) plt.xlabel('VAE 1') plt.ylabel('VAE 2') numex = 10 colors = cm.rainbow(np.linspace(0,1,numex)) for i in range(numex):
''' import numpy as np from scipy.cluster.hierarchy import dendrogram, linkage from scipy.spatial.distance import squareform import matplotlib.pyplot as plt mat = np.array([[0.0, 2.0, 6.0, 10.0, 9.0], [2.0, 0.0, 5.0, 9.0, 8.0], [6.0, 5.0, 0.0, 4.0, 5.0], [10.0, 9.0, 4.0, 0.0, 3.0], [9.0, 8.0, 5.0, 3.0, 0.0]]) dists = squareform(mat) linkage_matrix = linkage(dists, "single") dendrogram(linkage_matrix, labels=["0", "1", "2","3", "4"]) plt.title("test") plt.show() # How to calculate distance_matrix from scipy.spatial import distance_matrix p = dataset.iloc[:5,[2,4]].values distance_matrix(p,p) d = np.dot(p,p.T) norm = (p**2).sum(0, keepdims=True) d / norm d / norm / norm.T
def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. Also, ['masked_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. See also -------- pairwise_distances_chunked : performs the same calculation as this funtion, but returns a generator of chunks of the distance matrix, in order to limit memory usage. paired_distances : Computes the distances between corresponding elements of two arrays """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) if metric in _MASKED_METRICS or callable(metric): missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan if np.all(_get_mask(X.data if issparse(X) else X, missing_values)): raise ValueError( "One or more samples(s) only have missing values.") if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None X, Y = check_pairwise_arrays(X, Y, dtype=dtype) if n_jobs == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def CosineScore(M): cos_M = squareform(pdist(M, 'cosine')) alpha_cos = softmax(cos_M, axis=0) return np.sum(alpha_cos, axis=1)
plt.show() H = linkage(dataset1, 'complete') plt.figure(figsize=(10, 10)) dendro = dendrogram(H, leaf_font_size=30) plt.title('Dendrogram on microstructural dataset using complete linkage') plt.show() H = linkage(dataset1, 'single', metric='correlation') plt.figure(figsize=(10, 10)) dendro = dendrogram(H, leaf_font_size=30) plt.title('Dendrogram on microstructural dataset using single linkage') plt.show() # Distance matrix dm = squareform(pdist(dataset1)) # For euclidean h = sns.clustermap(dm, metric='euclidean') plt.show() # For jaccard h = sns.clustermap(dm, metric='jaccard') plt.show() # For correlation h = sns.clustermap(dm, metric='correlation') plt.show() # For single h = sns.clustermap(dm, method='single') plt.show() # Gaussian mixture g_m = GaussianMixture(n_components=72).fit(x)
def manhattenScore(M): man_M = squareform(pdist(M, 'cityblock')) alpha_man = softmax(man_M, axis=0) return np.sum(alpha_man, axis=1)
def find_correlation_clusters(corr, corr_thresh): dissimilarity = 1.0 - corr hierarchy = linkage(squareform(dissimilarity), method='single') diss_thresh = 1.0 - corr_thresh labels = fcluster(hierarchy, diss_thresh, criterion='distance') return labels