def _CalcMutualNearestNeighbors(hull_points, all_points): all_points_list = list(all_points) ds = distance.pdist(list(all_points)) std_d = p.std(ds) square_ds = distance.squareform(ds) nearest_neighbors = {} for i, point in enumerate(all_points_list): if point not in hull_points: continue my_ds = [(d, j) for j, d in enumerate(square_ds[i]) if j != i] my_ds.sort() nearest_neighbors[point] = set([j for d, j in my_ds[:3]]) no_mutual = set() for i, point in enumerate(all_points_list): if point not in hull_points: continue no_nbrs = True for neighbor_index in nearest_neighbors.get(point, []): neighbor = all_points_list[neighbor_index] neighbor_set = nearest_neighbors.get(neighbor, []) if i in neighbor_set: no_nbrs = False if no_nbrs: no_mutual.add(point) return no_mutual
def spectral_partition(W,q,method = 'complete', metric = 'cosine'): n,m = W.shape K = Kmatrix(W) if n == m: try: e,v = linalg.eigen(K, q) except TypeError: e,v = linalg.eigs(K, q) else: try: u,e,v = linalg.svds(K, q) except AttributeError: u,e,v = linalg.svd(K, q) v = np.concatenate((u, v.T), 0) max_index = e.argmax() v = np.delete(v,max_index,1) Obs = np.real(v) D = distance.pdist(Obs,metric = metric) D = np.multiply(D >= 0, D) Z = linkage(D, method = method, metric = metric) cluster = fcluster(Z, q, criterion = 'maxclust') cluster += - 1 cluster = {'spectral' : cluster} return cluster
def _CalcMutualNearestNeighbors(hull_points, all_points): all_points_list = list(all_points) ds = distance.pdist(list(all_points)) std_d = p.std(ds) square_ds = distance.squareform(ds) nearest_neighbors = {} for i, point in enumerate(all_points_list): if point not in hull_points: continue my_ds = [(d, j) for j, d in enumerate(square_ds[i]) if j != i] my_ds.sort() nearest_neighbors[point] = set([j for d,j in my_ds[:3]]) no_mutual = set() for i, point in enumerate(all_points_list): if point not in hull_points: continue no_nbrs = True for neighbor_index in nearest_neighbors.get(point, []): neighbor = all_points_list[neighbor_index] neighbor_set = nearest_neighbors.get(neighbor, []) if i in neighbor_set: no_nbrs = False if no_nbrs: no_mutual.add(point) return no_mutual
def StdDist(points): """Returns the standard deviation of the pairwise distances. Args: points: an Nx2 matrix of points. """ ds = distance.pdist(points) return p.mean(ds)
def plt_cluster(source_path, result_path): # 参数初始化 path = source_path # 开门不同时段 o = open(path, 'rb') data = pd.read_csv(o, index_col='neighbor') # 生成点与点之间的距离矩阵,这里用的欧氏距离: disMat = distance.pdist(data, metric='euclidean') Z = linkage(disMat, method='average') # 进行层次聚类: P=dendrogram( Z ) # 将层级聚类结果以树状图表示出来并保存 plt.savefig( result_path + 'plot_dendrogram.png')
def main(assets, start_date, end_date, plot_original=False): """Main function to draw MST from assets. Parameters ---------- assets: list list of assets start_date: string start date of asset prices end_date: string end date of asset prices """ assets = asset_prices(assets, start_date, end_date) close_prices = assets["Close"] # For reference, log(x) - log(y) == log(x/y) log_returns = np.log(close_prices / close_prices.shift(1))[1:] correlations = log_returns.corr() distances = np.sqrt(2 * (1 - correlations)) # Build graph. graph = Graph(distances) edges = graph.weighted_edges() nodes = graph.vertices() # Plot all nodes and edges. if plot_original: fig, ax = plt.subplots(1, 1) fig.set_size_inches(14.5, 10.5) draw_graph( nodes, edges, ax, graph_layout=nx.spring_layout, title="Original Network Graph", ) # Plot MST. mst = graph.kruskal_mst() fig, ax = plt.subplots(1, 1) fig.set_size_inches(14.5, 10.5) draw_graph( nodes, mst, ax, graph_layout=nx.spring_layout, title="MST Network Graph", ) # Plot dendogram. fig, ax = plt.subplots(1, 1) fig.set_size_inches(20.5, 8.5) pdist = distance.pdist(distances.values) link = linkage(pdist, method="complete") dendrogram(link, labels=distances.columns) ax.set_title("Dendogram of MST", fontsize=24) plt.show()
def linkage_matrix_rep(sim_matrix): methods = ['average', 'single', 'complete', 'weighted'] c_final = 0.0 method_final = '' final_linkage = linkage(sim_matrix) for method in methods: linkage_matrix = linkage(sim_matrix, method=method) c, coph_dists = cophenet(linkage_matrix, distance.pdist(sim_matrix)) if c > c_final: c_final = c final_linkage = linkage_matrix method_final = method cd_final = coph_dists return c_final, method_final, final_linkage, cd_final
def _CalcDensities(hull_points, all_points): ds = distance.pdist(list(all_points)) std_d = p.std(ds) square_ds = distance.squareform(ds) densities = {} for i, point in enumerate(all_points): if point not in hull_points: continue my_ds = square_ds[i] density = len([1 for i in my_ds if i <= std_d]) densities[point] = density tmp_densities = [(d, pt) for pt,d in densities.iteritems()] tmp_densities.sort(reverse=True) return tmp_densities, std_d
def _CalcDensities(hull_points, all_points): ds = distance.pdist(list(all_points)) std_d = p.std(ds) square_ds = distance.squareform(ds) densities = {} for i, point in enumerate(all_points): if point not in hull_points: continue my_ds = square_ds[i] density = len([1 for i in my_ds if i <= std_d]) densities[point] = density tmp_densities = [(d, pt) for pt, d in densities.iteritems()] tmp_densities.sort(reverse=True) return tmp_densities, std_d
def cluster(source_path, result_path): #参数初始化 path = source_path #开门不同时段 o = open(path, 'rb') data = pd.read_csv(o, index_col='address') #生成点与点之间的距离矩阵,这里用的欧氏距离: disMat = distance.pdist(data, metric='euclidean') Z = linkage(disMat, method='average') # 进行层次聚类: P = dendrogram(Z) # 将层级聚类结果以树状图表示出来并保存 plt.savefig(result_path + 'plot_dendrogram.png') cluster = fcluster(Z, t=6, criterion='maxclust') #根据linkage matrix Z得到聚类结果 k = len(np.unique(cluster)) # 聚类簇的数量 # print(cluster) #详细输出原始数据及其类别 if not os.path.exists(result_path + 'imgs/'): os.makedirs(result_path + 'imgs/') if not os.path.exists(result_path + 'csv/'): os.makedirs(result_path + 'csv/') r = pd.concat([data, pd.Series(cluster, index=data.index)], axis=1) #详细输出每个样本对应的类别 r.columns = list(data.columns) + [u'聚类类别'] #重命名表头 plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 style = ['ro-', 'go-', 'bo-', 'co-', 'mo-', 'yo-'] xlabels = [u'工作日工作时段', u'工作日通勤时段', u'周末日间', u'凌晨'] pic_output = result_path + 'imgs/type_' #聚类图文件名前缀 for i in range(1, k + 1): #逐一作图,作出不同样式 plt.figure() tmp = r[r[u'聚类类别'] == i].iloc[:, :4] #提取每一类除最后一列(label)的数据 tmp.to_csv(result_path + 'csv/类别%s.csv' % (i)) #将每一类存成一个csv文件 for j in range(len(tmp)): #作图 plt.plot(range(1, 5), tmp.iloc[j], style[i - 1]) plt.xticks(range(1, 5), xlabels, rotation=20) #坐标标签 plt.title(u'门洞类别%s' % (i)) #从1开始计数 plt.subplots_adjust(bottom=0.15) #调整底部 plt.savefig(u'%s%s.png' % (pic_output, i)) #保存图片
def calc_distance_matrix(gene_informative, ignore_indels=True, metric='jaccard'): """ Calculate a pairwise distance matrix from a pileup of reads across informative sites """ n_reads, n_sites = gene_informative.shape if n_reads > 1000: print("Greater than 1000 reads!") print("... consider downsampling") if ignore_indels: f = np.copy(gene_informative).astype("float") f[f == -1] = np.nan else: f = gene_informative X = distance.pdist(gene_informative, metric=metric) return X
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, distance from matplotlib import pyplot as plt import scipy import numpy as np points = np.random.randn(10, 4) #生成20个样本,每个样本4维 dist_mat = distance.pdist(points, 'euclidean') #生成点与点之间的距离矩阵,这里使用欧式距离 print(dist_mat) Z = linkage(dist_mat, method='ward') #进行层次聚类, 使用ward方法,返回值是聚类树的合并过程 print(Z) f = fcluster( Z, 2, criterion='distance') #根据阈值决定分类,不同的阈值会导致不同类的合并,判断标准是距离,返回值是每个点的类型标记 print(f) den = dendrogram(Z) #将层次聚类结果以树状图表示出来 plt.show()
d = d_sr[i] l1 = d * n2 / (n1 + n2) l2 = d * n1 / (n1 + n2) # the first branching if i == 2*n - 2: pos_df.loc[c1] = (l1, 0) pos_df.loc[c2] = (-l2, 0) elif d == 0: pos_df.loc[c1] = pos_df.loc[i] pos_df.loc[c2] = pos_df.loc[i] else: pos_s = pos_df.loc[sis_sr[i]] # sister node pos_i = pos_df.loc[i] L = np.linalg.norm(pos_s - pos_i) th = get_actual_theta(n1, n2, l1, l2, L) phi = np.angle(np.complex(*(pos_i - pos_s))) psi = phi + th - np.pi pos_df.loc[c1] = pos_i + [l1 * np.cos(psi), l1 * np.sin(psi)] pos_df.loc[c2] = pos_i - [l2 * np.cos(psi), l2 * np.sin(psi)] return pos_df.iloc[:n] if __name__ == '__main__': from scipy.cluster.hierarchy import distance, linkage X = np.random.randn(5,3) # data matrix Y = distance.pdist(X, metric='euclidean') # distance matrix Z = linkage(Y, method='average') # linkage matrix pos_df = branching_embedding(Z)
def getImpCat(): vecs = [] ids = getImpIds(imp_ids_num) #getAllIds() for _id in ids: vecs.append(docvecs[_id]) print(len(vecs), len(vecs[0])) # print(points) disMat = distance.pdist(vecs, 'euclidean') #define the linkage_matrix using ward clustering pre-computed distances print('开始计算') linkage_matrix = linkage(disMat, method='ward', optimal_ordering=True) #optonal :average ward etc def getTree(linkage_matrix): class TreeNode(object): def __init__(self, _id): self.id = _id self.parent = None self.childs = set() self.child_num = 0 class NodeCompany(): def __init__(self): self.id2node = {} def get(self, _id): id2node = self.id2node if _id in id2node: return id2node[_id] else: id2node[_id] = TreeNode(_id) return id2node[_id] nodeCompany = NodeCompany() linkage_matrix = linkage_matrix.tolist() # print(linkage_matrix) for item in linkage_matrix: node1 = int(item[0]) node2 = int(item[1]) # sim = item[2] num = int(item[3]) item[0] = node1 item[1] = node2 item[3] = num l_length = imp_ids_num for index, item in enumerate(linkage_matrix): child_num = item[3] node1 = nodeCompany.get(item[0]) node2 = nodeCompany.get(item[1]) index += l_length parent_node = nodeCompany.get(index) node1.parent = parent_node node2.parent = parent_node parent_node.childs.add(node1) parent_node.childs.add(node2) parent_node.child_num = child_num id2node = nodeCompany.id2node # for key in id2node: # node = id2node[key] # print([sub.id for sub in node.childs], node.id) return id2node tree = getTree(linkage_matrix) result = {} for _id in tree: item = tree[_id] result[_id] = { # 'id': item.id, 'child_num': item.child_num, 'parent': None if (item.parent is None) else item.parent.id, 'childs': [child.id for child in item.childs], } writeJson(tree_path, result)
#cosine distance doc_sim =1-cosine_similarity(tfidf_matrix) print (doc_sim) # clustering using hierarchical clustering linkage_matrix = linkage(doc_sim,method='centroid') #assignments = fcluster(linkage_matrix,1,criterion='distance') #assignments = fcluster(,4,'distance') print(linkage_matrix) c, coph_dists = cophenet(linkage_matrix, distance.pdist(doc_sim)) print (c) assignments =fcluster(linkage_matrix, 4, 'maxclust') cluster_doc = pd.DataFrame({'doc':doc_name , 'cluster':assignments}) print(cluster_doc) cluster_doc.to_csv('doc_cluster.csv',sep='\t') fig, ax = plt.subplots(figsize=(15, 20)) # set size ax = dendrogram(linkage_matrix, orientation="left", labels=doc_name); plt.tick_params(\ axis= 'x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off