def test_all(n,dim): method = 'single' # metrics for boolean vectors pcd = np.array(np.random.random_integers(0,1,(n,dim)), dtype=np.bool) pcd2 = pcd.copy() for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', #'kulsinski', 'rogerstanimoto', #'sokalmichener', # exclude, bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric) Z2 = fc.linkage_vector(pcd, method, metric) if np.any(pcd2!=pcd): raise AssertionError('Input array was corrupted.', pcd) test(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.random_integers(-bound,bound,(n,dim)) for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', #'canberra', # exclude, bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user']: sys.stdout.write("Metric: " + metric + "...") if metric=='minkowski': p = np.random.uniform(1.,10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric, p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric=='user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())) D = pdist(pcd, fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric) Z2 = fc.linkage_vector(pcd, method, metric) test(Z2, method, D) #print pcd D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) test(Z2, method, D)
def linkage(self, title_clusters, method='ward'): try: data = np.array([i[0][0] for i in title_clusters.word_vector]) Z = fastcluster.linkage_vector(data, method=method) except AttributeError: title_clusters = apply_word_embedings(title_clusters) data = np.array([i[0][0] for i in title_clusters.word_vector]) Z = fastcluster.linkage_vector(data, method=method) return Z
def linkage(self, title_clusters, method='ward', linkage_matrix=None): if not linkage_matrix is None: self.linkage_matrix = linkage_matrix return linkage_matrix try: data = np.array([i[0][0] for i in title_clusters.word_vector]) Z = fastcluster.linkage_vector(data, method=method) except AttributeError: title_clusters = apply_word_embedings(title_clusters, model_name=self.model_name) data = np.array([i[0][0] for i in title_clusters.word_vector]) Z = fastcluster.linkage_vector(data, method=method) return Z
def cluster_finder(polygons): """returns a matrix Z as described in scipy.cluster.hierarchy.linkage""" def distfunc(u, v): return polygons[int(u)].distance(polygons[int(v)]) X = np.arange(len(polygons))[:, np.newaxis] return fastcluster.linkage_vector(X, method='single', metric=distfunc)
def test(): n = np.random.random_integers(2, 100) # Part 1: distance matrix input N = n * (n - 1) // 2 D = np.random.rand(N) # Insert a single NaN value pos = np.random.randint(N) D[pos] = np.nan for method in [ 'single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median' ]: try: fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf, -np.inf, -np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.random_integers(2, 12) X = np.random.rand(n, dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass return True
def fast_hierarchy(feat, distance, hmethod='single', **kwargs): import fastcluster import scipy.cluster links = fastcluster.linkage_vector(feat, method=hmethod) labels_ = scipy.cluster.hierarchy.fcluster(links, distance, criterion='distance') return labels_
def test(): n = np.random.randint(2,100) # Part 1: distance matrix input N = n*(n-1)//2 D = np.random.rand(N) # Insert a single NaN value pos = np.random.randint(N) D[pos] = np.nan for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.randint(2,13) X = np.random.rand(n,dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass return True
def cal_cophenetic(C): """ calculate cophenetic correlation coefficient """ print("=== calculate cophenetic correlation coefficient ===") X = C # Original data (1000 observations) """Z = linkage(X)""" Z = fc.linkage_vector(X) # Clustering orign_dists = fc.pdist(X) # Matrix of original distances between observations cophe_dists = cophenet(Z) # Matrix of cophenetic distances between observations corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1] return corr_coef
def linkage(self, x): """Performs hierarchical clustering. :Parameters: x : 2d array_like object (N, P) vector data, N observations in R^P """ self._Z = fastcluster.linkage_vector(X=x, method=self._method, metric='euclidean', extraarg=None)
def product_finder_fasttext(data_, k1= 30,topn_=4000, min_value_ = 0.95,expected_density = 1.1, sparse=False, clustering_algorithm = 'agglomerative'): if clustering_algorithm == 'community': print('creating cv_matrix') data = [i[0][0] for i in data_['word_vector']] cv_matrix = csr_matrix(np.array(data)) print('calculating similarity matrix') s = time.time() cosine_sim = pairwise_cosine_sparse_sim(cv_matrix, topn = topn_, min_value=min_value_,expected_density = expected_density, sparse= False) print (str(time.time()-s)+'s for cosine similarity computing') s = time.time() print('generating similarities graph') sources, targets = cosine_sim.nonzero() g = Graph(list(zip(sources.tolist(), targets.tolist()))) print (str(time.time()-s)+'s for graph generation') s= time.time() print('creating graph communities') clusters= g.community_multilevel(weights = np.exp(k1*cosine_sim.data)) cluster_labels = clusters.membership Z = None if clustering_algorithm == 'agglomerative': data = np.array([i[0][0] for i in data_['word_vector']]) cv_matrix = csr_matrix(np.array(data)) print('calculating similarity matrix') s = time.time() #cosine_sim = pairwise_cosine_sparse_sim(cv_matrix, topn = cv_matrix.shape[0], min_value=min_value_,expected_density = expected_density, sparse= False) print (str(time.time()-s)+'s for cosine similarity computing') Z = fastcluster.linkage_vector(np.array(data)) cluster_labels = scipy.cluster.hierarchy.fcluster(Z, 1-min_value_, criterion='distance', depth=2, R=None, monocrit=None) data_=data_.assign(product_id = cluster_labels) list_of_labels = list(set(data_['product_id'])) product_word_vector={} for i in list_of_labels: by_column_dic_i = data_[data_.product_id == i] word_vectors = [vector[0] for vector in by_column_dic_i['word_vector']] if len(word_vectors) == 1: product_word_vector[i] = word_vectors else: product_word_vector[i] = [np.average(np.array(word_vectors),axis=0)] data_= data_.assign(product_word_vector = 0) for i in product_word_vector.keys(): data_[data_.product_id == i] = data_[data_.product_id == i].assign(product_word_vector = len(data_[data_.product_id == i])*product_word_vector[i]) data_ = data_[['ad_title','ad_title_corpus','ad_id','product_id','word_vector','product_word_vector']] print (str(time.time()-s)+'s for clusters computation') if clustering_algorithm == 'community': return {'clustered_data':data_ , 'sim_matrix_density': cosine_sim.size/(cosine_sim.shape[0]*cosine_sim.shape[1])} if clustering_algorithm == 'agglomerative': return {'clustered_data':data_ , 'linkage_matrix': Z}
def fast_cluster(array, method, metric): import fastcluster euclidean_methods = ('centroid', 'median', 'ward') euclidean = metric == 'euclidean' and method in euclidean_methods if euclidean or method == 'single': _linkage = fastcluster.linkage_vector(array, method=method, metric=metric) else: _linkage = fastcluster.linkage(array, method=method, metric=metric) return _linkage
def _calculate_linkage_fastcluster(array, metric='euclidean', method='single'): # Fastcluster has a memory-saving vectorized version, but only # with certain linkage methods, and mostly with euclidean metric # vector_methods = ('single', 'centroid', 'median', 'ward') euclidean_methods = ('centroid', 'median', 'ward') euclidean = metric == 'euclidean' and method in \ euclidean_methods if euclidean or method == 'single': return fastcluster.linkage_vector(array, method=method, metric=metric) else: linkage = fastcluster.linkage(array, method=method, metric=metric) return linkage
def _calculate_linkage_fastcluster(self): import fastcluster # Fastcluster has a memory-saving vectorized version, but only # with certain linkage methods, and mostly with euclidean metric vector_methods = ("single", "centroid", "median", "ward") euclidean_methods = ("centroid", "median", "ward") euclidean = self.metric == "euclidean" and self.method in euclidean_methods if euclidean or self.method == "single": return fastcluster.linkage_vector(self.array, method=self.method, metric=self.metric) else: pairwise_dists = distance.pdist(self.array, metric=self.metric) linkage = fastcluster.linkage(pairwise_dists, method=self.method) del pairwise_dists return linkage
def _calculate_linkage_fastcluster(self): import fastcluster # Fastcluster has a memory-saving vectorized version, but only # with certain linkage methods, and mostly with euclidean metric # vector_methods = ("single", "centroid", "median", "ward") euclidean_methods = ("centroid", "median", "ward") euclidean = self.metric == "euclidean" and self.method in euclidean_methods if euclidean or self.method == "single": return fastcluster.linkage_vector(self.array, method=self.method, metric=self.metric) else: linkage = fastcluster.linkage(self.array, method=self.method, metric=self.metric) return linkage
def _calculate_linkage_fastcluster(self): import fastcluster # Fastcluster has a memory-saving vectorized version, but only # with certain linkage methods, and mostly with euclidean metric # vector_methods = ('single', 'centroid', 'median', 'ward') euclidean_methods = ('centroid', 'median', 'ward') euclidean = self.metric == 'euclidean' and self.method in \ euclidean_methods if euclidean or self.method == 'single': return fastcluster.linkage_vector(self.array, method=self.method, metric=self.metric) else: linkage = fastcluster.linkage(self.array, method=self.method, metric=self.metric) return linkage
def test_custom_linkage(self): kws = self.default_kws.copy() try: import fastcluster linkage = fastcluster.linkage_vector(self.x_norm, method="single", metric="euclidean") except ImportError: d = distance.pdist(self.x_norm, metric="euclidean") linkage = hierarchy.linkage(d, method="single") dendrogram = hierarchy.dendrogram(linkage, no_plot=True, color_list=["k"], color_threshold=-np.inf) kws["linkage"] = linkage p = mat._DendrogramPlotter(self.df_norm, **kws) npt.assert_array_equal(p.linkage, linkage) nt.assert_dict_equal(p.dendrogram, dendrogram)
def applyHierarchicalClustering(X, n_clusters): Z = fastcluster.linkage_vector(X, method='ward', metric='euclidean') Z_dataFrame = pd.DataFrame( data=Z, columns=['clusterOne', 'clusterTwo', 'distance', 'newClusterSize']) distance = find_distance_thres(n_clusters, Z, X) clusters = fcluster(Z, distance, criterion='distance') clusters = pd.DataFrame(data=clusters, index=X.index, columns=['cluster']) print("Number of distinct clusters: ", len(clusters['cluster'].unique())) # cluster number from int to string clusters['cluster'] = clusters['cluster'].apply(str) return clusters
def _calculate_linkage_fastcluster(self): import fastcluster # Fastcluster has a memory-saving vectorized version, but only # with certain linkage methods, and mostly with euclidean metric vector_methods = ('single', 'centroid', 'median', 'ward') euclidean_methods = ('centroid', 'median', 'ward') euclidean = self.metric == 'euclidean' and self.method in \ euclidean_methods if euclidean or self.method == 'single': return fastcluster.linkage_vector(self.array, method=self.method, metric=self.metric) else: pairwise_dists = distance.pdist(self.array, metric=self.metric) linkage = fastcluster.linkage(pairwise_dists, method=self.method) del pairwise_dists return linkage
def test_custom_linkage(self): kws = self.default_kws.copy() try: import fastcluster linkage = fastcluster.linkage_vector(self.x_norm, method='single', metric='euclidean') except ImportError: d = distance.pdist(self.x_norm, metric='euclidean') linkage = hierarchy.linkage(d, method='single') dendrogram = hierarchy.dendrogram(linkage, no_plot=True, color_threshold=-np.inf) kws['linkage'] = linkage p = mat._DendrogramPlotter(self.df_norm, **kws) npt.assert_array_equal(p.linkage, linkage) nt.assert_dict_equal(p.dendrogram, dendrogram)
def agglom_cluster(down, nclusters): """Performs agglomerative clustering on downsampled data as per paper, this is single linkage, L1 distance metric """ # see http://www.jstatsoft.org/v53/i09/paper for details on fastcluster # by Daniel Müllnerout of Carlsson's group # NOTE: Ideally, we would call the linkage function as # `Z = linkage(down, method = 'single', metric = cityblock)` # which would prevent the explicit formation of a distance matrix # however, since this involves calling back to Python, the overhead # is too much. So we form the distance matrix and pass it to the # linkage function. try: Z = linkage_vector(down, method = 'single', metric = 'cityblock') except: dist = pdist(down, metric = 'minkowski', p = 1) Z = linkage(dist, method = 'single', preserve_input = False) return fcluster(Z, nclusters, criterion = 'maxclust')
def showModelPerformanceHierarchical(X_train, y_train): fc = fastcluster.linkage_vector(X_train, method='ward', metric='euclidean') distance = find_hierarchical_clustering_distance_threshold( 23, fc, X_train) # le résultat est 174 clusters = fcluster(fc, distance, criterion='distance') X_train_hierClustered = pd.DataFrame(data=clusters, index=X_train.index, columns=['cluster']) print(X_train_hierClustered) print("Number of distinct clusters: ", len(X_train_hierClustered['cluster'].unique())) showClusterDistribution(X_train, clusters, 6, 'Evaporation', 'Rainfall') countByCluster_hierClust, countByLabel_hierClust, countMostFreq_hierClust, accuracyDF_hierClust, overallAccuracy_hierClust, accuracyByLabel_hierClust = analyzeCluster( X_train_hierClustered, y_train) print("Accuracy by cluster from hierarchical clustering: \n", accuracyByLabel_hierClust) print("Overall accuracy from hierarchical clustering: ", overallAccuracy_hierClust) print("Standard deviation from hierarchical clustering: ", accuracyByLabel_hierClust.std())
def test_random_cluster(self): np.random.seed(1337) N = 1000 t_old = 0. t_new = 0. for _ in range(N): n = int(np.random.uniform(2, 32)) x = np.random.uniform(-10, 50, (n, 1)) y = np.random.uniform(-5, 5, (n, 1)) vrel = np.random.uniform(-5, 5, (n, 1)) pts = np.hstack([x, y, vrel]) t = time.time() old_link = linkage_vector(pts, method='centroid') old_cluster_idx = fcluster(old_link, 2.5, criterion='distance') t_old += time.time() - t t = time.time() cluster_idx = cluster_points_centroid(pts, 2.5) t_new += time.time() - t self.assertTrue(same_clusters(old_cluster_idx, cluster_idx))
def handle_unlabeled(self, data, max_product_id, clustering_algorithm='agglomerative'): unknown_products = apply_word_embedings(data, model_name=self.model_name) if clustering_algorithm == 'agglomerative': unknown_data = np.array( [i[0][0] for i in unknown_products.word_vector]) cluster_ = fastcluster.linkage_vector(unknown_data, method='ward') cluster_labels = Cluster.hierarchy.fcluster(cluster_, 0.2) unknown_products = unknown_products.assign( product_id=cluster_labels) elif clustering_algorithm == 'community': unknown_products = self.graph_communities( unknown_products, min_value_=0.8, topn_=400, k1=50, expected_density=0.1, graph_communities_df=None)
def radard_thread(gctx=None): set_realtime_priority(2) # wait for stats about the car to come in from controls cloudlog.info("radard is waiting for CarParams") CP = car.CarParams.from_bytes(Params().get("CarParams", block=True)) mocked= CP.radarName == "mock" VM = VehicleModel(CP) cloudlog.info("radard got CarParams") # import the radar from the fingerprint cloudlog.info("radard is importing %s", CP.radarName) exec('from selfdrive.radar.'+CP.radarName+'.interface import RadarInterface') context = zmq.Context() # *** subscribe to features and model from visiond poller = zmq.Poller() model = messaging.sub_sock(context, service_list['model'].port, conflate=True, poller=poller) live100 = messaging.sub_sock(context, service_list['live100'].port, conflate=True, poller=poller) PP = PathPlanner() RI = RadarInterface() last_md_ts = 0 last_l100_ts = 0 # *** publish live20 and liveTracks live20 = messaging.pub_sock(context, service_list['live20'].port) liveTracks = messaging.pub_sock(context, service_list['liveTracks'].port) path_x = np.arange(0.0, 140.0, 0.1) # 140 meters is max # Time-alignment rate = 20. # model and radar are both at 20Hz tsv = 1./rate v_len = 20 # how many speed data points to remember for t alignment with rdr data active = 0 steer_angle = 0. steer_override = False tracks = defaultdict(dict) # Kalman filter stuff: ekfv = EKFV1D() speedSensorV = SimpleSensor(XV, 1, 2) # v_ego v_ego = None v_ego_array = np.zeros([2, v_len]) v_ego_t_aligned = 0. rk = Ratekeeper(rate, print_delay_threshold=np.inf) while 1: rr = RI.update() ar_pts = {} for pt in rr.points: ar_pts[pt.trackId] = [pt.dRel + RDR_TO_LDR, pt.yRel, pt.vRel, pt.measured] # receive the live100s l100 = None md = None for socket, event in poller.poll(0): if socket is live100: l100 = messaging.recv_one(socket) elif socket is model: md = messaging.recv_one(socket) if l100 is not None: active = l100.live100.active v_ego = l100.live100.vEgo steer_angle = l100.live100.angleSteers steer_override = l100.live100.steerOverride v_ego_array = np.append(v_ego_array, [[v_ego], [float(rk.frame)/rate]], 1) v_ego_array = v_ego_array[:, 1:] last_l100_ts = l100.logMonoTime if v_ego is None: continue if md is not None: last_md_ts = md.logMonoTime # *** get path prediction from the model *** PP.update(v_ego, md) # run kalman filter only if prob is high enough if PP.lead_prob > 0.7: ekfv.update(speedSensorV.read(PP.lead_dist, covar=PP.lead_var)) ekfv.predict(tsv) ar_pts[VISION_POINT] = (float(ekfv.state[XV]), np.polyval(PP.d_poly, float(ekfv.state[XV])), float(ekfv.state[SPEEDV]), False) else: ekfv.state[XV] = PP.lead_dist ekfv.covar = (np.diag([PP.lead_var, ekfv.var_init])) ekfv.state[SPEEDV] = 0. if VISION_POINT in ar_pts: del ar_pts[VISION_POINT] # *** compute the likely path_y *** if (active and not steer_override) or mocked: # use path from model (always when mocking as steering is too noisy) path_y = np.polyval(PP.d_poly, path_x) else: # use path from steer, set angle_offset to 0 it does not only report the physical offset path_y = calc_lookahead_offset(v_ego, steer_angle, path_x, VM, angle_offset=0)[0] # *** remove missing points from meta data *** for ids in tracks.keys(): if ids not in ar_pts: tracks.pop(ids, None) # *** compute the tracks *** for ids in ar_pts: # ignore standalone vision point, unless we are mocking the radar if ids == VISION_POINT and not mocked: continue rpt = ar_pts[ids] # align v_ego by a fixed time to align it with the radar measurement cur_time = float(rk.frame)/rate v_ego_t_aligned = np.interp(cur_time - RI.delay, v_ego_array[1], v_ego_array[0]) d_path = np.sqrt(np.amin((path_x - rpt[0]) ** 2 + (path_y - rpt[1]) ** 2)) # add sign d_path *= np.sign(rpt[1] - np.interp(rpt[0], path_x, path_y)) # create the track if it doesn't exist or it's a new track if ids not in tracks: tracks[ids] = Track() tracks[ids].update(rpt[0], rpt[1], rpt[2], d_path, v_ego_t_aligned, rpt[3], steer_override) # allow the vision model to remove the stationary flag if distance and rel speed roughly match if VISION_POINT in ar_pts: fused_id = None best_score = NO_FUSION_SCORE for ids in tracks: dist_to_vision = np.sqrt((0.5*(ar_pts[VISION_POINT][0] - tracks[ids].dRel)) ** 2 + (2*(ar_pts[VISION_POINT][1] - tracks[ids].yRel)) ** 2) rel_speed_diff = abs(ar_pts[VISION_POINT][2] - tracks[ids].vRel) tracks[ids].update_vision_score(dist_to_vision, rel_speed_diff) if best_score > tracks[ids].vision_score: fused_id = ids best_score = tracks[ids].vision_score if fused_id is not None: tracks[fused_id].vision_cnt += 1 tracks[fused_id].update_vision_fusion() if DEBUG: print "NEW CYCLE" if VISION_POINT in ar_pts: print "vision", ar_pts[VISION_POINT] idens = tracks.keys() track_pts = np.array([tracks[iden].get_key_for_cluster() for iden in idens]) # If we have multiple points, cluster them if len(track_pts) > 1: link = linkage_vector(track_pts, method='centroid') cluster_idxs = fcluster(link, 2.5, criterion='distance') clusters = [None]*max(cluster_idxs) for idx in xrange(len(track_pts)): cluster_i = cluster_idxs[idx]-1 if clusters[cluster_i] == None: clusters[cluster_i] = Cluster() clusters[cluster_i].add(tracks[idens[idx]]) elif len(track_pts) == 1: # TODO: why do we need this? clusters = [Cluster()] clusters[0].add(tracks[idens[0]]) else: clusters = [] if DEBUG: for i in clusters: print i # *** extract the lead car *** lead_clusters = [c for c in clusters if c.is_potential_lead(v_ego)] lead_clusters.sort(key=lambda x: x.dRel) lead_len = len(lead_clusters) # *** extract the second lead from the whole set of leads *** lead2_clusters = [c for c in lead_clusters if c.is_potential_lead2(lead_clusters)] lead2_clusters.sort(key=lambda x: x.dRel) lead2_len = len(lead2_clusters) # *** publish live20 *** dat = messaging.new_message() dat.init('live20') dat.live20.mdMonoTime = last_md_ts dat.live20.canMonoTimes = list(rr.canMonoTimes) dat.live20.radarErrors = list(rr.errors) dat.live20.l100MonoTime = last_l100_ts if lead_len > 0: lead_clusters[0].toLive20(dat.live20.leadOne) if lead2_len > 0: lead2_clusters[0].toLive20(dat.live20.leadTwo) else: dat.live20.leadTwo.status = False else: dat.live20.leadOne.status = False dat.live20.cumLagMs = -rk.remaining*1000. live20.send(dat.to_bytes()) # *** publish tracks for UI debugging (keep last) *** dat = messaging.new_message() dat.init('liveTracks', len(tracks)) for cnt, ids in enumerate(tracks.keys()): if DEBUG: print "id: %4.0f x: %4.1f y: %4.1f vr: %4.1f d: %4.1f va: %4.1f vl: %4.1f vlk: %4.1f alk: %4.1f s: %1.0f" % \ (ids, tracks[ids].dRel, tracks[ids].yRel, tracks[ids].vRel, tracks[ids].dPath, tracks[ids].vLat, tracks[ids].vLead, tracks[ids].vLeadK, tracks[ids].aLeadK, tracks[ids].stationary) dat.liveTracks[cnt].trackId = ids dat.liveTracks[cnt].dRel = float(tracks[ids].dRel) dat.liveTracks[cnt].yRel = float(tracks[ids].yRel) dat.liveTracks[cnt].vRel = float(tracks[ids].vRel) dat.liveTracks[cnt].aRel = float(tracks[ids].aRel) dat.liveTracks[cnt].stationary = tracks[ids].stationary dat.liveTracks[cnt].oncoming = tracks[ids].oncoming liveTracks.send(dat.to_bytes()) rk.monitor_time()
def radard_thread(gctx=None): #print "===>>> File: controls/radard.py; FUnction: radard_thread" set_realtime_priority(1) # wait for stats about the car to come in from controls cloudlog.info("radard is waiting for CarParams") CP = car.CarParams.from_bytes(Params().get("CarParams", block=True)) cloudlog.info("radard got CarParams") # import the radar from the fingerprint cloudlog.info("radard is importing %s", CP.radarName) exec('from selfdrive.radar.'+CP.radarName+'.interface import RadarInterface') context = zmq.Context() # *** subscribe to features and model from visiond model = messaging.sub_sock(context, service_list['model'].port) live100 = messaging.sub_sock(context, service_list['live100'].port) PP = PathPlanner() RI = RadarInterface() last_md_ts = 0 last_l100_ts = 0 # *** publish live20 and liveTracks live20 = messaging.pub_sock(context, service_list['live20'].port) liveTracks = messaging.pub_sock(context, service_list['liveTracks'].port) path_x = np.arange(0.0, 140.0, 0.1) # 140 meters is max # Time-alignment rate = 20. # model and radar are both at 20Hz tsv = 1./rate rdr_delay = 0.10 # radar data delay in s v_len = 20 # how many speed data points to remember for t alignment with rdr data enabled = 0 steer_angle = 0. tracks = defaultdict(dict) # Kalman filter stuff: ekfv = EKFV1D() speedSensorV = SimpleSensor(XV, 1, 2) # v_ego v_ego = None v_ego_array = np.zeros([2, v_len]) v_ego_t_aligned = 0. rk = Ratekeeper(rate, print_delay_threshold=np.inf) while 1: rr = RI.update() ar_pts = {} for pt in rr.points: ar_pts[pt.trackId] = [pt.dRel + RDR_TO_LDR, pt.yRel, pt.vRel, pt.aRel, None, False, None] # receive the live100s l100 = messaging.recv_sock(live100) if l100 is not None: enabled = l100.live100.enabled v_ego = l100.live100.vEgo steer_angle = l100.live100.angleSteers v_ego_array = np.append(v_ego_array, [[v_ego], [float(rk.frame)/rate]], 1) v_ego_array = v_ego_array[:, 1:] last_l100_ts = l100.logMonoTime if v_ego is None: continue md = messaging.recv_sock(model) #print "============ RADAR Thread" #print md if md is not None: last_md_ts = md.logMonoTime # *** get path prediction from the model *** PP.update(sec_since_boot(), v_ego, md) # run kalman filter only if prob is high enough if PP.lead_prob > 0.7: ekfv.update(speedSensorV.read(PP.lead_dist, covar=PP.lead_var)) ekfv.predict(tsv) ar_pts[VISION_POINT] = (float(ekfv.state[XV]), np.polyval(PP.d_poly, float(ekfv.state[XV])), float(ekfv.state[SPEEDV]), np.nan, last_md_ts, np.nan, sec_since_boot()) else: ekfv.state[XV] = PP.lead_dist ekfv.covar = (np.diag([PP.lead_var, ekfv.var_init])) ekfv.state[SPEEDV] = 0. if VISION_POINT in ar_pts: del ar_pts[VISION_POINT] # *** compute the likely path_y *** if enabled: # use path from model path_poly path_y = np.polyval(PP.d_poly, path_x) else: # use path from steer, set angle_offset to 0 since calibration does not exactly report the physical offset path_y = calc_lookahead_offset(v_ego, steer_angle, path_x, CP, angle_offset=0)[0] # *** remove missing points from meta data *** for ids in tracks.keys(): if ids not in ar_pts: tracks.pop(ids, None) # *** compute the tracks *** for ids in ar_pts: # ignore the vision point for now if ids == VISION_POINT and not VISION_ONLY: continue elif ids != VISION_POINT and VISION_ONLY: continue rpt = ar_pts[ids] # align v_ego by a fixed time to align it with the radar measurement cur_time = float(rk.frame)/rate v_ego_t_aligned = np.interp(cur_time - rdr_delay, v_ego_array[1], v_ego_array[0]) d_path = np.sqrt(np.amin((path_x - rpt[0]) ** 2 + (path_y - rpt[1]) ** 2)) # create the track if it doesn't exist or it's a new track if ids not in tracks or rpt[5] == 1: tracks[ids] = Track() tracks[ids].update(rpt[0], rpt[1], rpt[2], d_path, v_ego_t_aligned) # allow the vision model to remove the stationary flag if distance and rel speed roughly match if VISION_POINT in ar_pts: dist_to_vision = np.sqrt((0.5*(ar_pts[VISION_POINT][0] - rpt[0])) ** 2 + (2*(ar_pts[VISION_POINT][1] - rpt[1])) ** 2) rel_speed_diff = abs(ar_pts[VISION_POINT][2] - rpt[2]) tracks[ids].mix_vision(dist_to_vision, rel_speed_diff) # publish tracks (debugging) dat = messaging.new_message() dat.init('liveTracks', len(tracks)) for cnt, ids in enumerate(tracks.keys()): dat.liveTracks[cnt].trackId = ids dat.liveTracks[cnt].dRel = float(tracks[ids].dRel) dat.liveTracks[cnt].yRel = float(tracks[ids].yRel) dat.liveTracks[cnt].vRel = float(tracks[ids].vRel) dat.liveTracks[cnt].aRel = float(tracks[ids].aRel) dat.liveTracks[cnt].stationary = tracks[ids].stationary dat.liveTracks[cnt].oncoming = tracks[ids].oncoming liveTracks.send(dat.to_bytes()) idens = tracks.keys() track_pts = np.array([tracks[iden].get_key_for_cluster() for iden in idens]) # If we have multiple points, cluster them if len(track_pts) > 1: link = linkage_vector(track_pts, method='centroid') cluster_idxs = fcluster(link, 2.5, criterion='distance') clusters = [None]*max(cluster_idxs) for idx in xrange(len(track_pts)): cluster_i = cluster_idxs[idx]-1 if clusters[cluster_i] == None: clusters[cluster_i] = Cluster() clusters[cluster_i].add(tracks[idens[idx]]) elif len(track_pts) == 1: # TODO: why do we need this? clusters = [Cluster()] clusters[0].add(tracks[idens[0]]) else: clusters = [] # *** extract the lead car *** lead_clusters = [c for c in clusters if c.is_potential_lead(v_ego)] lead_clusters.sort(key=lambda x: x.dRel) lead_len = len(lead_clusters) # *** extract the second lead from the whole set of leads *** lead2_clusters = [c for c in lead_clusters if c.is_potential_lead2(lead_clusters)] lead2_clusters.sort(key=lambda x: x.dRel) lead2_len = len(lead2_clusters) # *** publish live20 *** dat = messaging.new_message() dat.init('live20') dat.live20.mdMonoTime = last_md_ts dat.live20.canMonoTimes = list(rr.canMonoTimes) dat.live20.l100MonoTime = last_l100_ts if lead_len > 0: lead_clusters[0].toLive20(dat.live20.leadOne) if lead2_len > 0: lead2_clusters[0].toLive20(dat.live20.leadTwo) else: dat.live20.leadTwo.status = False else: dat.live20.leadOne.status = False dat.live20.cumLagMs = -rk.remaining*1000. live20.send(dat.to_bytes()) rk.monitor_time()
# 少ない列を削除 print(X_train.shape) # drop_columns = X_train.columns[X_train.var(axis='index') <= 2]# 0.5 by bunsan のとき # X_train = X_train.drop(drop_columns, axis=1) # print(X_train.shape) X_train = X_train[:880] print(X_train.shape) # 正規化 # X_train = normalization(X_train) methods = ["ward"] # methods = ["ward", "single", "centroid", "median"] for method in methods: Z = fastcluster.linkage_vector(X_train, method=method, metric="euclidean") Z_dataFrame = pd.DataFrame( data=Z, columns=["clusterOne", "clusterTwo", "distance", "newClusterSize"]) # print(Z_dataFrame[:10]) BINARY_SEARCH = 0 LINER_SEARCH = 1 if BINARY_SEARCH: # クラスター数を要素数の半分とした時の閾値を決めて評価 ( distance_threshold, clusters, X_train_hierClustered,
def test_scipy_clustering(self): old_link = linkage_vector(TRACK_PTS, method='centroid') old_cluster_idxs = fcluster(old_link, 2.5, criterion='distance') np.testing.assert_allclose(old_link, CORRECT_LINK) np.testing.assert_allclose(old_cluster_idxs, CORRECT_LABELS)
def heatmap(x, row_header, column_header, row_method, column_method, row_metric, column_metric, color_gradient, filename, other_data=None, log=False, trad=False, level_row=0.4, level_column=0.5, folder=os.getcwd(), range_normalization=(-2,2), colorbar_ticks=[-2, 0, 2], colorbar_ticklabels=['$ <\mu-2 \sigma$', '$\mu$', '$> \mu+2 \sigma$'], colorbar_title='Feature range', title=None, save=False, show=True): print "\nPerforming hiearchical clustering using %s for columns and %s for rows" % (column_metric,row_metric), if numpy.any(numpy.isnan(x)): sys.stderr.write("WARNING, there are NaN values in the data. Hence distances with data elements that have NaN values will have value NaN, which might perturb the hierarchical clustering.") """ This below code is based in large part on the protype methods: http://old.nabble.com/How-to-plot-heatmap-with-matplotlib--td32534593.html http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre Possibilities for methods: single, complete, average, centroid, median, ward Possibilities for metrics: 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule x is an m by n ndarray, m observations, n genes, or m rows,n columns WARNING WARNING This is a modified version to work with "big data" (starting with m=50,000). Indeed, the previous version actually stores the distance matrix in the memory which makes it crash. Here, we use the package fastcluster (see http://danifold.net/fastcluster.html) in its memory-efficient implementation. The parameter method must be one of 'single', 'centroid', 'median', 'ward', complete, average, weighted. It can take a dissimilarity matrix in input, ie we don't necessarily have to use a metric which is already implemented If one wants to plot another data than that which is used for the clustering, then this can be inputed in other_data. If X is n_row, n_columns, then other_data should be n_row, m_col """ print level_column, level_row #for export if numpy.any(~numpy.array([type(s)==str for s in row_header])): row_header=[str(el) for el in row_header] if numpy.any(~numpy.array([type(s)==str for s in column_header])): column_header=[str(el) for el in column_header] ### Define the color gradient to use based on the provided name n = len(x[0]); m = len(x) if color_gradient == 'red_white_blue': cmap=pylab.cm.bwr if color_gradient == 'red_black_sky': cmap=RedBlackSkyBlue() if color_gradient=='OrRd': cmap = pylab.cm.OrRd if color_gradient == 'red_black_blue': cmap=RedBlackBlue() if color_gradient == 'red_black_green': cmap=RedBlackGreen() if color_gradient == 'yellow_black_blue': cmap=YellowBlackBlue() if color_gradient == 'seismic': cmap=pylab.cm.seismic if color_gradient == 'green_white_purple': cmap=pylab.cm.PiYG_r if color_gradient == 'coolwarm': cmap=pylab.cm.coolwarm if color_gradient=='YlOrRd': cmap=pylab.cm.YlOrRd ### Scale the max and min colors so that 0 is white/black vmin=numpy.nanmin(x) vmax=numpy.nanmax(x) vmax = max([vmax,abs(vmin)]) #vmin = vmax*-1 # if log: # norm = mpl.colors.LogNorm(vmin, vmax) ### adjust the max and min to scale these colors # elif normalization: # norm = mpl.colors.Normalize(10**(-70), 1) # else: if numpy.any(x<0): norm = mpl.colors.Normalize(range_normalization[0], range_normalization[1]) else: if range_normalization[0]<0: norm = mpl.colors.Normalize(0,range_normalization[1]) else: norm = mpl.colors.Normalize(range_normalization[0], range_normalization[1]) ### Scale the Matplotlib window size default_window_hight = 8.5 default_window_width = 12 fig = pylab.figure(figsize=(default_window_width,default_window_hight)) ### could use m,n to scale here color_bar_w = 0.015 ### Sufficient size to show ## calculate positions for all elements # ax1, placement of dendrogram 1, on the left of the heatmap #if row_method != None: w1 = [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05,0.22,0.2,0.6] ### The second value controls the position of the matrix relative to the bottom of the view width_between_ax1_axr = 0.004 height_between_ax1_axc = 0.004 ### distance between the top color bar axis and the matrix # axr, placement of row side colorbar [axr_x, axr_y, axr_w, axr_h] = [0.31,0.1,color_bar_w,0.6] ### second to last controls the width of the side color bar - 0.015 when showing axr_x = ax1_x + ax1_w + width_between_ax1_axr axr_y = ax1_y; axr_h = ax1_h width_between_axr_axm = 0.004 # axc, placement of column side colorbar [axc_x, axc_y, axc_w, axc_h] = [0.4,0.63,0.5,color_bar_w] ### last one controls the hight of the top color bar - 0.015 when showing axc_x = axr_x + axr_w + width_between_axr_axm axc_y = ax1_y + ax1_h + height_between_ax1_axc height_between_axc_ax2 = 0.004 # axm, placement of heatmap for the data matrix [axm_x, axm_y, axm_w, axm_h] = [0.4,0.9,2.5,0.5] axm_x = axr_x + axr_w + width_between_axr_axm axm_y = ax1_y; axm_h = ax1_h axm_w = axc_w # ax2, placement of dendrogram 2, on the top of the heatmap [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3,0.72,0.6,0.15] ### last one controls hight of the dendrogram ax2_x = axr_x + axr_w + width_between_axr_axm ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2 ax2_w = axc_w # axcb - placement of the color legend [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07,0.88,0.18,0.04] # Compute and plot top dendrogram if column_method != None: start_time = time.time() # d2 = dist.pdist(x.T) # D2 = dist.squareform(d2) ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True) Y2 = fastcluster.linkage_vector(x.T, method=column_method, metric=column_metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete' Z2 = sch.dendrogram(Y2) ind2 = sch.fcluster(Y2,level_column*max(Y2[:,2]),'distance') ### This is the default behavior of dendrogram ax2.set_xticks([]) ### Hides ticks ax2.set_yticks([]) time_diff = str(round(time.time()-start_time,1)) print 'Column clustering completed in %s seconds' % time_diff else: ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data # Compute and plot left dendrogram. if row_method != None: start_time = time.time() # d1 = dist.pdist(x) # D1 = dist.squareform(d1) # full matrix ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True) # frame_on may be False if row_metric==None: Y1 = fastcluster.linkage_vector(x, method=row_method) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete' else: Y1 = fastcluster.linkage_vector(x, method=row_method, metric=row_metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete' Z1 = sch.dendrogram(Y1, orientation='right') ind1 = sch.fcluster(Y1,level_row*max(Y1[:,2]),'distance') ### This is the default behavior of dendrogram ax1.set_xticks([]) ### Hides ticks ax1.set_yticks([]) time_diff = str(round(time.time()-start_time,1)) print 'Row clustering completed in %s seconds' % time_diff else: ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data if save: print 'Saving flat clusters in', 'Flat_clusters_{}_{}.pkl'.format(filename, level_row) f=open('Flat_clusters_{}_{}.pkl'.format(filename,level_row), 'w') pickle.dump([ind1, ind2],f); f.close() ind1_to_return = np.array(ind1) # if trad: # if len(row_header)>100: # genes=list(row_header) # clustering = numpy.array(ind1) # elif len(column_header)>100: # genes=list(column_header) # clustering=numpy.array(ind2) # else: # print 'Tell which of column and row is the gene list' # pdb.set_trace() # #il faut d'abord traduire de SYMBOL en ENSEMBL # trad = EnsemblEntrezTrad('../data/mapping_2014/mitocheck_siRNAs_target_genes_Ens75.txt') # trad['ctrl']='None' # # result=[Counter([trad[genes[k]] for k in numpy.where(clustering==cluster)[0]]).keys() for cluster in range(1,numpy.max(clustering)+1)] # for geneList in result: # for i,gene in enumerate(geneList): # if '/' in gene: # geneList[i]=gene.split('/')[0] # geneList.append(gene.split('/')[1]) # # #ensuite on va enregistrer les genes des differents clusters dans differents fichiers # #background par defaut c'est genes_list.txt # print "Nb of cluster found", numpy.max(clustering) # multipleGeneListsToFile(result, ['Cluster {}'.format(k+1) for k in range(numpy.max(clustering))], 'gene_cluster_{}_{}.txt'.format(column_method, filename)) # Plot distance matrix. axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h]) # axes for the data matrix xt = x if column_method != None: idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data xt = xt[:,idx2] ind2 = ind2[idx2] ### reorder the flat cluster to match the order of the leaves the dendrogram if row_method != None: idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data xt = xt[idx1,:] # xt is transformed x if other_data is not None: other_data=other_data[idx1,:] ind1 = ind1[idx1] ### reorder the flat cluster to match the order of the leaves the dendrogram ### taken from http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python/3011894#3011894 if other_data is None: im = axm.matshow(xt, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black else: im = axm.matshow(other_data, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black axm.set_xticks([]) ### Hides x-ticks axm.set_yticks([]) # Add text new_row_header=[] new_column_header=[] for i in range(x.shape[0]): if row_method != None: if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows axm.text(x.shape[1]-0.5, i, ' {}'.format(row_header[idx1[i]]), fontsize=6) new_row_header.append(row_header[idx1[i]]) else: if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows axm.text(x.shape[1]-0.5, i, ' {}'.format(row_header[i]), fontsize=6) ### When not clustering rows new_row_header.append(row_header[i]) column_decider=x if other_data is None else other_data for i in range(column_decider.shape[1]): if column_method != None: if len(column_header)<200: axm.text(i, -0.9, '{}'.format(column_header[idx2[i]]), rotation=270, verticalalignment="top", fontsize=6) # rotation could also be degrees new_column_header.append(column_header[idx2[i]]) else: ### When not clustering columns if len(column_header)<200: axm.text(i, -0.9, '{}'.format(column_header[i]), rotation=270, verticalalignment="top", fontsize=6) new_column_header.append(column_header[i]) # Plot colside colors # axc --> axes for column side colorbar if column_method != None: print 'Number of clusters for columns ', np.bincount(ind2) axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h]) # axes for column side colorbar #getting a degrade colormap for the column side colorbar cm = ColorMap() cr = cm.makeColorRamp(256, ["#FFFF00", "#FF0000"]) degrade = [cm.getColorFromMap(x, cr, 0, 10) for x in range(len(np.bincount(ind2)))] cmap_c = mpl.colors.ListedColormap(degrade) dc = numpy.array(ind2, dtype=int) dc.shape = (1,len(ind2)) im_c = axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c) axc.set_xticks([]) ### Hides ticks axc.set_yticks([]) # Plot rowside colors # axr --> axes for row side colorbar if row_method != None: print 'Number of clusters for rows ', np.bincount(ind1) axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h]) # axes for column side colorbar dr = numpy.array(ind1, dtype=int) dr.shape = (len(ind1),1) #rainbow colormap for row side colorbar cmap_r = mpl.cm.gist_rainbow im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r) axr.set_xticks([]) ### Hides ticks axr.set_yticks([]) # Plot color legend axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False) # axes for colorbar axcb.set_title(colorbar_title, fontsize=15) cb = mpl.colorbar.ColorbarBase(axcb, cmap=cmap,norm=norm, orientation='horizontal', ticks=colorbar_ticks) cb.ax.set_xticklabels(colorbar_ticklabels, fontsize=15) filename = '%s/Clust_%s_%s_%s.pdf' % (folder, filename[:10],column_method,row_method) # exportFlatClusterData(filename, new_row_header,new_column_header,xt,ind1,ind2) # ### Render the graphic # if len(row_header)>50 or len(column_header)>50: # pylab.rcParams['font.size'] = 5 # else: pylab.rcParams['font.size'] = 15 if title is not None: axm.set_xlabel(title) pylab.savefig(filename) print 'Exporting:',filename if show: pylab.show() # if trad: # return result return ind1_to_return
class TestClustermap: rs = np.random.RandomState(sum(map(ord, "clustermap"))) x_norm = rs.randn(4, 8) + np.arange(8) x_norm = (x_norm.T + np.arange(4)).T letters = pd.Series(["A", "B", "C", "D", "E", "F", "G", "H"], name="letters") df_norm = pd.DataFrame(x_norm, columns=letters) default_kws = dict(pivot_kws=None, z_score=None, standard_scale=None, figsize=(10, 10), row_colors=None, col_colors=None, dendrogram_ratio=.2, colors_ratio=.03, cbar_pos=(0, .8, .05, .2)) default_plot_kws = dict(metric='euclidean', method='average', colorbar_kws=None, row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None, tree_kws=None) row_colors = color_palette('Set2', df_norm.shape[0]) col_colors = color_palette('Dark2', df_norm.shape[1]) if not _no_scipy: if _no_fastcluster: x_norm_distances = distance.pdist(x_norm.T, metric='euclidean') x_norm_linkage = hierarchy.linkage(x_norm_distances, method='single') else: x_norm_linkage = fastcluster.linkage_vector(x_norm.T, metric='euclidean', method='single') x_norm_dendrogram = hierarchy.dendrogram(x_norm_linkage, no_plot=True, color_threshold=-np.inf) x_norm_leaves = x_norm_dendrogram['leaves'] df_norm_leaves = np.asarray(df_norm.columns[x_norm_leaves]) def test_ndarray_input(self): cg = mat.ClusterGrid(self.x_norm, **self.default_kws) pdt.assert_frame_equal(cg.data, pd.DataFrame(self.x_norm)) assert len(cg.fig.axes) == 4 assert cg.ax_row_colors is None assert cg.ax_col_colors is None def test_df_input(self): cg = mat.ClusterGrid(self.df_norm, **self.default_kws) pdt.assert_frame_equal(cg.data, self.df_norm) def test_corr_df_input(self): df = self.df_norm.corr() cg = mat.ClusterGrid(df, **self.default_kws) cg.plot(**self.default_plot_kws) diag = cg.data2d.values[np.diag_indices_from(cg.data2d)] npt.assert_array_almost_equal(diag, np.ones(cg.data2d.shape[0])) def test_pivot_input(self): df_norm = self.df_norm.copy() df_norm.index.name = 'numbers' df_long = pd.melt(df_norm.reset_index(), var_name='letters', id_vars='numbers') kws = self.default_kws.copy() kws['pivot_kws'] = dict(index='numbers', columns='letters', values='value') cg = mat.ClusterGrid(df_long, **kws) pdt.assert_frame_equal(cg.data2d, df_norm) def test_colors_input(self): kws = self.default_kws.copy() kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cg = mat.ClusterGrid(self.df_norm, **kws) npt.assert_array_equal(cg.row_colors, self.row_colors) npt.assert_array_equal(cg.col_colors, self.col_colors) assert len(cg.fig.axes) == 6 def test_categorical_colors_input(self): kws = self.default_kws.copy() row_colors = pd.Series(self.row_colors, dtype="category") col_colors = pd.Series(self.col_colors, dtype="category", index=self.df_norm.columns) kws['row_colors'] = row_colors kws['col_colors'] = col_colors exp_row_colors = list(map(mpl.colors.to_rgb, row_colors)) exp_col_colors = list(map(mpl.colors.to_rgb, col_colors)) cg = mat.ClusterGrid(self.df_norm, **kws) npt.assert_array_equal(cg.row_colors, exp_row_colors) npt.assert_array_equal(cg.col_colors, exp_col_colors) assert len(cg.fig.axes) == 6 def test_nested_colors_input(self): kws = self.default_kws.copy() row_colors = [self.row_colors, self.row_colors] col_colors = [self.col_colors, self.col_colors] kws['row_colors'] = row_colors kws['col_colors'] = col_colors cm = mat.ClusterGrid(self.df_norm, **kws) npt.assert_array_equal(cm.row_colors, row_colors) npt.assert_array_equal(cm.col_colors, col_colors) assert len(cm.fig.axes) == 6 def test_colors_input_custom_cmap(self): kws = self.default_kws.copy() kws['cmap'] = mpl.cm.PRGn kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cg = mat.clustermap(self.df_norm, **kws) npt.assert_array_equal(cg.row_colors, self.row_colors) npt.assert_array_equal(cg.col_colors, self.col_colors) assert len(cg.fig.axes) == 6 def test_z_score(self): df = self.df_norm.copy() df = (df - df.mean()) / df.std() kws = self.default_kws.copy() kws['z_score'] = 1 cg = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cg.data2d, df) def test_z_score_axis0(self): df = self.df_norm.copy() df = df.T df = (df - df.mean()) / df.std() df = df.T kws = self.default_kws.copy() kws['z_score'] = 0 cg = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cg.data2d, df) def test_standard_scale(self): df = self.df_norm.copy() df = (df - df.min()) / (df.max() - df.min()) kws = self.default_kws.copy() kws['standard_scale'] = 1 cg = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cg.data2d, df) def test_standard_scale_axis0(self): df = self.df_norm.copy() df = df.T df = (df - df.min()) / (df.max() - df.min()) df = df.T kws = self.default_kws.copy() kws['standard_scale'] = 0 cg = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cg.data2d, df) def test_z_score_standard_scale(self): kws = self.default_kws.copy() kws['z_score'] = True kws['standard_scale'] = True with pytest.raises(ValueError): mat.ClusterGrid(self.df_norm, **kws) def test_color_list_to_matrix_and_cmap(self): # Note this uses the attribute named col_colors but tests row colors matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( self.col_colors, self.x_norm_leaves, axis=0) for i, leaf in enumerate(self.x_norm_leaves): color = self.col_colors[leaf] assert_colors_equal(cmap(matrix[i, 0]), color) def test_nested_color_list_to_matrix_and_cmap(self): # Note this uses the attribute named col_colors but tests row colors colors = [self.col_colors, self.col_colors[::-1]] matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( colors, self.x_norm_leaves, axis=0) for i, leaf in enumerate(self.x_norm_leaves): for j, color_row in enumerate(colors): color = color_row[leaf] assert_colors_equal(cmap(matrix[i, j]), color) def test_color_list_to_matrix_and_cmap_axis1(self): matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( self.col_colors, self.x_norm_leaves, axis=1) for j, leaf in enumerate(self.x_norm_leaves): color = self.col_colors[leaf] assert_colors_equal(cmap(matrix[0, j]), color) def test_color_list_to_matrix_and_cmap_different_sizes(self): colors = [self.col_colors, self.col_colors * 2] with pytest.raises(ValueError): matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( colors, self.x_norm_leaves, axis=1) def test_savefig(self): # Not sure if this is the right way to test.... cg = mat.ClusterGrid(self.df_norm, **self.default_kws) cg.plot(**self.default_plot_kws) cg.savefig(tempfile.NamedTemporaryFile(), format='png') def test_plot_dendrograms(self): cm = mat.clustermap(self.df_norm, **self.default_kws) assert len(cm.ax_row_dendrogram.collections[0].get_paths()) == len( cm.dendrogram_row.independent_coord) assert len(cm.ax_col_dendrogram.collections[0].get_paths()) == len( cm.dendrogram_col.independent_coord) data2d = self.df_norm.iloc[cm.dendrogram_row.reordered_ind, cm.dendrogram_col.reordered_ind] pdt.assert_frame_equal(cm.data2d, data2d) def test_cluster_false(self): kws = self.default_kws.copy() kws['row_cluster'] = False kws['col_cluster'] = False cm = mat.clustermap(self.df_norm, **kws) assert len(cm.ax_row_dendrogram.lines) == 0 assert len(cm.ax_col_dendrogram.lines) == 0 assert len(cm.ax_row_dendrogram.get_xticks()) == 0 assert len(cm.ax_row_dendrogram.get_yticks()) == 0 assert len(cm.ax_col_dendrogram.get_xticks()) == 0 assert len(cm.ax_col_dendrogram.get_yticks()) == 0 pdt.assert_frame_equal(cm.data2d, self.df_norm) def test_row_col_colors(self): kws = self.default_kws.copy() kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cm = mat.clustermap(self.df_norm, **kws) assert len(cm.ax_row_colors.collections) == 1 assert len(cm.ax_col_colors.collections) == 1 def test_cluster_false_row_col_colors(self): kws = self.default_kws.copy() kws['row_cluster'] = False kws['col_cluster'] = False kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cm = mat.clustermap(self.df_norm, **kws) assert len(cm.ax_row_dendrogram.lines) == 0 assert len(cm.ax_col_dendrogram.lines) == 0 assert len(cm.ax_row_dendrogram.get_xticks()) == 0 assert len(cm.ax_row_dendrogram.get_yticks()) == 0 assert len(cm.ax_col_dendrogram.get_xticks()) == 0 assert len(cm.ax_col_dendrogram.get_yticks()) == 0 assert len(cm.ax_row_colors.collections) == 1 assert len(cm.ax_col_colors.collections) == 1 pdt.assert_frame_equal(cm.data2d, self.df_norm) def test_row_col_colors_df(self): kws = self.default_kws.copy() kws['row_colors'] = pd.DataFrame( { 'row_1': list(self.row_colors), 'row_2': list(self.row_colors) }, index=self.df_norm.index, columns=['row_1', 'row_2']) kws['col_colors'] = pd.DataFrame( { 'col_1': list(self.col_colors), 'col_2': list(self.col_colors) }, index=self.df_norm.columns, columns=['col_1', 'col_2']) cm = mat.clustermap(self.df_norm, **kws) row_labels = [l.get_text() for l in cm.ax_row_colors.get_xticklabels()] assert cm.row_color_labels == ['row_1', 'row_2'] assert row_labels == cm.row_color_labels col_labels = [l.get_text() for l in cm.ax_col_colors.get_yticklabels()] assert cm.col_color_labels == ['col_1', 'col_2'] assert col_labels == cm.col_color_labels def test_row_col_colors_df_shuffled(self): # Tests if colors are properly matched, even if given in wrong order m, n = self.df_norm.shape shuffled_inds = [ self.df_norm.index[i] for i in list(range(0, m, 2)) + list(range(1, m, 2)) ] shuffled_cols = [ self.df_norm.columns[i] for i in list(range(0, n, 2)) + list(range(1, n, 2)) ] kws = self.default_kws.copy() row_colors = pd.DataFrame({'row_annot': list(self.row_colors)}, index=self.df_norm.index) kws['row_colors'] = row_colors.loc[shuffled_inds] col_colors = pd.DataFrame({'col_annot': list(self.col_colors)}, index=self.df_norm.columns) kws['col_colors'] = col_colors.loc[shuffled_cols] cm = mat.clustermap(self.df_norm, **kws) assert list(cm.col_colors)[0] == list(self.col_colors) assert list(cm.row_colors)[0] == list(self.row_colors) def test_row_col_colors_df_missing(self): kws = self.default_kws.copy() row_colors = pd.DataFrame({'row_annot': list(self.row_colors)}, index=self.df_norm.index) kws['row_colors'] = row_colors.drop(self.df_norm.index[0]) col_colors = pd.DataFrame({'col_annot': list(self.col_colors)}, index=self.df_norm.columns) kws['col_colors'] = col_colors.drop(self.df_norm.columns[0]) cm = mat.clustermap(self.df_norm, **kws) assert list( cm.col_colors)[0] == [(1.0, 1.0, 1.0)] + list(self.col_colors[1:]) assert list( cm.row_colors)[0] == [(1.0, 1.0, 1.0)] + list(self.row_colors[1:]) def test_row_col_colors_df_one_axis(self): # Test case with only row annotation. kws1 = self.default_kws.copy() kws1['row_colors'] = pd.DataFrame( { 'row_1': list(self.row_colors), 'row_2': list(self.row_colors) }, index=self.df_norm.index, columns=['row_1', 'row_2']) cm1 = mat.clustermap(self.df_norm, **kws1) row_labels = [ l.get_text() for l in cm1.ax_row_colors.get_xticklabels() ] assert cm1.row_color_labels == ['row_1', 'row_2'] assert row_labels == cm1.row_color_labels # Test case with only col annotation. kws2 = self.default_kws.copy() kws2['col_colors'] = pd.DataFrame( { 'col_1': list(self.col_colors), 'col_2': list(self.col_colors) }, index=self.df_norm.columns, columns=['col_1', 'col_2']) cm2 = mat.clustermap(self.df_norm, **kws2) col_labels = [ l.get_text() for l in cm2.ax_col_colors.get_yticklabels() ] assert cm2.col_color_labels == ['col_1', 'col_2'] assert col_labels == cm2.col_color_labels def test_row_col_colors_series(self): kws = self.default_kws.copy() kws['row_colors'] = pd.Series(list(self.row_colors), name='row_annot', index=self.df_norm.index) kws['col_colors'] = pd.Series(list(self.col_colors), name='col_annot', index=self.df_norm.columns) cm = mat.clustermap(self.df_norm, **kws) row_labels = [l.get_text() for l in cm.ax_row_colors.get_xticklabels()] assert cm.row_color_labels == ['row_annot'] assert row_labels == cm.row_color_labels col_labels = [l.get_text() for l in cm.ax_col_colors.get_yticklabels()] assert cm.col_color_labels == ['col_annot'] assert col_labels == cm.col_color_labels def test_row_col_colors_series_shuffled(self): # Tests if colors are properly matched, even if given in wrong order m, n = self.df_norm.shape shuffled_inds = [ self.df_norm.index[i] for i in list(range(0, m, 2)) + list(range(1, m, 2)) ] shuffled_cols = [ self.df_norm.columns[i] for i in list(range(0, n, 2)) + list(range(1, n, 2)) ] kws = self.default_kws.copy() row_colors = pd.Series(list(self.row_colors), name='row_annot', index=self.df_norm.index) kws['row_colors'] = row_colors.loc[shuffled_inds] col_colors = pd.Series(list(self.col_colors), name='col_annot', index=self.df_norm.columns) kws['col_colors'] = col_colors.loc[shuffled_cols] cm = mat.clustermap(self.df_norm, **kws) assert list(cm.col_colors) == list(self.col_colors) assert list(cm.row_colors) == list(self.row_colors) def test_row_col_colors_series_missing(self): kws = self.default_kws.copy() row_colors = pd.Series(list(self.row_colors), name='row_annot', index=self.df_norm.index) kws['row_colors'] = row_colors.drop(self.df_norm.index[0]) col_colors = pd.Series(list(self.col_colors), name='col_annot', index=self.df_norm.columns) kws['col_colors'] = col_colors.drop(self.df_norm.columns[0]) cm = mat.clustermap(self.df_norm, **kws) assert list( cm.col_colors) == [(1.0, 1.0, 1.0)] + list(self.col_colors[1:]) assert list( cm.row_colors) == [(1.0, 1.0, 1.0)] + list(self.row_colors[1:]) def test_row_col_colors_ignore_heatmap_kwargs(self): g = mat.clustermap(self.rs.uniform(0, 200, self.df_norm.shape), row_colors=self.row_colors, col_colors=self.col_colors, cmap="Spectral", norm=mpl.colors.LogNorm(), vmax=100) assert np.array_equal( np.array(self.row_colors)[g.dendrogram_row.reordered_ind], g.ax_row_colors.collections[0].get_facecolors()[:, :3]) assert np.array_equal( np.array(self.col_colors)[g.dendrogram_col.reordered_ind], g.ax_col_colors.collections[0].get_facecolors()[:, :3]) def test_row_col_colors_raise_on_mixed_index_types(self): row_colors = pd.Series(list(self.row_colors), name="row_annot", index=self.df_norm.index) col_colors = pd.Series(list(self.col_colors), name="col_annot", index=self.df_norm.columns) with pytest.raises(TypeError): mat.clustermap(self.x_norm, row_colors=row_colors) with pytest.raises(TypeError): mat.clustermap(self.x_norm, col_colors=col_colors) def test_mask_reorganization(self): kws = self.default_kws.copy() kws["mask"] = self.df_norm > 0 g = mat.clustermap(self.df_norm, **kws) npt.assert_array_equal(g.data2d.index, g.mask.index) npt.assert_array_equal(g.data2d.columns, g.mask.columns) npt.assert_array_equal( g.mask.index, self.df_norm.index[g.dendrogram_row.reordered_ind]) npt.assert_array_equal( g.mask.columns, self.df_norm.columns[g.dendrogram_col.reordered_ind]) def test_ticklabel_reorganization(self): kws = self.default_kws.copy() xtl = np.arange(self.df_norm.shape[1]) kws["xticklabels"] = list(xtl) ytl = self.letters.loc[:self.df_norm.shape[0]] kws["yticklabels"] = ytl g = mat.clustermap(self.df_norm, **kws) xtl_actual = [t.get_text() for t in g.ax_heatmap.get_xticklabels()] ytl_actual = [t.get_text() for t in g.ax_heatmap.get_yticklabels()] xtl_want = xtl[g.dendrogram_col.reordered_ind].astype("<U1") ytl_want = ytl[g.dendrogram_row.reordered_ind].astype("<U1") npt.assert_array_equal(xtl_actual, xtl_want) npt.assert_array_equal(ytl_actual, ytl_want) def test_noticklabels(self): kws = self.default_kws.copy() kws["xticklabels"] = False kws["yticklabels"] = False g = mat.clustermap(self.df_norm, **kws) xtl_actual = [t.get_text() for t in g.ax_heatmap.get_xticklabels()] ytl_actual = [t.get_text() for t in g.ax_heatmap.get_yticklabels()] assert xtl_actual == [] assert ytl_actual == [] def test_size_ratios(self): # The way that wspace/hspace work in GridSpec, the mapping from input # ratio to actual width/height of each axes is complicated, so this # test is just going to assert comparative relationships kws1 = self.default_kws.copy() kws1.update(dendrogram_ratio=.2, colors_ratio=.03, col_colors=self.col_colors, row_colors=self.row_colors) kws2 = kws1.copy() kws2.update(dendrogram_ratio=.3, colors_ratio=.05) g1 = mat.clustermap(self.df_norm, **kws1) g2 = mat.clustermap(self.df_norm, **kws2) assert (g2.ax_col_dendrogram.get_position().height > g1.ax_col_dendrogram.get_position().height) assert (g2.ax_col_colors.get_position().height > g1.ax_col_colors.get_position().height) assert (g2.ax_heatmap.get_position().height < g1.ax_heatmap.get_position().height) assert (g2.ax_row_dendrogram.get_position().width > g1.ax_row_dendrogram.get_position().width) assert (g2.ax_row_colors.get_position().width > g1.ax_row_colors.get_position().width) assert (g2.ax_heatmap.get_position().width < g1.ax_heatmap.get_position().width) kws1 = self.default_kws.copy() kws1.update(col_colors=self.col_colors) kws2 = kws1.copy() kws2.update(col_colors=[self.col_colors, self.col_colors]) g1 = mat.clustermap(self.df_norm, **kws1) g2 = mat.clustermap(self.df_norm, **kws2) assert (g2.ax_col_colors.get_position().height > g1.ax_col_colors.get_position().height) kws1 = self.default_kws.copy() kws1.update(dendrogram_ratio=(.2, .2)) kws2 = kws1.copy() kws2.update(dendrogram_ratio=(.2, .3)) g1 = mat.clustermap(self.df_norm, **kws1) g2 = mat.clustermap(self.df_norm, **kws2) # Fails on pinned matplotlib? # assert (g2.ax_row_dendrogram.get_position().width # == g1.ax_row_dendrogram.get_position().width) assert g1.gs.get_width_ratios() == g2.gs.get_width_ratios() assert (g2.ax_col_dendrogram.get_position().height > g1.ax_col_dendrogram.get_position().height) def test_cbar_pos(self): kws = self.default_kws.copy() kws["cbar_pos"] = (.2, .1, .4, .3) g = mat.clustermap(self.df_norm, **kws) pos = g.ax_cbar.get_position() assert pytest.approx(tuple(pos.p0)) == kws["cbar_pos"][:2] assert pytest.approx(pos.width) == kws["cbar_pos"][2] assert pytest.approx(pos.height) == kws["cbar_pos"][3] kws["cbar_pos"] = None g = mat.clustermap(self.df_norm, **kws) assert g.ax_cbar is None def test_square_warning(self): kws = self.default_kws.copy() g1 = mat.clustermap(self.df_norm, **kws) with pytest.warns(UserWarning): kws["square"] = True g2 = mat.clustermap(self.df_norm, **kws) g1_shape = g1.ax_heatmap.get_position().get_points() g2_shape = g2.ax_heatmap.get_position().get_points() assert np.array_equal(g1_shape, g2_shape) def test_clustermap_annotation(self): g = mat.clustermap(self.df_norm, annot=True, fmt=".1f") for val, text in zip(np.asarray(g.data2d).flat, g.ax_heatmap.texts): assert text.get_text() == "{:.1f}".format(val) g = mat.clustermap(self.df_norm, annot=self.df_norm, fmt=".1f") for val, text in zip(np.asarray(g.data2d).flat, g.ax_heatmap.texts): assert text.get_text() == "{:.1f}".format(val) def test_tree_kws(self): rgb = (1, .5, .2) g = mat.clustermap(self.df_norm, tree_kws=dict(color=rgb)) for ax in [g.ax_col_dendrogram, g.ax_row_dendrogram]: tree, = ax.collections assert tuple(tree.get_color().squeeze())[:3] == rgb
def get_single_inverse_image_clusters(inverseImage, coveringSetElements, dmax, debugMode, metricName='euclidean', clusterMethod='single'): if debugMode == 1: print "" print "(function: get_single_inverse_image_clusters) " # check that at least two data points are in the inverse image numElements = len(coveringSetElements) if numElements == 0: return [set()] elif numElements == 1: return [{coveringSetElements[0]}] else: # perform clustering in the inverse image links = fastcluster.linkage_vector(inverseImage, method=clusterMethod, metric=metricName) # determine total number of data points in this inverse image N = inverseImage.shape[0] # count the number of clusters; to start, no points have been merged so number of Clusters = number of data points = N numClusters = N # create an initial dictionary of clusters; the labels will be in the range [0, N-1] and these will correspond to how the initial clusters (the nodes) are labeled in the 'links' array. The values will be sets; to start, each set will be a singleton that contains the original point index as its only element. clusterDict = {i: [coveringSetElements[i]] for i in range(N)} # The third column (index 2) of 'links' contains the merging distance for the two clusters listed in that row. We will now loop through 'links' until that merging distance is greater than dmax, or until we reach the end of 'links', whichever comes first. index = 0 # this is out "counter" index while index < (N-1): if links[index][2] < dmax: # get the fastcluster indices for the two nodes merged at this step p1 = links[index][0] p2 = links[index][1] # Now create a new cluster (that is, a new dictionary entry) which has a label that is one higher than whatever is currently the highest label and which has a value that is the list produced by combining the lists which correspond to clusters p1 and p2 (remember, p1 and p2 are also labels in the dictionary) newLabel = N + index # start at N, then increase newCluster = clusterDict[p1] + clusterDict[p2] # this concatenates the lists which are indexed in the dictionary by p1 and p2 clusterDict.pop(p1); clusterDict.pop(p2) # remove these clusters clusterDict[newLabel] = newCluster # add the new cluster which was formed by merging the old 2 index += 1 # increment our index counter after each merge # Take each cluster, which is stored in a a dictionary value, and place it into a list. This list of sublists now contains a sublist which holds the original indices of each point in the cluster which is represented by that sublist clusters = clusterDict.values() # EXPERIMENT: try converting each element of 'clusters' (elements being lists) to a set for i in range(len(clusters)): listlength = len(clusters[i]) clusters[i] = set(clusters[i]) setlength = len(clusters[i]) # if lengths don't match, then something went wrong if listlength != setlength and debugMode == 1: print "ERROR!! (function: get_single_inverse_image_clusters): listlength != setlength" if debugMode == 1: print "links.shape = " + str(links.shape) print "number of data points in this inverse image = " + str(N) print "should = number of elements in coveringSetElements = " + str(len(coveringSetElements)) print "Number of clusters = " + str(len(clusters)) print "first merging distance = " + str(links[0][2]) print "last merging distance = " + str(links[N-2][2]) print "" return clusters
labels = algorithm.labels_ end_time = time.time() palette = sns.color_palette('deep', np.unique(labels).max() + 1) colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels] plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds) frame = plt.gca() frame.axes.get_xaxis().set_visible(False) frame.axes.get_yaxis().set_visible(False) # plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24) # plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14) # clusterer = hdbscan.HDBSCAN(min_cluster_size=1000, min_samples=100).fit(X) # plot_clusters(X[:,:5], clusterer) link_mat = fastcluster.linkage_vector(X, method='ward') ## fc_cluster labels = fcluster(link_mat, 8, criterion='maxclust') # plt.clf();plt.scatter(X.T[0], X.T[1], c=(labels==6).astype(np.int)) X = X[labels != 6, :] ## remove shots of sky for light normalization.... imgs = imgs[labels != 6] imgs_paths = imgs_paths[labels != 6] ## generate additional features stdevs = [np.std(t.reshape((24, 38, 3)).reshape(-1, 3), axis=0) for t in imgs] avg_vals = np.array( [np.mean(t.reshape((24, 38, 3)).reshape(-1, 3), axis=0) for t in imgs]) ################ gmm ################
fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.random_integers(2,12) X = np.random.rand(n,dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass print('OK.')
def test_all(n,dim): method = 'single' # metrics for boolean vectors pcd = np.array(np.random.random_integers(0,1,(n,dim)), dtype=np.bool) pcd2 = pcd.copy() for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', 'rogerstanimoto', #'sokalmichener', # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: # If linkage_vector reported a NaN dissimilarity value, # check whether the distance matrix really contains NaN. if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError('"linkage_vector" erroneously reported NaN.') if np.any(pcd2!=pcd): raise AssertionError('Input array was corrupted.', pcd) test(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.random_integers(-bound,bound,(n,dim)) for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', 'canberra', # canberra: see bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user']: sys.stdout.write("Metric: " + metric + "...") if metric=='minkowski': p = np.random.uniform(1.,10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric, p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric=='user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())) D = pdist(pcd, fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError( '"linkage_vector" erroneously reported NaN.') test(Z2, method, D) D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) test(Z2, method, D)
class TestDendrogram(object): rs = np.random.RandomState(sum(map(ord, "dendrogram"))) x_norm = rs.randn(4, 8) + np.arange(8) x_norm = (x_norm.T + np.arange(4)).T letters = pd.Series(["A", "B", "C", "D", "E", "F", "G", "H"], name="letters") df_norm = pd.DataFrame(x_norm, columns=letters) try: import fastcluster x_norm_linkage = fastcluster.linkage_vector(x_norm.T, metric='euclidean', method='single') except ImportError: x_norm_distances = distance.pdist(x_norm.T, metric='euclidean') x_norm_linkage = hierarchy.linkage(x_norm_distances, method='single') x_norm_dendrogram = hierarchy.dendrogram(x_norm_linkage, no_plot=True, color_list=['k'], color_threshold=-np.inf) x_norm_leaves = x_norm_dendrogram['leaves'] df_norm_leaves = np.asarray(df_norm.columns[x_norm_leaves]) default_kws = dict(linkage=None, metric='euclidean', method='single', axis=1, label=True, rotate=False) def test_ndarray_input(self): p = mat._DendrogramPlotter(self.x_norm, **self.default_kws) npt.assert_array_equal(p.array.T, self.x_norm) pdt.assert_frame_equal(p.data.T, pd.DataFrame(self.x_norm)) npt.assert_array_equal(p.linkage, self.x_norm_linkage) nt.assert_dict_equal(p.dendrogram, self.x_norm_dendrogram) npt.assert_array_equal(p.reordered_ind, self.x_norm_leaves) npt.assert_array_equal(p.xticklabels, self.x_norm_leaves) npt.assert_array_equal(p.yticklabels, []) nt.assert_equal(p.xlabel, None) nt.assert_equal(p.ylabel, '') def test_df_input(self): p = mat._DendrogramPlotter(self.df_norm, **self.default_kws) npt.assert_array_equal(p.array.T, np.asarray(self.df_norm)) pdt.assert_frame_equal(p.data.T, self.df_norm) npt.assert_array_equal(p.linkage, self.x_norm_linkage) nt.assert_dict_equal(p.dendrogram, self.x_norm_dendrogram) npt.assert_array_equal(p.xticklabels, np.asarray(self.df_norm.columns)[ self.x_norm_leaves]) npt.assert_array_equal(p.yticklabels, []) nt.assert_equal(p.xlabel, 'letters') nt.assert_equal(p.ylabel, '') def test_df_multindex_input(self): df = self.df_norm.copy() index = pd.MultiIndex.from_tuples([("A", 1), ("B", 2), ("C", 3), ("D", 4)], names=["letter", "number"]) index.name = "letter-number" df.index = index kws = self.default_kws.copy() kws['label'] = True p = mat._DendrogramPlotter(df.T, **kws) xticklabels = ["A-1", "B-2", "C-3", "D-4"] xticklabels = [xticklabels[i] for i in p.reordered_ind] npt.assert_array_equal(p.xticklabels, xticklabels) npt.assert_array_equal(p.yticklabels, []) nt.assert_equal(p.xlabel, "letter-number") def test_axis0_input(self): kws = self.default_kws.copy() kws['axis'] = 0 p = mat._DendrogramPlotter(self.df_norm.T, **kws) npt.assert_array_equal(p.array, np.asarray(self.df_norm.T)) pdt.assert_frame_equal(p.data, self.df_norm.T) npt.assert_array_equal(p.linkage, self.x_norm_linkage) nt.assert_dict_equal(p.dendrogram, self.x_norm_dendrogram) npt.assert_array_equal(p.xticklabels, self.df_norm_leaves) npt.assert_array_equal(p.yticklabels, []) nt.assert_equal(p.xlabel, 'letters') nt.assert_equal(p.ylabel, '') def test_rotate_input(self): kws = self.default_kws.copy() kws['rotate'] = True p = mat._DendrogramPlotter(self.df_norm, **kws) npt.assert_array_equal(p.array.T, np.asarray(self.df_norm)) pdt.assert_frame_equal(p.data.T, self.df_norm) npt.assert_array_equal(p.xticklabels, []) npt.assert_array_equal(p.yticklabels, self.df_norm_leaves) nt.assert_equal(p.xlabel, '') nt.assert_equal(p.ylabel, 'letters') def test_rotate_axis0_input(self): kws = self.default_kws.copy() kws['rotate'] = True kws['axis'] = 0 p = mat._DendrogramPlotter(self.df_norm.T, **kws) npt.assert_array_equal(p.reordered_ind, self.x_norm_leaves) def test_custom_linkage(self): kws = self.default_kws.copy() try: import fastcluster linkage = fastcluster.linkage_vector(self.x_norm, method='single', metric='euclidean') except ImportError: d = distance.pdist(self.x_norm, metric='euclidean') linkage = hierarchy.linkage(d, method='single') dendrogram = hierarchy.dendrogram(linkage, no_plot=True, color_list=['k'], color_threshold=-np.inf) kws['linkage'] = linkage p = mat._DendrogramPlotter(self.df_norm, **kws) npt.assert_array_equal(p.linkage, linkage) nt.assert_dict_equal(p.dendrogram, dendrogram) def test_label_false(self): kws = self.default_kws.copy() kws['label'] = False p = mat._DendrogramPlotter(self.df_norm, **kws) nt.assert_equal(p.xticks, []) nt.assert_equal(p.yticks, []) nt.assert_equal(p.xticklabels, []) nt.assert_equal(p.yticklabels, []) nt.assert_equal(p.xlabel, "") nt.assert_equal(p.ylabel, "") def test_linkage_scipy(self): p = mat._DendrogramPlotter(self.x_norm, **self.default_kws) scipy_linkage = p._calculate_linkage_scipy() from scipy.spatial import distance from scipy.cluster import hierarchy dists = distance.pdist(self.x_norm.T, metric=self.default_kws['metric']) linkage = hierarchy.linkage(dists, method=self.default_kws['method']) npt.assert_array_equal(scipy_linkage, linkage) @skipif(_no_fastcluster) def test_fastcluster_other_method(self): import fastcluster kws = self.default_kws.copy() kws['method'] = 'average' linkage = fastcluster.linkage(self.x_norm.T, method='average', metric='euclidean') p = mat._DendrogramPlotter(self.x_norm, **kws) npt.assert_array_equal(p.linkage, linkage) @skipif(_no_fastcluster) def test_fastcluster_non_euclidean(self): import fastcluster kws = self.default_kws.copy() kws['metric'] = 'cosine' kws['method'] = 'average' linkage = fastcluster.linkage(self.x_norm.T, method=kws['method'], metric=kws['metric']) p = mat._DendrogramPlotter(self.x_norm, **kws) npt.assert_array_equal(p.linkage, linkage) def test_dendrogram_plot(self): d = mat.dendrogram(self.x_norm, **self.default_kws) ax = plt.gca() xlim = ax.get_xlim() # 10 comes from _plot_dendrogram in scipy.cluster.hierarchy xmax = len(d.reordered_ind) * 10 nt.assert_equal(xlim[0], 0) nt.assert_equal(xlim[1], xmax) nt.assert_equal(len(ax.collections[0].get_paths()), len(d.dependent_coord)) plt.close('all') def test_dendrogram_rotate(self): kws = self.default_kws.copy() kws['rotate'] = True d = mat.dendrogram(self.x_norm, **kws) ax = plt.gca() ylim = ax.get_ylim() # 10 comes from _plot_dendrogram in scipy.cluster.hierarchy ymax = len(d.reordered_ind) * 10 # Since y axis is inverted, ylim is (80, 0) # and therefore not (0, 80) as usual: nt.assert_equal(ylim[1], 0) nt.assert_equal(ylim[0], ymax) plt.close('all') def test_dendrogram_ticklabel_rotation(self): f, ax = plt.subplots(figsize=(2, 2)) mat.dendrogram(self.df_norm, ax=ax) for t in ax.get_xticklabels(): nt.assert_equal(t.get_rotation(), 0) plt.close(f) df = self.df_norm.copy() df.columns = [str(c) * 10 for c in df.columns] df.index = [i * 10 for i in df.index] f, ax = plt.subplots(figsize=(2, 2)) mat.dendrogram(df, ax=ax) for t in ax.get_xticklabels(): nt.assert_equal(t.get_rotation(), 90) plt.close(f) f, ax = plt.subplots(figsize=(2, 2)) mat.dendrogram(df.T, axis=0, rotate=True) for t in ax.get_yticklabels(): nt.assert_equal(t.get_rotation(), 0) plt.close(f)
def search_engine(self, raw_df, centroids, threshold=0.5, min_sim=0.9, model_name='model_fast_text_sg_40', prod_id_column='product_id', column_name_db='word_vector', column_name_data='word_vector', pre_computed_word_vectors=False, min_amount_analogous=3, clustering_algorithm='agglomerative'): ''' performs a serach for similarity of word vector from a dataframe in a precalculated reference DB raw_df is the unlabeled data centroids is the reference DB threshold is the hierarchichal clustering threshold distance min sim is the minimum similarity in order to assign an ad_title to a prodcut_id tag ''' last_product_id = max(centroids.product_id) if not ('category_id' in raw_df.columns): raw_df = raw_df.assign(category_id=0) test = search_engine_fasttext( raw_df, centroids, min_sim=min_sim, model_name=model_name, column_name_db=column_name_db, column_name_data=column_name_data, pre_computed_word_vectors=pre_computed_word_vectors) test = test.rename(columns={'product_id_fasttext': 'product_id'}) test = test[[ 'date_min', 'date_max', 'category_id', 'ad_title', 'word_vector', 'ad_id', 'product_id' ]] data = test print('{} unlabeled ads'.format( len(test[test.product_id == -1]['product_id']))) try: test = test.assign(last_modified_date=test.date_max, starting_date=test.date_min) except: test = test.assign( last_modified_date=datetime.datetime.today().strftime( '%Y-%m-%d'), starting_date='2000-01-01') try: unknown_products = test[test.product_id == -1].assign( starting_date=test[test.product_id == -1].date_min) except: print(test.product_id) unknown_products = test[test.product_id == -1].assign( starting_date=datetime.datetime.today().strftime('%Y-%m-%d')) try: test = test.drop('date_max', axis=1) except: pass try: test = test.drop('date_min', axis=1) except: pass try: unknown_products = unknown_products.drop('date_max', axis=1) except: pass try: unknown_products = unknown_products.drop('date_min', axis=1) except: pass print('groupying new products') self.new_existing_products = test[test.product_id != -1].rename( columns={ 'product_id': 'product_id' }).assign(counter=1) self.new_existing_products = self.group_by_product( self.new_existing_products[[ 'product_id', 'starting_date', 'last_modified_date', 'category_id', 'ad_title', 'counter', 'word_vector', 'ad_id' ]]) if len(unknown_products) >= min_amount_analogous: if clustering_algorithm == 'agglomerative': unknown_data = np.array( [i[0][0] for i in unknown_products.word_vector]) cluster_ = fastcluster.linkage_vector(unknown_data, method='ward') cluster_labels = Cluster.hierarchy.fcluster( cluster_, threshold) unknown_products = unknown_products.assign( product_id=cluster_labels) elif clustering_algorithm == 'community': unknown_products = self.graph_communities( unknown_products, min_value_=0.8, topn_=400, k1=50, expected_density=0.1, graph_communities_df=None) title_clusters_joinned = self.group_by_product( unknown_products, prod_id_column='product_id') new_products = title_clusters_joinned[ title_clusters_joinned.counter >= min_amount_analogous] dumped_products = title_clusters_joinned[ title_clusters_joinned.counter < min_amount_analogous] new_products = new_products.assign( product_id=new_products.product_id.apply( lambda x: x + last_product_id + 1)) self.new_products = new_products self.dumped_products = dumped_products print(title_clusters_joinned) else: new_products = pd.DataFrame(columns=[ 'product_id', 'starting_date', 'last_modified_date', 'category_id', 'ad_title', 'counter', 'word_vector', 'ad_id' ]) self.new_products = new_products try: print( str(len(self.new_existing_products)) + ' products that already exist in data base') except: pass try: print(str(len(self.new_products)) + ' new products found') except: pass try: print(str(len(self.dumped_products)) + ' ads dumped') except: pass self.new_existing_products = self.new_existing_products.set_index( np.arange(len(self.new_existing_products))) self.new_products = self.new_products.set_index( np.arange(len(self.new_products))) try: self.dumped_products = self.dumped_products.set_index( np.arange(len(self.dumped_products))) except: try: self.dumped_products = dumped_products.set_index( np.arange(len(dumped_products))) except: pass return data
# Hierarchical agglomerative clustering # Fortunately, it has been implemented by somebody with quite a nice analysis of # the complexity and comparison to other packages. import fastcluster # fastcluster has a nice website to go along with it: # http://danifold.net/fastcluster.html # So the result is that linkage_vector is a much more efficient # But it isn't magic; always test when you have larger datasets. from timeit import default_timer as timer start = timer() fastcluster.linkage_vector(mat_numeric[0:1000,:7]) end = timer() # Time for 1000 print(end - start) start = timer() fastcluster.linkage_vector(mat_numeric[0:10000,:7]) end = timer() # Time for 10000 print(end - start) # Hm, this won't scale linearly. Let's randomly sample 1000 so it will be easier to work with. np.random.seed(1) samples = np.random.randint(0,df.shape[0], 100) mat_sample = mat_numeric[samples, :]
def test_all(n, dim): method = 'single' # metrics for boolean vectors pcd = np.array(np.random.random_integers(0, 1, (n, dim)), dtype=np.bool) pcd2 = pcd.copy() for metric in ( 'hamming', 'jaccard', 'yule', 'matching', 'dice', #'kulsinski', 'rogerstanimoto', #'sokalmichener', # exclude, bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric) Z2 = fc.linkage_vector(pcd, method, metric) if np.any(pcd2 != pcd): raise AssertionError('Input array was corrupted.', pcd) test(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.random_integers(-bound, bound, (n, dim)) for metric in [ 'euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', #'canberra', # exclude, bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user' ]: sys.stdout.write("Metric: " + metric + "...") if metric == 'minkowski': p = np.random.uniform(1., 10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric, p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric == 'user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u - v) * (u - v).T).sum())) D = pdist(pcd, fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric) Z2 = fc.linkage_vector(pcd, method, metric) test(Z2, method, D) D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) test(Z2, method, D)
class TestClustermap(object): rs = np.random.RandomState(sum(map(ord, "clustermap"))) x_norm = rs.randn(4, 8) + np.arange(8) x_norm = (x_norm.T + np.arange(4)).T letters = pd.Series(["A", "B", "C", "D", "E", "F", "G", "H"], name="letters") df_norm = pd.DataFrame(x_norm, columns=letters) try: import fastcluster x_norm_linkage = fastcluster.linkage_vector(x_norm.T, metric='euclidean', method='single') except ImportError: x_norm_distances = distance.pdist(x_norm.T, metric='euclidean') x_norm_linkage = hierarchy.linkage(x_norm_distances, method='single') x_norm_dendrogram = hierarchy.dendrogram(x_norm_linkage, no_plot=True, color_list=['k'], color_threshold=-np.inf) x_norm_leaves = x_norm_dendrogram['leaves'] df_norm_leaves = np.asarray(df_norm.columns[x_norm_leaves]) default_kws = dict(pivot_kws=None, z_score=None, standard_scale=None, figsize=None, row_colors=None, col_colors=None) default_plot_kws = dict(metric='euclidean', method='average', colorbar_kws=None, row_cluster=True, col_cluster=True, row_linkage=None, col_linkage=None) row_colors = color_palette('Set2', df_norm.shape[0]) col_colors = color_palette('Dark2', df_norm.shape[1]) def test_ndarray_input(self): cm = mat.ClusterGrid(self.x_norm, **self.default_kws) pdt.assert_frame_equal(cm.data, pd.DataFrame(self.x_norm)) nt.assert_equal(len(cm.fig.axes), 4) nt.assert_equal(cm.ax_row_colors, None) nt.assert_equal(cm.ax_col_colors, None) plt.close('all') def test_df_input(self): cm = mat.ClusterGrid(self.df_norm, **self.default_kws) pdt.assert_frame_equal(cm.data, self.df_norm) plt.close('all') def test_corr_df_input(self): df = self.df_norm.corr() cg = mat.ClusterGrid(df, **self.default_kws) cg.plot(**self.default_plot_kws) diag = cg.data2d.values[np.diag_indices_from(cg.data2d)] npt.assert_array_equal(diag, np.ones(cg.data2d.shape[0])) plt.close('all') def test_pivot_input(self): df_norm = self.df_norm.copy() df_norm.index.name = 'numbers' df_long = pd.melt(df_norm.reset_index(), var_name='letters', id_vars='numbers') kws = self.default_kws.copy() kws['pivot_kws'] = dict(index='numbers', columns='letters', values='value') cm = mat.ClusterGrid(df_long, **kws) pdt.assert_frame_equal(cm.data2d, df_norm) plt.close('all') def test_colors_input(self): kws = self.default_kws.copy() kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cm = mat.ClusterGrid(self.df_norm, **kws) npt.assert_array_equal(cm.row_colors, self.row_colors) npt.assert_array_equal(cm.col_colors, self.col_colors) nt.assert_equal(len(cm.fig.axes), 6) plt.close('all') def test_nested_colors_input(self): kws = self.default_kws.copy() row_colors = [self.row_colors, self.row_colors] col_colors = [self.col_colors, self.col_colors] kws['row_colors'] = row_colors kws['col_colors'] = col_colors cm = mat.ClusterGrid(self.df_norm, **kws) npt.assert_array_equal(cm.row_colors, row_colors) npt.assert_array_equal(cm.col_colors, col_colors) nt.assert_equal(len(cm.fig.axes), 6) plt.close('all') def test_colors_input_custom_cmap(self): kws = self.default_kws.copy() kws['cmap'] = mpl.cm.PRGn kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cm = mat.clustermap(self.df_norm, **kws) npt.assert_array_equal(cm.row_colors, self.row_colors) npt.assert_array_equal(cm.col_colors, self.col_colors) nt.assert_equal(len(cm.fig.axes), 6) plt.close('all') def test_z_score(self): df = self.df_norm.copy() df = (df - df.mean()) / df.std() kws = self.default_kws.copy() kws['z_score'] = 1 cm = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cm.data2d, df) plt.close('all') def test_z_score_axis0(self): df = self.df_norm.copy() df = df.T df = (df - df.mean()) / df.std() df = df.T kws = self.default_kws.copy() kws['z_score'] = 0 cm = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cm.data2d, df) plt.close('all') def test_standard_scale(self): df = self.df_norm.copy() df = (df - df.min()) / (df.max() - df.min()) kws = self.default_kws.copy() kws['standard_scale'] = 1 cm = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cm.data2d, df) plt.close('all') def test_standard_scale_axis0(self): df = self.df_norm.copy() df = df.T df = (df - df.min()) / (df.max() - df.min()) df = df.T kws = self.default_kws.copy() kws['standard_scale'] = 0 cm = mat.ClusterGrid(self.df_norm, **kws) pdt.assert_frame_equal(cm.data2d, df) plt.close('all') def test_z_score_standard_scale(self): kws = self.default_kws.copy() kws['z_score'] = True kws['standard_scale'] = True with nt.assert_raises(ValueError): cm = mat.ClusterGrid(self.df_norm, **kws) plt.close('all') def test_color_list_to_matrix_and_cmap(self): matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( self.col_colors, self.x_norm_leaves) colors_set = set(self.col_colors) col_to_value = dict((col, i) for i, col in enumerate(colors_set)) matrix_test = np.array([col_to_value[col] for col in self.col_colors])[self.x_norm_leaves] shape = len(self.col_colors), 1 matrix_test = matrix_test.reshape(shape) cmap_test = mpl.colors.ListedColormap(colors_set) npt.assert_array_equal(matrix, matrix_test) npt.assert_array_equal(cmap.colors, cmap_test.colors) plt.close('all') def test_nested_color_list_to_matrix_and_cmap(self): colors = [self.col_colors, self.col_colors] matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( colors, self.x_norm_leaves) all_colors = set(itertools.chain(*colors)) color_to_value = dict((col, i) for i, col in enumerate(all_colors)) matrix_test = np.array( [color_to_value[c] for color in colors for c in color]) shape = len(colors), len(colors[0]) matrix_test = matrix_test.reshape(shape) matrix_test = matrix_test[:, self.x_norm_leaves] matrix_test = matrix_test.T cmap_test = mpl.colors.ListedColormap(all_colors) npt.assert_array_equal(matrix, matrix_test) npt.assert_array_equal(cmap.colors, cmap_test.colors) plt.close('all') def test_color_list_to_matrix_and_cmap_axis1(self): matrix, cmap = mat.ClusterGrid.color_list_to_matrix_and_cmap( self.col_colors, self.x_norm_leaves, axis=1) colors_set = set(self.col_colors) col_to_value = dict((col, i) for i, col in enumerate(colors_set)) matrix_test = np.array([col_to_value[col] for col in self.col_colors])[self.x_norm_leaves] shape = 1, len(self.col_colors) matrix_test = matrix_test.reshape(shape) cmap_test = mpl.colors.ListedColormap(colors_set) npt.assert_array_equal(matrix, matrix_test) npt.assert_array_equal(cmap.colors, cmap_test.colors) plt.close('all') def test_savefig(self): # Not sure if this is the right way to test.... cm = mat.ClusterGrid(self.df_norm, **self.default_kws) cm.plot(**self.default_plot_kws) cm.savefig(tempfile.NamedTemporaryFile(), format='png') plt.close('all') def test_plot_dendrograms(self): cm = mat.clustermap(self.df_norm, **self.default_kws) nt.assert_equal(len(cm.ax_row_dendrogram.collections[0].get_paths()), len(cm.dendrogram_row.independent_coord)) nt.assert_equal(len(cm.ax_col_dendrogram.collections[0].get_paths()), len(cm.dendrogram_col.independent_coord)) data2d = self.df_norm.iloc[cm.dendrogram_row.reordered_ind, cm.dendrogram_col.reordered_ind] pdt.assert_frame_equal(cm.data2d, data2d) plt.close('all') def test_cluster_false(self): kws = self.default_kws.copy() kws['row_cluster'] = False kws['col_cluster'] = False cm = mat.clustermap(self.df_norm, **kws) nt.assert_equal(len(cm.ax_row_dendrogram.lines), 0) nt.assert_equal(len(cm.ax_col_dendrogram.lines), 0) nt.assert_equal(len(cm.ax_row_dendrogram.get_xticks()), 0) nt.assert_equal(len(cm.ax_row_dendrogram.get_yticks()), 0) nt.assert_equal(len(cm.ax_col_dendrogram.get_xticks()), 0) nt.assert_equal(len(cm.ax_col_dendrogram.get_yticks()), 0) pdt.assert_frame_equal(cm.data2d, self.df_norm) plt.close('all') def test_row_col_colors(self): kws = self.default_kws.copy() kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cm = mat.clustermap(self.df_norm, **kws) nt.assert_equal(len(cm.ax_row_colors.collections), 1) nt.assert_equal(len(cm.ax_col_colors.collections), 1) plt.close('all') def test_cluster_false_row_col_colors(self): kws = self.default_kws.copy() kws['row_cluster'] = False kws['col_cluster'] = False kws['row_colors'] = self.row_colors kws['col_colors'] = self.col_colors cm = mat.clustermap(self.df_norm, **kws) nt.assert_equal(len(cm.ax_row_dendrogram.lines), 0) nt.assert_equal(len(cm.ax_col_dendrogram.lines), 0) nt.assert_equal(len(cm.ax_row_dendrogram.get_xticks()), 0) nt.assert_equal(len(cm.ax_row_dendrogram.get_yticks()), 0) nt.assert_equal(len(cm.ax_col_dendrogram.get_xticks()), 0) nt.assert_equal(len(cm.ax_col_dendrogram.get_yticks()), 0) nt.assert_equal(len(cm.ax_row_colors.collections), 1) nt.assert_equal(len(cm.ax_col_colors.collections), 1) pdt.assert_frame_equal(cm.data2d, self.df_norm) plt.close('all') def test_mask_reorganization(self): kws = self.default_kws.copy() kws["mask"] = self.df_norm > 0 g = mat.clustermap(self.df_norm, **kws) npt.assert_array_equal(g.data2d.index, g.mask.index) npt.assert_array_equal(g.data2d.columns, g.mask.columns) npt.assert_array_equal(g.mask.index, self.df_norm.index[ g.dendrogram_row.reordered_ind]) npt.assert_array_equal(g.mask.columns, self.df_norm.columns[ g.dendrogram_col.reordered_ind]) plt.close("all") def test_ticklabel_reorganization(self): kws = self.default_kws.copy() xtl = np.arange(self.df_norm.shape[1]) kws["xticklabels"] = list(xtl) ytl = self.letters.ix[:self.df_norm.shape[0]] kws["yticklabels"] = ytl g = mat.clustermap(self.df_norm, **kws) xtl_actual = [t.get_text() for t in g.ax_heatmap.get_xticklabels()] ytl_actual = [t.get_text() for t in g.ax_heatmap.get_yticklabels()] xtl_want = xtl[g.dendrogram_col.reordered_ind].astype("<U1") ytl_want = ytl[g.dendrogram_row.reordered_ind].astype("<U1")[::-1] npt.assert_array_equal(xtl_actual, xtl_want) npt.assert_array_equal(ytl_actual, ytl_want) plt.close("all")
def test_all(n,dim): method = 'single' # metrics for boolean vectors pcd = np.random.randint(0, 2, size=(n,dim), dtype=np.bool) pcd2 = pcd.copy() for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', 'rogerstanimoto', #'sokalmichener', # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric=metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: # If linkage_vector reported a NaN dissimilarity value, # check whether the distance matrix really contains NaN. if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError('"linkage_vector" erroneously reported NaN.') if np.any(pcd2!=pcd): raise AssertionError('Input array was corrupted.', pcd) check(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.randint(-bound, bound + 1, (n,dim)) for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', 'canberra', # canberra: see bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user']: sys.stdout.write("Metric: " + metric + "...") if metric=='minkowski': p = np.random.uniform(1.,10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric=metric, p=p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric=='user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())) D = pdist(pcd, metric=fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric=metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError( '"linkage_vector" erroneously reported NaN.') check(Z2, method, D) D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) check(Z2, method, D)