def birch_algo(X, threshold=1.7, clustering=None): birch = Birch(threshold=threshold, n_clusters=clustering) t = time() birch.fit(X) time_ = time() - t labels = birch.labels_ centroids = birch.subcluster_centers_ n_clusters = np.unique(labels).size print(" The number of clusters is : %d" % n_clusters)
def birchcluster(X): brc = Birch() brc.fit(X) # Plot result labels = brc.labels_ centroids = brc.subcluster_centers_ n_clusters = np.unique(labels).size print("n_clusters : %d" % n_clusters) return labels
def birch_algo(X, threshold=1.7, clustering=None): birch = Birch(threshold=threshold, n_clusters=clustering) birch.fit(X) labels = birch.labels_ centroids = birch.subcluster_centers_ labels_unique = np.unique(labels) n_clusters = labels_unique.size print(" The number of clusters is : %d" % n_clusters) return labels, centroids, n_clusters
def birch(x, n_clusters=None, threshold=0.5, branching_factor=5): birch_model = Birch( threshold=threshold, n_clusters=n_clusters, branching_factor=branching_factor ) birch_model.fit(x) centroids = birch_model.subcluster_centers_ c = birch_model.labels_ k = len(centroids) return birch_model, (centroids, c, k)
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): #t = time.clock() global quota_for_each_cluster global brc global v global quota global select quota = 10000 result_arr = QLINK_URLS + UNKNOWN_URLS for i, url in enumerate(result_arr): result_arr[i] = urlparse.urlparse(unquote(url.strip())) #l_dict = v = DictVectorizer(sparse=False) data = v.fit_transform(extract_features(result_arr)) ind_list = [] ind_list_data = [] low_bound = 8 for col in xrange(data.shape[1]): if (np.sum(data[:, col]) > low_bound): ind_list.append(1) ind_list_data.append(col) else: ind_list.append(0) v = v.restrict(ind_list) data = data[:, ind_list_data] #if (start_url[0].find("wikipedia") != -1): # out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500]) # out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:]) # out_data("som_data_wiki/data.tfxidf", data, start_url) # out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data)) # out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2) # out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2) # return 0 best_cou_clusters = data.shape[1] #k_means = KMeans(n_clusters=best_cou_clusters, init = 'random') #clust = k_means.fit_predict(data) brc = Birch(branching_factor=50, n_clusters=best_cou_clusters, threshold=0.2, compute_labels=True) clust = brc.fit_predict(data) select = SelectKBest(k=min(data.shape[1], 30)) data = select.fit_transform(data, clust) clust = brc.fit_predict(data) #print data.shape quota_for_each_cluster = np.zeros(best_cou_clusters) clust_qlink = list(clust[:500]) for i in xrange(best_cou_clusters): quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA quota_for_each_cluster *= 2.0
def birch_cluster(init_ds,ts_flag = False): ''' Parameters: init_ds - 2D list of data ts_flag - boolean specifying if the first column of init_ds is a datetime object or not Returns: 2D list with additional column denoting which cluster said row falls into ''' if ts_flag: init_ds = [i[1:] for i in init_ds] brc = Birch() labels = brc.fit_predict(init_ds) return [init_ds[i]+[labels[i]] for i in range(len(init_ds)) ]
def main(): #remove sub folders removeSubFolders(path+algorithm+'\\') for file in os.listdir(path): if file.endswith("-d.txt"): text_file = open(path+file,'r') ar = (text_file.readline().split(' ')) ar.remove('\n') if(len(ar)>0): #print map(int,ar) row = map(int,ar); data.append(row) fileNames.append(file) #print(row) #create np array npData = np.array(data) n_samples, n_features = npData.shape brc = Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5,compute_labels=True) #kmeans = KMeans(init='random', n_clusters=n_digits, n_init=500) brc.fit(npData) list1 = brc.labels_ list2 = fileNames print brc.labels_ print fileNames list1, list2 = zip(*sorted(zip(list1, list2))) print list1 print list2 ''' k=0 lim = len(list1)-1 for i in range(0,n_digits): while(list1[k]==i): # want to copy these into folders copychar(list1[k],list2[k]) print list1[k],list2[k] k+=1 if k==lim: break ''' for i in range(0,len(list1)): print list1[i],list2[i] copychar(list1[i],list2[i])
def obtainCodebook(self, sampled_x, x): print 'Obatining codebook using Birch from skilean...' scaled_x_sampled = StandardScaler().fit_transform(sampled_x) scaled_x = StandardScaler().fit_transform(x) brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True) #obatin the codebook and the projections of the images on the codebook (clusters of words) codebook = brc.fit(scaled_x_sampled) clusters = brc.predict(scaled_x) print 'Clusters obtained.' return codebook, clusters
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): # url to obj qlinks = map(parse_url, QLINK_URLS) ulinks = map(parse_url, UNKNOWN_URLS) # check netloc # print qlinks[0].netloc # extract features start = time.time() qlinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in qlinks] ulinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in ulinks] # print time.time() - start # start = time.time() v = DictVectorizer(sparse=False) x_ = v.fit_transform(qlinks_f + ulinks_f) best_features = np.sum(x_, axis=0) > 5 m_features = np.sum(best_features) v = v.restrict(best_features) x_ = x_[:, best_features] clustering = Birch(branching_factor=BIRCH_BRANCHING_FACTOR, n_clusters=m_features, threshold=BIRCH_THRESHOLD, compute_labels=True) y_ = clustering.fit_predict(x_) sel = SelectKBest(k=min(m_features, KBEST_K)) x = sel.fit_transform(x_, y_) y = clustering.fit_predict(x) q_or_u = np.repeat([1, 0], [len(QLINK_URLS), len(UNKNOWN_URLS)]) q_ = np.vstack((y, q_or_u)).T quota = zip(np.unique(y), (np.array([np.sum(q_[q_[:, 0] == c, 1]) for c in np.unique(y)]) / float(len(QLINK_URLS))) * QUOTA * 2) quota = {c: int(q) for c, q in quota} algos[qlinks[0].netloc] = { "clustering": clustering, "quota": quota, "sel": sel, "vect": v, "total_quota": QUOTA, }
def obtainClusters(self, hist): print 'Obatining clusters using Birch from skilean...' hist = np.array(hist) hist = hist.astype(float) scaled_vec = StandardScaler().fit_transform(hist) brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True) #obatin the codebook and the projections of the images on the codebook (clusters of words) codebook = brc.fit(scaled_vec) clusters = brc.predict(scaled_vec) print 'Clusters obtained.' return clusters
def runBrich(K_cluster, cluster_input): # clustering by topic-probability vector of each category t0 = time() bri = Birch(n_clusters=K_cluster) bri.fit(cluster_input) print("done in %0.3fs" % (time() - t0)) with open('result/brich_cluster_' + str(K_cluster) + '.txt', 'w') as f: f.write("cluster_centers\n") f.write(str(bri.subcluster_centers_)) f.write("\n==========\n") f.write("labels (sequence of cluster # which input belongs to )\n") f.write(str(bri.labels_)) f.write("\n==========\n") f.write("inertia\n") f.write(str(bri.subcluster_labels_)) f.write("\n==========\n") return bri.labels_
def split_birch(self, branching_factor, threshold): # Extract dataset from files dataset = [f.dataset for f in self.files] # Initialize classifier classifier = Birch(branching_factor=branching_factor, n_clusters=None, threshold=threshold) classifier.fit(dataset) # Get index index = classifier.predict(dataset) count = max(index) + 1 # Create new clusters clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)] for i in range(0, len(self.files), 1): clusters[index[i]].add_file(self.files[i]) return clusters
def test_birch_with_depot_calculation(): points = points_from_file('tsps/berlin52.txt') matrix = load_matrix(points) X = [[p[1],p[2]] for p in points] est = Birch(n_clusters=3) est.fit(X) labels = est.labels_ hl_matrix, clusters, G = load_matrices_from_labels(points,labels) depots, C = compute_depots(clusters, matrix, G, per_cluster=True) depots_actual, _ = compute_depots(clusters, matrix, G) cluster_optimal_cost, R, hl_route = clustered_tsp_solve(points, 3, labels=labels, depots=depots) cluster_optimal_cost += C print(depots_actual) print(R,C) for depot in depots_actual: for r in R: if r[1][0] == depot: for point in r[1]: print(matrix.points[point]) print('')
def build_model(df, cluster_type="kmeans", seed=1): if cluster_type == "birch": model = Birch(n_clusters=N_CLUSTERS) res = model.fit_predict(df) elif cluster_type == "minibatch": model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) elif cluster_type == "em": model = mixture.GMM(n_components=N_CLUSTERS) model.fit(df) res = model.predict(df) elif cluster_type == 'lda': model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed) data_to_cluster = np.array(df).astype(int) lda_res = model.fit_transform(data_to_cluster) res = [] for i in lda_res: #for now - do hard clustering, take the higheset propability res.append(i.argmax()) else: model = KMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) df_array = np.array(df) dis_dict = {} for i in range(N_CLUSTERS): dis_dict[i] = clusters_centers[i] all_dist = [] for line_idx in range(len(df_array)): label = model.labels_[line_idx] dist = calc_distance(df_array[line_idx],dis_dict[label]) all_dist.append(dist) df["distance_from_cluster"] = all_dist #clusters = model.labels_.tolist() #print ("clusters are:",clusters) print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res))) res = [str(i) for i in res] docs_clusteres = zip(df.index,res) return docs_clusteres
def birch(self, n_clusters, threshold=0.5, lsi_components=None): """ Perform Birch clustering Parameters ---------- n_clusters : int number of clusters lsi_components : int apply LSA before the clustering algorithm threshold : float birch threshold """ from sklearn.cluster import Birch pars = {'threshold': threshold} if lsi_components is None: raise ValueError("lsi_components=None detected. You must use LSI with Birch \ clustering for scaling reasons.") lsi = _generate_lsi(lsi_components) km = Birch(n_clusters=n_clusters, threshold=threshold) return self._cluster_func(n_clusters, km, pars, lsi=lsi)
def get_model(data, index): index += 1 if index == 1: classifier = KMeans(n_clusters=2) classifier.fit(data) elif index == 2: classifier = svm.OneClassSVM(nu=params['alpha'] + 0.005, kernel="rbf", gamma=0.1) classifier.fit(data) elif index == 3: classifier = MeanShift(bin_seeding=True, n_jobs=-1) classifier.fit(data) elif index == 4: classifier = EllipticEnvelope(contamination=params['alpha']) classifier.fit(data) elif index == 5: classifier = IsolationForest(contamination=params['alpha'], random_state=None) classifier.fit(data) elif index == 6: classifier = Birch(n_clusters=2) classifier.fit(data) return classifier
def set_Cluster(self, algorithm, param_dict): self.algorithm_name = algorithm if algorithm == "KMeans": self.cluster = KMeans(param_dict[0], max_iter=param_dict[1]) elif algorithm == "BIRCH": self.cluster = Birch(n_clusters=param_dict[0], threshold=param_dict[1]) elif algorithm == "DBSCAN": self.cluster = DBSCAN(eps=param_dict[0], min_samples=param_dict[1]) elif algorithm == "GMM": self.cluster = GMM(n_clusters=param_dict[0], max_iter=param_dict[1]) elif algorithm == "OPTICS": self.cluster = OPTICS(min_samples=param_dict[0], max_eps=param_dict[1]) elif algorithm == "MeanShift": self.cluster = MEANSHIFT(quantile=param_dict[0], n_samples=param_dict[1]) elif algorithm == "CLIQUE": self.cluster = CLIQUE(intervals=param_dict[0], threshold=param_dict[1]) else: print("没有找到分类器") self.cluster.class_ = None
def find_anomalous_edges(self): for edge in self.edges: elapsed_time = np.array( list(self.trace_data[self.trace_data.path == edge] ['elapsedTime'])) normalized_time = preprocessing.normalize([elapsed_time ]).reshape(-1, 1) if self.take_minute_averages_of_trace_data: birch = Birch(branching_factor=50, n_clusters=None, threshold=0.05, compute_labels=True) else: birch = Birch(branching_factor=50, n_clusters=None, threshold=0.001, compute_labels=True) birch.fit_predict(normalized_time) labels = birch.labels_ if np.unique(labels).size > 1: self.anomalous_edges[edge.split('-')[1]] = edge
def birchModel(self): birch_model = Birch() birch_model.fit(self.X) # Plot result labels = birch_model.labels_ centroids = birch_model.subcluster_centers_ n_clusters = np.unique(labels).size print("n_clusters : %d" % n_clusters) print(birch_model.predict(self.X)) for i in range(1, self.X.shape[0]): if birch_model.predict(self.X)[i] == 1: print(i) #KMeansModel() #linkageModel() #agglomerativeClusteringModel() #TSNETest() #birchModel() #decisiontree() #model_predict() #randomforest()
def train(feature, weights, cluster_num, feature_path=None): if feature_path != None: feature = pd.read_csv(feature_path) X, Y = [], [] print("Training...\n") for i in range(len(feature[feature.columns[0]])): f = np.array(feature.iloc[i][1:]) f_w = combine(feature.iloc[i][1:], weights) print(f) print(f_w) X.append(f_w) Y.append(f) clf = Birch(n_clusters=cluster_num) clf = KMeans(n_clusters=cluster_num) clf.fit(X) pred = clf.predict(X) joblib.dump(clf, 'curve_model_KMeans.pkl') rdf = RandomForestClassifier() rdf.fit(Y, pred) joblib.dump(rdf, 'rforest_model.pkl') print(pred) return pred
def create_graph(self): # creates weighted graph from the trace data print('Creating graph of %d edges:' % len(self.edges)) for edge in self.edges: source, destination = edge.split('-') if source != 'Start': vector_of_time = self.dictionary_of_times[edge] reshaped_vector_of_time = np.reshape(vector_of_time, (-1, 1)) if len(reshaped_vector_of_time) > 5000: k = len(reshaped_vector_of_time) // 5000 + 1 rnge = np.arange(len(reshaped_vector_of_time)) indices = (rnge % k) == 0 reshaped_vector_of_time = reshaped_vector_of_time[indices] KDE = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(reshaped_vector_of_time) KDE_scores = KDE.score_samples(reshaped_vector_of_time) mean_of_KDE_scores = -np.mean(KDE_scores) normalized_vector_of_time = preprocessing.normalize( [vector_of_time]).reshape(-1, 1) birch = Birch(n_clusters=None, threshold=0.1, compute_labels=True) birch.fit(normalized_vector_of_time) birch.predict(normalized_vector_of_time) labels = birch.labels_ birch_clustering_score = 100 * len( labels[np.where(labels != 0)]) / len(labels) total_weight = mean_of_KDE_scores * birch_clustering_score + mean_of_KDE_scores + birch_clustering_score self.base_graph.add_edge(source, destination, weight=total_weight) print('Added edge: %s with weight %f, ' % (edge, total_weight) + 'KDE performed on %d rows' % len(reshaped_vector_of_time)) print('Finished creating graph.')
def callback(self, odom_msg, scan_msg): print('-----------------------------------------') start_time = time.time() # process odometry message rx = odom_msg.pose.pose.position.x ry = odom_msg.pose.pose.position.y q = odom_msg.pose.pose.orientation rth = arctan2(2 * q.x * q.y - 2 * q.z * q.w, 1 - 2 * q.y**2 - 2 * q.z**2) rth = 2 * pi - rth % (2 * pi) pose = np.array([rx, ry, rth]) self.pose = pose.copy() # process scan message bearings = self.bearings.copy() ranges = np.array(scan_msg.ranges) inf_flag = (-1 * np.isinf(ranges).astype(int) + 1) ranges = np.nan_to_num(ranges) * inf_flag euc_coord_x = pose[0] + np.cos(bearings + pose[2]) * ranges euc_coord_y = pose[1] + np.sin(bearings + pose[2]) * ranges dist_flag = np.where( (euc_coord_x-pose[0])**2 + \ (euc_coord_y-pose[1])**2 != 0.0)[0] points = np.array([euc_coord_x, euc_coord_y]).T points = points[dist_flag] self.obsv = [] if len(points) > 0: brc = Birch(n_clusters=None, threshold=0.05) brc.fit(points) labels = brc.predict(points) u_labels = np.unique(labels) for l in u_labels: seg_idx = np.where(labels == l) seg = points[seg_idx] if seg.shape[0] <= 1: fit_cov = 10 else: fit_cov = np.trace(np.cov(seg.T)) if fit_cov < 0.001 and seg.shape[0] >= 5: self.obsv.append(seg.mean(axis=0)) print('odom: {}\nlandmarks:\n{}'.format(pose, self.obsv)) # publish observed landmarks cube_list = Marker() cube_list.header.frame_id = 'odom' cube_list.header.stamp = rospy.Time.now() cube_list.ns = 'landmark_point' cube_list.action = Marker.ADD cube_list.pose.orientation.w = 1.0 cube_list.id = 0 cube_list.type = Marker.CUBE_LIST cube_list.scale.x = 0.05 cube_list.scale.y = 0.05 cube_list.scale.z = 0.5 cube_list.color.b = 1.0 cube_list.color.a = 1.0 for landmark in self.obsv: p = Point() p.x = landmark[0] p.y = landmark[1] p.z = 0.25 cube_list.points.append(p) self.obsv_pub.publish(cube_list) ''' # send control ctrl = self.erg_ctrl(pose.copy()) ctrl_lin = ctrl[0] ctrl_ang = ctrl[1] vel_msg = Twist() vel_msg.linear.x = ctrl_lin vel_msg.linear.y = 0.0 vel_msg.linear.z = 0.0 vel_msg.angular.x = 0.0 vel_msg.angular.y = 0.0 vel_msg.angular.z = ctrl_ang self.ctrl_pub.publish(vel_msg) ''' # log self.log['count'] += 1 self.log['traj'].append(pose.copy()) # self.log['ctrls'].append(ctrl.copy()) print('elasped time: {}'.format(time.time() - start_time))
def birch_1(a, kwargs): return Birch(**kwargs).fit_predict(a)
def __init__(self): self.wrapped = Birch(n_clusters = 2) self.data = [] self.indexes =[]
ratio = 0.9 n_paa_segments = 18 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_mid = paa.fit_transform(stdData[:, :int(ratio * stdData.shape[1])]) paa_mid = paa_mid.reshape(paa_mid.shape[0], paa_mid.shape[1]) first_clus = paa_mid.copy() for i in range(len(first_clus)): first_clus[i] = rankbased(paa_mid[i]) ################################################################# # 第一次聚类使用Birch跑出初始,然后使用Kmeans细分。数据使用rank-base # 改进:直接使用原始数据,调整Birch的threshold data = first_clus s = time.time() y_pre = Birch(n_clusters=None, threshold=getEpsilon(data, 0.8)).fit_predict(data) y_pre = KMeans(n_clusters=max(y_pre) + 1, random_state=0).fit_predict(data) e = time.time() ################################################################# # 第二次聚类使用10以内间隔2的gap statistics。聚类对象为残差 # 改进:可以考虑聚类对象是残差或直接是标准数据 import pandas as pd def optimalK(data, nrefs=3, maxClusters=15): """ Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie Params: data: ndarry of shape (n_samples, n_features) nrefs: number of sample reference datasets to create
from general_functions import * if __name__ == '__main__': # hypothetical_goal = [0 if _ < 80 else 1 for _ in range(120)] MODEL_NAME = 'model/tf_idf_1.csv' N_ARRANGE = (1, 1) MODE = 'word' make_tf_idf_model(N_ARRANGE, MODEL_NAME, mode=MODE) data = pd.read_csv(MODEL_NAME, index_col=0) from sklearn.cluster import KMeans from sklearn.cluster import SpectralClustering from sklearn.decomposition import PCA from sklearn.cluster import Birch pca = PCA(n_components=3) data = pd.DataFrame(pca.fit_transform(data)) spectral = SpectralClustering(2, random_state=0) k_means = KMeans(n_clusters=2, random_state=0) birch = Birch(threshold=0.1, n_clusters=2) train_and_show(data, spectral) train_and_show(data, k_means) train_and_show(data, birch)
for k in range_clusters: # fit data for k clusters spectral = Clustering(SpectralClustering(n_clusters=k)) spectral.fit(data_df) # evaluate clustering through silhouette score score['Spectral'].append(spectral.evaluate()) # ------------------------------------------------------------------------------ # -- Birch Performance # ------------------------------------------------------------------------------ for k in range_clusters: # fit data for k clusters birch = Clustering(Birch(n_clusters=k, threshold=0.36)) birch.fit(data_df) # evaluate clustering through silhouette score score['Birch'].append(birch.evaluate()) # ------------------------------------------------------------------------------ # -- DBSCAN Performance # ------------------------------------------------------------------------------ # DBSCAN dbscan = Clustering(DBSCAN(eps=.5, min_samples=3)) dbscan.fit(data_df) score['DBSCAN'].append(dbscan.evaluate())
print(data, citypos.shape) # KMeans km = KMeans(n_clusters=100, n_init=1) itime = time.perf_counter() kmlabels = km.fit_predict(citypos) etime = time.perf_counter() print ('K-means Time = ', etime-itime) # Minibatch Kmeans itime = time.perf_counter() mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000) mbkmlabels = mbkm.fit_predict(citypos) etime = time.perf_counter() print ('MB K-means Time = ', etime-itime) print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels)) # Birch itime = time.perf_counter() birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100) birchlabels = birch.fit_predict(citypos) etime = time.perf_counter() print ('BIRCH Time = ',etime-itime) print('Similarity Km vs BIRCH',adjusted_mutual_info_score(kmlabels, birchlabels))
columns = ['RadPeer.Score', 'RadPeer.Significance.of.Errors', 'Technical.Performance.Score', 'Percent.Error'] features = summary[columns] #fig = pd.scatter_matrix(features, figsize=(18,18), alpha=0.5, grid=True) #sns.plt.savefig('features_scatter.png', bbox_inches='tight') # scaling mms = MinMaxScaler() X = mms.fit_transform(features) # set up clustering algorithms db = DBSCAN(eps=0.3, min_samples=5) ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='average') #km = MiniBatchKMeans(n_clusters=2, random_state=1, n_init=15) bc = Birch(n_clusters=2) #sp = SpectralClustering(n_clusters=2, eigen_solver='arpack', random_state=1) #bandwidth = estimate_bandwidth(X, quantile=0.3) #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #ap= AffinityPropagation(damping=.9, preference=-200) #y_km = km.fit_predict(X) y_ac = ac.fit_predict(X) utils.swap_label(y_ac) y_bc = bc.fit_predict(X) utils.swap_label(y_bc) y_db = db.fit_predict(X) y_db[y_db==-1] = 1 #print np.unique(y_db) #y_sp = sp.fit_predict(X) #y_ms = ms.fit_predict(X)
import numpy as np from sklearn.cluster import Birch from sklearn.datasets.samples_generator import make_blobs import matplotlib.pyplot as plt from itertools import cycle # Generates random vectors to cluster n_samples = 50 centers = [[0, 1], [4, -2], [-2, 2], [0, -1]] X, _ = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.2) # Creates the Birch classificator and gives it the vectors brc = Birch(branching_factor=50, n_clusters=None, threshold=0.8, compute_labels=True) brc.fit(X) labels = brc.labels_ cluster_centers = brc.subcluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) # Prints the points generated plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k cluster_center = cluster_centers[k] plt.plot(X[my_members, 0], X[my_members, 1], col + '.') plt.axis([-4,12,-4,12]) plt.title('Estimated number of clusters: %d' % n_clusters_)
def birchclustering(datalist): brc = Birch(branching_factor=50, n_clusters=None, threshold=0.17,compute_labels=True) brc.fit(datalist) return brc
event_array[i, 1] = dsp_dict["EVLO"] station_array = np.array(station_list) dsp_array = np.array(dsp_list) # extract the unique station names stations = np.unique(station_array) print stations for sta in stations: events = event_array[station_array == sta, :] dsp_shortlist = dsp_array[station_array == sta] print sta, events.shape, dsp_shortlist.shape # cluster on events so as to compare dispersion curves for nearby # events brc = Birch(branching_factor=50, n_clusters=None, threshold=dist, compute_labels=True) brc.fit(events) labels = brc.predict(events) print np.max(labels) for lab in np.unique(labels): dsp_this_label_list = dsp_shortlist[labels == lab] cluster_name = os.path.join(dirname, "cluster_%s_%03d" % (sta, lab)) plot_all_dsp(dsp_this_label_list, legend=False, fname="%s_gvel.png" % cluster_name) plot_all_map(dsp_this_label_list, fname="%s_map.png" % cluster_name, legend=False) f = open("%s_info.txt" % cluster_name, "w") for (dsp, dsp_dict) in dsp_this_label_list: f.write( "%s %s %d %03d %02d %02d %.3f %.3f\n" % ( dsp_dict["STA"], dsp_dict["COMP"],
def __init__(self, num_clusters, feature_names, train_x, train_y, rep): ClusterModel.__init__(self, train_x, train_y, feature_names, rep) self.birch_model = Birch(n_clusters=num_clusters).fit(train_x) self.birch_model.predict(train_x) self.labels = self.birch_model.labels_ self.num_clusters = num_clusters
KMS = MiniBatchKMeans(n_clusters=6, init='k-means++', n_init=10, max_iter=300, tol=0.0001).fit(Wine_Softmax) SSE, SSB, SSE_cluster = calculateMeasures(Wine_Softmax, KMS.labels_, KMS.cluster_centers_) print('SSB : %f' % (SSB)) print('SSE : %f' % (SSE)) KMS # In[69]: print('\nWine_Base') KMS = Birch(n_clusters=6).fit(Wine_Base) SSE, SSB, SSE_cluster = calculateMeasures( Wine_Base, KMS.labels_, updateCentroids(Wine_Base, KMS.labels_, np.zeros((k, Wine_Base.shape[1])))) print('SSB : %f' % (SSB)) print('SSE : %f' % (SSE)) print('\nWine_Norm') KMS = Birch(n_clusters=6).fit(Wine_Norm) SSE, SSB, SSE_cluster = calculateMeasures( Wine_Base, KMS.labels_, updateCentroids(Wine_Base, KMS.labels_, np.zeros((k, Wine_Base.shape[1])))) print('SSB : %f' % (SSB)) print('SSE : %f' % (SSE)) print('\nWine_Softmax')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) print(X_train.shape, "\n\n", X_test.shape, "\n\n", y_train.shape, "\n\n", y_test.shape, "\n\n") kms = KMeans(n_clusters=7, random_state=0).fit(X_train) y_pred = kms.predict(X_test) metrics.accuracy_score(y_test, y_pred) print("Accuracy using KMeans Clustering: ", metrics.accuracy_score(y_test, y_pred)) agg = AgglomerativeClustering(n_clusters=1).fit(X_train) y_pred = agg.fit_predict(X_test) metrics.accuracy_score(y_test, y_pred) print("Accuracy using Agglomerative Clustering: ", metrics.accuracy_score(y_test, y_pred)) brc = Birch(n_clusters=2).fit(X_train).fit(X_train) y_pred = brc.predict(X_test) metrics.accuracy_score(y_test, y_pred) print("Accuracy using Birch Clustering: ", metrics.accuracy_score(y_test, y_pred)) ''' Accuracy using KMeans Clustering: 0.17061611374407584 Accuracy using Agglomerative Clustering: 0.6777251184834123 Accuracy using Birch Clustering: 0.5450236966824644 '''
for item in range(len(affinity_propagation_valid_performance_metric_array)): affinity_propagation_valid_performance_metrics_for_plotting[item + 1] = affinity_propagation_valid_performance_metric_array[item] affinity_propagation_test_performance_metrics_for_plotting[item + 1] = affinity_propagation_test_performance_metric_array[item] Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(affinity_propagation_parameter_search_space_for_plotting, affinity_propagation_valid_performance_metrics_for_plotting, affinity_propagation_test_performance_metrics_for_plotting, 'Adjusted Mutual Information Score', 'AffinityPropagation Clustering damping parameter', 'Affinity_Propagation_Performance', 0, 0.5, left_horizontal_limit=0.5) # Do BIRCH, optimizing number of calls to partial_fit over a validation set current_optimal_birch_number_of_calls = 1 initial_optimal_birch_clusterer = Birch() initial_optimal_birch_clusterer.partial_fit(train_data_set) initial_optimal_birch_clusterer.set_params(n_clusters=number_of_classes) initial_birch_valid_predictions = initial_optimal_birch_clusterer.predict(valid_data_set) initial_birch_test_predictions = initial_optimal_birch_clusterer.predict(test_data_set) # Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix for element in range(number_of_valid_observations): initial_birch_valid_predictions[element] += 1 for element in range(number_of_test_observations): initial_birch_test_predictions[element] += 1 initial_birch_valid_predictions = Clustering.Hungarian_Fix(initial_birch_valid_predictions, valid_labels).astype('int') initial_birch_test_predictions = Clustering.Hungarian_Fix(initial_birch_test_predictions, test_labels).astype('int')
def main(): # parameters write_whole_cluster = False perform_pca = False birch_thresh = 2.0 count_thresh = 0.1 eval_file_names = [ 'filtered_eval_three_event.csv', 'filtered_eval_five_event.csv', 'filtered_eval_seven_event.csv' ] annotated_file_names = [ 'annotated_three_event.txt', 'annotated_five_event.txt', 'annotated_seven_event.txt' ] for m in range(0, len(eval_file_names)): fileName = eval_file_names[m] file_prefix = 'output' print(fileName) for birch_thresh in np.arange(0.0, 4.1, 0.2): for count_thresh in np.arange(0.1, 1.1, 0.1): '''for i in range(1,179): if(i not in temp): print(i) ''' df = pd.read_csv(fileName, header=None, encoding='latin-1') df.columns = [ 'record_id', 'date', 'url', 'counts', 'themes', 'locations', 'persons', 'organizations', 'tone' ] # Retaining only those news which have non-null themes and locations df = df[pd.notnull(df['themes'])] df = df[pd.notnull(df['locations'])] df_locations = pd.DataFrame(df['locations']) # Reading actual class labels assigned by expert human assessor class_labels = [None] * len(df) temp = {} with open(annotated_file_names[m], "r") as ins: label = 1 for line in ins: line = line.strip() if line.startswith("#"): continue if line: line = line.split(',') # print(line) for item in line: class_labels[int(item) - 1] = label temp[int(item)] = True label += 1 row_dict = df.copy(deep=True) row_dict.fillna('', inplace=True) row_dict.index = range(len(row_dict)) row_dict = row_dict.to_dict( 'index') # dictionary that maps row number to row identifier_dict = { } # dictionary that maps GKG Record Id to Row Number i = 0 for index, row in df.iterrows(): identifier_dict[row['record_id']] = i i += 1 df = df[df.columns[[4]]] df.columns = ['themes'] df = pd.DataFrame( df['themes'].str.split(';')) # splitting themes df_locations = pd.DataFrame( df_locations['locations'].str.split( ';')) # splitting locations for row in df_locations.itertuples(): for i in range(0, len(row.locations)): try: row.locations[i] = (row.locations[i].split('#'))[ 3] # for retaining only ADM1 Code except: continue # merged = list(itertools.chain(*row.locations)) # df_locations.loc[row.Index, 'locations'] = merged df = df[pd.notnull(df['themes'])] mlb = MultiLabelBinarizer(sparse_output=True) sparse_themes = mlb.fit_transform(df['themes']) df = sparse_themes # df = sparse_locations # Reducing dimensions through principal component analysis if perform_pca: pca = PCA(n_components=None) df = pd.DataFrame(pca.fit_transform(df)) #print("Starting clustering") brc = Birch(branching_factor=50, n_clusters=None, threshold=birch_thresh, compute_labels=True) predicted_labels = brc.fit_predict(df) clusters = {} n = 0 for item in predicted_labels: if item in clusters: clusters[item].append(list((row_dict[n]).values( ))) # since row_dict[n] is itself a dictionary else: clusters[item] = [list((row_dict[n]).values())] n += 1 # clustering within each cluster, on counts count_clusters = { } # dictionary which maps original_cluster_key to new clusters within that cluster for item in clusters: count_clusters[item] = {} cluster_df = pd.DataFrame(clusters[item]) cluster_row_dict = cluster_df.copy(deep=True) cluster_row_dict.fillna('', inplace=True) cluster_row_dict.index = range(len(cluster_row_dict)) cluster_row_dict = cluster_row_dict.to_dict('index') df_counts = pd.DataFrame( cluster_df[cluster_df.columns[[3]]]) df_counts.columns = ['counts'] df_counts = pd.DataFrame( df_counts['counts'].str.split(';')) # splitting counts df_locations = pd.DataFrame( cluster_df[cluster_df.columns[[5]]]) df_locations.columns = ['locations'] df_locations = pd.DataFrame( df_locations['locations'].str.split(';')) for row in df_locations.itertuples(): for i in range(0, len(row.locations)): try: row.locations[i] = (row.locations[i].split( '#'))[3] # for retaining only ADM1 Code except: continue for row in df_counts.itertuples(): for i in range(0, len(row.counts)): try: temp_list = row.counts[i].split('#') row.counts[i] = temp_list[0] + '#' + temp_list[ 1] + '#' + temp_list[ 5] # for retaining only COUNT_TYPE and QUANTITY and LOCATION ADM1 Code except: continue if len(row.counts) == 1 and row.counts[0] == '': row.counts.append( '#' ) # so that news with no counts are clustered together row.counts.pop(0) if row.counts[len(row.counts) - 1] == '': row.counts.pop() row.counts[:] = [ x for x in row.counts if not x.startswith('CRISISLEX') ] # Removing CRISISLEX Entries due to elevated false positive rate mlb4 = MultiLabelBinarizer(sparse_output=True) sparse_counts = mlb4.fit_transform(df_counts['counts']) mlb5 = MultiLabelBinarizer(sparse_output=True) sparse_locations = mlb5.fit_transform( df_locations['locations']) small_df = hstack([sparse_locations, sparse_counts]) #pca = PCA(n_components=2) #df_counts = pd.DataFrame(pca.fit_transform(df_counts)) # print(df_counts.to_string()) # df_counts.to_csv('one_hot_encoded_counts.csv', sep=',') # return brc2 = Birch(branching_factor=50, n_clusters=None, threshold=count_thresh, compute_labels=True) predicted_labels2 = brc2.fit_predict(small_df) n2 = 0 for item2 in predicted_labels2: if item2 in count_clusters[item]: count_clusters[item][item2].append( list((cluster_row_dict[n2]).values()) ) # since cluster_row_dict[n2] is itself a dictionary else: count_clusters[item][item2] = [ list((cluster_row_dict[n2]).values()) ] n2 += 1 # if write_whole_cluster: # with open('filtered_one/'+file+'.txt', 'w', encoding='utf-8') as file: # for item in count_clusters: # for item2 in count_clusters[item]: # file.write("\n\nCluster "+str(item)+': ' + str(item2) + "\n") # for i in range(0, len(count_clusters[item][item2])): # file.write(count_clusters[item][item2][i][2] + '\n') # appending url # else: # with open('filtered_one/'+file+'.csv', 'w',newline='', encoding='utf-8') as file: # writer = csv.writer(file, delimiter=",") # for item in count_clusters: # for item2 in count_clusters[item]: # writer.writerow(count_clusters[item][item2][0]) test_dict = {} label = 1 cluster_labels = [None] * n with open(file_prefix + '.txt', 'w', encoding='utf-8') as file: for item in count_clusters: for item2 in count_clusters[item]: file.write("\n\nCluster " + str(item) + ': ' + str(item2) + "\n") for i in range(0, len(count_clusters[item][item2])): gkg_record_id = count_clusters[item][item2][i][ 0] if (gkg_record_id in test_dict): print("yes") print(gkg_record_id) return test_dict[gkg_record_id] = True #file.write(str(identifier_dict[gkg_record_id]+1)+'\n'+count_clusters[item][item2][i][2]+ '\n' +count_clusters[item][item2][i][3]+ '\n\n') # appending url file.write( str(identifier_dict[gkg_record_id] + 1) + '\n') cluster_labels[ identifier_dict[gkg_record_id]] = label label += 1 # print(cluster_labels) matrix = metrics.cluster.contingency_matrix( class_labels, cluster_labels) rand_index, precision, recall, f1 = precision_recall_fmeasure( matrix) ari = metrics.cluster.adjusted_rand_score( class_labels, cluster_labels) #print("AdjustedRI:", ari) nmi = metrics.normalized_mutual_info_score( class_labels, cluster_labels) #print("NMI :", nmi) print(birch_thresh, ",", count_thresh, ",", rand_index, ",", precision, ",", recall, ",", f1, ",", ari, ",", nmi)
def cluster_birch(self): print "Starting Birch clustering" brc = Birch(branching_factor=10, n_clusters=40, threshold=self.cluster_distance,compute_labels=False) brc.fit(self.all_frames_xy) clusters = brc.predict(self.all_frames_xy) return clusters
subsets_original.append(X_subset1) subsets_original.append(X_subset2) subsets_original.append(X_subset3) #subsets_original.append(X_subset4) # diccionarios para el guardado de las variables metrics_CH = dict() metrics_SC = dict() cluster_predict_all = dict() # k = len(set(cluster_predict)) para ver cuantos clusters se han obtenido # if k>1 and name is not ward en caso contrario pon las métricas a 0 print("------ Declarando los algoritmos") k_means = KMeans(n_clusters=3, init='k-means++') ward = AgglomerativeClustering(n_clusters=3, linkage='ward') birch = Birch(n_clusters=3) dbscan = DBSCAN(eps=0.01, min_samples=10) spectral = SpectralClustering(n_clusters=3, affinity="nearest_neighbors") #affinity_propagation = AffinityPropagation() #ms = MeanShift() clustering_algorithms = [("k-means", k_means), ("ward", ward), ("birch", birch), ("dbscan", dbscan), ('spectral', spectral)] index = 1 for subset in subsets: print("Trabajando con subset {}".format(index)) for name, algorithm in clustering_algorithms: print("{:7s}, ".format(name), end='') tiempo = time.time()
#create dendogram #dendogram = sch.dendrogram(sch.linkage(points,method='ward')) hc = ac(n_clusters=2, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(points) f2 = plt.figure() plt.scatter(points[y_hc == 0, 0], points[y_hc == 0, 1], c='red') plt.scatter(points[y_hc == 1, 0], points[y_hc == 1, 1], c='blue') plt.scatter(points[y_hc == 2, 0], points[y_hc == 2, 1], c='black') plt.scatter(points[y_hc == 3, 0], points[y_hc == 3, 1], c='cyan') plt.title('Heirarchical Clustering') plt.show() #Birch clustering bir = Birch(n_clusters=2, threshold=0.8, branching_factor=200) bir.fit(points) y_bir = bir.fit_predict(points) f3 = plt.figure() plt.scatter(points[y_bir == 0, 0], points[y_bir == 0, 1], c='red') plt.scatter(points[y_bir == 1, 0], points[y_bir == 1, 1], c='blue') plt.scatter(points[y_bir == 2, 0], points[y_bir == 2, 1], c='black') plt.scatter(points[y_bir == 3, 0], points[y_bir == 3, 1], c='cyan') plt.title('Birch Clustering') plt.show() #DBSCAN dbs = DBSCAN(eps=0.1, min_samples=5) dbs.fit(points)
def getSecondClus_2(data): epsilon = getEpsilonFromtiny(data) y_pre = Birch(n_clusters=None, threshold=epsilon).fit_predict(data) return y_pre
include_self=False) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ward = AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = DBSCAN(eps=params['eps']) affinity_propagation = AffinityPropagation(damping=params['damping'], preference=params['preference']) average_linkage = AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = Birch(n_clusters=params['n_clusters']) gmm = GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('Birch', birch), ('GaussianMixture', gmm)) #now plot everything f, ax = plt.subplots(2, 4, figsize=(20, 15)) for idx, (name, algorithm) in enumerate(clustering_algorithms): algorithm.fit(embedding) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(embedding)
def birch(X): br = Birch(n_clusters=None, threshold=10).fit(X) print('br') print(silhouette_score(X, br.labels_)) print(calinski_harabaz_score(X, br.labels_)) return br
def clf_init(b_factor = 50, threshold = 0.8): return Birch(branching_factor=b_factor, n_clusters=None, threshold=threshold, compute_labels=True)
w2v = Counter(documents_tokens[doc]) row = [] for idx in all_words: if idx in w2v: row.append(w2v[idx]) else: row.append(0) matrix.append(row) print('Matrix shape') print(len(matrix), 'x', len(matrix[0])) # Birch clustering brc = Birch(branching_factor=20, n_clusters=7, threshold=0.5, compute_labels=True) # Clustering brc.fit(matrix) document_labels = brc.predict(matrix) print('Document labels: ', document_labels) # Countplot sns.countplot(document_labels) # Jaccard similarity measure
from sklearn.cluster import Birch import csv import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np X = np.loadtxt(fname='Dataset.txt', skiprows=1) # print(X) X = [list(i) for i in X] for i in range(len(X)): for j in range(2): X[i][j] = X[i][j] / 1000000 print(X) X = np.array(X) plt.scatter(X[:, 0], X[:, 1], s=4, c='black') plt.show() brc = Birch(branching_factor=50, n_clusters=7, threshold=0.05, compute_labels=True) cftree = brc.fit(X) ans = brc.predict(X) labs = np.unique(ans) cmap = plt.get_cmap('jet', len(labs)) plt.scatter(X[:, 0], X[:, 1], c=ans, s=4, cmap=cmap) plt.show()
import time """ https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html#module-scipy.cluster.hierarchy https://towardsdatascience.com/machine-learning-algorithms-part-12-hierarchical-agglomerative-clustering-example-in-python-1e18e0075019 https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/ """ print("Compute birch clustering...") st = time.time() X = np.stack([x1, x2], axis=1) X = np.reshape(X, (-1, 2)) n_clusters = 3 birch = Birch(n_clusters=n_clusters, threshold=0.01, branching_factor=10) birch.fit(X) # label = birch.labels_ label = birch.predict(X) print("Elapsed time: ", time.time() - st) print("Number of clusters: ", np.unique(label).size) import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.scatter(x1, x2, c=label) ax.set_xlabel(r"$x1$", fontsize=15)
def process(X, labels_num): print("Clustering using Birch") brc = Birch(branching_factor=20, n_clusters=32, threshold=10,compute_labels = True).fit(X) pred_label = brc.predict(X) return pred_label
data_thr = mask(data, 'orbit') # rm too large values except for 'orbit' np.random.seed(0) X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB, data_thr.rateC, data_thr.rateCA] Html_file = open("clustering_files/birch.html", "w") scaler = StandardScaler() X = scaler.fit_transform(X) for n_clusters in range(2, 10): km = Birch(n_clusters=n_clusters) preds = km.fit_predict(X) print "components:", set(preds) print np.bincount(preds) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] * 2 # Spectral9 # color_key = color_key[:len(set(preds))+2] # single plot rateCA vs rate with predicted classes and ellipses: single_plot = bokeh_datashader_plot(data_thr, covs=None, means=None,
class chj_data(object): def __init__(self, data, target): self.data = data self.target = target def chj_load_file(fdata, ftarget): res = chj_data(fdata, ftarget) return res print(X_train) print(X_train["Pclass"]) iris = chj_load_file(X_train, y_pred) X_tsne = TSNE(n_components=2, learning_rate=100).fit_transform(iris.data) plt.figure(figsize=(12, 6)) plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=iris.target) plt.colorbar() plt.show() y_Birch = Birch(n_clusters=None).fit_predict(X_train) iris_Birch = chj_load_file(X_train, y_Birch) X_tsne_Birch = TSNE(n_components=2, learning_rate=100).fit_transform(iris_Birch.data) plt.figure(figsize=(12, 6)) plt.scatter(X_tsne_Birch[:, 0], X_tsne_Birch[:, 1], c=iris_Birch.target) plt.colorbar() plt.show()
class Mini(): def __init__(self,minis,mini_names,mini_finds,sample_freq): self.mini_names =mini_names self.minis = minis self.sample_freq = sample_freq self.mini_finds=mini_finds self.offsets= self.fit_paras= self.event_sizes= self.amplitudes= self.fast_constants= self.slow_constants=self.a_constants=self.cur_labels = None self.dict=['mini_names','minis','offsets','fit_paras','event_sizes','amplitudes','fast_constants','slow_constants','a_constants','cur_labels','mini_finds'] self.delete_index = set() def _delete_mini(self,index): # truly delete for name in self.dict: if hasattr(self,name): llist=getattr(self,name) if isinstance(llist,list): llist.pop(index) #print(llist==getattr(self,name)) else: print(name) setattr(self,name,list(llist)) llist = getattr(self, name) llist.pop(index) def mark_delete_mini(self,indexs): # delete candidate # indexs is list or union or tuple self.delete_index=self.delete_index.union(indexs) def truly_delete_mini(self): print(self.delete_index) self.delete_index=list(self.delete_index) self.delete_index.sort(reverse=True) for number in self.delete_index: self._delete_mini(number) self.delete_index=set() # clear the delete flush def reindex_mini(self): self.mini_reindex={'label':{},'sweep':{}} #self.mini_reindex['label']=func_base.list_to_dict(self.cur_labels,self.minis) self.mini_reindex['label']=func_base.list_to_dict(self.cur_labels,range(len(self.cur_labels))) #self.mini_reindex['sweep']=func_base.list_to_dict([x[0] for x in self.mini_finds],self.minis) self.mini_reindex['sweep']=func_base.list_to_dict([x[0] for x in self.mini_finds],range(len(self.mini_finds))) print(self.mini_reindex['label']) # self.minis_number,self.event_sizes,self.offsets,self.fast_constants,self.slow_constants,self.rise_10_90s,self.decay_90_50s=mini_base.statis(self.minis) def statis(self): if not self.minis: print('couldn\'t find any minis' ) return #print(self.minis) self.mini_number=len(self.minis) def templete_func(x,a0,a1,tau1,tau2,t0): try: return np.piecewise(x,[x>=t0,x<t0],[lambda x: a0+a1*(1-math.exp((x-t0)/tau1))*(math.exp((x-t0)/tau2)),a0]) except: print('xxx',x) self.fit_paras=[] self.event_sizes=[] self.amplitudes=[] self.offsets=[] self.fast_constants=[] self.slow_constants=[] self.a_constants=[] # fit use two expenent function param_bounds=([-np.inf,-np.inf,0,0,-np.inf],[np.inf,0,np.inf,np.inf,np.inf]) #nn=0 for mini in self.minis: self.amplitudes.append(max(mini)-min(mini)) minilen= len(mini) # if too large fitcurve cannt work if minilen>10000: minilen=10000 mini=mini[:minilen] x_label=np.arange(0,minilen)/self.sample_freq #nn+=1 #print(len(x_label)) try: paraments,pcov = curve_fit(templete_func,x_label,mini,bounds=param_bounds) except: #print(nn) print("mini",mini,"label",x_label) plt.figure() plt.plot(x_label,mini) plt.show() raise self.fit_paras.append(paraments) self.offsets.append(paraments[4]) self.fast_constants.append(paraments[2]) self.slow_constants.append(paraments[3]) self.a_constants.append(paraments[1]) fit_mini=templete_func(x_label,*paraments) self.event_sizes.append(max(fit_mini)-min(fit_mini)) def mini_dim_reduce(self,dim=5): # PCA anylysis pca=PCA(n_components=dim) # Convert Python sequence to NumPy array, filling missing values minis=np.array(list(itertools.zip_longest(*self.minis, fillvalue=0))).T # transform return array like self.proced_minis=pca.fit_transform(minis) print('explained variance ratio (first two components): %s' %str(pca.explained_variance_ratio_)) def get_mini_info(self,index): #print(locals()) mini=self.minis[index] x_label=np.arange(len(mini))/self.sample_freq return self.mini_names[index],mini,self.cur_labels[index],x_label def classify(self,n_cluster=5): # Using BIRCH cluster self.birch = Birch(threshold=0.5,n_clusters=n_cluster) self.birch.fit(self.proced_minis) self.ori_labels = self.birch.labels_ self.ori_centroids = self.birch.subcluster_centers_ self.ori_n_clusters = np.unique(self.ori_labels) self.ori_n_cluster = np.unique(self.ori_labels).size self.cur_labels = self.ori_labels self.cur_centroids = self.ori_centroids self.cur_n_cluster = self.ori_n_cluster self.cur_n_clusters = self.ori_n_clusters def set_n_cluster(self,n_cluster): self.birch.set_params(n_clusters=n_cluster) self.cur_labels = self.ori_labels=self.birch.predict(self.proced_minis) self.cur_n_cluster = np.unique(self.cur_labels).size self.cur_n_clusters = np.unique(self.cur_labels) self.cur_centroids = self.birch.subcluster_centers_
def cluster_junctions(juncs): birch_model = Birch(threshold=3, n_clusters=None) X = np.array(juncs) birch_model.fit(X) return birch_model.labels_
def scan_callback(self, scan_msg): print('-----------------------------------------') start_time = time.time() # process scan message pose = self.pose.copy() bearings = self.bearings.copy() ranges = np.array(scan_msg.ranges) inf_flag = (-1 * np.isinf(ranges).astype(int) + 1) ranges = np.nan_to_num(ranges) * inf_flag euc_coord_x = pose[0] + np.cos(bearings + pose[2]) * ranges euc_coord_y = pose[1] + np.sin(bearings + pose[2]) * ranges dist_flag = np.where( (euc_coord_x-pose[0])**2 + \ (euc_coord_y-pose[1])**2 != 0.0)[0] points = np.array([euc_coord_x, euc_coord_y]).T points = points[dist_flag] self.obsv = [] if len(points) > 0: brc = Birch(n_clusters=None, threshold=0.05) brc.fit(points) labels = brc.predict(points) u_labels = np.unique(labels) for l in u_labels: seg_idx = np.where(labels == l) seg = points[seg_idx] if seg.shape[0] <= 1: fit_cov = 10 else: fit_cov = np.trace(np.cov(seg.T)) if fit_cov < 0.001 and seg.shape[0] >= 4: self.obsv.append(seg.mean(axis=0)) print('odom: {}\nlandmarks:\n{}'.format(pose, self.obsv)) # publish observed landmarks cube_list = Marker() cube_list.header.frame_id = 'odom' cube_list.header.stamp = rospy.Time.now() cube_list.ns = 'landmark_point' cube_list.action = Marker.ADD cube_list.pose.orientation.w = 1.0 cube_list.id = 0 cube_list.type = Marker.CUBE_LIST cube_list.scale.x = 0.05 cube_list.scale.y = 0.05 cube_list.scale.z = 0.5 cube_list.color.b = 1.0 cube_list.color.a = 1.0 for landmark in self.obsv: p = Point() p.x = landmark[0] p.y = landmark[1] p.z = 0.25 cube_list.points.append(p) self.obsv_pub.publish(cube_list) print('elasped time: {}'.format(time.time() - start_time))
import numpy as np from sklearn.cluster import Birch import cluster import csv clusters = 20 submit_file = 'submit_birch.csv' X, plays = cluster.get_matrix() brc = Birch() X = np.array(X, dtype=float) plays = np.array(plays, dtype=float) # print X.shape print "Running Birch on training data...", brc = Birch(branching_factor=50, n_clusters=clusters, threshold=0.5, compute_labels=True) labels = brc.fit_predict(X) print "Done!" print labels # plays_sums = [0] * clusters # cluster_size = [0] * clusters plays_sums = {} # Median for idx, label in enumerate(labels): if label in plays_sums: plays_sums[label].append(plays[idx]) else: plays_sums[label] = [plays[idx]] # cluster_size[label] += 1