def birch_ad_with_smoothing(latency_df, threshold): # anomaly detection on response time of service invocation. # input: response times of service invocations, threshold for birch clustering # output: anomalous service invocation anomalies = [] for svc, latency in latency_df.iteritems(): # No anomaly detection in db if svc != 'timestamp' and 'Unnamed' not in svc and 'rabbitmq' not in svc and 'db' not in svc: latency = latency.rolling(window=smoothing_window, min_periods=1).mean() x = np.array(latency) x = np.where(np.isnan(x), 0, x) normalized_x = preprocessing.normalize([x]) X = normalized_x.reshape(-1,1) # threshold = 0.05 brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True) brc.fit(X) brc.predict(X) labels = brc.labels_ # centroids = brc.subcluster_centers_ n_clusters = np.unique(labels).size if n_clusters > 1: anomalies.append(svc) return anomalies
class Birch_(ClusterModel): def __init__(self, num_clusters, feature_names, train_x, train_y, rep): ClusterModel.__init__(self, train_x, train_y, feature_names, rep) self.birch_model = Birch(n_clusters=num_clusters).fit(train_x) self.birch_model.predict(train_x) self.labels = self.birch_model.labels_ self.num_clusters = num_clusters
def _get_centers(self, x): x = np.array(x) if self.hidden is None: brc = Birch() brc.fit(x) brc.predict(x) return brc.subcluster_centers_ else: if x.shape[0] == 1: x = x.T print(x.shape) idx = np.random.choice(x.shape[0], self.hidden, replace=False) print(idx) return x[idx]
class Birch_algo_wrapper: def __init__(self): self.wrapped = Birch(n_clusters=None, threshold=0.5, branching_factor=50) def fit(self, data): return self.wrapped.fit(data) def fit_predict(self, data): self.wrapped = self.wrapped.partial_fit(data) return self.wrapped.predict(data) def predict(self, data): return self.wrapped.predict(data)
def bclustering(matlist, numlist, thre): ids = [] brc = Birch(branching_factor=80, n_clusters=300, threshold=thre, compute_labels=True) brc.fit(np.asarray(matlist, dtype=float)) brc.predict(np.asarray(matlist, dtype=float)) labels = brc.labels_ k = len(set(labels)) # k is the numbers of clusters, including the bad cluster label, for instance -1 for i in range(0, k): list_id = np.asarray(numlist)[labels == i] ids.append(list(list_id)) return ids
def train(feature, weights, cluster_num, feature_path = None, down = 0.006, up = 0.0085, bf_index = 2): if feature_path != None: feature = pd.read_csv(feature_path) X = [] print("Training...\n") for i in range(len(feature[feature.columns[0]])): f = np.array(feature.iloc[i][1:]) key = f[bf_index] if key > up: f_w = combine(feature.iloc[i][1:], weights) X.append(f_w) clf = Birch(n_clusters = cluster_num) clf = KMeans(n_clusters = cluster_num) clf.fit(X) pred = [] for i in range(len(feature[feature.columns[0]])): f = np.array(feature.iloc[i][1:]) key = f[bf_index] if key > up: p = clf.predict([combine(f, weights)]) pred.append(p[0]) if key < down: pred.append(cluster_num) if key > down and key < up: pred.append(cluster_num + 1) joblib.dump(clf, 'curve_model_Birch.pkl') print(pred) return pred
def birch_skm_part1_helper(data, m, k, delta): """ The function receive data and calculates k centers using the birch function in sklearn, and their quantile radius :param data: numpy array :param m: Size of the data :param k: Number of centers. :param delta: int :return: tuple of two numpy array. (k_medoids, k_distances). """ birch_instance = Birch(n_clusters=k, threshold=0.1) # birch instance birch_instance.fit(data) # Run birch on the data labels = birch_instance.predict(data) # calculate the cluster number for each point l_medoids = [] # since birch does not return centers, I have to calculate them for label in range( np.unique(labels).size): # calculate the center for each cluster cluster = data[labels == label] kmedoids_instance_for_birch = kmedoids(cluster.tolist(), init_centers(cluster, 1)) kmedoids_instance_for_birch.process() l_medoids.append(cluster[kmedoids_instance_for_birch.get_medoids()][0]) l_medoids = np.array(l_medoids) q = calc_q(m, delta) # calculate q # calculate the distance to the quantile points around each center l_distances = calc_quantile_radius_around_centers(data, l_medoids, q, k) return l_medoids, l_distances
class ClusteringObjectClassifierModel(object): def __init__(self): self.learned_classes = dict() self.max_classes = 10 self.estimator = Birch(n_clusters=None, threshold=10.0) def online_fit(self, X, class_name): self.estimator.partial_fit(X) cluster_id = np.asscalar(self.estimator.labels_) if cluster_id not in self.learned_classes: print("Assigning cluster id %d to class %s" % (cluster_id, class_name)) self.learned_classes[cluster_id] = class_name return self.__pca_on_cluster_centers( self.estimator.subcluster_centers_) def __pca_on_cluster_centers(self, cluster_centers): pca = PCA(n_components=2) coords = np.atleast_2d(pca.fit_transform(cluster_centers)) if len(coords) < 2: return np.zeros(1), np.zeros(1) return coords[:, 0], coords[:, 1] def predict_class(self, X): if not hasattr(self.estimator, "root_"): return False, False cluster_id = np.asscalar(self.estimator.predict(X)) if cluster_id not in self.learned_classes: return False, False return self.learned_classes[cluster_id], cluster_id
def birch_clusters(textdata, trained_doc2vec, n_clusters, start_alpha=0.025, infer_epoch=100, branching_factor=10, threshold=0.01, compute_labels=True, metric='cosine', **kwargs): infer_list = [] for doc in textdata: infer_list.append( trained_doc2vec.infer_vector(doc, alpha=start_alpha, steps=infer_epoch, **kwargs)) brc = Birch(branching_factor=branching_factor, n_clusters=int(n_clusters), threshold=threshold, compute_labels=compute_labels) brc.fit(infer_list) clusters = brc.predict(infer_list) birch_labels = brc.labels_ silhouette_score = metrics.silhouette_score(infer_list, birch_labels, metric=metric) return silhouette_score, clusters
def cluster_latlon(n_clusters, data): #split the data between "around NYC" and "other locations" basically our first two clusters data_c = data[(data.longitude > -74.05) & (data.longitude < -73.75) & (data.latitude > 40.4) & (data.latitude < 40.9)] data_e = data[~(data.longitude > -74.05) & (data.longitude < -73.75) & (data.latitude > 40.4) & (data.latitude < 40.9)] #put it in matrix form coords = data_c.as_matrix(columns=['latitude', "longitude"]) brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01, compute_labels=True) brc.fit(coords) clusters = brc.predict(coords) data_c["cluster_" + str(n_clusters)] = clusters data_e["cluster_" + str( n_clusters)] = -1 #assign cluster label -1 for the non NYC listings data = pd.concat([data_c, data_e]) plt.scatter(data_c["longitude"], data_c["latitude"], c=data_c["cluster_" + str(n_clusters)], s=10, linewidth=0.1) plt.title(str(n_clusters) + " Neighbourhoods from clustering") plt.show() return data
def do_BIRCH(nc = 100): os.chdir("/home/admin123/Clustering_MD/Paper/clustering.experiments/") fp = "Jan_2016_Delays_Recoded.csv" df = pd.read_csv(fp) X = df.as_matrix() del df ipca = IncrementalPCA(n_components = 2) X_ipca = ipca.fit_transform(X) del X logger.debug("Starting BIRCH on large dataset - " + str(X_ipca.shape[0]) + " rows!") brc = Birch(branching_factor=50, n_clusters=nc,\ threshold=0.25,compute_labels=True) brc = brc.fit(X_ipca) labels = brc.predict(X_ipca) logger.debug("Done with BIRCH !") chis = metrics.calinski_harabaz_score(X_ipca, labels) logger.debug("CH index score : " + str(chis)) colors = cm.rainbow(np.linspace(0, 1, nc)) ax = plt.gca() for l,c in zip(labels, colors): mask = labels == l ax.plot(X_ipca[mask, 0], X_ipca[mask, 1], 'w',\ markerfacecolor=c , marker='.') ax.set_title("BIRCH Airline Delay for January 2016") ax.set_xlabel("Principal Component 1") ax.set_ylabel("Principal Component 2") plt.grid() plt.show() return
def birch(df): print(" ----------------------") print(" Birch Clustering") print(" ----------------------") df1 = df.drop(columns=[ 'Class Attribute', 'Semester type', 'Speaker Type', 'Course', 'Course instructor' ]) df2 = df.drop(columns=[ 'Class Attribute', 'Semester type', 'Speaker Type', 'Course', 'Class Size' ]) df3 = df.drop(columns=[ 'Class Attribute', 'Semester type', 'Speaker Type', 'Class Size', 'Course instructor' ]) p1 = df1.to_numpy() p2 = df2.to_numpy() p3 = df3.to_numpy() data = np.array(np.concatenate([p1, p2, p3])) x_range = range(len(data)) x = np.array(list(zip(x_range, data))).reshape(len(x_range), 2) plt.scatter(x[:, 0], x[:, 1]) plt.show() bclust = Birch(branching_factor=100, threshold=.5).fit(x) print(bclust) labels = bclust.predict(x) plt.scatter(x[:, 0], x[:, 1], c=labels) plt.show()
def detect_segments(data): # rho = [normal_to_angle(row[2], row[3]) for row in data] rho = np.arctan2(data[:,3],data[:,2]) rho[rho < 0] += 2*math.pi #dist = [point_to_dist(row[0],row[1],row[2],row[3]) for row in data] dist = np.fabs(data[:,0]*data[:,2] + data[:,1]*data[:,3]) # X = [(r,d) for (r,d) in zip(rho,dist)] X = list(zip(rho,dist)) brc = Birch(branching_factor=50,n_clusters=None, threshold=0.5) rrr = brc.fit(X) labels = brc.predict(X) # sorted_data = [row + [label] for (row,label) in zip(data,labels)] # sorted_data = np.concatenate((data,np.array([labels],dtype=float).T),axis=1) sorted_data = np.zeros((data.shape[0],data.shape[1]+1)) sorted_data[:,0:4] = data sorted_data[:,4:5] = np.array([labels],dtype=float).T # sorted_data = sorted(sorted_data, key=lambda row: row[4]) sorted_data = sorted_data[sorted_data[:,4].argsort()] segments = extract_segments(sorted_data) filtered_data = list(filter(lambda row: row[4] not in segments, sorted_data)) return segments, filtered_data
def density(df): print( " ------------------------------------" ) print( " Density Based Spatial Clustering" ) print( " ------------------------------------" ) df = df.drop(columns=['Class Attribute', 'Semester type', 'Speaker Type']) data = df.to_numpy() np.random.seed(12) p1 = np.random.randint(5, 21, 110) p2 = np.random.randint(20, 30, 120) p3 = np.random.randint(8, 21, 90) data = np.array(np.concatenate([p1, p2, p3])) x_range = range(len(data)) x = np.array(list(zip(x_range, data))).reshape(len(x_range), 2) plt.scatter(x[:, 0], x[:, 1]) plt.show() bclust = Birch(branching_factor=100, threshold=.5).fit(x) print(bclust) labels = bclust.predict(x) plt.scatter(x[:, 0], x[:, 1], c=labels) plt.show() print( "--------------------------------------------------------------------------------------------------------" )
def scan_callback(self, msg): pose = self.pose.copy() bearings = self.bearings.copy() ranges = np.array(msg.ranges) inf_flag = (-1 * np.isinf(ranges).astype(int) + 1) ranges = np.nan_to_num(ranges) * inf_flag euc_coord_x = pose[0] + np.cos(bearings - pose[2]) * ranges euc_coord_y = pose[1] + np.sin(bearings - pose[2]) * ranges dist_flag = np.where( (euc_coord_x-pose[0])**2 + \ (euc_coord_y-pose[1])**2 != 0.0)[0] points = np.array([euc_coord_x, euc_coord_y]).T points = points[dist_flag] self.obsv = [] if len(points) > 0: brc = Birch(n_clusters=None, threshold=0.05) brc.fit(points) labels = brc.predict(points) u_labels = np.unique(labels) for l in u_labels: seg_idx = np.where(labels == l) seg = points[seg_idx] if seg.shape[0] <= 1: fit_cov = 10 else: fit_cov = np.trace(np.cov(seg.T)) if fit_cov < 0.001 and seg.shape[0] >= 3: self.obsv.append(seg.mean(axis=0)) print(self.obsv)
def map_clusters(n_list, n_clusters): # x = np.array([[28.596596, 77.344098], [28.574783, 77.333393]]) # x = np.append(x, [[28.596596, 77.344098], [28.574783, 77.333393], [28.582515, 77.246735], # [28.582915, 77.215735], [28.635639, 77.201197], [28.464873, 76.995451]], axis=0) x = np.array([[28.596596, 0], [28.574783, 0], [28.996596, 0], [28.674783, 0], [28.582515, 0], [28.582915, 0], [28.635639, 0], [28.464873, 0]]) # x = np.append(x, n_list, axis=0) # define the model model = Birch(threshold=0.01, n_clusters=n_clusters) # fit the model model.fit(n_list) # assign a cluster to each example yhat = model.predict(n_list) # retrieve unique clusters clusters = unique(yhat) dic = {} # create scatter plot for samples from each cluster for cluster in clusters: # # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # # create scatter of these samples dic[cluster] = row_ix[0] # pyplot.scatter(x[row_ix, 0], x[row_ix, 1]) # print(dic) # pyplot.show() return dic
def birch_clustering(principal_components, principal_df, number_of_clusters): final_df = pd.concat([principal_df], axis=1) model = Birch(threshold=0.01, n_clusters=number_of_clusters) # fit the model model.fit(principal_components) # assign a cluster to each example yhat = model.predict(principal_components) # retrieve unique clusters clusters = unique(yhat) final_df['Segment'] = model.labels_ # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples plt.scatter(principal_components[row_ix, 0], principal_components[row_ix, 1], s=75) final_df.rename({ 0: 'PC1', 1: 'PC2', 2: 'PC3', 'y': 'Race' }, axis=1, inplace=True) plt.title("BIRCH Clustering") add_race_labels(final_df) calc_silhouette(data=principal_components, prediction=yhat, n_clusters=len(clusters)) return final_df
def clusteringReminMost(window): brc = Birch(branching_factor=50, n_clusters=3, threshold=0.5, compute_labels=True) brc.fit(window) Class = brc.predict(window) #统计各个类别的信息,找出个数最多的类别,取出这些数据,从而强化历史数据 num0 = 0 num1 = 0 num2 = 0 for i in Class: if i == 0: num0 += 1 elif i == 1: num1 += 1 else: num2 += 1 lable = chooseMax(num0, num1, num2) newwindow = window[0:1] for i in range(1, len(Class)): if Class[i] == lable: #属于目标类别,则进行添加 newwindow = newwindow.append(window[i - 1:i]) #都为pandas数据结果 return newwindow
def add_cluster_column(train_df, test_df, n_clusters): train_df['source'] = 'train' test_df['source'] = 'test' total_rows = train_df.shape[0] + test_df.shape[0] data = pd.concat([train_df, test_df]) #split the data between "around NYC" and "other locations" data_c = data[(data.longitude > -74.05) & (data.longitude < -73.75) & (data.latitude > 40.4) & (data.latitude < 40.9)] data_e = data[~((data.longitude > -74.05) & (data.longitude < -73.75) & (data.latitude > 40.4) & (data.latitude < 40.9))] #put it in matrix form coords = data_c.as_matrix(columns=['latitude', "longitude"]) brc = Birch(branching_factor=100, n_clusters=n_clusters, threshold=0.01, compute_labels=True) brc.fit(coords) clusters = brc.predict(coords) data_c["num_cluster_" + str(n_clusters)] = clusters data_e["num_cluster_" + str( n_clusters)] = -1 #assign cluster label -1 for the non NYC listings data = pd.concat([data_c, data_e]) print('lost: {}'.format(total_rows - data[data['source'] == 'train'].shape[0] - data[data['source'] == 'test'].shape[0])) return data[data['source'] == 'train'], data[data['source'] == 'test']
def clusterize_birch(self, vectors): brc = Birch(branching_factor=8, n_clusters=(int(len(vectors) / 6))).fit(vectors) print('Fit ready') predictions = brc.predict(vectors) print('Predict ready') return predictions
def birch(data_train, data_test, label_train, label_test, args): print('birch') birch = Birch(n_clusters=10).fit(data_train) predict = birch.predict(data_test) print('birch done') compare_class(predict, label_test) if args.create_mean: create_images_from_rows('bi', mean_image(predict, data_test))
def cluster_birch(self): print "Starting Birch clustering" brc = Birch(branching_factor=10, n_clusters=40, threshold=self.cluster_distance, compute_labels=False) brc.fit(self.all_frames_xy) clusters = brc.predict(self.all_frames_xy) return clusters
def find_outliers(self, values, dodgy_node='hello'): # flag if a KPI is exhibiting anomalous behaviour if self.find_root_cause_with_KDE: X = np.reshape(values, (-1, 1)) KDE = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(X) KDE_scores = KDE.score_samples(X) outliers = np.where(KDE_scores < np.percentile(KDE_scores, 1))[0] return (len(outliers) > 0), -np.mean(KDE_scores) else: normalized_values = preprocessing.normalize([values ]).reshape(-1, 1) birch = Birch(n_clusters=None, threshold=0.06, compute_labels=True) birch.fit(normalized_values) birch.predict(normalized_values) labels = birch.labels_ birch_clustering_score = len( labels[np.where(labels != 0)]) / len(labels) return (birch_clustering_score > 0), birch_clustering_score
def birch(self, number_of_clusters, output_file_path): print("birch Clustering in progress...") arr = np.array(self.__data_set) birch_clustering = Birch(branching_factor=50, n_clusters=number_of_clusters, threshold=20, compute_labels=False).fit(arr) labels = birch_clustering.predict(arr) print("Birch Clustering done!") print("generating Birch clustering csv") self.__generate_result_clustering_csv(labels, output_file_path) print("Birch clustering csv created successfully!")
def update_k_clusters(attrname, old, new): k_cluster = int(k_slider.value) brc = Birch(branching_factor=50, n_clusters=k_cluster, threshold=0.5, compute_labels=True) brc.fit(tweet_vecs) predictions = brc.predict(tweet_vecs) colors = get_colors(predictions) brc_data.data = dict(colors=colors, x=tsne_vecs[:, 0], y=tsne_vecs[:, 1])
def BIRCH2_duplicate_removal(dataframe, threshold=0.8): # Note this method now takes a dataframe as input if len(dataframe) < 2: # nothing to do return dataframe Crater_data = dataframe # extract axes x = Crater_data[0].values.tolist() y = Crater_data[1].values.tolist() r = Crater_data[2].values.tolist() p = Crater_data[3].values.tolist() Points = [] X = np.column_stack((x, y)) brc = Birch(branching_factor=50, n_clusters=int(threshold * len(x)), threshold=0.5, compute_labels=True) brc.fit(X) groups_pred = brc.predict(X) for c in set(groups_pred): idx = [i for i, e in enumerate(groups_pred) if e == c] Group_x = [] Group_y = [] Group_r = [] Group_p = [] index = [] for i in idx: if i in range(0, len(x)): Group_x.append(x[i]) Group_y.append(y[i]) Group_r.append(r[i]) Group_p.append(p[i]) index.append(i) # after group is defined, extract its elements from list Points.append([Group_x, Group_y, Group_r, Group_p]) # now reduce groups center_size = [] for i, (Xs, Ys, Rr, Ps) in enumerate(Points): # we take the point with best prediction confidence best_index = np.argmax(Ps) x_center = Xs[best_index] y_center = Ys[best_index] radius = Rr[best_index] prob = Ps[best_index] center_size += [[x_center, y_center, radius, prob]] return pd.DataFrame(center_size)
def compute_clusters(data: List) -> np.ndarray: print("--->Computing clusters") birch = Birch(branching_factor=50, n_clusters=5, threshold=0.3, copy=True, compute_labels=True) birch.fit(data) predictions = np.array(birch.predict(data)) return predictions
def skitleanBirch(): data = pd.read_csv("soy_rock.csv", header=None) X = data.values.tolist() randomm = randint(5, 20) brc = Birch(branching_factor=randomm, n_clusters=4, threshold=0.1, compute_labels=True) brc.fit(X) pred = brc.predict(X) return pred
def cluster_sentences(sentences): X = vectorize(sentences) bcl = Birch(branching_factor=10, n_clusters=None).fit( X) # the algorithm figures out the clusters clusters = bcl.predict(X) labels = bcl.labels_ norm_X = normalize_vectors(X, labels) cluster_means = calculate_mean(norm_X, clusters) cluster_sentences = find_minimum_from_mean(cluster_means, norm_X) sents = vectors_to_sentences(cluster_sentences) print(sents) return sents
class Birch_algo_wrapper: def __init__(self): self.wrapped = Birch(n_clusters = 2) self.data = [] self.indexes =[] def fit(self,data): self.wrapped.fit(data) self.data = data self.indexes = self.wrapped.labels_ def predict(self,data): return self.wrapped.predict(data)
def obtainCodebook(self, sampled_x, x): print 'Obatining codebook using Birch from skilean...' scaled_x_sampled = StandardScaler().fit_transform(sampled_x) scaled_x = StandardScaler().fit_transform(x) brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True) #obatin the codebook and the projections of the images on the codebook (clusters of words) codebook = brc.fit(scaled_x_sampled) clusters = brc.predict(scaled_x) print 'Clusters obtained.' return codebook, clusters
def obtainClusters(self, hist): print 'Obatining clusters using Birch from skilean...' hist = np.array(hist) hist = hist.astype(float) scaled_vec = StandardScaler().fit_transform(hist) brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True) #obatin the codebook and the projections of the images on the codebook (clusters of words) codebook = brc.fit(scaled_vec) clusters = brc.predict(scaled_vec) print 'Clusters obtained.' return clusters
def split_birch(self, branching_factor, threshold): # Extract dataset from files dataset = [f.dataset for f in self.files] # Initialize classifier classifier = Birch(branching_factor=branching_factor, n_clusters=None, threshold=threshold) classifier.fit(dataset) # Get index index = classifier.predict(dataset) count = max(index) + 1 # Create new clusters clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)] for i in range(0, len(self.files), 1): clusters[index[i]].add_file(self.files[i]) return clusters
def build_model(df, cluster_type="kmeans", seed=1): if cluster_type == "birch": model = Birch(n_clusters=N_CLUSTERS) res = model.fit_predict(df) elif cluster_type == "minibatch": model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) elif cluster_type == "em": model = mixture.GMM(n_components=N_CLUSTERS) model.fit(df) res = model.predict(df) elif cluster_type == 'lda': model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed) data_to_cluster = np.array(df).astype(int) lda_res = model.fit_transform(data_to_cluster) res = [] for i in lda_res: #for now - do hard clustering, take the higheset propability res.append(i.argmax()) else: model = KMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) df_array = np.array(df) dis_dict = {} for i in range(N_CLUSTERS): dis_dict[i] = clusters_centers[i] all_dist = [] for line_idx in range(len(df_array)): label = model.labels_[line_idx] dist = calc_distance(df_array[line_idx],dis_dict[label]) all_dist.append(dist) df["distance_from_cluster"] = all_dist #clusters = model.labels_.tolist() #print ("clusters are:",clusters) print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res))) res = [str(i) for i in res] docs_clusteres = zip(df.index,res) return docs_clusteres
Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(affinity_propagation_parameter_search_space_for_plotting, affinity_propagation_valid_performance_metrics_for_plotting, affinity_propagation_test_performance_metrics_for_plotting, 'Adjusted Mutual Information Score', 'AffinityPropagation Clustering damping parameter', 'Affinity_Propagation_Performance', 0, 0.5, left_horizontal_limit=0.5) # Do BIRCH, optimizing number of calls to partial_fit over a validation set current_optimal_birch_number_of_calls = 1 initial_optimal_birch_clusterer = Birch() initial_optimal_birch_clusterer.partial_fit(train_data_set) initial_optimal_birch_clusterer.set_params(n_clusters=number_of_classes) initial_birch_valid_predictions = initial_optimal_birch_clusterer.predict(valid_data_set) initial_birch_test_predictions = initial_optimal_birch_clusterer.predict(test_data_set) # Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix for element in range(number_of_valid_observations): initial_birch_valid_predictions[element] += 1 for element in range(number_of_test_observations): initial_birch_test_predictions[element] += 1 initial_birch_valid_predictions = Clustering.Hungarian_Fix(initial_birch_valid_predictions, valid_labels).astype('int') initial_birch_test_predictions = Clustering.Hungarian_Fix(initial_birch_test_predictions, test_labels).astype('int') # Set a starting point for optimality of the initial performance metric, to be possibly adjusted later birch_number_of_calls_integer_search_space_start = current_optimal_birch_number_of_calls + 1 birch_number_of_calls_integer_search_space_stop = current_optimal_birch_number_of_calls + 9
def cluster_birch(self): print "Starting Birch clustering" brc = Birch(branching_factor=10, n_clusters=40, threshold=self.cluster_distance,compute_labels=False) brc.fit(self.all_frames_xy) clusters = brc.predict(self.all_frames_xy) return clusters
dsp_array = np.array(dsp_list) # extract the unique station names stations = np.unique(station_array) print stations for sta in stations: events = event_array[station_array == sta, :] dsp_shortlist = dsp_array[station_array == sta] print sta, events.shape, dsp_shortlist.shape # cluster on events so as to compare dispersion curves for nearby # events brc = Birch(branching_factor=50, n_clusters=None, threshold=dist, compute_labels=True) brc.fit(events) labels = brc.predict(events) print np.max(labels) for lab in np.unique(labels): dsp_this_label_list = dsp_shortlist[labels == lab] cluster_name = os.path.join(dirname, "cluster_%s_%03d" % (sta, lab)) plot_all_dsp(dsp_this_label_list, legend=False, fname="%s_gvel.png" % cluster_name) plot_all_map(dsp_this_label_list, fname="%s_map.png" % cluster_name, legend=False) f = open("%s_info.txt" % cluster_name, "w") for (dsp, dsp_dict) in dsp_this_label_list: f.write( "%s %s %d %03d %02d %02d %.3f %.3f\n" % ( dsp_dict["STA"], dsp_dict["COMP"], dsp_dict["YEAR"], dsp_dict["JDAY"],
for idx, label in enumerate(labels): if label in plays_sums: plays_sums[label].append(plays[idx]) else: plays_sums[label] = [plays[idx]] # cluster_size[label] += 1 for label in plays_sums: median = np.median(np.array(plays_sums[label])) plays_sums[label] = median # for idx, size in enumerate(cluster_size): # plays_sums[idx] /= size Y = cluster.get_test_matrix() # print len(Y) Y = np.array(Y, dtype=float) print "Running Birch on test data...", test_predicts = brc.predict(Y) print "Done!" print test_predicts with open(submit_file, 'w') as submit_fh: submit_csv = csv.writer(submit_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) submit_csv.writerow(['Id', 'plays']) for idx, test_predict in enumerate(test_predicts): submit_csv.writerow([idx+1, plays_sums[test_predict]]) if idx%10000 == 0: print "Row", idx