def fit(method, X, n_clusters, samples_by_cluster, max_iter): if X.shape[0] <= samples_by_cluster * n_clusters: n_clusters = int(X.shape[0] / samples_by_cluster) if X.shape[0] < n_clusters or n_clusters == 0: n_clusters = 1 if method == "kmeans": model = cluster.KMeans(n_clusters=n_clusters, max_iter=max_iter) elif method == "kmeans": model = mixture.GaussianMixture(n_components=n_clusters, max_iter=max_iter) elif method == "bgmm": model = mixture.BayesianGaussianMixture( n_components=n_clusters, max_iter=max_iter) else: model = cluster.Birch(n_clusters=n_clusters, compute_labels=False) while True: try: model.fit(X) return model except Exception as e: if type(model) == cluster.birch.Birch: model = cluster.Birch(n_clusters=n_clusters, compute_labels=False) if n_clusters > 1: n_clusters -= 1 continue else: raise(e) return model
def configuraciones_birch(): brc_1 = cl.Birch(n_clusters=5, threshold=0.1) brc_05 = cl.Birch(n_clusters=5, threshold=0.05) brc_01 = cl.Birch(n_clusters=5, threshold=0.01) #Los añadimos a una lista clustering_algorithms = (('Birch thershold=0.1', brc_1), ('Birch thershold=0.05', brc_05), ('Birch thershold=0.01', brc_01)) return clustering_algorithms
def configuraciones_birch2(): brc_01 = cl.Birch(n_clusters=10, threshold=0.01) brc_05 = cl.Birch(n_clusters=10, threshold=0.05) brc_07 = cl.Birch(n_clusters=10, threshold=0.07) #Los añadimos a una lista clustering_algorithms = ( ('Birch-01', brc_01), ('Birch-05', brc_05), ('Birch-07', brc_07), ) return clustering_algorithms
def definition_clusters(subset): #Importante -> normalizar el conjunto de datos que utilizamos normalized_set = preprocessing.normalize(subset, norm='l2') print("-------- Definiendo los clusteres...") k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100) # estimate bandwidth for mean shift bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3) ms = cl.MeanShift(bandwidth=bandwidth) two_means = cl.MiniBatchKMeans(n_clusters=5, init='k-means++') # connectivity matrix for structured Ward connectivity = kneighbors_graph(normalized_set, n_neighbors=10, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) ward = cl.AgglomerativeClustering(n_clusters=5, linkage='ward') #dbscan = cl.DBSCAN(eps=0.3, n_clusters=5) brc = cl.Birch(n_clusters=5, threshold=0.1) #Los añadimos a una lista clustering_algorithms = (('K-Means', k_means), ('MiniBatchKMeans', two_means), ('MeanShift', ms), ('Agglomerative', ward), ('Birch', brc)) return clustering_algorithms
def evaluate(df, thres_list=[0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5]): # preprocessing x_train_list = preprocessing(df.T.to_dict().values()) # modeling and embedding projection model_dm = algo.document_embeddings(x_train_list, vector_size=3, epochs=100) vectors = [] for text_list, label_str in x_train_list: vector = model_dm.infer_vector(text_list) vectors.append(vector) # clustring with average silhouette method results = [] for thres in thres_list: brc = cluster.Birch( branching_factor=50, n_clusters=None, threshold=thres, compute_labels=True, ) clrs = brc.fit_predict(vectors) logger.warning("clrs: {0}".format(clrs)) silhouette_avg = metrics.silhouette_score(vectors, clrs) logger.warning("[thres {0}] silhouette_avg: {1}".format(thres, silhouette_avg)) results.append( {"score": silhouette_avg, "clrs": clrs, "thres": thres, "vectors": vectors} ) return results
def birch(self, data): """ Wrapper for sklearn.cluster.birch with parameters from the dynamic reconfigure config. Parameters ---------- data : numpy.array Points in order: [[x1,y1,z1], [x2,y2,z2], ....] Returns ------- labels : ndarray The cluster labels Notes ----- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html """ params = {'branching_factor': self.B_branching_factor, 'threshold': self.B_threshold, 'n_clusters': None, 'compute_labels': True} return cluster.Birch(**params).fit_predict(data)
def use_birch(mat, n_cluster): clusters = cls.Birch(threshold=0.0005, n_clusters=n_cluster).fit(mat) hist, bin_edges = np.histogram(clusters.labels_, bins=np.arange(n_cluster + 1)) print 'Birch clustering:', clusters.labels_ print hist return clusters.labels_
def Birch(self): """ Uses `sklearn's Birch <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html>`_ **Defaults and var_params:** sklearn.cluster.Birch(threshold=0.5, branching_factor=50, compute_labels=True, copy=True) Other Parameters ---------------- var_params: dict Pass variable params through constructor as dictionary pairs. Current default parameters are listed above Returns ------- labels: list of ints Solution of clustering labels for each object (updated in object.out) """ params = {} params['distance'] = 'euclidean' #not mutable params['threshold'] = 0.5 params['branching_factor'] = 50 params['n_clusters'] = self.K params['compute_labels'] = True params['copy'] = True if not self.K: raise ValueError('Birch clustering requires an argument K=<intiger value>') params = returnParams(self.var_params, params, 'Birch') d = returnDistanceMatrix(self.data, params['distance']) solution = skc.Birch(threshold=params['threshold'], branching_factor=params['branching_factor'], n_clusters=params['n_clusters'], compute_labels=params['compute_labels'], copy=params['copy']) solution.fit(d) self.out = solution.labels_ self.var_params = params
def get_train_test_idx(X, n_interp, do_clustering=True, do_birch=True): if do_clustering: Xscaled = prep.StandardScaler().fit_transform(X) if do_birch: threshold_birch = 1. n_clusters = int(len(Xscaled) / n_interp) while True: try: clus = cluster.Birch(threshold=threshold_birch, n_clusters=n_clusters) clus.fit(Xscaled) if len(np.unique(clus.labels_)) == n_clusters: break else: threshold_birch *=0.5 except: threshold_birch *=0.5 else: n_clusters = int(len(Xscaled) / n_interp) clus = cluster.KMeans(n_clusters=n_clusters) clus.fit(Xscaled) clus_indices = [np.where(clus.labels_ == i)[0] for i in np.unique(clus.labels_)] idx_train = [] for label in np.unique(clus.labels_): cluster_center = Xscaled[clus_indices[label]].mean(axis=0) dists = ((Xscaled - cluster_center)**2).sum(axis=1)**0.5 idx_train.append(np.argmin(dists)) else: idx_train = list(np.arange(0, len(X), n_interp)) idx_test = set(np.arange(0, len(X))).difference(idx_train) idx_test = list(idx_test) return idx_train, idx_test
def cluster_model(newdata, data, model_name, input_param): ds = data params = input_param if str.lower(model_name) == 'kmeans': cluster_obj = cluster.KMeans(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('MiniBatchKMeans'): cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('SpectralClustering'): cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters']) if str.lower(model_name) == str.lower('MeanShift'): cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth']) if str.lower(model_name) == str.lower('DBSCAN'): cluster_obj = cluster.DBSCAN(eps=params['eps']) if str.lower(model_name) == str.lower('AffinityPropagation'): cluster_obj = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference']) cluster_obj.fit(ds) if str.lower(model_name) == str.lower('Birch'): cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters']) if str.lower(model_name) == str.lower('GaussianMixture'): cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') cluster_obj.fit(ds) if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']: model_result = cluster_obj.predict(ds) else: model_result = cluster_obj.fit_predict(ds) newdata[model_name] = pd.DataFrame(model_result) return newdata
def update_data(self, attrname, old, new): #store the models here models = [ cluster.MiniBatchKMeans(n_clusters=self.k_means_slider.value), cluster.DBSCAN(eps=self.DBSCAN_slider.value), cluster.Birch(n_clusters=self.birch_slider.value), cluster.MeanShift(bandwidth=self.bandwidth, bin_seeding=True) ] #AgglomerativeClustering assert len(models) == 4 for model in models: model.fit(self.X) for i in range(4): if hasattr(model, 'labels_'): y_pred = models[i].labels_.astype(np.int) else: y_pred = models[i].predict(self.X) self.colors[i] = [Spectral6[f % 6] for f in y_pred] self.source[i].data['colors'] = self.colors[i]
def definition_clusters(subset): #Importante -> normalizar el conjunto de datos que utilizamos normalized_set = preprocessing.normalize(subset, norm='l2') print("-------- Definiendo los clusteres...") k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100) # estimate bandwidth for mean shift bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3) ms = cl.MeanShift(bandwidth=bandwidth, bin_seeding=True) #Utilizarlo para casos de estudio pequeños spectral = cl.SpectralClustering(n_clusters=5, affinity="rbf") dbscan = cl.DBSCAN(eps=0.1) #Ponemos threshold bajo porque nos daba un warning en el fit_predict brc = cl.Birch(n_clusters=5, threshold=0.1) #Los añadimos a una lista clustering_algorithms = (('K-Means', k_means), ('MeanShift', ms), ('DBSCAN', dbscan), ('Birch', brc), ('SpectralClustering', spectral)) return clustering_algorithms
def main(): path = '/home/s/Documents/Taxi/Taxi-Stops/stops.csv' df = pd.read_csv(path) df = df.sort_values('Latitude') lon0 = df['Longitude'] lat0 = df['Latitude'] time1 = [t[1:-1].split(', ') for t in list(df['Time'])] # print(time1) vehicle1 = df['Vehicle_No'] n = len(df.index) lon = np.asarray(lon0[:n]).reshape(-1, 1) lat = np.asarray(lat0[:n]).reshape(-1, 1) points = np.concatenate((lat, lon), axis=1) * (6378137 / 180) * math.pi radius = input('Enter the raduis of cluster(Recommended 20-25 meters): ') # clustering = cluster.OPTICS(min_samples=5, max_eps=5, metric='euclidean', xi=0.05).fit(points) clustering = cluster.Birch(threshold=int(radius), branching_factor=500, n_clusters=None, compute_labels=True, copy=True).fit(points) # clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=40).fit(points) col = [] cent_lat1 = [] cent_lon1 = [] min_taxis = input( 'Enter minimum number of Taxis for a location to be considered a stop: ' ) res = dict(collections.Counter(clustering.labels_)) for key, value in res.items(): cent_lat1.append(clustering.subcluster_centers_[key][0] * (180 / (6378137 * math.pi))) cent_lon1.append(clustering.subcluster_centers_[key][1] * (180 / (6378137 * math.pi))) col.append(len(cent_lat1)) num_clusters = len(cent_lat1) df = pd.DataFrame(list(zip(cent_lon1, cent_lat1)), columns=['Longitude', 'Latitude']) df.to_csv('centers.csv', index=False) print('Cluster centers are stored in "centers.csv"') lat1 = [] lon1 = [] col = [] time = [[] for i in range(num_clusters)] vehicle = [[] for i in range(num_clusters)] for i in range(n): if (res[clustering.labels_[i]] > int(min_taxis)): lat1.append(points[i][0] * (180 / (6378137 * math.pi))) lon1.append(points[i][1] * (180 / (6378137 * math.pi))) time[clustering.labels_[i]].extend(time1[i]) vehicle[clustering.labels_[i]].append(vehicle1[i]) col.append(clustering.labels_[i]) print(time) df = pd.DataFrame(list(zip(lon1, lat1)), columns=['Longitude', 'Latitude']) df.to_csv('clusters.csv', index=False) print('Clusters are stored in "clusters.csv"') fig = go.Figure(data=go.Scatter( x=lat1, y=lon1, mode='markers', marker=dict(color=col), text=col)) # plotly.offline.plot(fig, filename='stops.html') print('Plot is stored in "stops.html".')
def get_algorithm(algorithm_name: str, clusters: int) -> cluster: if algorithm_name == "Birch": return cluster.Birch(n_clusters=clusters) elif algorithm_name == "Spectral Clustering": return cluster.SpectralClustering(n_clusters=clusters) elif algorithm_name == 'Affinity Propagation': return cluster.AffinityPropagation() else: raise NotImplementedError(f'algorithm: {algorithm_name} not implemented')
def birch(): data_1 = numpy.random.normal(loc=0.0, scale=0.1, size=[100, 2]) data_2 = numpy.random.normal(loc=0.1, scale=0.1, size=[100, 2]) data = numpy.concatenate([data_1, data_2], axis=0) x = [item[0] for item in data] y = [item[1] for item in data] y_pre = cluster.Birch(threshold=0.05, branching_factor=50, n_clusters=2).fit_predict(data) plt.scatter(x, y, c=y_pre) plt.show()
def clustering(X, algorithm, n_clusters=2): X = np.transpose(X) # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=0.3) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # Generate the new colors: if algorithm == 'KMeans': model = cluster.KMeans(n_clusters=n_clusters, random_state=0) elif algorithm == 'Birch': model = cluster.Birch(n_clusters=n_clusters) elif algorithm == 'DBSCAN': model = cluster.DBSCAN(eps=.2) elif algorithm == 'AffinityPropagation': model = cluster.AffinityPropagation(damping=.9, preference=-200) elif algorithm == 'MeanShift': model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif algorithm == 'SpectralClustering': model = cluster.SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors") elif algorithm == 'Ward': model = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) elif algorithm == 'AgglomerativeClustering': model = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=n_clusters, connectivity=connectivity) model.fit(X) if hasattr(model, 'labels_'): y_pred = model.labels_.astype(np.int) else: y_pred = model.predict(X) return X, y_pred
def compute_clusters(vectors, clusters, algorithm='kmeans'): # select clustering algorithm if algorithm == 'kmeans': algorithm = cluster.MiniBatchKMeans(n_clusters=len(set(clusters))) elif algorithm == 'dbscan': algorithm = cluster.DBSCAN(eps=1.25, n_jobs=-1) elif algorithm == 'optics': algorithm = cluster.OPTICS(min_samples=10, eps=10, cluster_method='dbscan', n_jobs=-1) elif algorithm == 'birch': algorithm = cluster.Birch(n_clusters=len(set(clusters))) elif algorithm == 'spectral': algorithm = cluster.SpectralClustering(n_clusters=len(set(clusters)), eigen_solver='arpack', affinity="nearest_neighbors", n_jobs=-1) elif algorithm == 'affinity': algorithm = cluster.AffinityPropagation(damping=.9, preference=-200) else: raise NotImplementedError(f"Not implemented for algorithm {algorithm}") # predict cluster memberships algorithm.fit(vectors) if hasattr(algorithm, 'labels_'): labels = algorithm.labels_.astype(np.int) else: labels = algorithm.predict(vectors) #transform categorical labels to digits if isinstance(clusters[0], str): labels_true = LabelEncoder().fit_transform(clusters) elif isinstance(clusters[0], (int, np.int)): labels_true = clusters # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(vectors, labels)) return labels, algorithm
def birch_clustering(options, all_text): print("Running Birch Clustering...") X = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True).fit_transform(all_text) c = cluster.Birch(n_clusters=options.num_clusters).fit(X) print("Label counts: ", Counter(c.labels_)) if options.save_intermediate: pickle.dump(c, open(os.path.join(options.intermediate_out_directory, 'cluster_birch.pkl'), 'wb')) pickle.dump(X, open(os.path.join(options.intermediate_out_directory, 'cluster_tfidf.pkl'), 'wb')) return X, c
def select_n_clusters(data, data_pca, n_clusters_range): scores = [] for n in n_clusters_range: birch = cluster.Birch(n_clusters=n).fit(data_pca) score = get_score(data, birch) scores.append(score) for i, score_function in enumerate(['silhouette_score', 'calinski_harabaz_score']): plt.subplot(1, 2, i+1) plt.title(score_function) plt.plot(n_clusters_range, [item[score_function] for item in scores]) plt.show()
def findClusters_Birch(data): ''' Cluster data using BIRCH algorithm ''' # create the classifier object birch = cl.Birch(branching_factor=100, n_clusters=4, compute_labels=True, copy=True) # fit the data return birch.fit(data)
def __init__(self, conn, args, data, split_type, num_clusters): """Constructor for Cluster object. :param conn: database connection object. :param args: dict of arguments read from the arguments file. :param data: data to cluster. :param split_type: Split train test data randomly or by date to allow testing by specific date ranges. :param num_clusters: Number of clusters to create. :return: Cluster instance. """ self.conn = conn self.args = args self.data = data self.split_type = split_type self.pca_model = None self.cluster_model = None self.algorithm = args['cluster_algorithm'] # http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html hdbsc = hdbscan.HDBSCAN(min_cluster_size=10) affinity_propagation = cluster.AffinityPropagation() ms = cluster.MeanShift(bin_seeding=True) spectral = cluster.SpectralClustering(n_clusters=num_clusters, eigen_solver='arpack', affinity="nearest_neighbors", random_state=self.args['seed']) ward = cluster.AgglomerativeClustering(n_clusters=num_clusters, linkage='ward') birch = cluster.Birch(n_clusters=num_clusters) two_means = cluster.MiniBatchKMeans(n_clusters=num_clusters, random_state=self.args['seed']) average_linkage = cluster.AgglomerativeClustering( linkage="average", n_clusters=num_clusters) hdbsc = hdbscan.HDBSCAN(min_cluster_size=10) kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=self.args['seed']) dbscan = cluster.DBSCAN() self.clustering_algorithms = { 'MiniBatchKMeans': two_means, 'AffinityPropagation': affinity_propagation, 'MeanShift': ms, 'SpectralClustering': spectral, 'Ward': ward, 'AgglomerativeClustering': average_linkage, 'DBSCAN': dbscan, 'Birch': birch, 'HDBSCAN': hdbsc, 'KMeans': kmeans }
def fit(self, dataset): with warnings.catch_warnings(): warnings.simplefilter("ignore") requested_n_clusters = self.configuration["NCLU"] engine = cluster.Birch(n_clusters=requested_n_clusters) self.model = engine.fit(dataset) fitted_n_clusters = len(self.model.subcluster_centers_) if fitted_n_clusters < requested_n_clusters: # INFO: Birch must have issued a warning result = fitted_n_clusters else: result = requested_n_clusters return result
def Birch(self, parameters): # data, threshold, branching_factor): result = {} default_threshold = 3 default_branching_factor = 3 data = np.array(parameters['data']) data = preprocessing.MinMaxScaler().fit_transform(data) if parameters.get('threshold') is not None: default_threshold = int(parameters['threshold']) if parameters.get('branching_factor') is not None: default_branching_factor = int(parameters['branching_factor']) model = skc.Birch(threshold=default_threshold, branching_factor=default_branching_factor) clustering = model.fit(data) result['labels'] = clustering.labels_ return result
def detection_with_birch(image_set): """ :param image_set: The bottleneck values of the relevant images. :return: Predictions vector """ # The branching_factor, might be fine tune for better results clf = cluster.Birch(n_clusters=2) clf.fit(image_set) predictions = clf.labels_ predictions = normalize_predictions(predictions) return predictions
def birch(data): ''' for branching_factor in np.arange(50,60,10): print "\nBranch factor = "+str(branching_factor) clusterer = skcluster.Birch(branching_factor=branching_factor, n_clusters=None, threshold=0.5, compute_labels=True) clusterer.fit(data) clusterer.fit_predict(data) cluster_labels = clusterer.fit_predict(data) silhouette_avg = silhouette_score(data, cluster_labels) print "Default cluster" print (len(set(cluster_labels)), silhouette_avg) for ncluster in np.arange(3,4,1): ''' maxsilh = float('-inf') centroid_best = [] for ncluster in range(3, 11): clusterer = skcluster.Birch(n_clusters=ncluster, compute_labels=True) clusterer.fit(data) clusterer.fit_predict(data) cluster_labels = clusterer.fit_predict(data) silhouette_avg = silhouette_score(data, cluster_labels) if silhouette_avg > maxsilh: maxsilh = silhouette_avg kbest = ncluster center_avg_hash = dict() center_num_hash = dict() for label, centers in zip(clusterer.subcluster_labels_, clusterer.subcluster_centers_): if label not in center_avg_hash: center_avg_hash[label] = np.array(centers) center_num_hash[label] = 1 else: center_avg_hash[label] += np.array(centers) center_num_hash[label] += 1 centroid_best = [] for label, sum_center in center_avg_hash.items(): #print label avg_center = sum_center / (center_num_hash[label] * 1.) centroid_best.append(avg_center) print(kbest, maxsilh) return np.array(centroid_best), kbest
def birch_clustering(data, filename): print("Executing birch...") model = cluster.Birch(n_clusters=_cluster_size).fit(data.values) cluster_ids = model.labels_ with open("results/birch_" + filename[4:-4] + "_model.txt", "w") as model_output: for gene, id in zip(data.index.values, cluster_ids): model_output.write(str(gene) + ": " + str(id) + "\n") print("Counting cluster size...") cluster_size = {i: 0 for i in range(_cluster_size)} for id in cluster_ids: cluster_size[id] += 1 print("Final results...") for cid, csize in cluster_size.items(): print("Size of " + str(cid) + " is:\t" + str(csize)) joblib.dump(model, "models/birch_" + filename[4:-4] + "_model.sav") return
def main(): data_origin = read_data('iris.data') data_converted = convert_data(data_origin, 0) true_labels = data_converted.iloc[:, -1] data_clean = clean_data(data_converted.iloc[:, :-2]) plot_distribution(data_clean, size=[1, 3], title='data_clean distribution') scaler = sk.preprocessing.StandardScaler().fit(data_clean) data_standard = scaler.transform(data_clean) plot_distribution(data_standard, size=[1, 3], title='data_standard distribution') pca_components = 2 pca = sk.decomposition.PCA(n_components=pca_components) pca.fit(data_standard) print('The sum of explained_variance_ratio_ is": ', sum(pca.explained_variance_ratio_)) data_pca = pca.fit_transform(data_standard) plot_distribution(data_pca, size=[1, pca_components], title='data_pca distribution') plt.scatter(data_pca[:, 0], data_pca[:, 1]) plt.show() n_clusters = 3 dimension_show = [1, 2] scores = dict() kmeans = cluster.KMeans(n_clusters=n_clusters).fit(data_pca) scores['kmeans'] = show_result(data_clean, data_pca, true_labels, kmeans, n_clusters, dimension_show) ap = cluster.AffinityPropagation(preference=-100).fit(data_pca) scores['ap'] = show_result(data_clean, data_pca, true_labels, ap, max(ap.labels_) + 1, dimension_show) dbscan = cluster.DBSCAN(eps=0.38, min_samples=10).fit(data_pca) scores['dbscan'] = show_result(data_clean, data_pca, true_labels, dbscan, n_clusters, dimension_show) birch = cluster.Birch(n_clusters=3).fit(data_pca) scores['birch'] = show_result(data_clean, data_pca, true_labels, birch, n_clusters, dimension_show) compare_scores(scores) return 0
def call_algo(name, params): algo = None if name == "dbscan" or name == "DBSCAN": algo = cluster.DBSCAN(eps=params['eps'], min_samples=params['n_neighbors']) elif name == "spectral": algo = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") elif name == "birch" or name == "Birch": algo = cluster.Birch(n_clusters=params['n_clusters']) elif name == "gmm" or name == "GMM": algo = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') else: print "unknown algo; exit" exit(0) return algo
def get_algorithm(self): if(self.algorithmName == "kmeans"): cluster_alg = cluster.MiniBatchKMeans(n_clusters=int(self.parms['k'])) elif(self.algorithmName == "mean_shift"): bandwidth = cluster.estimate_bandwidth(self.X, quantile=float(self.parms['quantile'])) cluster_alg = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif(self.algorithmName == "affinity_propagation"): cluster_alg = cluster.AffinityPropagation(damping=float(self.parms['damping'])) elif(self.algorithmName == "birch"): cluster_alg = cluster.Birch(n_clusters=int(self.parms['k'])) elif(self.algorithmName == "ward"): connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) cluster_alg = cluster.AgglomerativeClustering(n_clusters=int(self.parms['k']), linkage='ward', connectivity=connectivity) elif(self.algorithmName == "spectral"): cluster_alg = cluster.SpectralClustering(n_clusters=int(self.parms['k']), eigen_solver='arpack', affinity="nearest_neighbors") elif(self.algorithmName == "dbscan"): cluster_alg = cluster.DBSCAN(eps=float(self.parms['eps'])) elif(self.algorithmName == "agglomerative"): connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) cluster_alg = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=int(self.parms['k']), connectivity=connectivity) else: return None return cluster_alg
def update_data(attrname, old, new): # Get the drop down values algorithm = dropdown.value global X # Generate the new colors: if algorithm == 'MiniBatchKMeans': model = cluster.MiniBatchKMeans(n_clusters=2) elif algorithm == 'AffinityPropagation': model = cluster.AffinityPropagation(damping=.9, preference=-200) elif algorithm == 'MeanShift': model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif algorithm == 'SpectralClustering': model = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors") elif algorithm == 'Ward': model = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity) elif algorithm == 'AgglomerativeClustering': model = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=2, connectivity=connectivity) elif algorithm == 'Birch': model = cluster.Birch(n_clusters=2) elif algorithm == 'DBSCAN': model = cluster.DBSCAN(eps=.2) else: print('No Algorithm selected') model.fit(X) if hasattr(model, 'labels_'): y_pred = model.labels_.astype(np.int) else: y_pred = model.predict(X) colors = [Spectral6[i] for i in y_pred] source.data['colors'] = colors plot.title = algorithm