def templateClusterAllocation( self, path, cluster_sizes, number_clusters, branching_factor=5, max_node_entries=5, initial_diameter=0.1, type_measurement=measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5): sample = read_sample(path) birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier) birch_instance.process() clusters = birch_instance.get_clusters() obtained_cluster_sizes = [len(cluster) for cluster in clusters] total_length = sum(obtained_cluster_sizes) assert total_length == len(sample) if (cluster_sizes != None): cluster_sizes.sort() obtained_cluster_sizes.sort() assert cluster_sizes == obtained_cluster_sizes
def template_clustering( number_clusters, path, branching_factor=50, max_node_entries=100, initial_diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5, show_result=True): print("Sample: ", path) sample = read_sample(path) birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier) birch_instance.process() clusters = birch_instance.get_clusters() if show_result is True: visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show() return sample, clusters
def birchAlgo(filename, col_name): df = pd.read_csv(filename, usecols=[col_name]) df[col_name] = df[col_name] data = df[col_name] rownumber = len(data) if rownumber % 2 == 1: rownumber += 1 #converting pandas series into ndarray input_data = np.asarray(data) input_data.shape = (rownumber // 2, 2) print(input_data.dtype) print(input_data.shape) print( "----------------------------------------------------------------------------------------------------------------------" ) # create BIRCH algorithm for allocation three objects. birch_instance = birch(input_data.tolist(), 10) # start processing - cluster analysis of the input data. birch_instance.process() # allocate clusters. clusters = birch_instance.get_clusters() print(clusters) print(timeit.timeit('"-".join(str(n) for n in range(100))', number=10000)) #Visualize clusters: visualizer = cluster_visualizer() visualizer.append_clusters(clusters, input_data) visualizer.show(display=False) plt.savefig( "C:/Users/Nupura Hajare/Desktop/flask_app/web/static/img/birch.png")
def template_clustering( number_clusters, path, branching_factor=5, max_node_entries=5, initial_diameter=0.0, type_measurement=measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5, show_result=True): sample = read_sample(path) birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier) (ticks, result) = timedcall(birch_instance.process) print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") clusters = birch_instance.get_clusters() if (show_result is True): visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show() return (sample, clusters)
def BIRCH_func(data, k): data = DataFrame(data) data = data.apply(pd.to_numeric) X = data.values.tolist() birch_instance = birch(X, k, diameter=3.0) birch_instance.process() clusters = birch_instance.get_clusters() return clusters
def template_clustering(number_clusters, path, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.0, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, ccore = True): sample = read_sample(path); birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, ccore); (ticks, result) = timedcall(birch_instance.process); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); clusters = birch_instance.get_clusters(); draw_clusters(sample, clusters);
def templateClusterAllocationOneDimensionData(self, branching_factor=5, max_node_entries=10, initial_diameter=1.0, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=20): input_data = [[random()] for _ in range(10)] + [[random() + 4] for _ in range(10)] + [[random() + 8] for _ in range(10)] + [[random() + 12] for _ in range(10)] birch_instance = birch(input_data, 4, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit) birch_instance.process() clusters = birch_instance.get_clusters() assert len(clusters) == 4 for cluster in clusters: assert len(cluster) == 10
def templateClusterAllocationOneDimensionData(self, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, ccore = True): input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 6] for i in range(10) ] + [ [random() + 9] for i in range(10) ]; birch_instance = birch(input_data, 4, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, ccore); birch_instance.process(); clusters = birch_instance.get_clusters(); assert len(clusters) == 4; for cluster in clusters: assert len(cluster) == 10;
def template_clustering(number_clusters, path, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.0, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, ccore = True): sample = read_sample(path); birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, ccore) (ticks, result) = timedcall(birch_instance.process); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); clusters = birch_instance.get_clusters(); draw_clusters(sample, clusters);
def get_modelo(self, algoritmo, eps, neig): print(algoritmo + ' ' + str(eps) + ' - ' + str(neig)) instance = None if algoritmo == 'AGNES': instance = agglomerative(self.amostras, self.numero_clusters, link=None) elif algoritmo == 'BIRCH': instance = birch(self.amostras, self.numero_clusters, entry_size_limit=10000) elif algoritmo == 'CLARANS': instance = clarans(self.amostras, self.numero_clusters, numlocal=100, maxneighbor=1) elif algoritmo == 'CURE': instance = cure(self.amostras, self.numero_clusters, number_represent_points=5, compression=0.5) elif algoritmo == 'DBSCAN': instance = dbscan(self.amostras, eps=eps, neighbors=neig) elif algoritmo == 'FCM': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = fcm(self.amostras, initial_centers) elif algoritmo == 'KMEANS': initial_centers = kmeans_plusplus_initializer( self.amostras, self.numero_clusters).initialize() instance = kmeans(self.amostras, initial_centers, tolerance=0.001) elif algoritmo == 'KMEDOIDS': instance = kmedoids(self.amostras, initial_index_medoids=[0, 0, 0, 0, 0, 0, 0], tolerance=0.0001) #ajustar o n_de cluster elif algoritmo == 'OPTICS': instance = optics(self.amostras, eps=eps, minpts=neig) elif algoritmo == 'ROCK': instance = rock(self.amostras, eps=eps, number_clusters=self.numero_clusters, threshold=0.5) else: pass instance.process() lista_agrupada = self.get_lista_agrupada(instance.get_clusters()) lista_agrupada = np.array(lista_agrupada) if (neig != 0): n_grupos = len(np.unique(lista_agrupada)) if n_grupos > self.numero_clusters: lista_agrupada = self.get_modelo(algoritmo, eps, neig + 1) return lista_agrupada
def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, ccore = True): sample = read_sample(path); cure_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, ccore); cure_instance.process(); clusters = cure_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; total_length = sum(obtained_cluster_sizes); assert total_length == len(sample); cluster_sizes.sort(); obtained_cluster_sizes.sort(); assert cluster_sizes == obtained_cluster_sizes;
def template_clustering(number_clusters, path, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.0, type_measurement = measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, show_result = True): sample = read_sample(path); birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier); (ticks, result) = timedcall(birch_instance.process); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); clusters = birch_instance.get_clusters(); if (show_result is True): visualizer = cluster_visualizer(); visualizer.append_clusters(clusters, sample); visualizer.show(); return (sample, clusters);
def templateClusterAllocation( self, path, cluster_sizes, number_clusters, branching_factor=50, max_node_entries=100, initial_diameter=0.5, type_measurement=measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit=200, diameter_multiplier=1.5): sample = read_sample(path) birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier) birch_instance.process() clusters = birch_instance.get_clusters() cf_clusters = birch_instance.get_cf_cluster() cf_entries = birch_instance.get_cf_entries() self.assertEqual(birch_instance.get_cluster_encoding(), type_encoding.CLUSTER_INDEX_LIST_SEPARATION) self.assertEqual(number_clusters, len(clusters)) self.assertEqual(number_clusters, len(cf_clusters)) self.assertGreater(len(cf_entries), 0) self.assertLessEqual(len(cf_entries), entry_size_limit) obtained_cluster_sizes = [len(cluster) for cluster in clusters] total_length = sum(obtained_cluster_sizes) self.assertEqual(total_length, len(sample)) if cluster_sizes is not None: cluster_sizes.sort() obtained_cluster_sizes.sort() self.assertEqual(cluster_sizes, obtained_cluster_sizes)
def doBirchClustering(i): #data=pd.read_csv("soy_rock.csv",header=None,dtype='int') #data=pd.read_csv("votepreporcess.csv",header=None,dtype='int') data = pd.read_csv("lung_number.csv", header=None, dtype='int') data_size = len(data.index) sample = data.values.tolist() #i=12 #print( "random initlization i in Brich ", i ) birch_instance = birch(sample, 7, branching_factor=i, max_node_entries=i, initial_diameter=0.1) birch_instance.process() clusters = birch_instance.get_clusters() value = [] for data in range(0, data_size): for cluster_number in range(0, len(clusters)): if data in clusters[cluster_number]: value.append(cluster_number) break return value
import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.cluster import KMeans from sklearn.metrics import completeness_score, homogeneity_score from sklearn.naive_bayes import GaussianNB from sklearn import cross_validation from pylab import figure, subplot, hist, xlim, show, plot from numpy import genfromtxt, zeros from sklearn.metrics import confusion_matrix from numpy import mean from sklearn.cross_validation import cross_val_score import pandas as pd actoridata = genfromtxt( 'C:\\Users\\26087\\PycharmProjects\\untitled\\venv\\coo_times_arr.csv', encoding='utf-8', delimiter=',', usecols=(0, 1, 2), dtype=str) print(actoridata) sample = read_sample(actoridata) # 使用birch算法,聚成三类,这里将类实例化,变成对象 birch_instance = birch(actoridata, 128) # 使用对象里的方法,开始聚类 birch_instance.process() # 获取聚类结果 clusters = birch_instance.get_clusters() # 查看形状,可以看到长度为3,被分为三类 visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show()
def process_birch(sample): instance = birch(sample, NUMBER_CLUSTERS) (ticks, _) = timedcall(instance.process) return ticks
# In[26]: from pyclustering.cluster.birch import birch from pyclustering.cluster import cluster_visualizer from pyclustering.utils import read_sample from pyclustering.samples.definitions import FAMOUS_SAMPLES # list 형태로! X = finalDataFrame.iloc[:, [0, 1]].values.tolist() X birch for k in range(2, 10): birch_instance = birch(X, k, diameter=3.0) # Cluster analysis birch_instance.process() # Obtain results of clustering clusters = birch_instance.get_clusters() # Visualize allocated clusters visualizer = cluster_visualizer() visualizer.append_clusters(clusters, X) print("\nk=", k) visualizer.show() # In[27]: X = finalDataFrame.iloc[:, [0, 1]].values.tolist() for k in range(10, 20):
# an example of clustering by BIRCH algorithm. from pyclustering.cluster.birch import birch from pyclustering.utils import read_sample # load data from the FCPS set that is provided by the library. sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN) # create BIRCH algorithm for allocation three objects. birch_instance = birch(sample, 3) # start processing - cluster analysis of the input data. birch_instance.process() # allocate clusters. clusters = birch_instance.get_clusters() # visualize obtained clusters. visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show()
""" from pyclustering.cluster.birch import birch import os import time import shutil #import subprocess ############### Configure the below variables for a change in system ################# PROJECT='/root/Desktop/ram/' HADOOP_HOME='/user/root/in/' ##################################################################################### PROC_DIR= PROJECT + 'PROJECT/BIRCH_PROCESS/' HADOOP_PROC_DIR= PROJECT + 'PROJECT/HADOOP_PROCESS/' PROCESSED_DIR= PROJECT + 'PROJECT/PROCESSED/' LOGDIR= PROJECT + 'PROJECT/LOGDIR/logfile' birch1=birch([[]],10,[],initial_diameter=10) z=0 while True : while (os.listdir(PROC_DIR)==[]): time.sleep(30) for datefile in os.listdir(PROC_DIR): DATE=datefile HADOOP_PATH= HADOOP_HOME + DATE +'/' os.popen("touch "+HADOOP_PROC_DIR + "DONE") #Below loop is to check moving of files from Hadoop dir to Birch Dir is finished while(os.path.isfile(PROC_DIR+datefile) == True): time.sleep(10) DONEFILE=PROC_DIR+"DONE" os.remove(DONEFILE) d1={} d2=os.listdir(PROC_DIR)
def process_birch(sample): instance = birch(sample, NUMBER_CLUSTERS) (ticks, _) = timedcall(instance.process) return ticks