def plot_map(lat, lon, color=None, size=10): center = [lat[0], lon[0]] cmap = cm.rainbow wlat, wlong = latlng_to_meters(lat, lon) colors = [] if color is not None: colors = MinMaxScaler(feature_range=(0, 255)).fit_transform(color) colors = [ "#%02x%02x%02x" % tuple([int(j * 255) for j in cmap(int(i))[:3]]) for i in colors ] wlat0, wlong0 = latlng_to_meters(center[0], center[1]) wlat = np.append(wlat, wlat0) wlong = np.append(wlong, wlong0) colors.append('#010002') openmap_url = 'http://c.tile.openstreetmap.org/{Z}/{X}/{Y}.png' otile_url = 'http://otile1.mqcdn.com/tiles/1.0.0/sat/{Z}/{X}/{Y}.jpg' TILES = WMTSTileSource(url=openmap_url) tools = "pan,wheel_zoom,reset" p = figure(tools=tools, plot_width=700, plot_height=600, x_axis_type="mercator", y_axis_type="mercator") p.circle(wlat0, wlong0, color='#dc3826', size=size + 10) p.circle(np.array(wlat), np.array(wlong), color=colors, size=size, alpha=0.5) p.add_tile(TILES) p.axis.visible = False pb = gridplot([[p]]) show(pb)
def _add_color_column(fire_spots): n_colors = fire_spots.n_reports.unique().shape[0] colors = MinMaxScaler().fit_transform(np.arange(n_colors).astype(float).reshape(-1, 1)) colors = (colors - 1.0) * (-1.0) colors = np.hstack([colors, np.zeros(n_colors).reshape(-1, 1), colors]) colors_df = pd.DataFrame(colors) colors = [] for _, row in colors_df.iterrows(): rgb_color = tuple((row.values * 255).astype(int)) colors.append('#%02x%02x%02x' % rgb_color) cluster_freqs = list(fire_spots.groupby(['n_reports']).groups.keys()) cluster_freqs = pd.DataFrame(sorted(cluster_freqs), columns=['n_reports']) colors_df = pd.DataFrame(colors, columns=['color']) cluster_colors = pd.concat([cluster_freqs, colors_df], axis=1) return pd.merge(fire_spots, cluster_colors, on='n_reports')
def main(): print("Loading dataset") os.chdir('../../../..') dfs = [pd.read_csv('data/empty.csv', header=None) for k in range(28)] # iris pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) dfs[0] = pd.read_csv('data/datasets_metrics/iris.csv', header=None) # wine pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) dfs[1] = pd.read_csv('data/datasets_metrics/wine.csv', header=None) # glass pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', header=None) dfs[2] = pd.read_csv('data/datasets_metrics/glass.csv', header=None) # breast cancer wincosin pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None) dfs[3] = pd.read_csv('data/datasets_metrics/breast-cancer-wisconsin.csv', header=None) # wdbc pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) dfs[4] = pd.read_csv('data/datasets_metrics/wdbc.csv', header=None) # liver disorders pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data', header=None) dfs[5] = pd.read_csv('data/datasets_metrics/bupa.csv', header=None) # contraceptive method choice pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', header=None) dfs[6] = pd.read_csv('data/datasets_metrics/cmc.csv', header=None) # tiroide pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/new-thyroid.data', header=None) dfs[7] = pd.read_csv('data/datasets_metrics/new-thyroid.csv', header=None) # dematology pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data', header=None) dfs[8] = pd.read_csv('data/datasets_metrics/dermatology.csv', header=None) # egyptian skools http://www.dm.unibo.it/~simoncin/EgyptianSkulls.html dfs[9] = df = pd.read_csv('data/datasets_metrics/egyptian-skulls.csv', header=None) # heart statlog pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', header=None) dfs[10] = df = pd.read_csv('data/datasets_metrics/heart.csv', header=None) # ionosphere dfs[11] = df = pd.read_csv('data/datasets_metrics/ionosphere.csv', header=None) # vehicle dfs[12] = df = pd.read_csv('data/datasets_metrics/vehicle.csv', header=None) # balance scale pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', header=None) dfs[13] = df = pd.read_csv('data/datasets_metrics/balance-scale.csv', header=None) # sonar dfs[14] = df = pd.read_csv('data/datasets_metrics/sonar.csv', header=None) # zoo pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data', header=None) dfs[15] = df = pd.read_csv('data/datasets_metrics/zoo.csv', header=None) # isolet5 dfs[16] = df = pd.read_csv('data/datasets_metrics/isolet5.csv', header=None) # movement libras dfs[17] = df = pd.read_csv('data/datasets_metrics/libras.csv', header=None) # cleveland http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data dfs[18] = df = pd.read_csv('data/datasets_metrics/cleveland.csv', header=None) # australian dfs[19] = df = pd.read_csv('data/datasets_metrics/australian.csv', header=None) dfs[20] = df = pd.read_csv('data/shapesets/compound.csv', header=None) dfs[21] = df = pd.read_csv('data/shapesets/flame.csv', header=None) dfs[22] = df = pd.read_csv('data/shapesets/jain.csv', header=None) dfs[23] = df = pd.read_csv('data/shapesets/r15.csv', header=None) dfs[24] = df = pd.read_csv('data/shapesets/d31.csv', header=None) dfs[25] = df = pd.read_csv('data/shapesets/spiral.csv', header=None) dfs[26] = df = pd.read_csv('data/shapesets/pathbased.csv', header=None) dfs[27] = df = pd.read_csv('data/shapesets/agregation.csv', header=None) # hill-valley # diabetes # olive # crud oil # musk version 1 # landsat satellite # heart disease len(dfs) for eachdataset in range(0, len(dfs)): df = dfs[eachdataset] df = df.drop(len(df.columns) - 1, 1) x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)] x = x.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() x = std.fit_transform(x) x = x[np.random.permutation(len(x))] SIMULATIONS = 30 df = [] # name = ['KMeans', 'FCMeans'] name = ['FCMeans'] # name = ['KPSO', 'CPSO', 'PSC'] for i in range(len(name)): metrics = [] mean = [] std = [] rng = range(2, 10) # 27 for metricNumber in range(0, 26): for k in rng: print("Number of Clusters = " + str(k) + "\n") for j in range(SIMULATIONS): print("Run ====> " + str(j)) # if (name[i] == 'KPSO'): # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'CPSO'): # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'PSC'): # clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01) if (name[i] == 'KMeans'): clf = KMeans(n_clusters=k) elif (name[i] == 'FCMeans'): clf = FCMeans(n_clusters=k, n_iter=1000) elif (name[i] == 'PSOC'): clf = PSOC(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, c1=1.49, c2=1.49) clf.fit(x) if not os.path.isdir("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/"): os.makedirs("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/") sn = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/") sn = sn + "dataset_{0}".format(str(eachdataset)) sn = sn + "_metric_{0}".format(str(metricNumber)) sn = sn + "_algorithm_{0}".format(str(name[i])) sn = sn + "_k_{0}".format(str(k)) sn = sn + "_simulation_{0}".format(str(j)) savecentroids = pd.DataFrame(clf.centroids) savecentroids = savecentroids.transpose() savecentroids.to_csv(sn+"_centroids.csv") clusters = {} for c in clf.centroids: clusters[c] = [] for xi in x: dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids] class_ = dist.index(min(dist)) clusters[class_].append(xi) # precisa inverter? file = open(sn+"_clusters.csv", 'w') file.write(str(len(clf.centroids)) + '\n') for c in range(len(clf.centroids)): file.write(str(len(clusters[c])) + '\n') for xi in range(len(clusters[c])): file.write(str(clusters[c][xi][0])) for xij in range(1, len(clusters[c][xi])): file.write(' ' + str(clusters[c][xi][xij])) file.write('\n') file.close() # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'): # os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids') # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False) # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf)) # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf)) # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) ) # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) ) # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf)) # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf)) # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xu(data=x, clf=clf)) # metrics.append(Metrics.Metrics.rl(data=x, clf=clf)) # metrics.append(Metrics.Metrics.wb(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0)) c = clf.centroids[0][0] # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c)) # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf)) # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf)) # metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf)) metrics.append(Metrics.clustering_evaluation("{0}".format(str(metricNumber)), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) df.append([name[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] # plt.subplot(130 + (i + 1)) plt.clf() plt.title(str(name[i]) + ' - Metric') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') saveName = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/") saveName = saveName + "dataset_{0}".format(str(eachdataset)) saveName = saveName + "_metric_{0}".format(str(metricNumber)) saveName = saveName + "_algorithm_{0}".format(str(name[i])) plt.savefig(saveName+".pdf") df = pd.DataFrame(df) df.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] df.to_csv(saveName+".csv") mean = [] std = [] plt.tight_layout()
def main(): print("Loading dataset") # df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1') df = pd.read_csv('data/egyptian-skulls.csv', header=None) df = df.drop(len(df.columns)-1, 1) x = df[df.apply(lambda x: sum([x_=='?' for x_ in x])==0, axis=1)] x = x.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimensions are in the same scale") std = MinMaxScaler() x = std.fit_transform(x) print x SIMULATIONS = 2 df = [] name = ['KMeans', 'FCMeans'] # name = ['KPSO', 'CPSO', 'PSC'] for i in range(len(name)): metrics = [] mean = [] std = [] rng = range(2, 4) for k in rng: print("Number of Clusters = " + str(k) + "\n") for j in range(SIMULATIONS): print("Run ====> " + str(j)) # if (name[i] == 'KPSO'): # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'CPSO'): # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'PSC'): # clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01) if (name[i] == 'KMeans'): clf = KMeans(n_clusters=k) elif (name[i] == 'FCMeans'): clf = FCMeans(n_clusters=k) clf.fit(x) centroids = pd.DataFrame(clf.centroids) # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'): # os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids') # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False) # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf)) # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf)) # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) ) # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) ) # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf)) # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf)) # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xu(data=x, clf=clf)) # metrics.append(Metrics.Metrics.rl(data=x, clf=clf)) # metrics.append(Metrics.Metrics.wb(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0)) # c = clf.centroids[0][0] # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c)) # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf)) # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf)) metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) df.append([name[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] plt.subplot(130 + (i + 1)) plt.title(str(name[i]) + ' - Metric') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') # df = pd.DataFrame(df) # df.columns = ['ALGORITHM', 'CLUSTERS', 'GAP MEAN' , 'GAP STD'] # df.to_csv('gap.csv', index=False) plt.tight_layout() plt.show()
def main(): num_exec = 30 swarm_size = 30 num_iter = 1000 # names = ['KMeans', 'FCMeans', 'PSOC', 'PSC', 'KMPSOC', 'PSOCKM'] names = ['PSC', 'PSOC', 'KMPSOC', 'PSOCKM'] # names = ['PSOC', 'PSC', 'KMPSOC', 'PSOCKM'] print("Loading dataset") os.chdir('../../..') df = pd.read_csv('data/booking_website/booking_website_without_empty_values.csv') df = df.drop(['id'], axis=1) # df = df.drop(['idade'], axis=1) df = df.drop(['sexo'], axis=1) x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)] x = x.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() x = std.fit_transform(x) x = x[np.random.permutation(len(x))] for i in range(len(names)): metrics = [] rng = range(2, 11) for metricNumber in ["intraClusterStatistic", "quantizationError", "sumInterClusterDistance"]: # for metricNumber in ["gap"]: print("Algorithm: " + names[i]) mean = [] std = [] dff = [] for k in rng: # print(" Number of Clusters = " + str(k)) for j in tqdm(range(num_exec)): if names[i] == 'KPSO': clf = KMPSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49) elif names[i] == 'PSOC': clf = PSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49) elif names[i] == 'PSC': clf = PSC(swarm_size=k, n_iter=num_iter, w=0.95, c1=2.05, c2=2.05, c3=1.0, c4=1.0, v_max=0.001) elif names[i] == 'PSOCKM': clf = PSOCKM(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49) elif names[i] == 'KMeans': clf = KMeans(n_clusters=k, n_iter=num_iter, shuffle=True, tolerance=0.00001) elif names[i] == 'FCMeans': clf = FCMeans(n_clusters=k, n_iter=num_iter, fuzzy_c=2, tolerance=0.001) clf.fit(x) out_dir = "results/booking/algorithm_{}/metric_{}/".format(names[i], metricNumber) if not os.path.exists(out_dir): os.makedirs(out_dir) file_name = out_dir + "{}_k_{}_exec_{}.csv".format('centroids', k, j) save_centroids = pd.DataFrame(clf.centroids) save_centroids = save_centroids.transpose() save_centroids.to_csv(file_name) clusters = {} for c in clf.centroids: clusters[c] = [] for xi in x: dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids] class_ = dist.index(min(dist)) clusters[class_].append(xi) clusters_file = open(out_dir + "{}_k_{}_exec_{}.csv".format('clusters', k, j), 'w') clusters_file.write(str(len(clf.centroids)) + '\n') for c in range(len(clf.centroids)): clusters_file.write(str(len(clusters[c])) + '\n') for xi in range(len(clusters[c])): clusters_file.write(str(clusters[c][xi][0])) for xij in range(1, len(clusters[c][xi])): clusters_file.write(' ' + str(clusters[c][xi][xij])) clusters_file.write('\n') clusters_file.close() c = clf.centroids[0][0] metrics.append( Metrics.clustering_evaluation("{}".format(metricNumber), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) dff.append([names[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] # plt.subplot(130 + (i + 1)) plt.figure() figure_name = "results/booking/algorithm_{}/metric_{}/plot.png".format(names[i], metricNumber) plt.title(str(names[i]) + ' - Metric ' + metricNumber) plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') plt.tight_layout() plt.savefig(figure_name) save_name = "results/booking/algorithm_{}/metric_{}/output.csv".format(names[i], metricNumber) dff = pd.DataFrame(dff) dff.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] dff.to_csv(save_name)
def main(): pathname_dir_results = '/home/elliackin/Documents/Swarm-Intelligence-Research/' \ 'Simulation-2007-LA-CCI/2007_LA-CCI_ClusteringSimulation-ID-26-Mai-2017-20h:42m:02s' pathname_dataset = "/home/elliackin/Documents/Swarm-Intelligence-Research/" \ "SRC-Swarm-Intelligence/clustering-optimization/data/Saidafinal4periodo.xlsx" df = pd.read_csv(pathname_dir_results + '/gap/gap.csv') psoc = df[df['ALGORITHM'] == 'PSOC'] kmpsoc = df[df['ALGORITHM'] == 'KMPSOC'] psc = df[df['ALGORITHM'] == 'PSC'] # i = 1 # algos = [psoc, kmpsoc, psc] # plt.figure(figsize=(12,4)) # for algo in algos: # plt.subplot(130 + i) # plt.errorbar(algo['CLUSTERS'], algo['GAP MEAN'], yerr=algo['GAP STD'], linewidth=0.5, elinewidth=0.5, color='b') # plt.plot(algo['CLUSTERS'], algo['GAP MEAN'], color='b', marker='o', linewidth=0.5, markersize=5) # plt.xticks(algo['CLUSTERS']) # plt.title(algo['ALGORITHM'].values[0] + ' - GAP') # plt.ylabel('GAP Measure') # plt.xlabel('Number of Clusters') # i += 1 #plt.tight_layout() #plt.show() print("Loading dataset") df = pd.read_excel(io=pathname_dataset, sheetname='Plan1') df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) x = df.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() data = std.fit_transform(x) qe = [] plt.figure(figsize=(12, 4)) mean = [] std = [] for k in range(2, 10): metrics = [] for j in range(30): curr_directory = pathname_dir_results + '/gap/KMPSOC/' + str( k) + '-centroids' filename = curr_directory + '/centroid-simulation-' + str( j) + '.csv' df = pd.read_csv(filename) cluster = df.as_matrix() raw_centroids = cluster.transpose() centroids = {} for i in range(k): centroids[i] = raw_centroids[i] qe_value = Metrics.intra_cluster_statistic(data, centroids) metrics.append(qe_value) mean.append(np.mean(metrics)) std.append(np.std(metrics)) plt.errorbar(range(2, 10), mean, yerr=std, linewidth=0.5, elinewidth=0.5, color='b') plt.plot(range(2, 10), mean, color='b', marker='o', linewidth=0.5, markersize=5) plt.xticks(range(2, 10)) plt.title('KMPSOC') plt.ylabel('QE Measure') plt.xlabel('K') plt.tight_layout() plt.show()
def main(): df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1') df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) x = df.iloc[:, :].values.astype(float) std = MinMaxScaler() x = std.fit_transform(x) df = [] name = ['KPSO', 'CPSO', 'PSC'] for i in range(len(name)): metrics = [] mean = [] std = [] rng = range(2, 10) for k in rng: for j in range(30): if (name[i] == 'KPSO'): clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) elif (name[i] == 'CPSO'): clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) elif (name[i] == 'PSC'): clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01) clf.fit(x) centroids = pd.DataFrame(clf.centroids) centroids.to_csv('inter/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False) metrics.append( Metrics.inter_cluster_statistic(centroids=clf.centroids)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) df.append([name[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] plt.subplot(130 + (i + 1)) plt.title(str(name[i]) + ' - INTRA') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('INTER Statistic') df = pd.DataFrame(df) df.columns = ['ALGORITHM', 'CLUSTERS', 'GAP MEAN', 'GAP STD'] df.to_csv('intra.csv', index=False) plt.tight_layout() plt.show()
def main(): # Importing dataset dataset = pd.read_csv("500_Cities_CDC.csv") # Select lines and columns X = dataset.iloc[:, [ 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112 ]].values # Normalize the dimension value to a float value with range 0 - 1 std = MinMaxScaler() X = std.fit_transform(X) techniques = ['k_means', 'FC_means', 'PSOC', 'ABCC'] metrics = ['gap', 'silhouete', 'calinskiHarabaszIndex'] num_exec = 30 for tec in techniques: rng = range(2, 15) met_eval = [] for met in metrics: mean = [] std = [] dff = [] for k in rng: for j in tqdm(range(num_exec), desc='{} - {} - k: {}'.format(tec, met, k)): if tec == 'k_means': clf = KMeans(k=k) elif tec == 'FC_means': clf = FCMeans(k=k) elif tec == 'PSOC': clf = PSOC(n_clusters=k) elif tec == 'ABCC': clf = ABCC(n_clusters=k) clf.fit(data=X) #run technique out_dir = "results/booking/algorithm_{}/metric_{}/".format( tec, met) #create folder if not os.path.exists(out_dir): os.makedirs(out_dir) file_name = out_dir + "{}_k_{}_exec_{}.csv".format( 'centroids', k, j) #save centroids save_centroids = pd.DataFrame(clf.centroids) save_centroids = save_centroids.transpose() save_centroids.to_csv(file_name) clusters_file = open( out_dir + "{}_k_{}_exec_{}.csv".format('clusters', k, j), 'w') #save clusters clusters_file.write(str(len(clf.centroids)) + '\n') for c in range(len(clf.centroids)): clusters_file.write(str(len(clf.clusters[c])) + '\n') for xi in range(len(clf.clusters[c])): clusters_file.write(str(clf.clusters[c][xi][0])) for xij in range(1, len(clf.clusters[c][xi])): clusters_file.write( ' ' + str(clf.clusters[c][xi][xij])) clusters_file.write('\n') clusters_file.close() if met == 'gap': clusters = clf.clusters random_data = np.random.uniform(0, 1, X.shape) clf.fit(data=random_data) random_clusters = clf.clusters met_eval.append( Metrics.gap_statistic(clusters, random_clusters)) elif met == 'silhouete': met_eval.append( Metrics.silhouette(clf.clusters, len(X))) elif met == 'calinskiHarabaszIndex': met_eval.append( Metrics.variance_based_ch(X, clf.centroids)) mean.append(np.mean(met_eval)) std.append(np.std(met_eval)) dff.append([tec, k, np.mean(met_eval), np.std(met_eval)]) met_eval = [] plt.figure() figure_name = "results/booking/algorithm_{}/metric_{}/plot.png".format( tec, met) plt.title('{} - Metric {}'.format(tec, met)) plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') plt.tight_layout() plt.savefig(figure_name) save_name = "results/booking/algorithm_{}/metric_{}/output.csv".format( tec, met) dff = pd.DataFrame(dff) dff.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] dff.to_csv(save_name)
def main(): print("Loading dataset") df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1') df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) print("Number of objects in the current dataset: " + str(len(df.index))) numberObjects = len(df.index) x = df.iloc[0:numberObjects, 0:numberObjects].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() x = std.fit_transform(x) print("Creating directory to store all clustering solutions") PATHNAME_CLUSTERS_SOL = "clusters_solutions" directory = os.path.dirname(PATHNAME_CLUSTERS_SOL) try: os.stat(directory) except: os.mkdir(directory) NUMBER_RUNS = 5 rng = range(2, 10) k_pso = [] mean = [] std = [] print("Start KPSO\n") for k in rng: print("\t Number K = " + str(k) + "\n") for i in range(NUMBER_RUNS): print("\t\t Run ====> " + str(i)) clf = KPSO(n_clusters=k, swarm_size=15, n_iter=100, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) clf.fit(x) k_pso.append(Metrics.gap_statistic(x, clf.centroids)) mean.append(np.mean(k_pso)) std.append(np.std(k_pso)) k_pso = [] plt.figure(0) plt.subplot(131) plt.title('KPSO - GAP') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('K') plt.ylabel('GAP Statistic') plot_gap_avaliation(mean, std, 131, 'KPSO') print("KPSO executions are completed!\n\n") print("Start CPSO\n") cpso = [] mean = [] std = [] for k in rng: print("\t Number K = " + str(k) + "\n") for i in range(NUMBER_RUNS): print("\t\t Run ====> " + str(i)) clf = CPSO(n_clusters=k, swarm_size=15, n_iter=100, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) clf.fit(x) cpso.append(Metrics.gap_statistic(x, clf.centroids)) mean.append(np.mean(cpso)) std.append(np.std(cpso)) cpso = [] plt.figure(0) plt.subplot(132) plt.title('CPSO - GAP') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('K') plt.ylabel('GAP Statistic') plot_gap_avaliation(mean, std, 132, 'CPSO') print("CPSO executions are completed!\n\n") print("Start PSC\n") psc = [] mean = [] std = [] for k in rng: print("\t Number K = " + str(k) + "\n") for i in range(NUMBER_RUNS): print("\t\t Run ====> " + str(i)) clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=200, w=0.95, v_max=0.01) clf.fit(x) psc.append(Metrics.gap_statistic(x, clf.centroids)) mean.append(np.mean(psc)) std.append(np.std(psc)) psc = [] plt.figure(0) plt.subplot(133) plt.title('PSC - GAP') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('K') plt.ylabel('GAP Statistic') plot_gap_avaliation(mean, std, 133, 'PSC') print("PSC executions are completed!\n") plt.tight_layout() plt.show()