def main(): print("Loading dataset") os.chdir('../../../..') dfs = [pd.read_csv('data/empty.csv', header=None) for k in range(28)] # iris pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) dfs[0] = pd.read_csv('data/datasets_metrics/iris.csv', header=None) # wine pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) dfs[1] = pd.read_csv('data/datasets_metrics/wine.csv', header=None) # glass pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', header=None) dfs[2] = pd.read_csv('data/datasets_metrics/glass.csv', header=None) # breast cancer wincosin pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None) dfs[3] = pd.read_csv('data/datasets_metrics/breast-cancer-wisconsin.csv', header=None) # wdbc pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) dfs[4] = pd.read_csv('data/datasets_metrics/wdbc.csv', header=None) # liver disorders pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data', header=None) dfs[5] = pd.read_csv('data/datasets_metrics/bupa.csv', header=None) # contraceptive method choice pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', header=None) dfs[6] = pd.read_csv('data/datasets_metrics/cmc.csv', header=None) # tiroide pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/new-thyroid.data', header=None) dfs[7] = pd.read_csv('data/datasets_metrics/new-thyroid.csv', header=None) # dematology pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data', header=None) dfs[8] = pd.read_csv('data/datasets_metrics/dermatology.csv', header=None) # egyptian skools http://www.dm.unibo.it/~simoncin/EgyptianSkulls.html dfs[9] = df = pd.read_csv('data/datasets_metrics/egyptian-skulls.csv', header=None) # heart statlog pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', header=None) dfs[10] = df = pd.read_csv('data/datasets_metrics/heart.csv', header=None) # ionosphere dfs[11] = df = pd.read_csv('data/datasets_metrics/ionosphere.csv', header=None) # vehicle dfs[12] = df = pd.read_csv('data/datasets_metrics/vehicle.csv', header=None) # balance scale pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', header=None) dfs[13] = df = pd.read_csv('data/datasets_metrics/balance-scale.csv', header=None) # sonar dfs[14] = df = pd.read_csv('data/datasets_metrics/sonar.csv', header=None) # zoo pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data', header=None) dfs[15] = df = pd.read_csv('data/datasets_metrics/zoo.csv', header=None) # isolet5 dfs[16] = df = pd.read_csv('data/datasets_metrics/isolet5.csv', header=None) # movement libras dfs[17] = df = pd.read_csv('data/datasets_metrics/libras.csv', header=None) # cleveland http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data dfs[18] = df = pd.read_csv('data/datasets_metrics/cleveland.csv', header=None) # australian dfs[19] = df = pd.read_csv('data/datasets_metrics/australian.csv', header=None) dfs[20] = df = pd.read_csv('data/shapesets/compound.csv', header=None) dfs[21] = df = pd.read_csv('data/shapesets/flame.csv', header=None) dfs[22] = df = pd.read_csv('data/shapesets/jain.csv', header=None) dfs[23] = df = pd.read_csv('data/shapesets/r15.csv', header=None) dfs[24] = df = pd.read_csv('data/shapesets/d31.csv', header=None) dfs[25] = df = pd.read_csv('data/shapesets/spiral.csv', header=None) dfs[26] = df = pd.read_csv('data/shapesets/pathbased.csv', header=None) dfs[27] = df = pd.read_csv('data/shapesets/agregation.csv', header=None) # hill-valley # diabetes # olive # crud oil # musk version 1 # landsat satellite # heart disease len(dfs) for eachdataset in range(0, len(dfs)): df = dfs[eachdataset] df = df.drop(len(df.columns) - 1, 1) x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)] x = x.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() x = std.fit_transform(x) x = x[np.random.permutation(len(x))] SIMULATIONS = 30 df = [] # name = ['KMeans', 'FCMeans'] name = ['FCMeans'] # name = ['KPSO', 'CPSO', 'PSC'] for i in range(len(name)): metrics = [] mean = [] std = [] rng = range(2, 10) # 27 for metricNumber in range(0, 26): for k in rng: print("Number of Clusters = " + str(k) + "\n") for j in range(SIMULATIONS): print("Run ====> " + str(j)) # if (name[i] == 'KPSO'): # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'CPSO'): # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'PSC'): # clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01) if (name[i] == 'KMeans'): clf = KMeans(n_clusters=k) elif (name[i] == 'FCMeans'): clf = FCMeans(n_clusters=k, n_iter=1000) elif (name[i] == 'PSOC'): clf = PSOC(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, c1=1.49, c2=1.49) clf.fit(x) if not os.path.isdir("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/"): os.makedirs("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/") sn = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/") sn = sn + "dataset_{0}".format(str(eachdataset)) sn = sn + "_metric_{0}".format(str(metricNumber)) sn = sn + "_algorithm_{0}".format(str(name[i])) sn = sn + "_k_{0}".format(str(k)) sn = sn + "_simulation_{0}".format(str(j)) savecentroids = pd.DataFrame(clf.centroids) savecentroids = savecentroids.transpose() savecentroids.to_csv(sn+"_centroids.csv") clusters = {} for c in clf.centroids: clusters[c] = [] for xi in x: dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids] class_ = dist.index(min(dist)) clusters[class_].append(xi) # precisa inverter? file = open(sn+"_clusters.csv", 'w') file.write(str(len(clf.centroids)) + '\n') for c in range(len(clf.centroids)): file.write(str(len(clusters[c])) + '\n') for xi in range(len(clusters[c])): file.write(str(clusters[c][xi][0])) for xij in range(1, len(clusters[c][xi])): file.write(' ' + str(clusters[c][xi][xij])) file.write('\n') file.close() # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'): # os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids') # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False) # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf)) # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf)) # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) ) # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) ) # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf)) # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf)) # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xu(data=x, clf=clf)) # metrics.append(Metrics.Metrics.rl(data=x, clf=clf)) # metrics.append(Metrics.Metrics.wb(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0)) c = clf.centroids[0][0] # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c)) # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf)) # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf)) # metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf)) metrics.append(Metrics.clustering_evaluation("{0}".format(str(metricNumber)), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) df.append([name[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] # plt.subplot(130 + (i + 1)) plt.clf() plt.title(str(name[i]) + ' - Metric') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') saveName = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/") saveName = saveName + "dataset_{0}".format(str(eachdataset)) saveName = saveName + "_metric_{0}".format(str(metricNumber)) saveName = saveName + "_algorithm_{0}".format(str(name[i])) plt.savefig(saveName+".pdf") df = pd.DataFrame(df) df.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] df.to_csv(saveName+".csv") mean = [] std = [] plt.tight_layout()
def main(): num_exec = 30 swarm_size = 30 num_iter = 1000 # names = ['KMeans', 'FCMeans', 'PSOC', 'PSC', 'KMPSOC', 'PSOCKM'] names = ['PSC', 'PSOC', 'KMPSOC', 'PSOCKM'] # names = ['PSOC', 'PSC', 'KMPSOC', 'PSOCKM'] print("Loading dataset") os.chdir('../../..') df = pd.read_csv('data/booking_website/booking_website_without_empty_values.csv') df = df.drop(['id'], axis=1) # df = df.drop(['idade'], axis=1) df = df.drop(['sexo'], axis=1) x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)] x = x.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() x = std.fit_transform(x) x = x[np.random.permutation(len(x))] for i in range(len(names)): metrics = [] rng = range(2, 11) for metricNumber in ["intraClusterStatistic", "quantizationError", "sumInterClusterDistance"]: # for metricNumber in ["gap"]: print("Algorithm: " + names[i]) mean = [] std = [] dff = [] for k in rng: # print(" Number of Clusters = " + str(k)) for j in tqdm(range(num_exec)): if names[i] == 'KPSO': clf = KMPSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49) elif names[i] == 'PSOC': clf = PSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49) elif names[i] == 'PSC': clf = PSC(swarm_size=k, n_iter=num_iter, w=0.95, c1=2.05, c2=2.05, c3=1.0, c4=1.0, v_max=0.001) elif names[i] == 'PSOCKM': clf = PSOCKM(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49) elif names[i] == 'KMeans': clf = KMeans(n_clusters=k, n_iter=num_iter, shuffle=True, tolerance=0.00001) elif names[i] == 'FCMeans': clf = FCMeans(n_clusters=k, n_iter=num_iter, fuzzy_c=2, tolerance=0.001) clf.fit(x) out_dir = "results/booking/algorithm_{}/metric_{}/".format(names[i], metricNumber) if not os.path.exists(out_dir): os.makedirs(out_dir) file_name = out_dir + "{}_k_{}_exec_{}.csv".format('centroids', k, j) save_centroids = pd.DataFrame(clf.centroids) save_centroids = save_centroids.transpose() save_centroids.to_csv(file_name) clusters = {} for c in clf.centroids: clusters[c] = [] for xi in x: dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids] class_ = dist.index(min(dist)) clusters[class_].append(xi) clusters_file = open(out_dir + "{}_k_{}_exec_{}.csv".format('clusters', k, j), 'w') clusters_file.write(str(len(clf.centroids)) + '\n') for c in range(len(clf.centroids)): clusters_file.write(str(len(clusters[c])) + '\n') for xi in range(len(clusters[c])): clusters_file.write(str(clusters[c][xi][0])) for xij in range(1, len(clusters[c][xi])): clusters_file.write(' ' + str(clusters[c][xi][xij])) clusters_file.write('\n') clusters_file.close() c = clf.centroids[0][0] metrics.append( Metrics.clustering_evaluation("{}".format(metricNumber), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) dff.append([names[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] # plt.subplot(130 + (i + 1)) plt.figure() figure_name = "results/booking/algorithm_{}/metric_{}/plot.png".format(names[i], metricNumber) plt.title(str(names[i]) + ' - Metric ' + metricNumber) plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') plt.tight_layout() plt.savefig(figure_name) save_name = "results/booking/algorithm_{}/metric_{}/output.csv".format(names[i], metricNumber) dff = pd.DataFrame(dff) dff.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] dff.to_csv(save_name)
def main(): pathname_dir_results = '/home/elliackin/Documents/Swarm-Intelligence-Research/' \ 'Simulation-2007-LA-CCI/2007_LA-CCI_ClusteringSimulation-ID-26-Mai-2017-20h:42m:02s' pathname_dataset = "/home/elliackin/Documents/Swarm-Intelligence-Research/" \ "SRC-Swarm-Intelligence/clustering-optimization/data/Saidafinal4periodo.xlsx" df = pd.read_csv(pathname_dir_results + '/gap/gap.csv') psoc = df[df['ALGORITHM'] == 'PSOC'] kmpsoc = df[df['ALGORITHM'] == 'KMPSOC'] psc = df[df['ALGORITHM'] == 'PSC'] # i = 1 # algos = [psoc, kmpsoc, psc] # plt.figure(figsize=(12,4)) # for algo in algos: # plt.subplot(130 + i) # plt.errorbar(algo['CLUSTERS'], algo['GAP MEAN'], yerr=algo['GAP STD'], linewidth=0.5, elinewidth=0.5, color='b') # plt.plot(algo['CLUSTERS'], algo['GAP MEAN'], color='b', marker='o', linewidth=0.5, markersize=5) # plt.xticks(algo['CLUSTERS']) # plt.title(algo['ALGORITHM'].values[0] + ' - GAP') # plt.ylabel('GAP Measure') # plt.xlabel('Number of Clusters') # i += 1 #plt.tight_layout() #plt.show() print("Loading dataset") df = pd.read_excel(io=pathname_dataset, sheetname='Plan1') df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) x = df.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std = MinMaxScaler() data = std.fit_transform(x) qe = [] plt.figure(figsize=(12, 4)) mean = [] std = [] for k in range(2, 10): metrics = [] for j in range(30): curr_directory = pathname_dir_results + '/gap/KMPSOC/' + str( k) + '-centroids' filename = curr_directory + '/centroid-simulation-' + str( j) + '.csv' df = pd.read_csv(filename) cluster = df.as_matrix() raw_centroids = cluster.transpose() centroids = {} for i in range(k): centroids[i] = raw_centroids[i] qe_value = Metrics.intra_cluster_statistic(data, centroids) metrics.append(qe_value) mean.append(np.mean(metrics)) std.append(np.std(metrics)) plt.errorbar(range(2, 10), mean, yerr=std, linewidth=0.5, elinewidth=0.5, color='b') plt.plot(range(2, 10), mean, color='b', marker='o', linewidth=0.5, markersize=5) plt.xticks(range(2, 10)) plt.title('KMPSOC') plt.ylabel('QE Measure') plt.xlabel('K') plt.tight_layout() plt.show()
def main(): data = pd.read_excel( '//home/elliackin/Documents/Swarm-Intelligence-Research/SRC-Swarm-Intelligence/' 'clustering-optimization/data/Saidafinal4periodo.xlsx', sheetname='Plan1') data.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) x = data.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") min_max_scaler = MinMaxScaler() x = min_max_scaler.fit_transform(x) indices_attributes = np.array([1, 7, 8, 10, 12, 13, 16]) indices_attributes = indices_attributes - 1 x = x[:, indices_attributes] # FOLDER = KMPSOC | PSC | PSOC -> ALGOS algorithms = ['PSOC', 'KMPSOC', "PSC"] mean_metric_by_algorithm = {} std_metric_by_algorithm = {} for idx_alg in range(len(algorithms)): mean_metric_by_algorithm[algorithms[idx_alg]] = [] std_metric_by_algorithm[algorithms[idx_alg]] = [] total_mean = [] total_std = [] n_centroids = range(2, 10) for k in n_centroids: # EACH SIMULATION inside n-centroid simulation = [] pathname = "/home/elliackin/Documents/Swarm-Intelligence-Research" \ "/Simulation-2007-LA-CCI/2017_LA-CCI_ClusteringSimulation-ID-10-Jun-2017-15h:01m:27s" \ "/data/" + algorithms[idx_alg] + "/clusters/" + str(k) +"-centroids/" for file in os.listdir(pathname): if os.path.join(pathname, file).endswith('.csv'): pathname_temp = os.path.join(pathname, file) #print pathname_temp df = pd.read_csv(pathname_temp) centroids = convert_df_to_dict(df) simulation.append( Metrics.intra_cluster_statistic(x, centroids)) total_mean.append(np.array(simulation).mean()) total_std.append(np.array(simulation).std()) mean_metric_by_algorithm[algorithms[idx_alg]] = total_mean std_metric_by_algorithm[algorithms[idx_alg]] = total_std min_value_mean = np.inf max_value_mean = -np.inf min_value_std = np.inf max_value_std = -np.inf for idx_alg in range(len(algorithms)): curr_min = np.amin(mean_metric_by_algorithm[algorithms[idx_alg]]) curr_max = np.amax(mean_metric_by_algorithm[algorithms[idx_alg]]) min_value_mean = np.minimum(min_value_mean, curr_min) max_value_mean = np.maximum(max_value_mean, curr_max) curr_min = np.amin(std_metric_by_algorithm[algorithms[idx_alg]]) curr_max = np.amax(std_metric_by_algorithm[algorithms[idx_alg]]) min_value_std = np.minimum(min_value_std, curr_min) max_value_std = np.maximum(max_value_std, curr_max) #plt.figure(figsize=(12, 4)) pathname_out = "/home/elliackin/Documents/Swarm-Intelligence-Research/Simulation-2007-LA-CCI/Figuras LACCI/" for idx_alg in range(len(algorithms)): total_mean = mean_metric_by_algorithm[algorithms[idx_alg]] total_std = std_metric_by_algorithm[algorithms[idx_alg]] plt.figure(idx_alg) plt.errorbar(n_centroids, total_mean, yerr=total_std, linewidth=0.5, elinewidth=0.5, color='b') plt.plot(n_centroids, total_mean, color='b', marker='o', linewidth=0.5, markersize=5) plt.xticks(n_centroids) plt.title(algorithms[idx_alg] + ' - INTRA CLUSTER SUM') plt.ylabel('Intra Cluster Sum') plt.xlabel('Number of Clusters (k)') ymin = min_value_mean - min_value_std ymax = max_value_mean + max_value_std delta = ymax - ymin plt.ylim([ymin - 0.5 * delta, ymax + 0.5 * delta]) plt.tight_layout() plt.savefig(pathname_out + algorithms[idx_alg] + "-SSW.pdf")
def main(parameters_simulation=None): data_file = open(parameters_simulation) param_simultations = json.load(data_file) pathname_dataset = param_simultations["pathname_dataset"] pathname_dir_results = param_simultations["pathname_dir_results"] num_simulations = param_simultations["NUN_SIMULATIONS"] # 30 num_trials = param_simultations["NUM_TRIALS"] # 50 num_iterations = param_simultations["NUM_ITERATIONS"] # 500 swarm_size = param_simultations["SWARM_SIZE"] # 15 name_classifier = param_simultations["ALGORITHMS"] clustering_metrics = param_simultations["EVALUATION_METRICS"] criteria = [] table_criteria = {} for idx_alg in range(len(clustering_metrics)): value = clustering_metrics[idx_alg] criteria.append(value['criteria']) table_criteria[value['criteria']] = ( clustering_metrics[idx_alg]['name'], clustering_metrics[idx_alg]['file']) time.sleep(2) pathname_output = pathname_dir_results + '/2017_LA-CCI_ClusteringSimulation' currDirectory = (pathname_output + '-ID-' + datetime.now().strftime('%d-%b-%Y-%Hh:%Mm:%Ss')) pathname_output = currDirectory if not os.path.exists(pathname_output): os.makedirs(pathname_output) else: raise Exception("This simulation cannot execute!") time.sleep(1) timestamp = datetime.now().strftime('%a, %d %b %Y at %Hh:%Mm:%Ss') if not os.path.exists(pathname_output + '/timestamp.txt'): file_timestamp = open(pathname_output + '/timestamp.txt', 'a') file_timestamp.write('This simulation started on ' + timestamp) file_timestamp.close() print("This simulation started on " + timestamp) else: raise Exception("This simulation cannot execute!") print("Loading dataset") df = pd.read_excel(io=pathname_dataset, sheetname='Plan1') df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) print pathname_dataset x = df.iloc[:, :].values.astype(float) print("Normalizing dataset so that all dimenions are in the same scale") min_max_scaler = MinMaxScaler() x = min_max_scaler.fit_transform(x) indices_attributes_4p = np.array([1, 7, 8, 10, 12, 13, 16]) indices_attributes_4p = indices_attributes_4p - 1 indices_attributes_5p = np.array([1, 7, 8, 10, 12, 13]) indices_attributes_5p = indices_attributes_5p - 1 x = x[:, indices_attributes_4p] # x = np.array(x) # The term clf means classifier # name_classifier = ['PSOC'] # name_classifier = ['PSC'] KList = range(2, 10) idx_success_simulation = {} idx_trial = {} idx_fail_simulation = {} check_sim_its_over = {} for idx_alg in range(len(name_classifier)): idx_success_simulation[name_classifier[idx_alg]] = {} idx_trial[name_classifier[idx_alg]] = {} idx_fail_simulation[name_classifier[idx_alg]] = {} check_sim_its_over[name_classifier[idx_alg]] = {} for k in KList: idx_success_simulation[name_classifier[idx_alg]][k] = 0 idx_trial[name_classifier[idx_alg]][k] = 0 idx_fail_simulation[name_classifier[idx_alg]][k] = 0 check_sim_its_over[name_classifier[idx_alg]][k] = False metrics_list_sim_by_algorithm_and_k = {} mean_metric_by_algorithm_and_k = {} std_metric_by_algorithm_and_k = {} for m in range(len(criteria)): mean_metric_by_algorithm_and_k[criteria[m]] = {} std_metric_by_algorithm_and_k[criteria[m]] = {} for m in range(len(criteria)): metrics_list_sim_by_algorithm_and_k[criteria[m]] = {} for idx_alg in range(len(name_classifier)): metrics_list_sim_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]] = {} for k in KList: metrics_list_sim_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]][k] = [] for m in range(len(criteria)): min_value_mean = {} max_value_mean = {} min_value_std = {} max_value_std = {} for m in range(len(criteria)): min_value_mean[criteria[m]] = np.inf max_value_mean[criteria[m]] = -np.inf min_value_std[criteria[m]] = np.inf max_value_std[criteria[m]] = -np.inf #First the simulation #(idx_success_simulation < num_simulations) and (idx_trial < num_trials) finished = False idx_fig = 1 Round = 1 while not finished: list_results_by_metric = {} for m in range(len(criteria)): list_results_by_metric[criteria[m]] = [] for idx_alg in range(len(name_classifier)): for m in range(len(criteria)): mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]] = np.zeros((len(KList), )) std_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]] = np.zeros((len(KList), )) for k in KList: if not check_sim_its_over[name_classifier[idx_alg]][k]: create_dir_algorithms_and_k(idx_alg, k, name_classifier, pathname_output) pathname_output_clusters = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \ + '/clusters/' + str(k) + '-centroids' pathname_output_metrics = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \ + '/metrics/' + str(k) + '-centroids' pathname_output_evolution_success = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \ + '/evolution/success/' + str(k) + '-centroids' pathname_output_evolution_fail = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \ + '/evolution/fail/' + str(k) + '-centroids' if (name_classifier[idx_alg] == 'KMPSOC'): clf = KMPSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iterations, w=0.72, c1=1.49, c2=1.49) elif (name_classifier[idx_alg] == 'PSOC'): clf = PSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iterations, w=0.72, c1=1.49, c2=1.49) elif (name_classifier[idx_alg] == 'PSOCKM'): clf = PSOCKM(n_clusters=k, swarm_size=swarm_size, n_iter=num_iterations, w=0.72, c1=1.49, c2=1.49) elif (name_classifier[idx_alg] == 'KMeans'): clf = KMeans(n_clusters=k) elif (name_classifier[idx_alg] == 'PSC'): clf = PSC(swarm_size=k, n_iter=num_iterations, w=0.95, c1=2.05, c2=2.05, c3=1.0, c4=1.0, v_max=0.01) clf.fit(x) centroids = clf.centroids Round = Round + 1 if clf.solution.number_of_effective_clusters == k: filename = pathname_output_clusters + '/centroid-' + str(k) + \ '-success-simulation-' + str( idx_success_simulation[name_classifier[idx_alg]][k] + 1) + '.csv' else: filename = pathname_output_clusters + '/centroid-' + str(k) \ + '-fail-simulation-' + str(idx_fail_simulation[name_classifier[idx_alg]][k] + 1) + '.csv' dataframe_centroids = pd.DataFrame(centroids) dataframe_centroids.transpose().to_csv(filename, sep=" ", index=False) if clf.solution.number_of_effective_clusters == k: filename = pathname_output_clusters + '/cluster-' + str(k) \ + '-success-simulation-' + str(idx_success_simulation[name_classifier[idx_alg]][k] + 1) + '.cluster' else: filename = pathname_output_clusters + '/cluster-' + str(k) + \ '-fail-simulation-' + str(idx_fail_simulation[name_classifier[idx_alg]][k] + 1) + '.cluster' file = open(filename, 'w') file.write(str(len(clf.centroids)) + '\n') file.write( str(clf.solution.number_of_effective_clusters) + '\n') for c in range(len(clf.centroids)): if len(clf.solution.clusters[c]) > 0: file.write( str(len(clf.solution.clusters[c])) + '\n') for xi in range(len(clf.solution.clusters[c])): file.write(str( clf.solution.clusters[c][xi][0])) for xij in range( 1, len(clf.solution.clusters[c][xi])): file.write( ' ' + str(clf.solution.clusters[c][xi][xij])) file.write('\n') file.close() if clf.solution.number_of_effective_clusters == k: evol_directory = pathname_output_evolution_success + '/evolution-' + str(k) \ + '-success-simulation-' + str(idx_success_simulation[name_classifier[idx_alg]][k] + 1) if not os.path.exists(evol_directory): os.makedirs(evol_directory) store_evolution( evol_directory, clf.debugger, k, idx_success_simulation[name_classifier[idx_alg]][k] + 1) else: evol_directory = pathname_output_evolution_fail + '/evolution-' + str(k) + \ '-fail-simulation-' + str(idx_fail_simulation[name_classifier[idx_alg]][k] + 1) if not os.path.exists(evol_directory): os.makedirs(evol_directory) store_evolution( evol_directory, clf.debugger, k, idx_fail_simulation[name_classifier[idx_alg]][k] + 1) if clf.solution.number_of_effective_clusters == k: for m in range(len(criteria)): value = Metrics.clustering_evaluation( criteria=criteria[m], centroids=centroids, data=x, clf=clf) metrics_list_sim_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]][k].append(value) for m in range(len(criteria)): pd.DataFrame(metrics_list_sim_by_algorithm_and_k[ criteria[m]][name_classifier[idx_alg]][k]).to_csv( pathname_output_metrics + '/' + table_criteria[criteria[m]][1] + '.csv', sep=" ") if clf.solution.number_of_effective_clusters == k: idx_success_simulation[name_classifier[idx_alg]][ k] = idx_success_simulation[ name_classifier[idx_alg]][k] + 1 else: idx_fail_simulation[ name_classifier[idx_alg]][k] = idx_fail_simulation[ name_classifier[idx_alg]][k] + 1 idx_trial[name_classifier[idx_alg]][k] = idx_trial[ name_classifier[idx_alg]][k] + 1 if (idx_success_simulation[name_classifier[idx_alg]][k] >= num_simulations or idx_trial[name_classifier[idx_alg]][k] >= num_trials): check_sim_its_over[name_classifier[idx_alg]][k] = True for d in check_sim_its_over.values(): if not all(d.values()): break else: finished = True print( "Round(" + str(Round) + ") ........................ " + "(SUCCESS = " + str(idx_success_simulation[ name_classifier[idx_alg]][k]) + ", " "FAIL = " + str(idx_fail_simulation[name_classifier[idx_alg]][k]) + ", " + "TRAIL = " + str(idx_trial[name_classifier[idx_alg]][k]) + ", " + "CLF = " + name_classifier[idx_alg] + ", K = " + str(k) + ")" + "\n") for m in range(len(criteria)): idx_k = 0 for ik in KList: if len(metrics_list_sim_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]][ik]) > 0: mean_metric_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][idx_k] = \ np.mean(metrics_list_sim_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][ik]) std_metric_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][idx_k] = \ np.std(metrics_list_sim_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][ik]) list_results_by_metric[criteria[m]].append([ name_classifier[idx_alg], ik, np.mean(metrics_list_sim_by_algorithm_and_k[ criteria[m]][name_classifier[idx_alg]][ik]), np.std(metrics_list_sim_by_algorithm_and_k[ criteria[m]][name_classifier[idx_alg]][ik]) ]) idx_k = idx_k + 1 for m in range(len(criteria)): min_value_mean[criteria[m]] = np.inf max_value_mean[criteria[m]] = -np.inf max_value_std[criteria[m]] = -np.inf for m in range(len(criteria)): min_value_mean[criteria[m]] = np.amin( mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]]) max_value_mean[criteria[m]] = np.amax( mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]]) max_value_std[criteria[m]] = np.amax( std_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]]) for m in range(len(criteria)): #print mean_metric_by_algorithm_and_k plt.figure() plt.title( str(name_classifier[idx_alg]) + ' - ' + table_criteria[criteria[m]][0]) # plt.errorbar(KList, mean_metric_by_algorithm_and_k[criteria[m]], yerr=std_metric_by_algorithm_and_k[criteria[m]], linewidth=0.5, elinewidth=0.5, color='b', capthick=2, barsabove=True) plt.errorbar(KList, mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]], yerr=(std_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]]), linewidth=0.5, elinewidth=0.5, color='b') plt.plot(KList, mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]], color='b', marker='o', linewidth=0.5, markersize=5) plt.xlabel('K') plt.ylabel(table_criteria[criteria[m]][0]) ymin = min_value_mean[criteria[m]] - max_value_std[criteria[m]] ymax = max_value_mean[criteria[m]] + max_value_std[criteria[m]] delta = ymax - ymin plt.ylim([ymin - 0.01 * delta, ymax + 0.01 * delta]) plt.tight_layout() plt.savefig(pathname_output + '/data/' + name_classifier[idx_alg] + "/" + name_classifier[idx_alg] + "-" + table_criteria[criteria[m]][1] + '.pdf') plt.close("all") for m in range(len(criteria)): df_by_metric = pd.DataFrame(list_results_by_metric[criteria[m]]) df_by_metric.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] df_by_metric.to_csv(pathname_output + '/data/' + table_criteria[criteria[m]][1] + '.csv', index=False) # mean_metric_by_algorithm[name_classifier[idx_alg]] = mean_metric_by_algorithm_and_k # std_metric_by_algorithm[name_classifier[idx_alg]] = std_metric_by_algorithm_and_k # for m in range(len(criteria)): min_value_mean[criteria[m]] = np.inf max_value_mean[criteria[m]] = -np.inf max_value_std[criteria[m]] = -np.inf for m in range(len(criteria)): for idx_alg in range(len(name_classifier)): curr_min_mean = np.amin(mean_metric_by_algorithm_and_k[criteria[m]] [name_classifier[idx_alg]]) curr_max_mean = np.amax(mean_metric_by_algorithm_and_k[criteria[m]] [name_classifier[idx_alg]]) min_value_mean[criteria[m]] = np.minimum( min_value_mean[criteria[m]], curr_min_mean) max_value_mean[criteria[m]] = np.maximum( max_value_mean[criteria[m]], curr_max_mean) curr_max_std = np.amax(std_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]]) max_value_std[criteria[m]] = np.maximum(max_value_std[criteria[m]], curr_max_std) for m in range(len(criteria)): for idx_alg in range(len(name_classifier)): plt.figure() plt.title( str(name_classifier[idx_alg]) + ' - ' + table_criteria[criteria[m]][0]) # plt.errorbar(KList, mean_metric_by_algorithm_and_k[criteria[m]], yerr=std_metric_by_algorithm_and_k[criteria[m]], linewidth=0.5, elinewidth=0.5, color='b', capthick=2, barsabove=True) plt.errorbar(KList, mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]], yerr=std_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]], linewidth=0.5, elinewidth=0.5, color='b') plt.plot(KList, mean_metric_by_algorithm_and_k[criteria[m]][ name_classifier[idx_alg]], color='b', marker='o', linewidth=0.5, markersize=5) plt.xlabel('K') plt.ylabel(table_criteria[criteria[m]][0]) ymin = min_value_mean[criteria[m]] - max_value_std[criteria[m]] ymax = max_value_mean[criteria[m]] + max_value_std[criteria[m]] delta = ymax - ymin plt.ylim([ymin - 0.01 * delta, ymax + 0.01 * delta]) plt.tight_layout() plt.savefig(pathname_output + '/data/' + name_classifier[idx_alg] + "/" + name_classifier[idx_alg] + "-" + table_criteria[criteria[m]][1] + '_final.pdf') plt.close("all")
def main(): print("Loading dataset") eachdataset = 072017 df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1') df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True) x = df.iloc[:, :].values.astype(float) print("Nomalizing dataset so that all dimenions are in the same scale") std_metric_by_algorithm_and_k = MinMaxScaler() x = std_metric_by_algorithm_and_k.fit_transform(x) indices_attributes = np.array([1,7,8,10,12,13,16]) indices_attributes = indices_attributes -1 x = x[:,indices_attributes] SIMULATIONS = 30 # name = ['KMeans', 'FCMeans'] name = ['PSOC'] # name = ['KPSO', 'CPSO', 'PSC'] for i in range(len(name)): metrics = [] mean = [] std = [] rng = range(2, 10) for metricNumber in range(1, 26): for k in rng: print("Number of Clusters = " + str(k) + "\n") for j in range(SIMULATIONS): print("Run ====> " + str(j)) # if (name[i] == 'KPSO'): # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'CPSO'): # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49) # elif (name[i] == 'PSC'): # clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01) if (name[i] == 'KMeans'): clf = KMeans(n_clusters=k) elif (name[i] == 'FCMeans'): clf = FCMeans(n_clusters=k) elif (name[i] == 'PSOC'): clf = PSOC(n_clusters=k, swarm_size=30, n_iter=1000, w=0.72, c1=1.49, c2=1.49) clf.fit(x) if not os.path.isdir("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/"): os.makedirs("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/") sn = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/") sn = sn + "dataset_{0}".format(str(eachdataset)) sn = sn + "_metric_{0}".format(str(metricNumber)) sn = sn + "_algorithm_{0}".format(str(name[i])) sn = sn + "_k_{0}".format(str(k)) sn = sn + "_simulation_{0}".format(str(j)) savecentroids = pd.DataFrame(clf.centroids) savecentroids = savecentroids.transpose() savecentroids.to_csv(sn+"_centroids.csv") # clust = Metrics.get_clusters(x, clf.centroids) # # print np.array(clust[0]) # c = [] # for ii in range(len(clust)): # c.append(np.array(clust[ii])) # sc = pd.DataFrame(c) # # precisa inverter? # sc.to_csv(sn+"_clusters.csv") file = open(sn+"_clusters.csv", 'w') file.write(str(len(clf.centroids)) + '\n') file.write(str(clf.solution.number_of_effective_clusters)+ '\n') for c in range(len(clf.centroids)): file.write(str(len(clf.solution.clusters[c])) + '\n') for xi in range(len(clf.solution.clusters[c])): file.write(str(clf.solution.clusters[c][xi][0])) for xij in range(1,len(clf.solution.clusters[c][xi])): file.write(' ' + str(clf.solution.clusters[c][xi][xij])) file.write('\n') file.close() # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'): # os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids') # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False) # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf)) # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4)) # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf)) # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf)) # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) ) # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) ) # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0)) # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf)) # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf)) # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xu(data=x, clf=clf)) # metrics.append(Metrics.Metrics.rl(data=x, clf=clf)) # metrics.append(Metrics.Metrics.wb(data=x, clf=clf)) # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0)) c = clf.centroids[0][0] # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c)) # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf)) # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf)) # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf)) # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf)) # metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf)) metrics.append(Metrics.clustering_evaluation("{0}".format(str(metricNumber)), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c)) mean.append(np.mean(metrics)) std.append(np.std(metrics)) df.append([name[i], k, np.mean(metrics), np.std(metrics)]) metrics = [] # plt.subplot(130 + (i + 1)) plt.clf() plt.title(str(name[i]) + ' - Metric') plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True) plt.xlabel('Clusters') plt.ylabel('Metric') saveName = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/") saveName = saveName + "dataset_{0}".format(str(eachdataset)) saveName = saveName + "_metric_{0}".format(str(metricNumber)) saveName = saveName + "_algorithm_{0}".format(str(name[i])) plt.savefig(saveName+".pdf") df = pd.DataFrame(df) df.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD'] df.to_csv(saveName+".csv") mean = [] std = [] plt.tight_layout()