示例#1
0
def plot_map(lat, lon, color=None, size=10):
    center = [lat[0], lon[0]]
    cmap = cm.rainbow
    wlat, wlong = latlng_to_meters(lat, lon)
    colors = []
    if color is not None:
        colors = MinMaxScaler(feature_range=(0, 255)).fit_transform(color)
        colors = [
            "#%02x%02x%02x" % tuple([int(j * 255) for j in cmap(int(i))[:3]])
            for i in colors
        ]
    wlat0, wlong0 = latlng_to_meters(center[0], center[1])
    wlat = np.append(wlat, wlat0)
    wlong = np.append(wlong, wlong0)
    colors.append('#010002')
    openmap_url = 'http://c.tile.openstreetmap.org/{Z}/{X}/{Y}.png'
    otile_url = 'http://otile1.mqcdn.com/tiles/1.0.0/sat/{Z}/{X}/{Y}.jpg'
    TILES = WMTSTileSource(url=openmap_url)
    tools = "pan,wheel_zoom,reset"
    p = figure(tools=tools,
               plot_width=700,
               plot_height=600,
               x_axis_type="mercator",
               y_axis_type="mercator")
    p.circle(wlat0, wlong0, color='#dc3826', size=size + 10)
    p.circle(np.array(wlat),
             np.array(wlong),
             color=colors,
             size=size,
             alpha=0.5)
    p.add_tile(TILES)
    p.axis.visible = False
    pb = gridplot([[p]])
    show(pb)
示例#2
0
def _add_color_column(fire_spots):
    n_colors = fire_spots.n_reports.unique().shape[0]

    colors = MinMaxScaler().fit_transform(np.arange(n_colors).astype(float).reshape(-1, 1))
    colors = (colors - 1.0) * (-1.0)
    colors = np.hstack([colors, np.zeros(n_colors).reshape(-1, 1), colors])
    colors_df = pd.DataFrame(colors)

    colors = []
    for _, row in colors_df.iterrows():
        rgb_color = tuple((row.values * 255).astype(int))
        colors.append('#%02x%02x%02x' % rgb_color)

    cluster_freqs = list(fire_spots.groupby(['n_reports']).groups.keys())
    cluster_freqs = pd.DataFrame(sorted(cluster_freqs), columns=['n_reports'])

    colors_df = pd.DataFrame(colors, columns=['color'])
    cluster_colors = pd.concat([cluster_freqs, colors_df], axis=1)

    return pd.merge(fire_spots, cluster_colors, on='n_reports')
def main():
    print("Loading dataset")
    os.chdir('../../../..')
    dfs = [pd.read_csv('data/empty.csv', header=None) for k in range(28)]
    # iris pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
    dfs[0] = pd.read_csv('data/datasets_metrics/iris.csv', header=None)
    # wine pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
    dfs[1] = pd.read_csv('data/datasets_metrics/wine.csv', header=None)
    # glass pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', header=None)
    dfs[2] = pd.read_csv('data/datasets_metrics/glass.csv', header=None)
    # breast cancer wincosin pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
    dfs[3] = pd.read_csv('data/datasets_metrics/breast-cancer-wisconsin.csv', header=None)
    # wdbc pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
    dfs[4] = pd.read_csv('data/datasets_metrics/wdbc.csv', header=None)
    # liver disorders pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data', header=None)
    dfs[5] = pd.read_csv('data/datasets_metrics/bupa.csv', header=None)
    # contraceptive method choice pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', header=None)
    dfs[6] = pd.read_csv('data/datasets_metrics/cmc.csv', header=None)
    # tiroide pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/new-thyroid.data', header=None)
    dfs[7] = pd.read_csv('data/datasets_metrics/new-thyroid.csv', header=None)
    # dematology pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data', header=None)
    dfs[8] = pd.read_csv('data/datasets_metrics/dermatology.csv', header=None)
    # egyptian skools http://www.dm.unibo.it/~simoncin/EgyptianSkulls.html
    dfs[9] = df = pd.read_csv('data/datasets_metrics/egyptian-skulls.csv', header=None)
    # heart statlog pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', header=None)
    dfs[10] = df = pd.read_csv('data/datasets_metrics/heart.csv', header=None)
    # ionosphere
    dfs[11] = df = pd.read_csv('data/datasets_metrics/ionosphere.csv', header=None)
    # vehicle
    dfs[12] = df = pd.read_csv('data/datasets_metrics/vehicle.csv', header=None)
    # balance scale pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', header=None)
    dfs[13] = df = pd.read_csv('data/datasets_metrics/balance-scale.csv', header=None)
    # sonar
    dfs[14] = df = pd.read_csv('data/datasets_metrics/sonar.csv', header=None)
    # zoo pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data', header=None)
    dfs[15] = df = pd.read_csv('data/datasets_metrics/zoo.csv', header=None)
    # isolet5
    dfs[16] = df = pd.read_csv('data/datasets_metrics/isolet5.csv', header=None)
    # movement libras
    dfs[17] = df = pd.read_csv('data/datasets_metrics/libras.csv', header=None)
    # cleveland http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
    dfs[18] = df = pd.read_csv('data/datasets_metrics/cleveland.csv', header=None)
    # australian
    dfs[19] = df = pd.read_csv('data/datasets_metrics/australian.csv', header=None)
    dfs[20] = df = pd.read_csv('data/shapesets/compound.csv', header=None)
    dfs[21] = df = pd.read_csv('data/shapesets/flame.csv', header=None)
    dfs[22] = df = pd.read_csv('data/shapesets/jain.csv', header=None)
    dfs[23] = df = pd.read_csv('data/shapesets/r15.csv', header=None)
    dfs[24] = df = pd.read_csv('data/shapesets/d31.csv', header=None)
    dfs[25] = df = pd.read_csv('data/shapesets/spiral.csv', header=None)
    dfs[26] = df = pd.read_csv('data/shapesets/pathbased.csv', header=None)
    dfs[27] = df = pd.read_csv('data/shapesets/agregation.csv', header=None)
    # hill-valley
    # diabetes
    # olive
    # crud oil
    # musk version 1
    # landsat satellite
    # heart disease len(dfs)
    for eachdataset in range(0, len(dfs)):
        df = dfs[eachdataset]
        df = df.drop(len(df.columns) - 1, 1)
        x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)]
        x = x.iloc[:, :].values.astype(float)
        print("Nomalizing dataset so that all dimenions are in the same scale")
        std = MinMaxScaler()
        x = std.fit_transform(x)
        x = x[np.random.permutation(len(x))]
        SIMULATIONS = 30
        df = []
        # name = ['KMeans', 'FCMeans']
        name = ['FCMeans']
        # name = ['KPSO', 'CPSO', 'PSC']
        for i in range(len(name)):
            metrics = []
            mean = []
            std = []
            rng = range(2, 10)
            # 27
            for metricNumber in range(0, 26):
                for k in rng:
                    print("Number of Clusters = " + str(k) + "\n")
                    for j in range(SIMULATIONS):
                        print("Run ====> " + str(j))
                        # if (name[i] == 'KPSO'):
                        # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                        # elif (name[i] == 'CPSO'):
                        # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                        # elif (name[i] == 'PSC'):
                        #     clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01)
                        if (name[i] == 'KMeans'):
                            clf = KMeans(n_clusters=k)
                        elif (name[i] == 'FCMeans'):
                            clf = FCMeans(n_clusters=k, n_iter=1000)
                        elif (name[i] == 'PSOC'):
                            clf = PSOC(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, c1=1.49, c2=1.49)
                        clf.fit(x)
                        if not os.path.isdir("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/"):
                            os.makedirs("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/")
                        sn = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/")
                        sn = sn + "dataset_{0}".format(str(eachdataset))
                        sn = sn + "_metric_{0}".format(str(metricNumber))
                        sn = sn + "_algorithm_{0}".format(str(name[i]))
                        sn = sn + "_k_{0}".format(str(k))
                        sn = sn + "_simulation_{0}".format(str(j))
                        savecentroids = pd.DataFrame(clf.centroids)
                        savecentroids = savecentroids.transpose()
                        savecentroids.to_csv(sn+"_centroids.csv")
                        clusters = {}
                        for c in clf.centroids:
                            clusters[c] = []

                        for xi in x:
                            dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids]
                            class_ = dist.index(min(dist))
                            clusters[class_].append(xi)

                        # precisa inverter?
                        file = open(sn+"_clusters.csv", 'w')
                        file.write(str(len(clf.centroids)) + '\n')
                        for c in range(len(clf.centroids)):
                            file.write(str(len(clusters[c])) + '\n')
                            for xi in range(len(clusters[c])):
                                file.write(str(clusters[c][xi][0]))
                                for xij in range(1, len(clusters[c][xi])):
                                    file.write(' ' + str(clusters[c][xi][xij]))
                                file.write('\n')
                        file.close()
                        # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'):
                        #     os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids')
                        # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False)
                        # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf))
                        # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0))
                        # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4))
                        # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4))
                        # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf))
                        # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf))
                        # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) )
                        # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) )
                        # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0))
                        # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.xu(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.rl(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.wb(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0))
                        c = clf.centroids[0][0]
                        # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c))
                        # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf))
                        metrics.append(Metrics.clustering_evaluation("{0}".format(str(metricNumber)), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c))
                    mean.append(np.mean(metrics))
                    std.append(np.std(metrics))
                    df.append([name[i], k, np.mean(metrics), np.std(metrics)])
                    metrics = []

                # plt.subplot(130 + (i + 1))
                plt.clf()
                plt.title(str(name[i]) + ' - Metric')
                plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True)
                plt.xlabel('Clusters')
                plt.ylabel('Metric')
                saveName = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/")
                saveName = saveName + "dataset_{0}".format(str(eachdataset))
                saveName = saveName + "_metric_{0}".format(str(metricNumber))
                saveName = saveName + "_algorithm_{0}".format(str(name[i]))
                plt.savefig(saveName+".pdf")
                df = pd.DataFrame(df)
                df.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
                df.to_csv(saveName+".csv")
                mean = []
                std = []
                plt.tight_layout()
示例#4
0
def main():
    print("Loading dataset")
    # df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1')
    df = pd.read_csv('data/egyptian-skulls.csv', header=None)
    df = df.drop(len(df.columns)-1, 1)
    x = df[df.apply(lambda x: sum([x_=='?' for x_ in x])==0, axis=1)]
    x = x.iloc[:, :].values.astype(float)
    print("Nomalizing dataset so that all dimensions are in the same scale")
    std = MinMaxScaler()
    x = std.fit_transform(x)
    print x
    SIMULATIONS = 2

    df = []
    name = ['KMeans', 'FCMeans']
    # name = ['KPSO', 'CPSO', 'PSC']
    for i in range(len(name)):
        metrics = []
        mean = []
        std = []
        rng = range(2, 4)
        for k in rng:
            print("Number of Clusters = " + str(k) + "\n")
            for j in range(SIMULATIONS):
                print("Run ====> " + str(j))
                # if (name[i] == 'KPSO'):
                # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                # elif (name[i] == 'CPSO'):
                #     clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                # elif (name[i] == 'PSC'):
                #     clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01)
                if (name[i] == 'KMeans'):
                    clf = KMeans(n_clusters=k)
                elif (name[i] == 'FCMeans'):
                    clf = FCMeans(n_clusters=k)
                clf.fit(x)
                centroids = pd.DataFrame(clf.centroids)
                # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'):
                #     os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids')
                # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False)
                # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf))
                # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0))
                # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4))

                # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4))

                # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf))
                # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf))
                # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) )
                # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) )
                # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0))

                # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf))

                # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.xu(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.rl(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.wb(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0))
                # c = clf.centroids[0][0]
                # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c))
                # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf))
                # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf))
                metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf))
            mean.append(np.mean(metrics))
            std.append(np.std(metrics))
            df.append([name[i], k, np.mean(metrics), np.std(metrics)])
            metrics = []

        plt.subplot(130 + (i + 1))
        plt.title(str(name[i]) + ' - Metric')
        plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True)
        plt.xlabel('Clusters')
        plt.ylabel('Metric')
    # df = pd.DataFrame(df)
    # df.columns = ['ALGORITHM', 'CLUSTERS', 'GAP MEAN' , 'GAP STD']
    # df.to_csv('gap.csv', index=False)

    plt.tight_layout()
    plt.show()
def main():
    num_exec = 30
    swarm_size = 30
    num_iter = 1000
    # names = ['KMeans', 'FCMeans', 'PSOC', 'PSC', 'KMPSOC', 'PSOCKM']
    names = ['PSC', 'PSOC', 'KMPSOC', 'PSOCKM']
    # names = ['PSOC', 'PSC', 'KMPSOC', 'PSOCKM']

    print("Loading dataset")
    os.chdir('../../..')
    df = pd.read_csv('data/booking_website/booking_website_without_empty_values.csv')
    df = df.drop(['id'], axis=1)
    # df = df.drop(['idade'], axis=1)
    df = df.drop(['sexo'], axis=1)
    x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)]
    x = x.iloc[:, :].values.astype(float)

    print("Nomalizing dataset so that all dimenions are in the same scale")
    std = MinMaxScaler()
    x = std.fit_transform(x)
    x = x[np.random.permutation(len(x))]
    for i in range(len(names)):
        metrics = []
        rng = range(2, 11)
        for metricNumber in ["intraClusterStatistic", "quantizationError", "sumInterClusterDistance"]:
        # for metricNumber in ["gap"]:
            print("Algorithm: " + names[i])
            mean = []
            std = []
            dff = []
            for k in rng:
                # print(" Number of Clusters = " + str(k))
                for j in tqdm(range(num_exec)):
                    if names[i] == 'KPSO':
                        clf = KMPSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49)
                    elif names[i] == 'PSOC':
                        clf = PSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49)
                    elif names[i] == 'PSC':
                        clf = PSC(swarm_size=k, n_iter=num_iter, w=0.95, c1=2.05, c2=2.05, c3=1.0, c4=1.0, v_max=0.001)
                    elif names[i] == 'PSOCKM':
                        clf = PSOCKM(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49)
                    elif names[i] == 'KMeans':
                        clf = KMeans(n_clusters=k, n_iter=num_iter, shuffle=True, tolerance=0.00001)
                    elif names[i] == 'FCMeans':
                        clf = FCMeans(n_clusters=k, n_iter=num_iter, fuzzy_c=2, tolerance=0.001)

                    clf.fit(x)
                    out_dir = "results/booking/algorithm_{}/metric_{}/".format(names[i], metricNumber)
                    if not os.path.exists(out_dir):
                        os.makedirs(out_dir)

                    file_name = out_dir + "{}_k_{}_exec_{}.csv".format('centroids', k, j)
                    save_centroids = pd.DataFrame(clf.centroids)
                    save_centroids = save_centroids.transpose()
                    save_centroids.to_csv(file_name)

                    clusters = {}
                    for c in clf.centroids:
                        clusters[c] = []

                    for xi in x:
                        dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids]
                        class_ = dist.index(min(dist))
                        clusters[class_].append(xi)

                    clusters_file = open(out_dir + "{}_k_{}_exec_{}.csv".format('clusters', k, j), 'w')
                    clusters_file.write(str(len(clf.centroids)) + '\n')

                    for c in range(len(clf.centroids)):
                        clusters_file.write(str(len(clusters[c])) + '\n')
                        for xi in range(len(clusters[c])):
                            clusters_file.write(str(clusters[c][xi][0]))
                            for xij in range(1, len(clusters[c][xi])):
                                clusters_file.write(' ' + str(clusters[c][xi][xij]))
                            clusters_file.write('\n')
                    clusters_file.close()

                    c = clf.centroids[0][0]

                    metrics.append(
                        Metrics.clustering_evaluation("{}".format(metricNumber), centroids=clf.centroids, data=x,
                                                      clf=clf, m=2.0, number_neighbors=2, centroidUnique=c))
                mean.append(np.mean(metrics))
                std.append(np.std(metrics))
                dff.append([names[i], k, np.mean(metrics), np.std(metrics)])
                metrics = []

            # plt.subplot(130 + (i + 1))
            plt.figure()

            figure_name = "results/booking/algorithm_{}/metric_{}/plot.png".format(names[i], metricNumber)
            plt.title(str(names[i]) + ' - Metric ' + metricNumber)
            plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True)
            plt.xlabel('Clusters')
            plt.ylabel('Metric')
            plt.tight_layout()
            plt.savefig(figure_name)

            save_name = "results/booking/algorithm_{}/metric_{}/output.csv".format(names[i], metricNumber)
            dff = pd.DataFrame(dff)
            dff.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
            dff.to_csv(save_name)
示例#6
0
def main():


    pathname_dir_results = '/home/elliackin/Documents/Swarm-Intelligence-Research/' \
                           'Simulation-2007-LA-CCI/2007_LA-CCI_ClusteringSimulation-ID-26-Mai-2017-20h:42m:02s'


    pathname_dataset  =  "/home/elliackin/Documents/Swarm-Intelligence-Research/" \
                         "SRC-Swarm-Intelligence/clustering-optimization/data/Saidafinal4periodo.xlsx"

    df = pd.read_csv(pathname_dir_results + '/gap/gap.csv')
    psoc = df[df['ALGORITHM'] == 'PSOC']
    kmpsoc = df[df['ALGORITHM'] == 'KMPSOC']
    psc = df[df['ALGORITHM'] == 'PSC']

    # i = 1
    # algos = [psoc, kmpsoc, psc]
    # plt.figure(figsize=(12,4))
    # for algo in algos:
    #     plt.subplot(130 + i)
    #     plt.errorbar(algo['CLUSTERS'], algo['GAP MEAN'], yerr=algo['GAP STD'], linewidth=0.5, elinewidth=0.5, color='b')
    #     plt.plot(algo['CLUSTERS'], algo['GAP MEAN'], color='b', marker='o', linewidth=0.5, markersize=5)
    #     plt.xticks(algo['CLUSTERS'])
    #     plt.title(algo['ALGORITHM'].values[0] + ' - GAP')
    #     plt.ylabel('GAP Measure')
    #     plt.xlabel('Number of Clusters')
    #     i += 1
    #plt.tight_layout()
    #plt.show()

    print("Loading dataset")

    df = pd.read_excel(io=pathname_dataset, sheetname='Plan1')
    df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True)

    x = df.iloc[:, :].values.astype(float)
    print("Nomalizing dataset so that all dimenions are in the same scale")
    std = MinMaxScaler()
    data = std.fit_transform(x)

    qe = []

    plt.figure(figsize=(12, 4))
    mean = []
    std = []
    for k in range(2, 10):
        metrics = []
        for j in range(30):
            curr_directory = pathname_dir_results + '/gap/KMPSOC/' + str(
                k) + '-centroids'
            filename = curr_directory + '/centroid-simulation-' + str(
                j) + '.csv'
            df = pd.read_csv(filename)
            cluster = df.as_matrix()
            raw_centroids = cluster.transpose()
            centroids = {}
            for i in range(k):
                centroids[i] = raw_centroids[i]
            qe_value = Metrics.intra_cluster_statistic(data, centroids)
            metrics.append(qe_value)
        mean.append(np.mean(metrics))
        std.append(np.std(metrics))
    plt.errorbar(range(2, 10),
                 mean,
                 yerr=std,
                 linewidth=0.5,
                 elinewidth=0.5,
                 color='b')
    plt.plot(range(2, 10),
             mean,
             color='b',
             marker='o',
             linewidth=0.5,
             markersize=5)
    plt.xticks(range(2, 10))
    plt.title('KMPSOC')
    plt.ylabel('QE Measure')
    plt.xlabel('K')
    plt.tight_layout()
    plt.show()
示例#7
0
def main():
    df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1')
    df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True)

    x = df.iloc[:, :].values.astype(float)
    std = MinMaxScaler()
    x = std.fit_transform(x)

    df = []
    name = ['KPSO', 'CPSO', 'PSC']
    for i in range(len(name)):
        metrics = []
        mean = []
        std = []
        rng = range(2, 10)
        for k in rng:
            for j in range(30):
                if (name[i] == 'KPSO'):
                    clf = KPSO(n_clusters=k,
                               swarm_size=15,
                               n_iter=500,
                               w=0.72,
                               lb_w=0.4,
                               c1=1.49,
                               c2=1.49)
                elif (name[i] == 'CPSO'):
                    clf = CPSO(n_clusters=k,
                               swarm_size=15,
                               n_iter=500,
                               w=0.72,
                               lb_w=0.4,
                               c1=1.49,
                               c2=1.49)
                elif (name[i] == 'PSC'):
                    clf = PSC(minf=0,
                              maxf=1,
                              swarm_size=k,
                              n_iter=500,
                              w=0.95,
                              v_max=0.01)
                clf.fit(x)
                centroids = pd.DataFrame(clf.centroids)
                centroids.to_csv('inter/' + name[i] + '/' + str(k) +
                                 '-centroids/centroid-simulation' + str(j) +
                                 '.csv',
                                 index=False)
                metrics.append(
                    Metrics.inter_cluster_statistic(centroids=clf.centroids))
            mean.append(np.mean(metrics))
            std.append(np.std(metrics))
            df.append([name[i], k, np.mean(metrics), np.std(metrics)])
            metrics = []

        plt.subplot(130 + (i + 1))
        plt.title(str(name[i]) + ' - INTRA')
        plt.errorbar(rng,
                     mean,
                     yerr=std,
                     marker='o',
                     ecolor='b',
                     capthick=2,
                     barsabove=True)
        plt.xlabel('Clusters')
        plt.ylabel('INTER Statistic')

    df = pd.DataFrame(df)
    df.columns = ['ALGORITHM', 'CLUSTERS', 'GAP MEAN', 'GAP STD']
    df.to_csv('intra.csv', index=False)

    plt.tight_layout()
    plt.show()
def main():
    # Importing dataset
    dataset = pd.read_csv("500_Cities_CDC.csv")

    # Select lines and columns
    X = dataset.iloc[:, [
        4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 68, 72,
        76, 80, 84, 88, 92, 96, 100, 104, 108, 112
    ]].values

    # Normalize the dimension value to a float value with range 0 - 1
    std = MinMaxScaler()
    X = std.fit_transform(X)

    techniques = ['k_means', 'FC_means', 'PSOC', 'ABCC']
    metrics = ['gap', 'silhouete', 'calinskiHarabaszIndex']
    num_exec = 30

    for tec in techniques:
        rng = range(2, 15)
        met_eval = []
        for met in metrics:
            mean = []
            std = []
            dff = []
            for k in rng:
                for j in tqdm(range(num_exec),
                              desc='{} - {} - k: {}'.format(tec, met, k)):
                    if tec == 'k_means':
                        clf = KMeans(k=k)
                    elif tec == 'FC_means':
                        clf = FCMeans(k=k)
                    elif tec == 'PSOC':
                        clf = PSOC(n_clusters=k)
                    elif tec == 'ABCC':
                        clf = ABCC(n_clusters=k)

                    clf.fit(data=X)  #run technique

                    out_dir = "results/booking/algorithm_{}/metric_{}/".format(
                        tec, met)  #create folder
                    if not os.path.exists(out_dir):
                        os.makedirs(out_dir)

                    file_name = out_dir + "{}_k_{}_exec_{}.csv".format(
                        'centroids', k, j)  #save centroids
                    save_centroids = pd.DataFrame(clf.centroids)
                    save_centroids = save_centroids.transpose()
                    save_centroids.to_csv(file_name)

                    clusters_file = open(
                        out_dir +
                        "{}_k_{}_exec_{}.csv".format('clusters', k, j),
                        'w')  #save clusters
                    clusters_file.write(str(len(clf.centroids)) + '\n')
                    for c in range(len(clf.centroids)):
                        clusters_file.write(str(len(clf.clusters[c])) + '\n')
                        for xi in range(len(clf.clusters[c])):
                            clusters_file.write(str(clf.clusters[c][xi][0]))
                            for xij in range(1, len(clf.clusters[c][xi])):
                                clusters_file.write(
                                    ' ' + str(clf.clusters[c][xi][xij]))
                            clusters_file.write('\n')
                    clusters_file.close()

                    if met == 'gap':
                        clusters = clf.clusters

                        random_data = np.random.uniform(0, 1, X.shape)
                        clf.fit(data=random_data)
                        random_clusters = clf.clusters

                        met_eval.append(
                            Metrics.gap_statistic(clusters, random_clusters))
                    elif met == 'silhouete':
                        met_eval.append(
                            Metrics.silhouette(clf.clusters, len(X)))
                    elif met == 'calinskiHarabaszIndex':
                        met_eval.append(
                            Metrics.variance_based_ch(X, clf.centroids))

                mean.append(np.mean(met_eval))
                std.append(np.std(met_eval))
                dff.append([tec, k, np.mean(met_eval), np.std(met_eval)])
                met_eval = []

                plt.figure()

            figure_name = "results/booking/algorithm_{}/metric_{}/plot.png".format(
                tec, met)
            plt.title('{} - Metric {}'.format(tec, met))
            plt.errorbar(rng,
                         mean,
                         yerr=std,
                         marker='o',
                         ecolor='b',
                         capthick=2,
                         barsabove=True)
            plt.xlabel('Clusters')
            plt.ylabel('Metric')
            plt.tight_layout()
            plt.savefig(figure_name)

            save_name = "results/booking/algorithm_{}/metric_{}/output.csv".format(
                tec, met)
            dff = pd.DataFrame(dff)
            dff.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
            dff.to_csv(save_name)
示例#9
0
def main():

    print("Loading dataset")
    df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1')
    df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True)

    print("Number of objects in the current dataset: " + str(len(df.index)))
    numberObjects = len(df.index)

    x = df.iloc[0:numberObjects, 0:numberObjects].values.astype(float)

    print("Nomalizing dataset so that all dimenions are in the same scale")
    std = MinMaxScaler()
    x = std.fit_transform(x)

    print("Creating directory to store all clustering solutions")
    PATHNAME_CLUSTERS_SOL = "clusters_solutions"
    directory = os.path.dirname(PATHNAME_CLUSTERS_SOL)

    try:
        os.stat(directory)
    except:
        os.mkdir(directory)

    NUMBER_RUNS = 5

    rng = range(2, 10)
    k_pso = []
    mean = []
    std = []
    print("Start KPSO\n")
    for k in rng:
        print("\t  Number K = " + str(k) + "\n")
        for i in range(NUMBER_RUNS):
            print("\t\t Run ====> " + str(i))
            clf = KPSO(n_clusters=k,
                       swarm_size=15,
                       n_iter=100,
                       w=0.72,
                       lb_w=0.4,
                       c1=1.49,
                       c2=1.49)
            clf.fit(x)
            k_pso.append(Metrics.gap_statistic(x, clf.centroids))
        mean.append(np.mean(k_pso))
        std.append(np.std(k_pso))
        k_pso = []

    plt.figure(0)
    plt.subplot(131)
    plt.title('KPSO - GAP')
    plt.errorbar(rng,
                 mean,
                 yerr=std,
                 marker='o',
                 ecolor='b',
                 capthick=2,
                 barsabove=True)
    plt.xlabel('K')
    plt.ylabel('GAP Statistic')

    plot_gap_avaliation(mean, std, 131, 'KPSO')
    print("KPSO executions are completed!\n\n")

    print("Start CPSO\n")

    cpso = []
    mean = []
    std = []
    for k in rng:
        print("\t  Number K = " + str(k) + "\n")
        for i in range(NUMBER_RUNS):
            print("\t\t Run ====> " + str(i))
            clf = CPSO(n_clusters=k,
                       swarm_size=15,
                       n_iter=100,
                       w=0.72,
                       lb_w=0.4,
                       c1=1.49,
                       c2=1.49)
            clf.fit(x)
            cpso.append(Metrics.gap_statistic(x, clf.centroids))
        mean.append(np.mean(cpso))
        std.append(np.std(cpso))
        cpso = []

    plt.figure(0)
    plt.subplot(132)
    plt.title('CPSO - GAP')
    plt.errorbar(rng,
                 mean,
                 yerr=std,
                 marker='o',
                 ecolor='b',
                 capthick=2,
                 barsabove=True)
    plt.xlabel('K')
    plt.ylabel('GAP Statistic')

    plot_gap_avaliation(mean, std, 132, 'CPSO')
    print("CPSO executions are completed!\n\n")

    print("Start PSC\n")

    psc = []
    mean = []
    std = []
    for k in rng:
        print("\t  Number K = " + str(k) + "\n")
        for i in range(NUMBER_RUNS):
            print("\t\t Run ====> " + str(i))
            clf = PSC(minf=0,
                      maxf=1,
                      swarm_size=k,
                      n_iter=200,
                      w=0.95,
                      v_max=0.01)
            clf.fit(x)
            psc.append(Metrics.gap_statistic(x, clf.centroids))
        mean.append(np.mean(psc))
        std.append(np.std(psc))
        psc = []

    plt.figure(0)
    plt.subplot(133)
    plt.title('PSC - GAP')
    plt.errorbar(rng,
                 mean,
                 yerr=std,
                 marker='o',
                 ecolor='b',
                 capthick=2,
                 barsabove=True)
    plt.xlabel('K')
    plt.ylabel('GAP Statistic')

    plot_gap_avaliation(mean, std, 133, 'PSC')
    print("PSC executions are completed!\n")

    plt.tight_layout()
    plt.show()