Пример #1
0
def main():
    print("Loading dataset")
    os.chdir('../../../..')
    dfs = [pd.read_csv('data/empty.csv', header=None) for k in range(28)]
    # iris pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
    dfs[0] = pd.read_csv('data/datasets_metrics/iris.csv', header=None)
    # wine pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
    dfs[1] = pd.read_csv('data/datasets_metrics/wine.csv', header=None)
    # glass pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', header=None)
    dfs[2] = pd.read_csv('data/datasets_metrics/glass.csv', header=None)
    # breast cancer wincosin pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
    dfs[3] = pd.read_csv('data/datasets_metrics/breast-cancer-wisconsin.csv', header=None)
    # wdbc pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
    dfs[4] = pd.read_csv('data/datasets_metrics/wdbc.csv', header=None)
    # liver disorders pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/liver-disorders/bupa.data', header=None)
    dfs[5] = pd.read_csv('data/datasets_metrics/bupa.csv', header=None)
    # contraceptive method choice pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data', header=None)
    dfs[6] = pd.read_csv('data/datasets_metrics/cmc.csv', header=None)
    # tiroide pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/new-thyroid.data', header=None)
    dfs[7] = pd.read_csv('data/datasets_metrics/new-thyroid.csv', header=None)
    # dematology pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data', header=None)
    dfs[8] = pd.read_csv('data/datasets_metrics/dermatology.csv', header=None)
    # egyptian skools http://www.dm.unibo.it/~simoncin/EgyptianSkulls.html
    dfs[9] = df = pd.read_csv('data/datasets_metrics/egyptian-skulls.csv', header=None)
    # heart statlog pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', header=None)
    dfs[10] = df = pd.read_csv('data/datasets_metrics/heart.csv', header=None)
    # ionosphere
    dfs[11] = df = pd.read_csv('data/datasets_metrics/ionosphere.csv', header=None)
    # vehicle
    dfs[12] = df = pd.read_csv('data/datasets_metrics/vehicle.csv', header=None)
    # balance scale pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data', header=None)
    dfs[13] = df = pd.read_csv('data/datasets_metrics/balance-scale.csv', header=None)
    # sonar
    dfs[14] = df = pd.read_csv('data/datasets_metrics/sonar.csv', header=None)
    # zoo pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data', header=None)
    dfs[15] = df = pd.read_csv('data/datasets_metrics/zoo.csv', header=None)
    # isolet5
    dfs[16] = df = pd.read_csv('data/datasets_metrics/isolet5.csv', header=None)
    # movement libras
    dfs[17] = df = pd.read_csv('data/datasets_metrics/libras.csv', header=None)
    # cleveland http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data
    dfs[18] = df = pd.read_csv('data/datasets_metrics/cleveland.csv', header=None)
    # australian
    dfs[19] = df = pd.read_csv('data/datasets_metrics/australian.csv', header=None)
    dfs[20] = df = pd.read_csv('data/shapesets/compound.csv', header=None)
    dfs[21] = df = pd.read_csv('data/shapesets/flame.csv', header=None)
    dfs[22] = df = pd.read_csv('data/shapesets/jain.csv', header=None)
    dfs[23] = df = pd.read_csv('data/shapesets/r15.csv', header=None)
    dfs[24] = df = pd.read_csv('data/shapesets/d31.csv', header=None)
    dfs[25] = df = pd.read_csv('data/shapesets/spiral.csv', header=None)
    dfs[26] = df = pd.read_csv('data/shapesets/pathbased.csv', header=None)
    dfs[27] = df = pd.read_csv('data/shapesets/agregation.csv', header=None)
    # hill-valley
    # diabetes
    # olive
    # crud oil
    # musk version 1
    # landsat satellite
    # heart disease len(dfs)
    for eachdataset in range(0, len(dfs)):
        df = dfs[eachdataset]
        df = df.drop(len(df.columns) - 1, 1)
        x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)]
        x = x.iloc[:, :].values.astype(float)
        print("Nomalizing dataset so that all dimenions are in the same scale")
        std = MinMaxScaler()
        x = std.fit_transform(x)
        x = x[np.random.permutation(len(x))]
        SIMULATIONS = 30
        df = []
        # name = ['KMeans', 'FCMeans']
        name = ['FCMeans']
        # name = ['KPSO', 'CPSO', 'PSC']
        for i in range(len(name)):
            metrics = []
            mean = []
            std = []
            rng = range(2, 10)
            # 27
            for metricNumber in range(0, 26):
                for k in rng:
                    print("Number of Clusters = " + str(k) + "\n")
                    for j in range(SIMULATIONS):
                        print("Run ====> " + str(j))
                        # if (name[i] == 'KPSO'):
                        # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                        # elif (name[i] == 'CPSO'):
                        # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                        # elif (name[i] == 'PSC'):
                        #     clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01)
                        if (name[i] == 'KMeans'):
                            clf = KMeans(n_clusters=k)
                        elif (name[i] == 'FCMeans'):
                            clf = FCMeans(n_clusters=k, n_iter=1000)
                        elif (name[i] == 'PSOC'):
                            clf = PSOC(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, c1=1.49, c2=1.49)
                        clf.fit(x)
                        if not os.path.isdir("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/"):
                            os.makedirs("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/")
                        sn = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/")
                        sn = sn + "dataset_{0}".format(str(eachdataset))
                        sn = sn + "_metric_{0}".format(str(metricNumber))
                        sn = sn + "_algorithm_{0}".format(str(name[i]))
                        sn = sn + "_k_{0}".format(str(k))
                        sn = sn + "_simulation_{0}".format(str(j))
                        savecentroids = pd.DataFrame(clf.centroids)
                        savecentroids = savecentroids.transpose()
                        savecentroids.to_csv(sn+"_centroids.csv")
                        clusters = {}
                        for c in clf.centroids:
                            clusters[c] = []

                        for xi in x:
                            dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids]
                            class_ = dist.index(min(dist))
                            clusters[class_].append(xi)

                        # precisa inverter?
                        file = open(sn+"_clusters.csv", 'w')
                        file.write(str(len(clf.centroids)) + '\n')
                        for c in range(len(clf.centroids)):
                            file.write(str(len(clusters[c])) + '\n')
                            for xi in range(len(clusters[c])):
                                file.write(str(clusters[c][xi][0]))
                                for xij in range(1, len(clusters[c][xi])):
                                    file.write(' ' + str(clusters[c][xi][xij]))
                                file.write('\n')
                        file.close()
                        # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'):
                        #     os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids')
                        # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False)
                        # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf))
                        # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0))
                        # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4))
                        # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4))
                        # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf))
                        # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf))
                        # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) )
                        # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) )
                        # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0))
                        # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.xu(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.rl(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.wb(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0))
                        c = clf.centroids[0][0]
                        # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c))
                        # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf))
                        # metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf))
                        metrics.append(Metrics.clustering_evaluation("{0}".format(str(metricNumber)), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c))
                    mean.append(np.mean(metrics))
                    std.append(np.std(metrics))
                    df.append([name[i], k, np.mean(metrics), np.std(metrics)])
                    metrics = []

                # plt.subplot(130 + (i + 1))
                plt.clf()
                plt.title(str(name[i]) + ' - Metric')
                plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True)
                plt.xlabel('Clusters')
                plt.ylabel('Metric')
                saveName = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/")
                saveName = saveName + "dataset_{0}".format(str(eachdataset))
                saveName = saveName + "_metric_{0}".format(str(metricNumber))
                saveName = saveName + "_algorithm_{0}".format(str(name[i]))
                plt.savefig(saveName+".pdf")
                df = pd.DataFrame(df)
                df.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
                df.to_csv(saveName+".csv")
                mean = []
                std = []
                plt.tight_layout()
Пример #2
0
def main(parameters_simulation=None):
    data_file = open(parameters_simulation)
    param_simultations = json.load(data_file)

    pathname_dataset = param_simultations["pathname_dataset"]
    pathname_dir_results = param_simultations["pathname_dir_results"]
    num_simulations = param_simultations["NUN_SIMULATIONS"]  # 30
    num_trials = param_simultations["NUM_TRIALS"]  # 50

    num_iterations = param_simultations["NUM_ITERATIONS"]  # 500
    swarm_size = param_simultations["SWARM_SIZE"]  # 15
    name_classifier = param_simultations["ALGORITHMS"]
    clustering_metrics = param_simultations["EVALUATION_METRICS"]

    criteria = []
    table_criteria = {}

    for idx_alg in range(len(clustering_metrics)):
        value = clustering_metrics[idx_alg]
        criteria.append(value['criteria'])
        table_criteria[value['criteria']] = (
            clustering_metrics[idx_alg]['name'],
            clustering_metrics[idx_alg]['file'])

    time.sleep(2)
    pathname_output = pathname_dir_results + '/2017_LA-CCI_ClusteringSimulation'
    currDirectory = (pathname_output + '-ID-' +
                     datetime.now().strftime('%d-%b-%Y-%Hh:%Mm:%Ss'))
    pathname_output = currDirectory

    if not os.path.exists(pathname_output):
        os.makedirs(pathname_output)
    else:
        raise Exception("This simulation cannot execute!")

    time.sleep(1)
    timestamp = datetime.now().strftime('%a, %d %b %Y at %Hh:%Mm:%Ss')
    if not os.path.exists(pathname_output + '/timestamp.txt'):
        file_timestamp = open(pathname_output + '/timestamp.txt', 'a')
        file_timestamp.write('This simulation started on ' + timestamp)
        file_timestamp.close()
        print("This simulation started on " + timestamp)
    else:
        raise Exception("This simulation cannot execute!")

    print("Loading dataset")

    df = pd.read_excel(io=pathname_dataset, sheetname='Plan1')
    df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True)

    print pathname_dataset

    x = df.iloc[:, :].values.astype(float)
    print("Normalizing dataset so that all dimenions are in the same scale")
    min_max_scaler = MinMaxScaler()
    x = min_max_scaler.fit_transform(x)

    indices_attributes_4p = np.array([1, 7, 8, 10, 12, 13, 16])
    indices_attributes_4p = indices_attributes_4p - 1

    indices_attributes_5p = np.array([1, 7, 8, 10, 12, 13])
    indices_attributes_5p = indices_attributes_5p - 1

    x = x[:, indices_attributes_4p]

    # x = np.array(x)
    # The term clf means classifier

    # name_classifier = ['PSOC']
    # name_classifier = ['PSC']

    KList = range(2, 10)

    idx_success_simulation = {}
    idx_trial = {}
    idx_fail_simulation = {}
    check_sim_its_over = {}

    for idx_alg in range(len(name_classifier)):
        idx_success_simulation[name_classifier[idx_alg]] = {}
        idx_trial[name_classifier[idx_alg]] = {}
        idx_fail_simulation[name_classifier[idx_alg]] = {}
        check_sim_its_over[name_classifier[idx_alg]] = {}
        for k in KList:
            idx_success_simulation[name_classifier[idx_alg]][k] = 0
            idx_trial[name_classifier[idx_alg]][k] = 0
            idx_fail_simulation[name_classifier[idx_alg]][k] = 0
            check_sim_its_over[name_classifier[idx_alg]][k] = False

    metrics_list_sim_by_algorithm_and_k = {}

    mean_metric_by_algorithm_and_k = {}
    std_metric_by_algorithm_and_k = {}

    for m in range(len(criteria)):
        mean_metric_by_algorithm_and_k[criteria[m]] = {}
        std_metric_by_algorithm_and_k[criteria[m]] = {}

    for m in range(len(criteria)):
        metrics_list_sim_by_algorithm_and_k[criteria[m]] = {}
        for idx_alg in range(len(name_classifier)):
            metrics_list_sim_by_algorithm_and_k[criteria[m]][
                name_classifier[idx_alg]] = {}
            for k in KList:
                metrics_list_sim_by_algorithm_and_k[criteria[m]][
                    name_classifier[idx_alg]][k] = []

    for m in range(len(criteria)):
        min_value_mean = {}
        max_value_mean = {}

        min_value_std = {}
        max_value_std = {}

        for m in range(len(criteria)):
            min_value_mean[criteria[m]] = np.inf
            max_value_mean[criteria[m]] = -np.inf

            min_value_std[criteria[m]] = np.inf
            max_value_std[criteria[m]] = -np.inf

    #First the simulation
    #(idx_success_simulation < num_simulations) and (idx_trial < num_trials)
    finished = False
    idx_fig = 1

    Round = 1

    while not finished:

        list_results_by_metric = {}

        for m in range(len(criteria)):
            list_results_by_metric[criteria[m]] = []

        for idx_alg in range(len(name_classifier)):

            for m in range(len(criteria)):
                mean_metric_by_algorithm_and_k[criteria[m]][
                    name_classifier[idx_alg]] = np.zeros((len(KList), ))
                std_metric_by_algorithm_and_k[criteria[m]][
                    name_classifier[idx_alg]] = np.zeros((len(KList), ))

            for k in KList:

                if not check_sim_its_over[name_classifier[idx_alg]][k]:

                    create_dir_algorithms_and_k(idx_alg, k, name_classifier,
                                                pathname_output)

                    pathname_output_clusters = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \
                                               + '/clusters/' + str(k) + '-centroids'
                    pathname_output_metrics = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \
                                              + '/metrics/' + str(k) + '-centroids'
                    pathname_output_evolution_success = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \
                                                        + '/evolution/success/' + str(k) + '-centroids'
                    pathname_output_evolution_fail = pathname_output + '/data/' + name_classifier[idx_alg] + '/' \
                                                     + '/evolution/fail/' + str(k) + '-centroids'

                    if (name_classifier[idx_alg] == 'KMPSOC'):
                        clf = KMPSOC(n_clusters=k,
                                     swarm_size=swarm_size,
                                     n_iter=num_iterations,
                                     w=0.72,
                                     c1=1.49,
                                     c2=1.49)
                    elif (name_classifier[idx_alg] == 'PSOC'):
                        clf = PSOC(n_clusters=k,
                                   swarm_size=swarm_size,
                                   n_iter=num_iterations,
                                   w=0.72,
                                   c1=1.49,
                                   c2=1.49)
                    elif (name_classifier[idx_alg] == 'PSOCKM'):
                        clf = PSOCKM(n_clusters=k,
                                     swarm_size=swarm_size,
                                     n_iter=num_iterations,
                                     w=0.72,
                                     c1=1.49,
                                     c2=1.49)
                    elif (name_classifier[idx_alg] == 'KMeans'):
                        clf = KMeans(n_clusters=k)
                    elif (name_classifier[idx_alg] == 'PSC'):
                        clf = PSC(swarm_size=k,
                                  n_iter=num_iterations,
                                  w=0.95,
                                  c1=2.05,
                                  c2=2.05,
                                  c3=1.0,
                                  c4=1.0,
                                  v_max=0.01)

                    clf.fit(x)

                    centroids = clf.centroids

                    Round = Round + 1

                    if clf.solution.number_of_effective_clusters == k:
                        filename = pathname_output_clusters + '/centroid-' + str(k) + \
                                   '-success-simulation-' + str(
                            idx_success_simulation[name_classifier[idx_alg]][k]  + 1) + '.csv'
                    else:
                        filename = pathname_output_clusters + '/centroid-' + str(k) \
                                   + '-fail-simulation-' + str(idx_fail_simulation[name_classifier[idx_alg]][k]  + 1) + '.csv'

                    dataframe_centroids = pd.DataFrame(centroids)
                    dataframe_centroids.transpose().to_csv(filename,
                                                           sep=" ",
                                                           index=False)

                    if clf.solution.number_of_effective_clusters == k:
                        filename = pathname_output_clusters + '/cluster-' + str(k) \
                                   + '-success-simulation-' + str(idx_success_simulation[name_classifier[idx_alg]][k]  + 1) + '.cluster'
                    else:
                        filename = pathname_output_clusters + '/cluster-' + str(k) + \
                                   '-fail-simulation-' + str(idx_fail_simulation[name_classifier[idx_alg]][k]  + 1) + '.cluster'

                    file = open(filename, 'w')
                    file.write(str(len(clf.centroids)) + '\n')
                    file.write(
                        str(clf.solution.number_of_effective_clusters) + '\n')
                    for c in range(len(clf.centroids)):
                        if len(clf.solution.clusters[c]) > 0:
                            file.write(
                                str(len(clf.solution.clusters[c])) + '\n')
                            for xi in range(len(clf.solution.clusters[c])):
                                file.write(str(
                                    clf.solution.clusters[c][xi][0]))
                                for xij in range(
                                        1, len(clf.solution.clusters[c][xi])):
                                    file.write(
                                        ' ' +
                                        str(clf.solution.clusters[c][xi][xij]))
                                file.write('\n')
                    file.close()

                    if clf.solution.number_of_effective_clusters == k:
                        evol_directory = pathname_output_evolution_success + '/evolution-' + str(k) \
                                         + '-success-simulation-' + str(idx_success_simulation[name_classifier[idx_alg]][k]  + 1)

                        if not os.path.exists(evol_directory):
                            os.makedirs(evol_directory)

                        store_evolution(
                            evol_directory, clf.debugger, k,
                            idx_success_simulation[name_classifier[idx_alg]][k]
                            + 1)

                    else:
                        evol_directory = pathname_output_evolution_fail + '/evolution-' + str(k) + \
                                         '-fail-simulation-' + str(idx_fail_simulation[name_classifier[idx_alg]][k] + 1)

                        if not os.path.exists(evol_directory):
                            os.makedirs(evol_directory)

                        store_evolution(
                            evol_directory, clf.debugger, k,
                            idx_fail_simulation[name_classifier[idx_alg]][k] +
                            1)

                    if clf.solution.number_of_effective_clusters == k:

                        for m in range(len(criteria)):
                            value = Metrics.clustering_evaluation(
                                criteria=criteria[m],
                                centroids=centroids,
                                data=x,
                                clf=clf)
                            metrics_list_sim_by_algorithm_and_k[criteria[m]][
                                name_classifier[idx_alg]][k].append(value)

                    for m in range(len(criteria)):
                        pd.DataFrame(metrics_list_sim_by_algorithm_and_k[
                            criteria[m]][name_classifier[idx_alg]][k]).to_csv(
                                pathname_output_metrics + '/' +
                                table_criteria[criteria[m]][1] + '.csv',
                                sep=" ")

                    if clf.solution.number_of_effective_clusters == k:
                        idx_success_simulation[name_classifier[idx_alg]][
                            k] = idx_success_simulation[
                                name_classifier[idx_alg]][k] + 1
                    else:
                        idx_fail_simulation[
                            name_classifier[idx_alg]][k] = idx_fail_simulation[
                                name_classifier[idx_alg]][k] + 1

                    idx_trial[name_classifier[idx_alg]][k] = idx_trial[
                        name_classifier[idx_alg]][k] + 1

                    if (idx_success_simulation[name_classifier[idx_alg]][k] >=
                            num_simulations
                            or idx_trial[name_classifier[idx_alg]][k] >=
                            num_trials):

                        check_sim_its_over[name_classifier[idx_alg]][k] = True
                        for d in check_sim_its_over.values():
                            if not all(d.values()):
                                break
                        else:
                            finished = True

                    print(
                        "Round(" + str(Round) + ") ........................ " +
                        "(SUCCESS = " + str(idx_success_simulation[
                            name_classifier[idx_alg]][k]) + ", "
                        "FAIL = " +
                        str(idx_fail_simulation[name_classifier[idx_alg]][k]) +
                        ", " + "TRAIL = " +
                        str(idx_trial[name_classifier[idx_alg]][k]) + ", " +
                        "CLF = " + name_classifier[idx_alg] + ", K = " +
                        str(k) + ")" + "\n")

            for m in range(len(criteria)):
                idx_k = 0
                for ik in KList:
                    if len(metrics_list_sim_by_algorithm_and_k[criteria[m]][
                            name_classifier[idx_alg]][ik]) > 0:
                        mean_metric_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][idx_k] = \
                            np.mean(metrics_list_sim_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][ik])

                        std_metric_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][idx_k]  = \
                            np.std(metrics_list_sim_by_algorithm_and_k[criteria[m]][name_classifier[idx_alg]][ik])

                        list_results_by_metric[criteria[m]].append([
                            name_classifier[idx_alg], ik,
                            np.mean(metrics_list_sim_by_algorithm_and_k[
                                criteria[m]][name_classifier[idx_alg]][ik]),
                            np.std(metrics_list_sim_by_algorithm_and_k[
                                criteria[m]][name_classifier[idx_alg]][ik])
                        ])
                    idx_k = idx_k + 1

            for m in range(len(criteria)):
                min_value_mean[criteria[m]] = np.inf
                max_value_mean[criteria[m]] = -np.inf

                max_value_std[criteria[m]] = -np.inf

            for m in range(len(criteria)):
                min_value_mean[criteria[m]] = np.amin(
                    mean_metric_by_algorithm_and_k[criteria[m]][
                        name_classifier[idx_alg]])
                max_value_mean[criteria[m]] = np.amax(
                    mean_metric_by_algorithm_and_k[criteria[m]][
                        name_classifier[idx_alg]])

                max_value_std[criteria[m]] = np.amax(
                    std_metric_by_algorithm_and_k[criteria[m]][
                        name_classifier[idx_alg]])

            for m in range(len(criteria)):
                #print mean_metric_by_algorithm_and_k
                plt.figure()
                plt.title(
                    str(name_classifier[idx_alg]) + ' - ' +
                    table_criteria[criteria[m]][0])
                # plt.errorbar(KList, mean_metric_by_algorithm_and_k[criteria[m]], yerr=std_metric_by_algorithm_and_k[criteria[m]], linewidth=0.5, elinewidth=0.5, color='b', capthick=2, barsabove=True)
                plt.errorbar(KList,
                             mean_metric_by_algorithm_and_k[criteria[m]][
                                 name_classifier[idx_alg]],
                             yerr=(std_metric_by_algorithm_and_k[criteria[m]][
                                 name_classifier[idx_alg]]),
                             linewidth=0.5,
                             elinewidth=0.5,
                             color='b')
                plt.plot(KList,
                         mean_metric_by_algorithm_and_k[criteria[m]][
                             name_classifier[idx_alg]],
                         color='b',
                         marker='o',
                         linewidth=0.5,
                         markersize=5)
                plt.xlabel('K')
                plt.ylabel(table_criteria[criteria[m]][0])
                ymin = min_value_mean[criteria[m]] - max_value_std[criteria[m]]
                ymax = max_value_mean[criteria[m]] + max_value_std[criteria[m]]
                delta = ymax - ymin
                plt.ylim([ymin - 0.01 * delta, ymax + 0.01 * delta])
                plt.tight_layout()
                plt.savefig(pathname_output + '/data/' +
                            name_classifier[idx_alg] + "/" +
                            name_classifier[idx_alg] + "-" +
                            table_criteria[criteria[m]][1] + '.pdf')
                plt.close("all")

        for m in range(len(criteria)):
            df_by_metric = pd.DataFrame(list_results_by_metric[criteria[m]])
            df_by_metric.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
            df_by_metric.to_csv(pathname_output + '/data/' +
                                table_criteria[criteria[m]][1] + '.csv',
                                index=False)

    # mean_metric_by_algorithm[name_classifier[idx_alg]] = mean_metric_by_algorithm_and_k
    # std_metric_by_algorithm[name_classifier[idx_alg]] = std_metric_by_algorithm_and_k
    #

    for m in range(len(criteria)):
        min_value_mean[criteria[m]] = np.inf
        max_value_mean[criteria[m]] = -np.inf

        max_value_std[criteria[m]] = -np.inf

    for m in range(len(criteria)):
        for idx_alg in range(len(name_classifier)):
            curr_min_mean = np.amin(mean_metric_by_algorithm_and_k[criteria[m]]
                                    [name_classifier[idx_alg]])
            curr_max_mean = np.amax(mean_metric_by_algorithm_and_k[criteria[m]]
                                    [name_classifier[idx_alg]])
            min_value_mean[criteria[m]] = np.minimum(
                min_value_mean[criteria[m]], curr_min_mean)
            max_value_mean[criteria[m]] = np.maximum(
                max_value_mean[criteria[m]], curr_max_mean)

            curr_max_std = np.amax(std_metric_by_algorithm_and_k[criteria[m]][
                name_classifier[idx_alg]])

            max_value_std[criteria[m]] = np.maximum(max_value_std[criteria[m]],
                                                    curr_max_std)

    for m in range(len(criteria)):
        for idx_alg in range(len(name_classifier)):
            plt.figure()
            plt.title(
                str(name_classifier[idx_alg]) + ' - ' +
                table_criteria[criteria[m]][0])
            # plt.errorbar(KList, mean_metric_by_algorithm_and_k[criteria[m]], yerr=std_metric_by_algorithm_and_k[criteria[m]], linewidth=0.5, elinewidth=0.5, color='b', capthick=2, barsabove=True)
            plt.errorbar(KList,
                         mean_metric_by_algorithm_and_k[criteria[m]][
                             name_classifier[idx_alg]],
                         yerr=std_metric_by_algorithm_and_k[criteria[m]][
                             name_classifier[idx_alg]],
                         linewidth=0.5,
                         elinewidth=0.5,
                         color='b')
            plt.plot(KList,
                     mean_metric_by_algorithm_and_k[criteria[m]][
                         name_classifier[idx_alg]],
                     color='b',
                     marker='o',
                     linewidth=0.5,
                     markersize=5)
            plt.xlabel('K')
            plt.ylabel(table_criteria[criteria[m]][0])
            ymin = min_value_mean[criteria[m]] - max_value_std[criteria[m]]
            ymax = max_value_mean[criteria[m]] + max_value_std[criteria[m]]
            delta = ymax - ymin
            plt.ylim([ymin - 0.01 * delta, ymax + 0.01 * delta])
            plt.tight_layout()
            plt.savefig(pathname_output + '/data/' + name_classifier[idx_alg] +
                        "/" + name_classifier[idx_alg] + "-" +
                        table_criteria[criteria[m]][1] + '_final.pdf')
            plt.close("all")
Пример #3
0
def main():
    num_exec = 30
    swarm_size = 30
    num_iter = 1000
    # names = ['KMeans', 'FCMeans', 'PSOC', 'PSC', 'KMPSOC', 'PSOCKM']
    names = ['PSC', 'PSOC', 'KMPSOC', 'PSOCKM']
    # names = ['PSOC', 'PSC', 'KMPSOC', 'PSOCKM']

    print("Loading dataset")
    os.chdir('../../..')
    df = pd.read_csv('data/booking_website/booking_website_without_empty_values.csv')
    df = df.drop(['id'], axis=1)
    # df = df.drop(['idade'], axis=1)
    df = df.drop(['sexo'], axis=1)
    x = df[df.apply(lambda x: sum([x_ == '?' for x_ in x]) == 0, axis=1)]
    x = x.iloc[:, :].values.astype(float)

    print("Nomalizing dataset so that all dimenions are in the same scale")
    std = MinMaxScaler()
    x = std.fit_transform(x)
    x = x[np.random.permutation(len(x))]
    for i in range(len(names)):
        metrics = []
        rng = range(2, 11)
        for metricNumber in ["intraClusterStatistic", "quantizationError", "sumInterClusterDistance"]:
        # for metricNumber in ["gap"]:
            print("Algorithm: " + names[i])
            mean = []
            std = []
            dff = []
            for k in rng:
                # print(" Number of Clusters = " + str(k))
                for j in tqdm(range(num_exec)):
                    if names[i] == 'KPSO':
                        clf = KMPSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49)
                    elif names[i] == 'PSOC':
                        clf = PSOC(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49)
                    elif names[i] == 'PSC':
                        clf = PSC(swarm_size=k, n_iter=num_iter, w=0.95, c1=2.05, c2=2.05, c3=1.0, c4=1.0, v_max=0.001)
                    elif names[i] == 'PSOCKM':
                        clf = PSOCKM(n_clusters=k, swarm_size=swarm_size, n_iter=num_iter, w=0.72, c1=1.49, c2=1.49)
                    elif names[i] == 'KMeans':
                        clf = KMeans(n_clusters=k, n_iter=num_iter, shuffle=True, tolerance=0.00001)
                    elif names[i] == 'FCMeans':
                        clf = FCMeans(n_clusters=k, n_iter=num_iter, fuzzy_c=2, tolerance=0.001)

                    clf.fit(x)
                    out_dir = "results/booking/algorithm_{}/metric_{}/".format(names[i], metricNumber)
                    if not os.path.exists(out_dir):
                        os.makedirs(out_dir)

                    file_name = out_dir + "{}_k_{}_exec_{}.csv".format('centroids', k, j)
                    save_centroids = pd.DataFrame(clf.centroids)
                    save_centroids = save_centroids.transpose()
                    save_centroids.to_csv(file_name)

                    clusters = {}
                    for c in clf.centroids:
                        clusters[c] = []

                    for xi in x:
                        dist = [np.linalg.norm(xi - clf.centroids[c]) for c in clf.centroids]
                        class_ = dist.index(min(dist))
                        clusters[class_].append(xi)

                    clusters_file = open(out_dir + "{}_k_{}_exec_{}.csv".format('clusters', k, j), 'w')
                    clusters_file.write(str(len(clf.centroids)) + '\n')

                    for c in range(len(clf.centroids)):
                        clusters_file.write(str(len(clusters[c])) + '\n')
                        for xi in range(len(clusters[c])):
                            clusters_file.write(str(clusters[c][xi][0]))
                            for xij in range(1, len(clusters[c][xi])):
                                clusters_file.write(' ' + str(clusters[c][xi][xij]))
                            clusters_file.write('\n')
                    clusters_file.close()

                    c = clf.centroids[0][0]

                    metrics.append(
                        Metrics.clustering_evaluation("{}".format(metricNumber), centroids=clf.centroids, data=x,
                                                      clf=clf, m=2.0, number_neighbors=2, centroidUnique=c))
                mean.append(np.mean(metrics))
                std.append(np.std(metrics))
                dff.append([names[i], k, np.mean(metrics), np.std(metrics)])
                metrics = []

            # plt.subplot(130 + (i + 1))
            plt.figure()

            figure_name = "results/booking/algorithm_{}/metric_{}/plot.png".format(names[i], metricNumber)
            plt.title(str(names[i]) + ' - Metric ' + metricNumber)
            plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True)
            plt.xlabel('Clusters')
            plt.ylabel('Metric')
            plt.tight_layout()
            plt.savefig(figure_name)

            save_name = "results/booking/algorithm_{}/metric_{}/output.csv".format(names[i], metricNumber)
            dff = pd.DataFrame(dff)
            dff.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
            dff.to_csv(save_name)
Пример #4
0
def main():
    print("Loading dataset")
    eachdataset = 072017
    df = pd.read_excel(io='data/Saidafinal4periodo.xlsx', sheetname='Plan1')
    df.drop(['ID', 'Nome', 'E-mail'], 1, inplace=True)

    x = df.iloc[:, :].values.astype(float)
    print("Nomalizing dataset so that all dimenions are in the same scale")
    std_metric_by_algorithm_and_k = MinMaxScaler()
    x = std_metric_by_algorithm_and_k.fit_transform(x)



    indices_attributes = np.array([1,7,8,10,12,13,16])
    indices_attributes = indices_attributes -1
    x = x[:,indices_attributes]
    SIMULATIONS = 30

    # name = ['KMeans', 'FCMeans']
    name = ['PSOC']
    # name = ['KPSO', 'CPSO', 'PSC']
    for i in range(len(name)):
        metrics = []
        mean = []
        std = []
        rng = range(2, 10)
        for metricNumber in range(1, 26):
            for k in rng:
                print("Number of Clusters = " + str(k) + "\n")
                for j in range(SIMULATIONS):
                    print("Run ====> " + str(j))
                    # if (name[i] == 'KPSO'):
                    # clf = KPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                    # elif (name[i] == 'CPSO'):
                    # clf = CPSO(n_clusters=k, swarm_size=15, n_iter=500, w=0.72, lb_w=0.4, c1=1.49, c2=1.49)
                    # elif (name[i] == 'PSC'):
                    #     clf = PSC(minf=0, maxf=1, swarm_size=k, n_iter=500, w=0.95, v_max=0.01)
                    if (name[i] == 'KMeans'):
                        clf = KMeans(n_clusters=k)
                    elif (name[i] == 'FCMeans'):
                        clf = FCMeans(n_clusters=k)
                    elif (name[i] == 'PSOC'):
                        clf = PSOC(n_clusters=k, swarm_size=30, n_iter=1000, w=0.72, c1=1.49, c2=1.49)
                    clf.fit(x)
                    if not os.path.isdir("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/"):
                        os.makedirs("results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i]))+"/")
                    sn = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/")
                    sn = sn + "dataset_{0}".format(str(eachdataset))
                    sn = sn + "_metric_{0}".format(str(metricNumber))
                    sn = sn + "_algorithm_{0}".format(str(name[i]))
                    sn = sn + "_k_{0}".format(str(k))
                    sn = sn + "_simulation_{0}".format(str(j))
                    savecentroids = pd.DataFrame(clf.centroids)
                    savecentroids = savecentroids.transpose()
                    savecentroids.to_csv(sn+"_centroids.csv")
                    # clust = Metrics.get_clusters(x, clf.centroids)
                    # # print np.array(clust[0])
                    # c = []
                    # for ii in range(len(clust)):
                    #     c.append(np.array(clust[ii]))
                    # sc = pd.DataFrame(c)
                    # # precisa inverter?
                    # sc.to_csv(sn+"_clusters.csv")

                    file = open(sn+"_clusters.csv", 'w')
                    file.write(str(len(clf.centroids)) + '\n')
                    file.write(str(clf.solution.number_of_effective_clusters)+ '\n')
                    for c in range(len(clf.centroids)):
                        file.write(str(len(clf.solution.clusters[c])) + '\n')
                        for xi in range(len(clf.solution.clusters[c])):
                            file.write(str(clf.solution.clusters[c][xi][0]))
                            for xij in range(1,len(clf.solution.clusters[c][xi])):
                                file.write(' ' + str(clf.solution.clusters[c][xi][xij]))
                            file.write('\n')
                    file.close()
                    # if not os.path.exists('gap/' + name[i] + '/' + str(k) + '-centroids'):
                    #     os.makedirs('gap/' + name[i] + '/' + str(k) + '-centroids')
                    # centroids.to_csv('gap/' + name[i] + '/' + str(k) + '-centroids/centroid-simulation' + str(j) + '.csv', index=False)
                    # metrics.append(Metrics.Metrics.inter_cluster_statistic(clf))
                    # metrics.append(Metrics.Metrics.cluster_separation_crisp(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.cluster_separation_fuzzy(data=x, clf=clf, m=2.0))
                    # metrics.append(Metrics.Metrics.abgss(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.edge_index(data=x, clf=clf, number_neighbors=4))
                    # metrics.append(Metrics.Metrics.cluster_connectedness(data=x, clf=clf, number_neighbors=4))
                    # metrics.append(Metrics.Metrics.intra_cluster_statistic(clf))
                    # metrics.append(Metrics.Metrics.ball_hall(data=x, clf=clf))
                    # metrics.append( Metrics.Metrics.j_index(data=x, clf=clf, m=2.0) )
                    # metrics.append( Metrics.Metrics.total_within_cluster_variance(data=x, clf=clf) )
                    # metrics.append(Metrics.Metrics.classification_entropy(data=x, clf=clf, m=2.0))
                    # metrics.append(Metrics.Metrics.intra_cluster_entropy(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.variance_based_ch(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.hartigan(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.xu(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.rl(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.wb(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.xie_beni(data=x, clf=clf, m=2.0))
                    c = clf.centroids[0][0]
                    # metrics.append(Metrics.Metrics.i_index(data=x, clf=clf, centroidUnique=c))
                    # metrics.append(Metrics.Metrics.dunn_index(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.davies_bouldin(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.cs_index(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.silhouette(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.min_max_cut(data=x, clf=clf))
                    # metrics.append(Metrics.Metrics.gap_statistic(data=x, clf=clf))
                    metrics.append(Metrics.clustering_evaluation("{0}".format(str(metricNumber)), centroids=clf.centroids, data=x, clf=clf, m=2.0, number_neighbors=2, centroidUnique=c))
                mean.append(np.mean(metrics))
                std.append(np.std(metrics))
                df.append([name[i], k, np.mean(metrics), np.std(metrics)])
                metrics = []

            # plt.subplot(130 + (i + 1))
            plt.clf()
            plt.title(str(name[i]) + ' - Metric')
            plt.errorbar(rng, mean, yerr=std, marker='o', ecolor='b', capthick=2, barsabove=True)
            plt.xlabel('Clusters')
            plt.ylabel('Metric')
            saveName = "results/metrics/"+"dataset_{0}".format(str(eachdataset))+"/metric_{0}".format(str(metricNumber))+"/algorithm_{0}".format(str(name[i])+"/")
            saveName = saveName + "dataset_{0}".format(str(eachdataset))
            saveName = saveName + "_metric_{0}".format(str(metricNumber))
            saveName = saveName + "_algorithm_{0}".format(str(name[i]))
            plt.savefig(saveName+".pdf")
            df = pd.DataFrame(df)
            df.columns = ['ALGORITHM', 'CLUSTERS', 'MEAN', 'STD']
            df.to_csv(saveName+".csv")
            mean = []
            std = []
            plt.tight_layout()