Exemplo n.º 1
0
def evaluate_KMeans():

    # data input
    data_array, data, data_shape = input()
    data_array_transposed = data_array.transpose()

    # init k
    cluster_amount_init = int(math.sqrt(data_shape[1]) / 2)
    print 'Initial value of k is %d' % cluster_amount_init

    # take first 20 result into consideration
    # calculate the silhouette score
    range_silhouette_avg = []
    for n in range(2, cluster_amount_init * 2 - 2):

        clusterer = KMeans(n_clusters=n, random_state=10)
        cluster_labels = clusterer.fit_predict(data_array_transposed)

        silhouette_avg = silhouette_score(data_array_transposed,
                                          cluster_labels)
        range_silhouette_avg.append(silhouette_avg)
        print 'For n_clusters = %d, The average silhouette_score is: %f' % (
            n, silhouette_avg)

    # draw the chart
    plt.plot(range(2, cluster_amount_init * 2 - 2), range_silhouette_avg,
             'bx-')
    plt.title('Silhouette_score-k line-chart')
    plt.xlabel('k')
    plt.ylabel('silhouette_score')
    plt.legend()
    plt.show()
Exemplo n.º 2
0
def gmm(n):

    # data input
    data_array, data, data_shape = input()
    data_transposed = data.transpose()

    # get the cluster labels
    gmm = GMM(n_components=n, covariance_type='spherical')
    cluster_labels = gmm.fit_predict(data_transposed)

    return cluster_labels
Exemplo n.º 3
0
def best_kmeans():

    # data input
    data_array, data, data_shape = input()
    data_array_transposed = data_array.transpose()

    # get the cluster labels
    clusterer = KMeans(n_clusters=2, random_state=10)
    cluster_labels = clusterer.fit_predict(data_array_transposed)

    return cluster_labels
Exemplo n.º 4
0
def best_dbscan():

    # data input
    data_array, data, data_shape = input()
    data_transposed = data.transpose()

    # get the cluster labels
    db = DBSCAN(eps=310, min_samples=4)
    clusterer = db.fit(data_transposed)
    cluster_labels = clusterer.labels_

    return cluster_labels
Exemplo n.º 5
0
def validate(n):

    # data input
    data_array, data, data_shape = input()
    data_transposed = data.transpose()
    data_array_transposed = data_array.transpose()

    # get a random vipno to use
    vipno_pos = rd.randint(0, data_shape[1])

    # get the result of GMM
    gmm = GMM(n_components=n, covariance_type='spherical')
    cluster_labels = gmm.fit_predict(data_transposed)

    # make a dictionary to index the cluster
    labels_dic = pd.DataFrame(
        np.row_stack((data_transposed.index, cluster_labels)))
    labels_dic = labels_dic.transpose().set_index(labels_dic.transpose()[0])

    # get result of KNN, and compare
    for scale in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]:
        for k in [1, 2, 3, 4, 5]:

            # get the result of KNN
            hash_size = int(data_shape[1] * scale)
            results = knn(data_array, data, hash_size, data_shape, vipno_pos,
                          k)

            if len(results) < 1:
                print 'For n_component = %d, hash_size = %d, k = %d: no result from KNN.\n' % (
                    n, hash_size, k)

            else:
                print 'For n_component = %d, hash_size = %d, k = %d, vipno_input = %d:' % (
                    n, hash_size, k, results[0])

                # cluster of the vipno itself
                cluster = labels_dic.loc[results[0]][1]

                # and compare
                for result in results[1:]:
                    cluster_result = labels_dic.loc[results[0]][1]
                    print 'vipno_output: %d, result: %s' % (
                        result,
                        'same' if cluster == cluster_result else 'not same.')

                print ''
Exemplo n.º 6
0
def evaluate_gmm():

    # data input
    data_array, data, data_shape = input()
    data_transposed = data.transpose()
    data_array_transposed = data_array.transpose()

    # get the result of kMeans
    result_kmeans = best_kmeans()

    # get the result of DBSCAN
    result_dbscan = best_dbscan()

    # compare
    # DBSCAN and GMM
    n_components = 2
    result_gmm = gmm(n_components)

    main_cluster = np.argmax(np.bincount(result_gmm))

    count = 0
    for label_index in range(0, data_shape[1] - 1):
        if result_dbscan[
                label_index] == result_gmm[label_index] - main_cluster:
            count += 1
    accuracy = float(count) / data_shape[1]

    print 'GMM accuracy in DBSCAN is: %f' % accuracy

    # kMeans and GMM
    n_components = 2
    result_gmm = gmm(n_components)

    main_cluster = np.argmax(np.bincount(result_gmm))

    count = 0
    for label_index in range(0, data_shape[1] - 1):
        if result_kmeans[
                label_index] == result_gmm[label_index] - main_cluster:
            count += 1
    accuracy = float(count) / data_shape[1]

    print 'GMM accuracy in kMeans is: %f' % accuracy
Exemplo n.º 7
0
def validate(n):

    # input again
    data_array, data, data_shape = input()
    data_array_transposed = data_array.transpose()

    # get a random vipno to use
    vipno_pos = rd.randint(0, data_shape[1])

    # get the result of KMeans
    kmeans = KMeans(n_clusters=n, random_state=10).fit(data_array_transposed)

    # get the result of KNN using best n, including vipno itself
    for scale in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]:
        for k in [1, 2, 3, 4, 5]:

            # get the result of KNN
            hash_size = int(data_shape[1] * scale)
            results = knn(data_array, data, hash_size, data_shape, vipno_pos,
                          k)

            if len(results) < 1:
                print 'For n_cluster = %d, hash_size = %d, k = %d: no result from KNN.\n' % (
                    n, hash_size, k)

            else:
                print 'For n_cluster = %d, hash_size = %d, k = %d, vipno_input = %d:' % (
                    n, hash_size, k, results[0])

                # cluster of the vipno itself
                cluster = kmeans.predict(
                    data.transpose().loc[results[0]].values.reshape(1, -1))

                # and compare
                for result in results[1:]:
                    cluster_result = kmeans.predict(
                        data.transpose().loc[result].values.reshape(1, -1))
                    print 'vipno_output: %d, result: %s' % (
                        result,
                        'same' if cluster == cluster_result else 'not same.')

                print ''