def elbow_manual(n_clusters,X):
    sample,features=X.shape
    e=10**(-10)

    X = DataFrameImputer().fit_transform(X)    
    SSE=[]
    SSE1=[]
    for i in range(1,n_clusters):
        Y=labeling(i,X,0.6)
        centers,labels=ss_kmeans_pp(Y,i,e)
        centers_sk,labels_sk=kmean_sklearn(i,X)
    
        # en utilisant lloyd
        SSE.append(np.sum(np.min(cdist(X,centers,'euclidean'),axis=1)))
        # en utilisant sklearn
        SSE1.append(np.sum(np.min(cdist(X,centers_sk,'euclidean'),axis=1)))
    
    K=np.arange(1,n_clusters)    
    plt.plot(K,SSE,label='méthode manuel',color='blue')
    plt.plot(K,SSE1,label='méthode sklearn',color='orange')
    plt.xticks(np.arange(1, n_clusters, 1))
    kn = KneeLocator(K, SSE1, curve='convex', direction='decreasing')    
    # plotting dashed_vline on knee
    plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
    plt.show()
    plt.legend()
Exemplo n.º 2
0
# Main
# =============================================================================

if __name__ == "__main__":
    # Uploading ML dataset
    base = pd.read_csv('BigML_Dataset.csv', sep=',')

    X = base.iloc[:, 1:]

    # Imputing based on mean for numeric, and most frequent for strings
    X = DataFrameImputer().fit_transform(X)
    X.fillna(X.mean())
    quanti_X = X.drop(['shops_used'], axis='columns')

    k, e = 5, 10**(-10)

    #kmeans++
    t0 = time()
    centers, clusters = kmeans_pp(quanti_X, k, e)
    t1 = time()
    print('En utilisant kmeans++ : %f' % (t1 - t0))

    #comparaison avec kmeans de sklearn
    t2 = time()
    resultat = kmean_sklearn(k, quanti_X)
    t3 = time()
    print('En utilisant kmeans++ de sklearn : %f' % (t3 - t2))

    N = 100
    distance_moyenne = inertie_intra(k, quanti_X, e, N)
    print('Inertie_intra en utilisant kmeans++:', distance_moyenne)
    return centers, clusters


# =============================================================================
# Main
# =============================================================================

if __name__ == "__main__":
    # Uploading ML dataset
    base = pd.read_csv('BigML_Dataset.csv', sep=',')

    X = base.iloc[:, 1:]

    # Imputing based on mean for numeric, and most frequent for strings
    X = DataFrameImputer().fit_transform(X)
    X.fillna(X.mean())

    k, e = 5, 10**(-10)

    #kmeans++
    t0 = time()
    centers, clusters = kmeans_pp(X, k, e)
    t1 = time()
    print('En utilisant kmeans++ : %f' % (t1 - t0))

    #comparaison avec kmeans de sklearn
    t2 = time()
    resultat = kmean_sklearn(k, X)
    t3 = time()
    print('En utilisant kmeans++ de sklearn : %f' % (t3 - t2))