예제 #1
0
def cluster(nclus, dfs, n_init=1, variables=None, algorithm='kmeans', ret_gmm=False):
    """
    Wrapper around Gaussian Mixture and Kmeans

    Parameters:
        nclus (int): number of clusters
        dfs (pd.DataFrame): the dataframe
        n_init (int): the number of times to initialize the clusterer
        variables (list): the list of variables to take from the dataframe
        algorithm (str): clustering type, `kmeans` or `gmm`
        ret_gmm (bool): either `true` for returning the mixture or false to return just the
            clustering

    Returns:
        clusdef +- gmm depending on ret_gmm
    """
    from sklearn.cluster import KMeans, hierarchical
    from sklearn.mixture import GaussianMixture
    # get the data for clustering:
    if variables is not None:
        nd_pts = dfs[variables].values
    else:
        nd_pts = dfs
    nclus = int(nclus)
    if algorithm.lower() == 'kmeans':
        clusdef = KMeans(n_clusters=nclus, n_init=n_init).fit(nd_pts).labels_
    elif algorithm.lower() == 'gmm':
        gmm = GaussianMixture(n_components=nclus, n_init=n_init)
        gmm = gmm.fit(nd_pts)
        clusdef = np.array(gmm.predict(nd_pts))
    elif algorithm.lower() == 'hier':
        hier = hierarchical.AgglomerativeClustering(n_clusters=nclus, linkage='ward')
        clusdef = hier.fit_predict(nd_pts)
    if ret_gmm:
        return clusdef, gmm
    else:
        return clusdef
예제 #2
0
def a_algo(v, n):
    return hierarchical.AgglomerativeClustering(n_clusters=n).fit_predict(v)
예제 #3
0
파일: main.py 프로젝트: ioanaandreeab/AD
h = hclust.linkage(x, method=metoda)
print("Matrice ierarhie:", h, sep="\n")
#Plot ierarhie - graficul dendograma
grafice.dendrograma(h, nume_instante,
                    "Plot ierarhie. Metoda: " + metoda + " Metrica euclidiana")
#grafice.show()
#Determinare partitie optimala
m = np.shape(h)[
    0]  #liniile de la shape; shape intoarce nr de linii si nr de coloane

# nr de clustere din partitia  optimala - trb sa vad unde este ceas mai mare distanta --ma intereseaza pozitia nu valoarea in sine
k_opt = m - np.argmax(h[:1, 2] - h[:(m - 1), 2])
print("Partitia optimala are " + str(k_opt) + " clusteri")
#di si Pi
# di+1 - d2
#di+1 - di

#cobstruire model sklearn
model_clusterizare_sk = skhclust.AgglomerativeClustering(n_clusters=5,
                                                         linkage=metoda)
model_clusterizare_sk.fit(x)
coduri = model_clusterizare_sk.labels_
partitie = np.array(["Cluster" + str(cod + 1) for cod in coduri])
tabel_partitie = pd.DataFrame(data={"Partitie ": partitie},
                              index=mortalitate.index)
print("Partitia optimala: ", tabel_partitie, sep="\n")
partitie = np.array("Cluster " + str(cod + 1) for cod in coduri)

#Plot partitie in axele principale
grafice.plot_partitie(x, partitie, nume_instante, "Partitie optimala")
grafice.show()
예제 #4
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import hierarchical
from sklearn import datasets

"Import data"
data = pd.DataFrame(datasets.load_iris().data)
shuf = np.random.choice(len(data), size=len(data), replace=False)
y = pd.DataFrame(datasets.load_iris().target)
newData = data.iloc[shuf]
newY = y.iloc[shuf]

"Create dendogram using scipy"
dendogram = sch.dendrogram(sch.linkage(newData, method="ward"))

"Create AgglomerativeClustering model using sklearn"
model = hierarchical.AgglomerativeClustering(n_clusters=3,
                                             linkage="ward",
                                             affinity="euclidean")

"Fit data using created model"
model.fit(newData)
y_pred = model.fit_predict(newData)

"accuracy"
accuracy = sum(list(newY) == y_pred) / len(newY)
print(accuracy)
plt.show()
예제 #5
0
data = pd.read_csv(r"C:\Users\Bharat Gupta\Downloads\Mall_Customers.csv")
x = data.iloc[0:, [3, 4]]
dendo = sch.dendrogram(
    sch.linkage(x, method='ward')
)  #ward minimizes the Varience within cluster and KMeans++ minimize WCSS
plt.title('Dendrogram')
plt.xlabel('CUstomers')
plt.ylabel("ED")
plt.show()

#############################################

from sklearn.cluster import hierarchical

model = hierarchical.AgglomerativeClustering(n_clusters=4)
y_sch = model.fit_predict(x)

###########################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
import seaborn as sns

data = pd.read_csv(r"C:\Users\Bharat Gupta\Desktop\Edge Download\Wine.csv")
x.info()

# data['Malic_Acid']=np.where(data["Malic_Acid"]>5,5,data["Malic_Acid"])
# sns.boxplot(data['Malic_Acid'])
예제 #6
0
        if estimator1.labels_[i]==0:
            plt.plot(iris.values[i:,1],iris.values[i:,2],'go')
            plt.plot(estimator1.cluster_centers_[:,0],estimator1.cluster_centers_[:,1],'o',c='black')
        elif estimator1.labels_[i]==1:
            plt.plot(iris.values[i:,1],iris.values[i:,2],'ro')
            plt.plot(estimator1.cluster_centers_[:,0],estimator1.cluster_centers_[:,1],'o',c='black')
        elif estimator1.labels_[i]==2:
            plt.plot(iris.values[i:,1],iris.values[i:,2],'bo')
            plt.plot(estimator1.cluster_centers_[:,0],estimator1.cluster_centers_[:,1],'o',c='black')
    plt.show()        

    #Black points are centroids
        
    # # Clustering using Hierarchical Clustering Algorithm
    
    estimator2 = hierarchical.AgglomerativeClustering(n_clusters=3)
        
    estimator2.fit(iris.values[:,1:3])
    
    for i in range(150):
        if estimator2.labels_[i]==0:
            plt.plot(iris.values[i:,1],iris.values[i:,2],'go')
        elif estimator2.labels_[i]==1:
            plt.plot(iris.values[i:,1],iris.values[i:,2],'ro')
        elif estimator2.labels_[i]==2:
            plt.plot(iris.values[i:,1],iris.values[i:,2],'bo')
    plt.show()        
        
    # # Clustering using DBSCAN Clustering Algorithm
    
    estimator3 = DBSCAN()
예제 #7
0
def h_agglomerative(values, n):
    return hierarchical.AgglomerativeClustering(
        n_clusters=n).fit_predict(values)