def cluster(nclus, dfs, n_init=1, variables=None, algorithm='kmeans', ret_gmm=False): """ Wrapper around Gaussian Mixture and Kmeans Parameters: nclus (int): number of clusters dfs (pd.DataFrame): the dataframe n_init (int): the number of times to initialize the clusterer variables (list): the list of variables to take from the dataframe algorithm (str): clustering type, `kmeans` or `gmm` ret_gmm (bool): either `true` for returning the mixture or false to return just the clustering Returns: clusdef +- gmm depending on ret_gmm """ from sklearn.cluster import KMeans, hierarchical from sklearn.mixture import GaussianMixture # get the data for clustering: if variables is not None: nd_pts = dfs[variables].values else: nd_pts = dfs nclus = int(nclus) if algorithm.lower() == 'kmeans': clusdef = KMeans(n_clusters=nclus, n_init=n_init).fit(nd_pts).labels_ elif algorithm.lower() == 'gmm': gmm = GaussianMixture(n_components=nclus, n_init=n_init) gmm = gmm.fit(nd_pts) clusdef = np.array(gmm.predict(nd_pts)) elif algorithm.lower() == 'hier': hier = hierarchical.AgglomerativeClustering(n_clusters=nclus, linkage='ward') clusdef = hier.fit_predict(nd_pts) if ret_gmm: return clusdef, gmm else: return clusdef
def a_algo(v, n): return hierarchical.AgglomerativeClustering(n_clusters=n).fit_predict(v)
h = hclust.linkage(x, method=metoda) print("Matrice ierarhie:", h, sep="\n") #Plot ierarhie - graficul dendograma grafice.dendrograma(h, nume_instante, "Plot ierarhie. Metoda: " + metoda + " Metrica euclidiana") #grafice.show() #Determinare partitie optimala m = np.shape(h)[ 0] #liniile de la shape; shape intoarce nr de linii si nr de coloane # nr de clustere din partitia optimala - trb sa vad unde este ceas mai mare distanta --ma intereseaza pozitia nu valoarea in sine k_opt = m - np.argmax(h[:1, 2] - h[:(m - 1), 2]) print("Partitia optimala are " + str(k_opt) + " clusteri") #di si Pi # di+1 - d2 #di+1 - di #cobstruire model sklearn model_clusterizare_sk = skhclust.AgglomerativeClustering(n_clusters=5, linkage=metoda) model_clusterizare_sk.fit(x) coduri = model_clusterizare_sk.labels_ partitie = np.array(["Cluster" + str(cod + 1) for cod in coduri]) tabel_partitie = pd.DataFrame(data={"Partitie ": partitie}, index=mortalitate.index) print("Partitia optimala: ", tabel_partitie, sep="\n") partitie = np.array("Cluster " + str(cod + 1) for cod in coduri) #Plot partitie in axele principale grafice.plot_partitie(x, partitie, nume_instante, "Partitie optimala") grafice.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.cluster.hierarchy as sch from sklearn.cluster import hierarchical from sklearn import datasets "Import data" data = pd.DataFrame(datasets.load_iris().data) shuf = np.random.choice(len(data), size=len(data), replace=False) y = pd.DataFrame(datasets.load_iris().target) newData = data.iloc[shuf] newY = y.iloc[shuf] "Create dendogram using scipy" dendogram = sch.dendrogram(sch.linkage(newData, method="ward")) "Create AgglomerativeClustering model using sklearn" model = hierarchical.AgglomerativeClustering(n_clusters=3, linkage="ward", affinity="euclidean") "Fit data using created model" model.fit(newData) y_pred = model.fit_predict(newData) "accuracy" accuracy = sum(list(newY) == y_pred) / len(newY) print(accuracy) plt.show()
data = pd.read_csv(r"C:\Users\Bharat Gupta\Downloads\Mall_Customers.csv") x = data.iloc[0:, [3, 4]] dendo = sch.dendrogram( sch.linkage(x, method='ward') ) #ward minimizes the Varience within cluster and KMeans++ minimize WCSS plt.title('Dendrogram') plt.xlabel('CUstomers') plt.ylabel("ED") plt.show() ############################################# from sklearn.cluster import hierarchical model = hierarchical.AgglomerativeClustering(n_clusters=4) y_sch = model.fit_predict(x) ########################################### import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.cluster.hierarchy as sch import seaborn as sns data = pd.read_csv(r"C:\Users\Bharat Gupta\Desktop\Edge Download\Wine.csv") x.info() # data['Malic_Acid']=np.where(data["Malic_Acid"]>5,5,data["Malic_Acid"]) # sns.boxplot(data['Malic_Acid'])
if estimator1.labels_[i]==0: plt.plot(iris.values[i:,1],iris.values[i:,2],'go') plt.plot(estimator1.cluster_centers_[:,0],estimator1.cluster_centers_[:,1],'o',c='black') elif estimator1.labels_[i]==1: plt.plot(iris.values[i:,1],iris.values[i:,2],'ro') plt.plot(estimator1.cluster_centers_[:,0],estimator1.cluster_centers_[:,1],'o',c='black') elif estimator1.labels_[i]==2: plt.plot(iris.values[i:,1],iris.values[i:,2],'bo') plt.plot(estimator1.cluster_centers_[:,0],estimator1.cluster_centers_[:,1],'o',c='black') plt.show() #Black points are centroids # # Clustering using Hierarchical Clustering Algorithm estimator2 = hierarchical.AgglomerativeClustering(n_clusters=3) estimator2.fit(iris.values[:,1:3]) for i in range(150): if estimator2.labels_[i]==0: plt.plot(iris.values[i:,1],iris.values[i:,2],'go') elif estimator2.labels_[i]==1: plt.plot(iris.values[i:,1],iris.values[i:,2],'ro') elif estimator2.labels_[i]==2: plt.plot(iris.values[i:,1],iris.values[i:,2],'bo') plt.show() # # Clustering using DBSCAN Clustering Algorithm estimator3 = DBSCAN()
def h_agglomerative(values, n): return hierarchical.AgglomerativeClustering( n_clusters=n).fit_predict(values)