def k_medoids_over_instances(self, dataset, cols, k, distance_metric, max_iters, n_inits=5, p=1): # If we set it to default we use the pyclust package... temp_dataset = dataset[cols] if distance_metric == 'default': km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits) km.fit(temp_dataset.values) cluster_assignment = km.labels_ else: self.p = p cluster_assignment = [] best_silhouette = -1 # Compute all distances D = self.compute_distance_matrix_instances(temp_dataset, distance_metric) for it in range(0, n_inits): # First select k random points as centers: centers = random.sample(range(0, len(dataset.index)), k) prev_centers = [] points_to_cluster = [] n_iter = 0 while (n_iter < max_iters) and not (centers == prev_centers): n_iter += 1 prev_centers = centers # Assign points to clusters. points_to_centroid = D[centers].idxmin(axis=1) new_centers = [] for i in range(0, k): # And find the new center that minimized the sum of the differences. best_center = D.loc[points_to_centroid == centers[i], points_to_centroid == centers[i]].sum().idxmin(axis=1) new_centers.append(best_center) centers = new_centers # Convert centroids to cluster numbers: points_to_centroid = D[centers].idxmin(axis=1) current_cluster_assignment = [] for i in range(0, len(dataset.index)): current_cluster_assignment.append(centers.index(points_to_centroid.iloc[i,:])) silhouette_avg = silhouette_score(temp_dataset, np.array(current_cluster_assignment)) if silhouette_avg > best_silhouette: cluster_assignment = current_cluster_assignment best_silhouette = silhouette_avg # And add the clusters and silhouette scores to the dataset. dataset['cluster'] = cluster_assignment silhouette_avg = silhouette_score(temp_dataset, np.array(cluster_assignment)) silhouette_per_inst = silhouette_samples(temp_dataset, np.array(cluster_assignment)) dataset['silhouette'] = silhouette_per_inst return dataset
# input features feature_matrix[obser_count] = [ month, date, hr, weekday, temp, cond_code ] + factility # input truth truth_vector[obser_count] = truth obser_count += 1 return [feature_matrix, truth_vector] print "reading data..." train_feature_matrix, train_truth_vector = read_obser_set( 'trainset_2014_with_facility.txt', SIZE) print "Standardizing..." train_feature_matrix = StandardScaler().fit_transform(train_feature_matrix) print "clustering..." kmd = pyclust.KMedoids(n_clusters=300000, n_trials=50) kmd.fit(train_feature_matrix) # ## K-meoids # # kmeans = KMeans(n_clusters=30000, init='k-means++', n_jobs=4, algorithm='auto').fit(train_feature_matrix) # medoids, clusters = kMedoids(np.transpose(train_feature_matrix), 30000) print "printing..." centers = open('training_subsample.txt', 'a+') for idx in medoids: centers.write( str(train_feature_matrix[idx]).replace("]", "").replace( "[", "").replace('\'', "").replace(" ", ",") + "," + str(train_truth_vector[0]) + "\n") centers.close()
def kMedoids(X,k,d_metric='euclidean'): km = pyclust.KMedoids(n_clusters=k,distance=d_metric) return km.fit_predict(X)
def k_medoids(g_distance, number_clusters): kmedoids = pyclust.KMedoids(n_clusters=number_clusters, n_trials=50, random_state = 0).fit_predict(g_distance) tsne_model = TSNE(n_components=2, verbose=1, random_state=0) tsne_kmedoids = tsne_model.fit_transform(g_distance) return tsne_kmedoids, kmedoids
import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import datasets import pyclust np.random.seed(5) iris = datasets.load_iris() X = iris.data y = iris.target print(X) print('size of data:', X.shape) print(y) est = pyclust.KMedoids(n_clusters=3, n_trials=100) est.fit(X) print('estimated labels:') print(est.labels_) ####### fig = plt.figure(figsize=(4, 3)) plt.clf() ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) plt.cla() #clear current axes!! ydict = {0: 'setosa', 1: 'versicolor', 2: 'Virginica'} with plt.style.context('seaborn-whitegrid'): for lab, col in zip((0, 1, 2), ('blue', 'red', 'green')): ax.scatter(X[est.labels_ == lab, 3], X[est.labels_ == lab, 0], X[est.labels_ == lab, 2], c=col,
def kmedoid(kdata, n_clusters, n_trial): kmd = pyclust.KMedoids(n_clusters=n_clusters, n_trials=n_trial) kmd.fit(kdata.values) return kmd.labels_
def k_medoids_over_instances(self, dataset: pd.DataFrame, cols: List[str], k: int, distance_metric: str, max_iters: int, n_inits: int = 5, p: int = 1): """ Apply k-medoids clustering using the self implemented distance metrics. :param dataset: DataFrame to apply clustering on. :param cols: List of columns to use for clustering. :param k: Number of clusters. :param distance_metric: Distance metrics to use for clustering. :param max_iters: Maximum number of iterations. :param n_inits: Number of inits. :param p: Optional parameter for Minkowski distance metrics. :return: Original DataFrame with cluster and silhouette score columns added. """ # Select the appropriate columns temp_dataset = dataset[cols] # Use PyClust Package in case of default distance metric if distance_metric == 'default': km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits) km.fit(temp_dataset.values) cluster_assignment = km.labels_ else: self.p = p cluster_assignment = [] best_silhouette = -1 # Compute all distances D = self.compute_distance_matrix_instances(temp_dataset, distance_metric) for it in range(0, n_inits): # Select k random points as centers first centers = random.sample(range(0, len(dataset.index)), k) prev_centers = [] n_iter = 0 while (n_iter < max_iters) and not (centers == prev_centers): n_iter += 1 prev_centers = centers # Assign points to clusters points_to_centroid = D[centers].idxmin(axis=1) new_centers = [] for i in range(0, k): # Find the new center that minimized the sum of the differences best_center = D.loc[points_to_centroid == centers[i]].sum().idxmin(axis=1) new_centers.append(best_center) centers = new_centers # Convert centroids to cluster numbers: points_to_centroid = D[centers].idxmin(axis=1) current_cluster_assignment = [] for i in range(0, len(dataset.index)): current_cluster_assignment.append( centers.index(points_to_centroid.iloc[i])) silhouette_avg = silhouette_score( temp_dataset, np.array(current_cluster_assignment)) if silhouette_avg > best_silhouette: cluster_assignment = current_cluster_assignment best_silhouette = silhouette_avg # Add the clusters and silhouette scores to the dataset dataset['cluster'] = cluster_assignment silhouette_per_inst = silhouette_samples(temp_dataset, np.array(cluster_assignment)) dataset['silhouette'] = silhouette_per_inst return dataset
membs, n_clusters=2, distance='euclidean') print(cent_upd) rng = np.random.RandomState(1234) print( pyclust._kmedoids._kmedoids_run(d, n_clusters=2, distance='euclidean', max_iter=20, tol=0.001, rng=rng)) kmd = pyclust.KMedoids(n_clusters=2) kmd.fit(d) print("Centers: ", kmd.centers_) print("Labels: ", kmd.labels_) print("SSE: ", kmd.sse_arr_) print("N_ITER: ", kmd.n_iter_) print('\n\n*** Testing RandomState: ***') for i in range(3): kmd2 = pyclust.KMedoids(n_clusters=2, random_state=123) kmd2.fit(d) print("Centers: ", kmd2.centers_) print("Labels: ", kmd2.labels_) print("SSE: ", kmd2.sse_arr_)