示例#1
0
    def k_medoids_over_instances(self, dataset, cols, k, distance_metric, max_iters, n_inits=5, p=1):
        # If we set it to default we use the pyclust package...
        temp_dataset = dataset[cols]
        if distance_metric == 'default':
            km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits)
            km.fit(temp_dataset.values)
            cluster_assignment = km.labels_

        else:
            self.p = p
            cluster_assignment = []
            best_silhouette = -1

            # Compute all distances
            D = self.compute_distance_matrix_instances(temp_dataset, distance_metric)

            for it in range(0, n_inits):
                # First select k random points as centers:
                centers = random.sample(range(0, len(dataset.index)), k)
                prev_centers = []
                points_to_cluster = []

                n_iter = 0
                while (n_iter < max_iters) and not (centers == prev_centers):
                    n_iter += 1
                    prev_centers = centers
                    # Assign points to clusters.
                    points_to_centroid = D[centers].idxmin(axis=1)

                    new_centers = []
                    for i in range(0, k):
                    # And find the new center that minimized the sum of the differences.
                        best_center = D.loc[points_to_centroid == centers[i], points_to_centroid == centers[i]].sum().idxmin(axis=1)
                        new_centers.append(best_center)
                    centers = new_centers

                # Convert centroids to cluster numbers:

                points_to_centroid = D[centers].idxmin(axis=1)
                current_cluster_assignment = []
                for i in range(0, len(dataset.index)):
                    current_cluster_assignment.append(centers.index(points_to_centroid.iloc[i,:]))

                silhouette_avg = silhouette_score(temp_dataset, np.array(current_cluster_assignment))
                if silhouette_avg > best_silhouette:
                    cluster_assignment = current_cluster_assignment
                    best_silhouette = silhouette_avg

        # And add the clusters and silhouette scores to the dataset.
        dataset['cluster'] = cluster_assignment
        silhouette_avg = silhouette_score(temp_dataset, np.array(cluster_assignment))
        silhouette_per_inst = silhouette_samples(temp_dataset, np.array(cluster_assignment))
        dataset['silhouette'] = silhouette_per_inst

        return dataset
示例#2
0
            # input features
            feature_matrix[obser_count] = [
                month, date, hr, weekday, temp, cond_code
            ] + factility
            # input truth
            truth_vector[obser_count] = truth
            obser_count += 1
    return [feature_matrix, truth_vector]


print "reading data..."
train_feature_matrix, train_truth_vector = read_obser_set(
    'trainset_2014_with_facility.txt', SIZE)
print "Standardizing..."
train_feature_matrix = StandardScaler().fit_transform(train_feature_matrix)
print "clustering..."
kmd = pyclust.KMedoids(n_clusters=300000, n_trials=50)
kmd.fit(train_feature_matrix)
# ## K-meoids
# # kmeans = KMeans(n_clusters=30000, init='k-means++', n_jobs=4, algorithm='auto').fit(train_feature_matrix)
# medoids, clusters = kMedoids(np.transpose(train_feature_matrix), 30000)
print "printing..."
centers = open('training_subsample.txt', 'a+')
for idx in medoids:
    centers.write(
        str(train_feature_matrix[idx]).replace("]", "").replace(
            "[", "").replace('\'', "").replace(" ", ",") + "," +
        str(train_truth_vector[0]) + "\n")

centers.close()
示例#3
0
def kMedoids(X,k,d_metric='euclidean'):
    km = pyclust.KMedoids(n_clusters=k,distance=d_metric)
    return km.fit_predict(X)
def k_medoids(g_distance, number_clusters):
    kmedoids = pyclust.KMedoids(n_clusters=number_clusters, n_trials=50, random_state = 0).fit_predict(g_distance)
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
    tsne_kmedoids = tsne_model.fit_transform(g_distance)
    return tsne_kmedoids, kmedoids
示例#5
0
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
import pyclust
np.random.seed(5)

iris = datasets.load_iris()
X = iris.data
y = iris.target
print(X)
print('size of data:', X.shape)
print(y)
est = pyclust.KMedoids(n_clusters=3, n_trials=100)
est.fit(X)
print('estimated labels:')
print(est.labels_)
#######
fig = plt.figure(figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()  #clear current axes!!

ydict = {0: 'setosa', 1: 'versicolor', 2: 'Virginica'}
with plt.style.context('seaborn-whitegrid'):

    for lab, col in zip((0, 1, 2), ('blue', 'red', 'green')):
        ax.scatter(X[est.labels_ == lab, 3],
                   X[est.labels_ == lab, 0],
                   X[est.labels_ == lab, 2],
                   c=col,
示例#6
0
def kmedoid(kdata, n_clusters, n_trial):
    kmd = pyclust.KMedoids(n_clusters=n_clusters, n_trials=n_trial)
    kmd.fit(kdata.values)

    return kmd.labels_
示例#7
0
    def k_medoids_over_instances(self,
                                 dataset: pd.DataFrame,
                                 cols: List[str],
                                 k: int,
                                 distance_metric: str,
                                 max_iters: int,
                                 n_inits: int = 5,
                                 p: int = 1):
        """
        Apply k-medoids clustering using the self implemented distance metrics.

        :param dataset: DataFrame to apply clustering on.
        :param cols: List of columns to use for clustering.
        :param k: Number of clusters.
        :param distance_metric: Distance metrics to use for clustering.
        :param max_iters: Maximum number of iterations.
        :param n_inits: Number of inits.
        :param p: Optional parameter for Minkowski distance metrics.
        :return: Original DataFrame with cluster and silhouette score columns added.
        """

        # Select the appropriate columns
        temp_dataset = dataset[cols]
        # Use PyClust Package in case of default distance metric
        if distance_metric == 'default':
            km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits)
            km.fit(temp_dataset.values)
            cluster_assignment = km.labels_
        else:
            self.p = p
            cluster_assignment = []
            best_silhouette = -1

            # Compute all distances
            D = self.compute_distance_matrix_instances(temp_dataset,
                                                       distance_metric)

            for it in range(0, n_inits):
                # Select k random points as centers first
                centers = random.sample(range(0, len(dataset.index)), k)
                prev_centers = []

                n_iter = 0
                while (n_iter < max_iters) and not (centers == prev_centers):
                    n_iter += 1
                    prev_centers = centers
                    # Assign points to clusters
                    points_to_centroid = D[centers].idxmin(axis=1)

                    new_centers = []
                    for i in range(0, k):
                        # Find the new center that minimized the sum of the differences

                        best_center = D.loc[points_to_centroid ==
                                            centers[i]].sum().idxmin(axis=1)
                        new_centers.append(best_center)
                    centers = new_centers

                # Convert centroids to cluster numbers:
                points_to_centroid = D[centers].idxmin(axis=1)
                current_cluster_assignment = []
                for i in range(0, len(dataset.index)):
                    current_cluster_assignment.append(
                        centers.index(points_to_centroid.iloc[i]))

                silhouette_avg = silhouette_score(
                    temp_dataset, np.array(current_cluster_assignment))
                if silhouette_avg > best_silhouette:
                    cluster_assignment = current_cluster_assignment
                    best_silhouette = silhouette_avg

        # Add the clusters and silhouette scores to the dataset
        dataset['cluster'] = cluster_assignment
        silhouette_per_inst = silhouette_samples(temp_dataset,
                                                 np.array(cluster_assignment))
        dataset['silhouette'] = silhouette_per_inst

        return dataset
示例#8
0
                                             membs,
                                             n_clusters=2,
                                             distance='euclidean')

print(cent_upd)

rng = np.random.RandomState(1234)
print(
    pyclust._kmedoids._kmedoids_run(d,
                                    n_clusters=2,
                                    distance='euclidean',
                                    max_iter=20,
                                    tol=0.001,
                                    rng=rng))

kmd = pyclust.KMedoids(n_clusters=2)

kmd.fit(d)

print("Centers: ", kmd.centers_)
print("Labels: ", kmd.labels_)
print("SSE: ", kmd.sse_arr_)
print("N_ITER: ", kmd.n_iter_)

print('\n\n*** Testing RandomState: ***')
for i in range(3):
    kmd2 = pyclust.KMedoids(n_clusters=2, random_state=123)
    kmd2.fit(d)
    print("Centers: ", kmd2.centers_)
    print("Labels: ", kmd2.labels_)
    print("SSE: ", kmd2.sse_arr_)