def test_seuclidean():
    with pytest.warns(None) as record:
        km = KMedoids(2, metric="seuclidean", method="pam")
        km.fit(np.array([0, 0, 0, 1]).reshape((4, 1)))
        km.predict(np.array([0, 0, 0, 1]).reshape((4, 1)))
        km.transform(np.array([0, 0, 0, 1]).reshape((4, 1)))
    assert len(record) == 0
예제 #2
0
def cross_validation(n=200):
    accuracy_sum_train, accuracy_sum_test, diff_sum_train, diff_sum_test = (
        0, ) * 4
    for i in range(n):
        model_sklearn = KMedoids(n_clusters=3, random_state=12412).fit(X_train)
        labels_sklearn_kmedoid = model_sklearn.predict(X_test)
        accuracy, diff = callculate_accuracy('K-medoid, Own Implementation',
                                             labels_sklearn_kmedoid)
        accuracy_sum_test += accuracy
        diff_sum_test += diff

        labels_sklearn_kmedoid = model_sklearn.predict(X_train)
        accuracy, diff = callculate_accuracy('K-medoid, Own Implementation',
                                             labels_sklearn_kmedoid)
        accuracy_sum_train += accuracy
        diff_sum_train += diff
    return (accuracy_sum_test / n, diff_sum_test / n, accuracy_sum_train / n,
            diff_sum_train / n)
def test_kmedoids_fit_predict_transform():
    rng = np.random.RandomState(seed)
    model = KMedoids(random_state=rng)

    labels1 = model.fit_predict(X)
    assert len(labels1) == 100
    assert_array_equal(labels1, model.labels_)

    labels2 = model.predict(X)
    assert_array_equal(labels1, labels2)

    Xt1 = model.fit_transform(X)
    assert_array_equal(Xt1.shape, (100, model.n_clusters))

    Xt2 = model.transform(X)
    assert_array_equal(Xt1, Xt2)
예제 #4
0
    def vae_plot(self):
        z_list = torch.Tensor(1, 2)
        poses = []
        for input, output in self.generator:
            for inp in input:
                poses.append(inp)
            mu, logvar = self.model.encode(input)
            z = self.model.reparameterize(mu, logvar)
            z2 = z[:, -1, :]
            z_list = torch.cat((z_list.double(), z2.double()), 0)

        indices = np.random.randint(low=1, high=z_list.shape[0], size=1000)
        coords = np.array([z_list[ind, :].detach().numpy() for ind in indices])

        # # k-means clustering for coloring
        # kmeans = KMeans(n_clusters=5).fit(coords)
        # y_kmeans = kmeans.predict(coords)
        # plt.scatter(coords[:,0], coords[:,1], c=y_kmeans, cmap='viridis')
        # plt.show()
        #
        # # draw each mean pose
        # centers = kmeans.cluster_centers_
        # recons = [self.model.decode(torch.from_numpy(center)).detach().numpy().reshape(19,2) for center in centers]

        # k-medoids clustering for coloring
        kmedoids = KMedoids(n_clusters=5).fit(coords)
        y_kmedoids = kmedoids.predict(coords)
        plt.scatter(coords[:, 0], coords[:, 1], c=y_kmedoids, cmap='viridis')
        plt.show()

        recons = []
        for center in kmedoids.cluster_centers_:
            c = np.array(center)
            for i in range(len(coords)):
                if np.array_equal(c, coords[i]):
                    recons.append(poses[indices[i] -
                                        1].detach().numpy().reshape(19, 2))

        self.draw_poses(np.array(recons))
def test_precomputed():
    """Test the 'precomputed' distance metric."""
    rng = np.random.RandomState(seed)
    X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]]
    D_1 = euclidean_distances(X_1)
    X_2 = [[1.1, 0.0], [0.0, 0.9]]
    D_2 = euclidean_distances(X_2, X_1)

    kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng)
    kmedoids.fit(D_1)

    assert_allclose(kmedoids.inertia_, 0.2)
    assert_array_equal(kmedoids.medoid_indices_, [2, 0])
    assert_array_equal(kmedoids.labels_, [1, 1, 0, 0])
    assert kmedoids.cluster_centers_ is None

    med_1, med_2 = tuple(kmedoids.medoid_indices_)
    predictions = kmedoids.predict(D_2)
    assert_array_equal(predictions, [med_1 // 2, med_2 // 2])

    transformed = kmedoids.transform(D_2)
    assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_])
예제 #6
0
# https://scikit-learn-extra.readthedocs.io/en/latest/generated/sklearn_extra.cluster.KMedoids.html
from sklearn_extra.cluster import KMedoids
import numpy as np

X = np.asarray([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
kmedoids = KMedoids(n_clusters=2, random_state=0).fit(X)
print("labels")
print(kmedoids.labels_)

# Predict the closest cluster for each sample in the list of arguments
print("predictions")
print(kmedoids.predict([[0, 0], [4, 4]]))

# Displays the cluster centers, here called medoids, which are points from the original dataset
print("Cluster centers")
print(kmedoids.cluster_centers_)

print("inertia")
print(kmedoids.inertia_)
예제 #7
0
# train_indexes.append(151)
# train_indexes.append(152)

test_indexes = [x for x in list(range(0, 150)) if x not in list(train_indexes)]
X_train = X[train_indexes, :]
X_test = X[test_indexes, :]
y = y[test_indexes]

#k-medoid, own implementation
model = k_medoids(k=3, max_iter=200)
model.fit(X_train)
labels_own_kmedoid = model.predict(X_test)

#k-medoid, sklearn
model_sklearn = KMedoids(n_clusters=3, random_state=12412).fit(X_train)
labels_sklearn_kmedoid = model_sklearn.predict(X_test)

#kmeans, sklearn
model_kmeans = KMeans(n_clusters=3).fit(X_train)
labels_sklearn_kmeans = model_kmeans.predict(X_test)

# In[147]:

#Plot the identified clusters and compare with our result

fig, axes = plt.subplots(1, 4, figsize=(20, 7), dpi=200)
axes[0].scatter(X_test[:, 2], X_test[:, 3], c=y, cmap='Pastel1', edgecolor='k')
axes[1].scatter(X_test[:, 2],
                X_test[:, 3],
                c=labels_own_kmedoid,
                cmap='Accent',
예제 #8
0
class ArgumentClusterer:
    english_clusterer = None
    greek_clusterer = None

    def __init__(self, n_components=2):
        #self.__pca = PCA(n_components=n_components, random_state=0)
        self.__clusterer = None
        self.__medoid_texts = None

    def fit(self, x, output_filename_suffix='output.pdf'):
        x = np.array(x)
        num_samples, num_features = x.shape[0], x.shape[1]
        self.__pca = PCA(n_components=min(num_samples, num_features),
                         random_state=0)
        x_transformed = self.__pca.fit_transform(x)

        visualizer = KElbowVisualizer(KMedoids(random_state=0),
                                      k=(1, num_samples),
                                      timings=False,
                                      locate_elbow=True)
        visualizer.fit(x_transformed)
        best_n_clusters = visualizer.elbow_value_ if visualizer.elbow_value_ is not None else 1

        self.__clusterer = KMedoids(n_clusters=best_n_clusters, random_state=0)
        self.__clusterer.fit(x_transformed)

    def predict(self, x):
        x_transformed = self.__pca.transform(x)
        return self.__clusterer.predict(x_transformed)

    def get_medoid_indices(self):
        return self.__clusterer.medoid_indices_.tolist()

    # Sort different arguments into similar clusters.
    @staticmethod
    @counter
    def suggest_clusters(discussions, lang_det, en_nlp, el_nlp):

        # The workspace doesn't have enough discussions, early exit.
        if len(discussions) < 3:
            return {'greek_clusters': {}, 'english_clusters': {}}

        # Fit all clusterers for all discussions of a single workspace.
        ArgumentClusterer.fit_clusterers(discussions, lang_det, en_nlp, el_nlp)
        english_clusters = {
            label: {
                'nodes': [],
                'texts': [],
                'summary': '',
                'medoid_text': ''
            }
            for label in map(
                str, ArgumentClusterer.english_clusterer.__clusterer.labels_)
        } if ArgumentClusterer.english_clusterer is not None else {}
        greek_clusters = {
            label: {
                'nodes': [],
                'texts': [],
                'summary': '',
                'medoid_text': ''
            }
            for label in map(
                str, ArgumentClusterer.greek_clusterer.__clusterer.labels_)
        } if ArgumentClusterer.greek_clusterer is not None else {}

        for discussion in discussions:
            if discussion['Position'] in ['Issue', 'Solution']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                if ArgumentClusterer.english_clusterer is None:
                    continue
                predicted = str(
                    ArgumentClusterer.english_clusterer.predict(
                        [en_nlp.tokenizer(text).vector])[0])
                english_clusters[predicted]['nodes'].append(discussion['id'])
                english_clusters[predicted]['texts'].append(text)
                english_clusters[predicted][
                    'medoid_text'] = ArgumentClusterer.english_clusterer.__medoid_texts[
                        predicted]
            elif language == 'greek':
                if ArgumentClusterer.greek_clusterer is None:
                    continue
                predicted = str(
                    ArgumentClusterer.greek_clusterer.predict(
                        [el_nlp.tokenizer(text).vector])[0])
                greek_clusters[predicted]['nodes'].append(discussion['id'])
                greek_clusters[predicted]['texts'].append(text)
                greek_clusters[predicted][
                    'medoid_text'] = ArgumentClusterer.greek_clusterer.__medoid_texts[
                        predicted]

            # Run textrank on non-empty aggregated text from each cluster for each language.
            for en_cluster in english_clusters.keys():
                en_text = '. '.join(english_clusters[en_cluster]['texts'])
                if en_text != '':
                    en_doc = run_textrank(en_text, en_nlp)
                    english_clusters[en_cluster][
                        'summary'] = text_summarization(
                            en_doc, en_nlp, config.top_n, config.top_sent)

            for el_cluster in greek_clusters.keys():
                el_text = '. '.join(greek_clusters[el_cluster]['texts'])
                if el_text != '':
                    el_doc = run_textrank(el_text, el_nlp)
                    greek_clusters[el_cluster]['summary'] = text_summarization(
                        el_doc, el_nlp, config.top_n, config.top_sent)

        return {
            'greek_clusters': greek_clusters,
            'english_clusters': english_clusters
        }

    @staticmethod
    @counter
    def fit_clusterers(discussions, lang_det, en_nlp, el_nlp):
        english_clusterer = None
        greek_clusterer = None

        english_texts, greek_texts = [], []
        for discussion in discussions:
            if discussion['Position'] in ['Issue']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                english_texts.append(text)
            elif language == 'greek':
                greek_texts.append(text)

        if len(english_texts) > 2:
            # Initialize the English Clusterer.
            english_clusterer = ArgumentClusterer()

            # Calculate the embeddings for each text of this discussion.
            english_embeddings = [
                en_nlp.tokenizer(text).vector for text in english_texts
            ]

            # Fit the clusterer using the textual embeddings of this discussion.
            english_clusterer.fit(english_embeddings, 'english.pdf')

            # Find the medoids of each cluster from each language.
            english_clusterer.__medoid_texts = {
                str(english_clusterer.__clusterer.labels_[i]): english_texts[i]
                for i in english_clusterer.__clusterer.medoid_indices_
            }

        if len(greek_texts) > 2:
            # Initialize the Greek Clusterer.
            greek_clusterer = ArgumentClusterer()

            # Calculate the embeddings for each text of this discussion.
            greek_embeddings = [
                el_nlp.tokenizer(text).vector for text in greek_texts
            ]

            # Fit the clusterer using the textual embeddings of this discussion.
            greek_clusterer.fit(greek_embeddings, 'greek.pdf')

            # Find the medoids of each cluster from each language.
            greek_clusterer.__medoid_texts = {
                str(greek_clusterer.__clusterer.labels_[i]): greek_texts[i]
                for i in greek_clusterer.__clusterer.medoid_indices_
            }

        ArgumentClusterer.english_clusterer = english_clusterer
        ArgumentClusterer.greek_clusterer = greek_clusterer
예제 #9
0
class DFKMedoids(BaseEstimator, ClusterMixin):
    def __init__(self, cluster_name='KMedoids', columns=None,
                 eval_inertia=False, eval_silhouette=False, eval_chi=False, eval_dbi=False, eval_sample_size=None,
                 **kwargs):
        self.cluster_name     = cluster_name
        self.columns          = columns
        self.model            = KMedoids(**kwargs)
        self.eval_inertia     = eval_inertia
        self.eval_silhouette  = eval_silhouette
        self.eval_chi         = eval_chi
        self.eval_dbi         = eval_dbi
        self.eval_sample_size = eval_sample_size
        self.transform_cols   = None
        self.eval_df          = None
        self.centroid_df      = None
        
    def fit(self, X, y=None):
        self.columns        = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]

        # Evaluation
        if any([self.eval_inertia, self.eval_silhouette, self.eval_chi, self.eval_dbi]):
            inertias    = []
            silhouettes = []
            chis        = []
            dbis        = []

            self.eval_df = pd.DataFrame({
                'n_cluster': [x+1 for x in range(self.model.n_clusters)],
            })
            self.eval_df['centroid'] = self.eval_df['n_cluster'].apply(lambda x: [])

            tmp_X = X[self.transform_cols].copy()
            index = 0
            for n_cluster in tqdm(self.eval_df['n_cluster'].values):
                model = copy.deepcopy(self.model)
                model.n_clusters = n_cluster
                model.fit(tmp_X)

                # Cluster centroid
                self.eval_df.at[index, 'centroid'] = model.cluster_centers_

                # Reference: https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f
                if self.eval_inertia:
                    inertias.append(model.inertia_)

                # Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6
                if self.eval_silhouette:
                    silhouettes.append(np.nan if n_cluster <= 1 else silhouette_score(tmp_X, model.labels_, sample_size=self.eval_sample_size, metric='euclidean', random_state=model.random_state))

                # Reference: https://stats.stackexchange.com/questions/52838/what-is-an-acceptable-value-of-the-calinski-harabasz-ch-criterion
                if self.eval_chi:
                    chis.append(np.nan if n_cluster <= 1 else calinski_harabasz_score(tmp_X, model.labels_))

                # Reference: https://stackoverflow.com/questions/59279056/davies-bouldin-index-higher-or-lower-score-better
                if self.eval_dbi:
                    dbis.append(np.nan if n_cluster <= 1 else davies_bouldin_score(tmp_X, model.labels_))

                index += 1

            if self.eval_inertia:
                self.eval_df['inertia'] = inertias

            if self.eval_silhouette:
                self.eval_df['silhouette'] = silhouettes

            if self.eval_chi:
                self.eval_df['calinski_harabasz'] = chis

            if self.eval_dbi:
                self.eval_df['davies_bouldin'] = dbis

        # Train
        else:
            self.model.fit(X[self.transform_cols])

            self.centroid_df = pd.DataFrame(
                self.model.cluster_centers_,
                columns=self.transform_cols
            )
            self.centroid_df['Cluster'] = [f'Cluster {x}' for x in np.unique(self.model.labels_)]
            self.centroid_df.set_index('Cluster', inplace=True)
            self.centroid_df.index.name = None

        return self
    
    def predict(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X = X.copy()
        new_X[self.cluster_name] = self.model.predict(X[self.transform_cols])
        new_X[self.cluster_name] = 'Cluster ' + new_X[self.cluster_name].astype(str)

        return new_X

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

    def predict_proba(self, X):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        # Measure distance to centroid
        prob_df = pd.DataFrame(
            DistanceMetric.get_metric('euclidean').pairwise(X[self.transform_cols], self.centroid_df),
            columns=[f'{self.cluster_name} Cluster {x}' for x in range(len(self.centroid_df))]
        )
        # Convert to probability
        prob_df = prob_df.divide(prob_df.sum(axis=1), axis=0)
        prob_df = 1 - prob_df

        new_X = pd.concat([X, prob_df], axis=1)

        return new_X
예제 #10
0
XC = X[-recent_cluster_num:]
yC = y[-recent_cluster_num:]

num_test = 60
a_class, a_kmeans, a_kmedians, a_gmm, a_spectral, test_count = [], [], [], [], [], []
for m in range(1, num_test + 1):
    classification_pred, clustering_pred_kmeans, clustering_pred_kmedians, clustering_pred_gmm, clustering_pred_spectral, real = [], [], [], [], [], []
    XC = X[-recent_cluster_num:]
    yC = y[-recent_cluster_num:]
    for i in range(1, m + 1):
        kmeans = KMeans(n_clusters=5, random_state=0).fit(XC)
        kmedians = KMedoids(n_clusters=5, random_state=0).fit(XC)
        gmm = GaussianMixture(n_components=5, random_state=0).fit(XC)
        XC_p = X[recent_cluster_num + i, :].reshape(1, 27)
        label_kmeans = kmeans.predict(XC_p)
        label_kmedians = kmedians.predict(XC_p)
        label_gmm = gmm.predict(XC_p)
        centroid_kmeans = kmeans.cluster_centers_[label_kmeans]
        centroid_kmedians = kmedians.cluster_centers_[label_kmedians]
        centroid_gmm = gmm.means_[label_gmm]
        clf = LinearDiscriminantAnalysis()
        class_p = clf.fit(X, y).predict(XC_p)
        clus_p_kmeans = clf.fit(XC, yC).predict(centroid_kmeans)
        clus_p_kmedians = clf.fit(XC, yC).predict(centroid_kmedians)
        clus_p_gmm = clf.fit(XC, yC).predict(centroid_gmm)
        classification_pred.append(class_p)
        clustering_pred_kmeans.append(clus_p_kmeans)
        clustering_pred_kmedians.append(clus_p_kmedians)
        clustering_pred_gmm.append(clus_p_gmm)
        real.append(y[recent_cluster_num + i])
예제 #11
0
def kmedoids_clustering(country, dbData, thermalFields, clusters,
                        floor_area_outlier_borders,
                        energy_consumption_outlier_borders):

    # https://hackersandslackers.com/json-into-pandas-dataframes/
    # json_normalize has as default separator '.', since we have float numbers in our data, we set the column separator for the normalization to '_'
    data_df = pd.json_normalize(dbData, sep="_")
    print(data_df.describe().transpose())

    # https://datatofish.com/k-means-clustering-python/
    slim_data_df = pd.DataFrame(
        data_df,
        columns=[
            'ratedDwelling_spatialData_totalFloorArea_value', thermalFields
        ])

    print(slim_data_df.head)

    # fitting the data is quite important, the clusters are now more like circles; the non-fitted data was more like strapes.
    slim_fitted_df = StandardScaler().fit_transform(slim_data_df)

    # remove the outliers which we detected already visually by running a kmeans plot before
    print("=== Outliers")
    # however, first convert the data back into a dataframe
    slim_data_df_optimised = pd.DataFrame(
        slim_fitted_df,
        columns=[
            'ratedDwelling_spatialData_totalFloorArea_value', thermalFields
        ])

    print(slim_data_df_optimised.shape)
    print(slim_data_df_optimised)

    print(country + ' outlier borders floor_area' +
          str(floor_area_outlier_borders))
    print(country + ' outlier borders energy_consumption' +
          str(energy_consumption_outlier_borders))

    print('Outliers')
    slim_data_df_optimised_floor_area = slim_data_df_optimised[
        slim_data_df_optimised['ratedDwelling_spatialData_totalFloorArea_value']
        >= floor_area_outlier_borders[0]]
    slim_data_df_optimised_energy_consumption = slim_data_df_optimised[
        slim_data_df_optimised[thermalFields] >
        energy_consumption_outlier_borders[0]]

    print(slim_data_df_optimised_floor_area)

    if country == 'England':
        slim_data_df_optimised_energy_consumption_lower = slim_data_df_optimised[
            slim_data_df_optimised[thermalFields] <
            energy_consumption_outlier_borders[1]]
        print(slim_data_df_optimised_energy_consumption_lower)
        frames = [
            slim_data_df_optimised_energy_consumption,
            slim_data_df_optimised_energy_consumption_lower
        ]
        slim_data_df_optimised_energy_consumption = pd.concat(frames,
                                                              sort=False)
    print(slim_data_df_optimised_energy_consumption)

    print("Outliers indexes")
    index_outliers_floorArea = slim_data_df_optimised_floor_area.index.values
    index_outliers_thermal = slim_data_df_optimised_energy_consumption.index.values

    print("index_outliers_floorArea")
    print(index_outliers_floorArea)
    print("index_outliers_thermal")
    print(index_outliers_thermal)

    # Removing the Outliers
    slim_data_df_optimised = slim_data_df_optimised.drop(
        index=index_outliers_floorArea)
    slim_data_df_optimised = slim_data_df_optimised.drop(
        index=index_outliers_thermal)
    print("scaled data after droping the outliers")
    print(slim_data_df_optimised.shape)

    print("rechecking")
    print(slim_data_df_optimised[slim_data_df_optimised_floor_area])
    print(slim_data_df_optimised[slim_data_df_optimised_energy_consumption])

    print("transforming the optimised dataset back to an array")
    slim_data_df_optimised_as_array = slim_data_df_optimised.to_numpy()
    print(slim_data_df_optimised_as_array.shape)
    print(type(slim_data_df_optimised_as_array))

    # remove the same index rows in the original data
    data_df = data_df.drop(index=index_outliers_floorArea)
    data_df = data_df.drop(index=index_outliers_thermal)
    print("original data after droping the outliers")
    print(data_df.shape)

    print("=== Outliers END")

    # preset the number of clusters
    kmedoids = KMedoids(n_clusters=clusters,
                        random_state=0).fit(slim_data_df_optimised_as_array)
    print("labels")
    print(kmedoids.labels_)

    # Displays the cluster centers, here called medoids, which are points from the original dataset
    print("Cluster centers")
    print(kmedoids.cluster_centers_)

    print("inertia")
    print(kmedoids.inertia_)

    print("predictions")
    print(kmedoids.predict([[-0.98965, -0.3211], [0.6, -0.300]]))

    scatter = plt.scatter(slim_data_df_optimised_as_array[:, 0],
                          slim_data_df_optimised_as_array[:, 1],
                          c=kmedoids.labels_.astype(float),
                          s=50,
                          alpha=0.5)

    centroids = kmedoids.cluster_centers_
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)

    plt.xlabel("floor area")
    plt.ylabel("energy consumption")

    plt.title(country + ":  " + str(len(slim_data_df_optimised_as_array) + 1) +
              " dwellings")

    plt.legend(*scatter.legend_elements(), loc='upper right', title="Clusters")

    here = os.path.dirname(os.path.abspath(__file__))

    filename = os.path.join(
        Path(here).parent, 'plots', 'kmedoid_plots',
        country + '_kmedoid_plot.png')
    plt.savefig(filename)

    # function that creates a dataframe for cluster centers with a column for cluster number

    P = pd_centers(
        ['ratedDwelling_spatialData_totalFloorArea_value', thermalFields],
        centroids)
    print('Centroids (x,y,label) for the fitted data')
    print(P)

    # we have the array with labels (cluster numbers)
    # so we add a new column with cluster numbers to the original data
    labels = kmedoids.labels_
    data_df['cluster_number'] = labels
    print('original data and the corresponfing cluster numbers')
    print(data_df)

    # display the rating level from the original data and the cluster number form the fitted data
    rating_vs_cluster_data_df = pd.DataFrame(
        data_df, columns=['awardedRating_ratingLevel', 'cluster_number'])
    print(rating_vs_cluster_data_df)

    # group by rating level
    print(
        rating_vs_cluster_data_df.groupby(
            ["awardedRating_ratingLevel",
             "cluster_number"])['cluster_number'].count())

    # => there is no clear relation btw. rating_level and cluster number
    return 'the END'
 def kmedoids(self, score_df, col_name):
     kmedoids = KMedoids(n_clusters = self.clust_num, random_state = self.random_state)
     kmedoids.fit(score_df[[col_name]])
     res_cluster = kmedoids.predict(score_df[[col_name]])
     return res_cluster
예제 #13
0
plt.scatter(p1, p2, c=y_kmeans, s=50)
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200)
plt.show()
print("sum of sq dist = ", kmeans.inertia_)
print("purity : ", purity_score(y, y_kmeans))
print("homogenousity score: ", hs(y, y_kmeans))
vals = []
for i in k:
    km = KMeans(n_clusters=i, random_state=0).fit(X)
    vals.append(km.inertia_)
plt.plot(k, vals)
plt.show()
kmedoids = KMedoids(n_clusters=4, random_state=0).fit(X)
centers = kmedoids.cluster_centers_
labels = kmedoids.labels_
y_kmed = kmedoids.predict(X)
y_kmeans = kmedoids.predict(X)
plt.scatter(p1, p2, c=y_kmeans, s=50)
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200)
plt.show()
print("sum of sq dist = ", kmedoids.inertia_)
print("purity : ", purity_score(y, y_kmed))
print("homegenousity score : ", hs(y, y_kmed))
vals = []
for i in k:
    km = KMedoids(n_clusters=i, random_state=0).fit(X)
    vals.append(km.inertia_)
plt.plot(k, vals)
plt.show()

gmm = GMM(n_components=4).fit(X)