Exemplo n.º 1
0
def elbow(X,
          range_clusters=range(2, 6),
          alg='kmeans',
          cat_features=[],
          random_state=42):

    inertias = []
    ks = range_clusters
    model = None
    for k in ks:
        if alg == 'kmeans':
            model = KMeans(n_clusters=k, random_state=random_state)
        elif alg == 'kmodes':
            model = KMeans(n_clusters=k, random_state=random_state)
        else:
            model = KPrototypes(n_clusters=k,
                                cat_features=cat_features,
                                random_state=random_state)

        model.fit(X.values)
        # centroids_, clusters_, inertia_ = k_means(X_final.values, k=k)
        inertias.append(model.inertia)

    plt.plot(ks, inertias, '-o', color='black')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.title(alg)
    plt.xticks(ks)
    plt.show()
Exemplo n.º 2
0
def get_clusterer(n_clusters,
                  cat_features=[],
                  alg='kmeans',
                  agglo_params=None,
                  random_state=10):
    clusterer = None
    metric = ''
    # if len(cat_features) == 0:
    if alg == 'agglo':
        clusterer = AgglomerativeClustering(affinity=agglo_params[0],
                                            compute_full_tree='auto',
                                            linkage=agglo_params[1],
                                            memory=None,
                                            n_clusters=n_clusters,
                                            pooling_func='deprecated')
        metric = 'euclidean'
    elif alg == 'kmeans':
        clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
        metric = 'euclidean'
    elif alg == 'kmodes':
        clusterer = KModes(n_clusters=n_clusters, random_state=random_state)
        metric = 'manhattan'
    elif alg == 'fuzzy':
        clusterer = FuzzyCMeans(n_clusters=n_clusters,
                                random_state=random_state)
        metric = 'euclidean'


# else:
    elif alg == 'kproto':
        clusterer = KPrototypes(n_clusters=n_clusters,
                                cat_features=cat_features,
                                random_state=random_state)
        metric = 'manhattan'

    return clusterer, metric
# ## K-Prototypes
# 

# In[27]:


from cluster.kprototypes import KPrototypes

best_clusters = None
best_centroids = None
best_r = None
best_score = -9999
cat_features = list(range(len(features)))
for r in range(25):
  kp = KPrototypes(n_clusters=3, cat_features=cat_features, random_state=r)
  kp.fit(df_original[features].values)
  clusters = kp.labels
  score = adjusted_rand_score(df_original[target], clusters)
  if score > best_score:
    best_clusters = clusters
    best_centroids = kp.centroids
    best_score = score 
    best_r = r
r_kproto = best_r
kprototypes_clusters = best_clusters
best_score



# In[28]:
# #### No PCA
#

# In[19]:

from cluster.kprototypes import KPrototypes
from cluster.metrics import get_metrics, rename_labels
from sklearn.metrics import adjusted_rand_score

best_clusters = None
best_centroids = None
best_r = None
best_score = -9999
cat_features = list(range(len(features)))
for r in range(25):
    kp = KPrototypes(n_clusters=3, cat_features=cat_features, random_state=r)
    kp.fit(df_original[features].values)
    clusters = kp.labels
    score = adjusted_rand_score(df_original[target], clusters)
    if score > best_score:
        best_clusters = clusters
        best_centroids = kp.centroids
        best_score = score
        best_r = r
r_kproto = best_r
kprototypes_clusters = best_clusters
best_score

# In[20]:

# pca_df = pd.DataFrame(pca.transform(df_OHE[features_OHE]))
n_comp = 3
pca = PCA(n_components=n_comp)
X_scaled_pca = pca.fit_transform(X_scaled.values)
X_scaled_pca = pd.DataFrame(X_scaled_pca)
X_scaled_pca.head()

# In[138]:

graph_components(X_scaled, n_components=n_comp)

# In[139]:

from cluster.kprototypes import KPrototypes

cat_features = list(range(2, len(X_scaled.columns)))
kp = KPrototypes(n_clusters=3, cat_features=cat_features, random_state=5)
kp.fit(X_scaled.values)

plt.scatter(X_scaled_pca.values[:, 0],
            X_scaled_pca.values[:, 1],
            c=kp.labels,
            s=50,
            cmap='viridis')

centroids_pca = PCA(n_components=n_comp).fit_transform(kp.centroids)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='x', c='r', s=200)
plt.title('K-prototypes')
plt.show()

# In[140]:
Exemplo n.º 6
0
from decomposition.pca import PCA

print(f'Shape X: {X.values.shape}')
# print(f'Shape W: {W.shape}')

pca = PCA(n_components=3, random_state=5)
X_pca = pca.fit_transform(X.values)

print(f'Shape X_pca: {X_pca.shape}')

# In[19]:

from cluster.kprototypes import KPrototypes

# cat_features = list(range(2, len(X.columns)))
kp = KPrototypes(n_clusters=3, cat_features=[], random_state=8)
kp.fit(X_pca)

plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kp.labels, s=50, cmap='viridis')

centroids_pca = PCA(n_components=3).fit_transform(kp.centroids)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='x', c='r', s=200)
plt.title('K-prototypes')
plt.show()

# **Metrics**

# In[20]:

from cluster.metrics import get_metrics
range_n_clusters = [2, 3, 4]
silhouette(X_cat, X_cat_pca, alg='kmodes', range_clusters=range_n_clusters)

# **K-prototypes**

# In[110]:

# best random state
from cluster.kprototypes import KPrototypes
best_clusters = None
best_centroids = None
best_r = None
best_score = -9999
for r in range(20):
    kp = KPrototypes(n_clusters=2, random_state=r)
    kp.fit(X_num_scaled.values)
    score = adjusted_rand_score(y, kp.labels)
    if score > best_score:
        best_clusters = kme.labels
        best_centroids = kme.centroids
        best_score = score
        best_r = r
fcm_clusters = best_clusters
print('Best score:', best_score)
print('Best random state value:', best_r)

# In[111]:

# Visaulization k-prototypes
from cluster.kprototypes import KPrototypes