def kproto(self): # TODO- solve clustering issue with PCA + K-means cluster_data = self.data opt_k = self.silouhette_analysis(cluster_data, prototype=True) kp = KPrototypes(n_clusters=opt_k) kp.fit(cluster_data, categorical=self.categorical_features) labels = kp.predict(cluster_data, categorical=self.categorical_features) cluster_data['labels'] = labels self.data_clustered = cluster_data return cluster_data
kproto.fit_predict(df_model, categorical=[0, 1, 2]) cost[k] = kproto.cost_ #Visualisasi Elbow Plot sns.pointplot(x=list(cost.keys()), y=list(cost.values())) plt.show() #Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot import pickle kproto = KPrototypes(n_clusters=5, random_state=75) kproto = kproto.fit(df_model, categorical=[0, 1, 2]) pickle.dump(kproto, open('best_cluster.pkl', 'wb')) #Menentukan segmen tiap pelanggan clusters = kproto.predict(df_model, categorical=[0, 1, 2]) print('segmen_pelanggan: {}\n'.format(clusters)) #Menggabungkan data awal dan segmen pelanggan df_final = df.copy() df_final['cluster'] = clusters print(df_final.head()) #Menampilkan data pelanggan berdasarkan cluster for i in range(0, 5): print('\nPelanggan cluster {}\n'.format(i)) print(df_final[df_final['cluster'] == i]) #Visualisasi box plot hasil clustering for i in kolom_numerik: plt.figure(figsize=(6, 4))
class KPrototypesClustering(baseoperationclass.BaseOperationClass): _operation_name = 'K-Prototypes Clustering' _operation_code_name = 'KPrototypes' _type_of_operation = 'cluster' def __init__(self): super().__init__() self.cluster_number = CLUSTER_NUMBER self.categorical_weight = CATEGORICAL_WEIGHT self.selected_features = [] self.model = None self.labels = None self.centers = None def _preprocessed_data(self, data): return data if not self.selected_features \ else data.loc[:, self.selected_features] def set_parameters(self, cluster_number, categorical_weight=None, features=None): if cluster_number is not None: self.cluster_number = cluster_number if categorical_weight is not None: self.categorical_weight = categorical_weight if features is not None and isinstance(features, (list, tuple)): self.selected_features = list(features) return True def get_parameters(self): return { 'cluster_number_KPrototypes': self.cluster_number, 'categorical_data_weight_KPrototypes': self.categorical_weight, 'features_KPrototypes': self.selected_features } def _get_initial_centers(self, dataset, categorical_indices): dataset_cat = dataset.take(categorical_indices, axis=1).values categorical_labels = [ column for index, column in enumerate(dataset.columns) if index in categorical_indices ] dataset_num = dataset.drop(categorical_labels, axis=1).values categorical_weight = self.categorical_weight if categorical_weight is None or categorical_weight < 0: categorical_weight = 0.5 * dataset_num.std() initial_centroids_num = np.zeros( (self.cluster_number, dataset_num.shape[1])) initial_centroids_cat = np.zeros( (self.cluster_number, dataset_cat.shape[1])) rand_index = randint(0, dataset.shape[0] - 1) initial_centroids_num[0], initial_centroids_cat[0] = dataset_num[ rand_index], dataset_cat[rand_index] for i in range(1, self.cluster_number): distances_num_cat = [ np.zeros((i, dataset.shape[0]), dtype=np.float64), np.zeros((i, dataset.shape[0])) ] for j in range(0, i): distances_num_cat[0][j] = dissimilarity_python.euclidean( dataset_num, initial_centroids_num[j]) distances_num_cat[1][j] = matching_dissim( dataset_cat, initial_centroids_cat[j]) distances = np.amin(distances_num_cat[0] + categorical_weight * distances_num_cat[1], axis=0) probabilities = distances / np.sum(distances) chosen_point = np.random.choice(range(0, dataset.shape[0]), p=probabilities) initial_centroids_num[i] = dataset_num[chosen_point] initial_centroids_cat[i] = dataset_cat[chosen_point] initial_centroids = [initial_centroids_num, initial_centroids_cat] return initial_centroids # Used if there's no categorical properties in the dataset def _fallback_algorithm(self, dataset): from . import KMeansClustering self.model = KMeansClustering.KMeansClustering() self.model.set_parameters(self.cluster_number, self.selected_features) self.labels = self.model.get_labels(dataset) self.centers = self.model.centers return self.labels # By default, K-Prototypes uses euclidean distance for numerical data and Hamming distance for categorical data # n_init is the number of time the k-modes algorithm will be run with different centroid seeds # gamma is the weight to balance numerical data against categorical. # If None, it defaults to half of standard deviation for numerical data def get_labels(self, data, reprocess=False): data_original = data data = self._preprocessed_data(data) categorical_indices = get_categorical_indices(data) if not categorical_indices: return self._fallback_algorithm(data_original) if self.model is None or reprocess: data = encode_nominal_parameters(data) data = normalized_dataset(data, categorical_indices) initial_centers = self._get_initial_centers( data, categorical_indices) self.model = KPrototypes(n_clusters=self.cluster_number, max_iter=1000, init=initial_centers, n_init=10, gamma=self.categorical_weight, num_dissim=dissimilarity_python.euclidean, n_jobs=1) data = data.values self.model.fit(data, categorical=categorical_indices) self.labels = self.model.predict(data, categorical=categorical_indices) self.centers = self.model.cluster_centroids_ centers = self.centers[0] for index, cat_index in enumerate(categorical_indices): centers = np.insert(centers, cat_index, values=self.centers[1].transpose()[index], axis=1) self.centers = centers else: self.labels = self.model.predict(data) return self.labels # Legacy methods def print_parameters(self): return self.get_parameters() def save_parameters(self): return self.get_parameters() def load_parameters(self, parameters): self.set_parameters( cluster_number=parameters.get('cluster_number_KPrototypes') or CLUSTER_NUMBER, categorical_weight=parameters.get( 'categorical_data_weight_KPrototypes') or CATEGORICAL_WEIGHT, features=parameters.get('features_KPrototypes') or []) return True def save_results(self): return { 'results': self.labels.tolist(), 'centers': self.centers.tolist(), 'dump': pickle.dumps(self.model).hex() } def load_results(self, results_dict): if results_dict.get("results") is not None: self.labels = np.array(results_dict['results']) if results_dict.get("centers") is not None: self.centers = np.array(results_dict['centers']) if results_dict.get("dump") is not None: self.model = pickle.loads(bytes.fromhex(results_dict['dump'])) return True def process_data(self, data): return self.get_labels(data) def predict(self, data): return self.get_labels(data)