def exec_kprototypes(df, choices_obj): print("Whitening data...", end='', flush=True) for header in choices_obj['numerical']: df[header + "_scaled"] = whiten(df[header]) print("Done.") nums_scaled = [header + "_scaled" for header in choices_obj['numerical']] cats_not_scaled = [header for header in choices_obj['categorical']] X = pd.concat( [df[nums_scaled].astype(float), df[cats_not_scaled].astype(str)], axis=1) k = int(input("Number of clusters:\n > ")) kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2) df['cluster_labels'] = kproto.fit_predict( X.values, categorical=list( range(len(X.columns) - len(cats_not_scaled), len(X.columns)))) if (len(nums_scaled) >= 2): # Plot clusters print("Only showing 2 dimensions of data (picking first two headers)") sns.scatterplot(x=nums_scaled[0], y=nums_scaled[1], hue='cluster_labels', data=df) plt.show()
def k_prototypes_fitness(self,individual): self.individual = individual df_cluster=self.X.copy() if self.add_target: self.individual = [1] + self.individual #check if calculation was already made upt to 2nd decimal inf_curr = [round(float(y),2) for y in individual] for x in self.results: ind_test_norm = [round(float(y),2) for y in x[:-1]] if ind_test_norm == inf_curr: print('já calculado') return float(x[-1]), #weights on kmeans for i in self.numerical_index: df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] random.seed(10) kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=0, gamma=1,n_init=1, init = 'random', random_state=10) kproto.fit(df_cluster.values,categorical = self.categorical_index) ftnss = self.calculate_fitness(kproto.labels_,kproto) self.save_scoring(self.individual,ftnss,kproto) self.results.append(self.individual + [ftnss]) return ftnss,
def predict(self): with open(self.data_processed, 'rb') as f: self.dataset = pickle.load(f) with open(self.label_file, 'rb') as f: self.label = pickle.load(f) # self.y_pred = KMeans(n_clusters=5, random_state=9).fit_predict(self.dataset) # np.savetxt(self.cluster_result, np.hstack(self.y_pred, self.dataset) , delimiter=',') # score = metrics.calinski_harabaz_score(self.dataset, self.y_pred) # print(score) kproto = KPrototypes(n_clusters=5, init='Cao', verbose=2) clusters = kproto.fit_predict(self.dataset, categorical=[1]) temp = np.loadtxt(fname=self.data_cleaned, dtype=object, delimiter=',') room_identity = temp[1:, :3] self.result = np.column_stack((room_identity, self.dataset, clusters)) print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) with open(self.result_binary, 'wb') as f: pickle.dump(self.result, f) with open('kproto_res', 'wb') as f: pickle.dump(kproto, f) with open(self.cluster_result, 'w') as f: re = self.result.tolist() for line in re: f.write("\t".join(list(map(str, line))) + '\n') for s, c in zip(self.label, clusters): print("Room identity: {}, cluster:{}".format(s, c))
def get_knee_results(data, cluster_lims, cores, categorical): knee_results = [] cluster_range = range(*cluster_lims) for n_clusters in tqdm(cluster_range): kp = KPrototypes(n_clusters, init="cao", random_state=0, n_jobs=cores) kp.fit(data[cols], categorical=categorical) knee_results.append(kp.cost_) kl = KneeLocator( cluster_range, knee_results, curve_nature="convex", curve_direction="decreasing", ) n_clusters = kl.knee with open(OUT_DIR / "n_clusters.txt", "w") as f: f.write(str(n_clusters)) knee_results = pd.Series(index=cluster_range, data=knee_results) knee_results.to_csv(OUT_DIR / "knee_results.csv", header=False) return n_clusters
def kprototypesCluster(features: np.array, catCols: list, nClust: int): #Convert continous features to astype float model = KPrototypes(n_clusters=nClust, verbose=2) clusters = model.fit_predict(features, categorical=catCols) return model
def kproto(self, K=20, N=int(1e5), MN=4, T=10, type='cao', save=True): data = self.to_numpy() M = data.shape[1] # MN = 22 if type == 'huang': model = KPrototypes(n_clusters=K, init='Huang', n_init=1, verbose=1) if type == 'cao': model = KPrototypes(n_clusters=K, init='Cao', verbose=2, max_iter=10000) clusters = model.fit_predict( data, categorical=[0, 3, 6, 8] if self.cl_type == 'prop' else [ 0, 2, 3, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ]) if save: self.save(model, 'Clustering_kproto_model') return np.array(model.cluster_centroids_[0]), np.array( model.cluster_centroids_[1]), np.array(clusters)
def making_model(self): kproto = KPrototypes ( n_clusters = 5, random_state = 75) kproto = kproto.fit(self.df_model, categorical=[0,1,2]) #Save Model pickle.dump(kproto, open('cluster.pkl', 'wb')) self.kproto = kproto
def get_labels(data, n_clusters, cores, categorical): kp = KPrototypes(n_clusters, init="matching", n_init=50, random_state=0, n_jobs=cores) kp.fit(data[cols], categorical=categorical) print(kp.cost_) return kp.labels_
def find_optimalCluster(self): # Mencari Jumlah Cluster yang Optimal # Melakukan Iterasi untuk Mendapatkan nilai Cost cost = {} for k in range(2,10): kproto = KPrototypes(n_clusters = k, random_state = 75) kproto.fit_predict(self.df_model, categorical = [0,1,2]) cost[k]= kproto.cost_ # Memvisualisasikan Elbow Plot sns.pointplot(x = list(cost.keys()), y = list(cost.values())) plt.show()
def kprotoypes_cluster(df, n_clusters, category, hover_text): datadf = df.loc[:, df.columns != hover_text] kmodes_instance = KPrototypes(n_clusters=n_clusters, init='Cao', verbose=2) clusters = kmodes_instance.fit_predict(datadf, categorical=category) data_array = np.array(datadf.to_numpy().tolist()) col_len = len(datadf.columns) if (col_len == 2): clus = scat2d(data_array, clusters, hover_text, df) return clus else: clus = scat3d(data_array, clusters, hover_text, df) return clus
def kproto(self): # TODO- solve clustering issue with PCA + K-means cluster_data = self.data opt_k = self.silouhette_analysis(cluster_data, prototype=True) kp = KPrototypes(n_clusters=opt_k) kp.fit(cluster_data, categorical=self.categorical_features) labels = kp.predict(cluster_data, categorical=self.categorical_features) cluster_data['labels'] = labels self.data_clustered = cluster_data return cluster_data
def KPrototypes_cluster(input_data, k_clusters): from kmodes.kprototypes import KPrototypes #normalized data normalized = preprocessing.StandardScaler() input_data[input_data.select_dtypes( include=['float', 'integer']).columns] = normalized.fit_transform( input_data[input_data.select_dtypes( include=['float', 'integer']).columns]) input_data = input_data.as_matrix() kproto = KPrototypes(n_clusters=k_clusters, init='Cao', verbose=2) clus_kmeans_fit = kproto.fit_predict(input_data, categorical=[0, 1, 2, 3, 4, 5, 6, 7]) return (clus_kmeans_fit)
def cluster(summ, agg_classes=None): """ Clusters summary info using DBSCAN if agg_classes is provided it uses K-Prototypes """ all_prop = None prop = {} ranks = [] for flow, edge in summ.get_flowedges(): rank = round(summ.get_edge_rank(flow, edge), 2) if rank < 0.5: continue # else: # print(flow, edge, rank) policy = nopticon.ReachabilityPolicy({ 'flow': flow, 'source': edge[0], 'target': edge[1] }) prop[policy] = len(ranks) if agg_classes is not None: ranks.append([rank, agg_classes[edge[0]], agg_classes[edge[1]]]) else: ranks.append([rank]) if agg_classes is not None: kproto = KPrototypes(n_clusters=3, init='Huang') clust = kproto.fit_predict(np.matrix(ranks).A, categorical=[1, 2]) else: agg = KMeans(n_clusters=2, n_jobs=2) # linkage="complete") clust = agg.fit(ranks).labels_ assert len(clust) == len(ranks) means = {} high = None for k in set(clust): kranks = [ranks[idx][0] for idx in prop.values() if clust[idx] == k] means[k] = sum(kranks) / len(kranks) if high is None or means[k] > means[high]: high = k for p, idx in prop.items(): if clust[idx] == high: # print("\tHIGH:", ranks[idx], p) summ.mark_cluster_accepted(p.flow(), p.edge()) # else: # print("\tlow:", ranks[idx], p) return
def get_clusters(self, df, var_list, k_values, map_sa_districts, path_out, cat_list=[]): for k in k_values: # k prototype KPro_model = KPro(n_clusters=k) #df_geo.loc[:, columns4] = preprocessing.normalize(df_geo.loc[:, columns4].values) KPro_fit = KPro_model.fit(X=df[var_list], categorical=cat_list) df['KPrototype cluster labels'] = KPro_fit.labels_ # plot self.plot_clusters(k, df, var_list, map_sa_districts, path_out) return df
def kprototypes_compute_metrics_for_every_cluster_number( clusters_range_lower_bound, clusters_range_upper_bound, dataset, distance_algorithm, init_Cao_or_Huang_for_kprototypes, list_categorical_features_indeces_for_kprototypes, print_optimum_metrics=True): kprototypes_list_metrics = [] for num_of_clusters in range(clusters_range_lower_bound, clusters_range_upper_bound): kprototypes = KPrototypes(n_clusters=int(num_of_clusters), init=str(init_Cao_or_Huang_for_kprototypes), n_init=50, verbose=0) predictions = kprototypes.fit_predict( dataset, categorical=list_categorical_features_indeces_for_kprototypes) centers = kprototypes.cluster_centroids_ cost_function = kprototypes.cost_ num_jobs = kprototypes.n_iter_ error_metric = cost_function silhouette = silhouette_score(dataset, predictions, distance_algorithm) kprototypes_list_metrics.append({ 'clusters': num_of_clusters, 'silhouette': silhouette, 'error': error_metric, 'num_jobs': num_jobs }) if print_optimum_metrics is True: print( "For n_clusters = {}, silhouette score is {}, cluster_errors is {}, " "n_jobs {})".format(num_of_clusters, silhouette, error_metric, num_jobs)) return kprototypes_list_metrics
def ClusterCreation(request,*args): global kproto #Example of clustering with random data ''' # random categorical data data = np.array([ [0,'a',4], [1,'e',3], [6,'ffed',15], [5,'fdfd',16] ]) kproto = KPrototypes(n_clusters=2, init='Cao', verbose=2) clusters = kproto.fit(data, categorical=[1]) # Create CSV with cluster statistics clusterStatisticsCSV(kproto) for argument in args: if argument is not None: return ''' # Get data from database rows=get_training_data() # Cast as numpy Array rows_array=np.array(rows) #Split data into variables and id's data_array = np.array(rows_array)[:,1:] #dejamos sólo las variables que pueden clusterizar el cliente ids_array = np.array(rows_array)[:, 0] #guardamos las id's en otro array #Clustering kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2) #clusters = kproto.fit(data_array, categorical=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]) clusters = kproto.fit(data_array,categorical=[1, 2, 3, 4]) # Create CSV with cluster statistics clusterStatisticsCSV(kproto) for argument in args: if argument is not None: return return HttpResponse('Clustering realizado y CSV report generado')
def return_best_cluster(self,df_cluster,cluster_param): if self.cluster_method == 'kprototypes': #weights on kmeans for i in self.numerical_index: df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] if os.path.exists(self.folder + 'cluster_init.json'): with open(self.folder + 'cluster_init.json') as f: cluster_init = json.load(f) ftnss = 100000 for init in cluster_init: init = [ np.array(init[0]),np.array(init[1])] kproto = KPrototypes(n_clusters=cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=1, gamma=1,n_init=1, init = init) kproto.fit(df_cluster.values,categorical = self.categorical_index) x = pd.DataFrame([]) x['cluster'] = kproto.labels_ x['target'] = self.target df_grouped = x.groupby(['cluster'])['target'].max() - x.groupby(['cluster'])['target'].min() curr_ftnss = (df_grouped.values).sum() print(ftnss) print(curr_ftnss < ftnss) winner_model = kproto if curr_ftnss < ftnss: ftnss = curr_ftnss winner_model = kproto else: kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=1, gamma=1,n_init=1, init = 'Cao') kproto.fit(df_cluster.values,categorical = self.categorical_index) curr_ftnss = self.calculate_fitness(kproto.labels_) winner_model = kproto dump(winner_model,self.folder+'best_model.joblib') self.df['cluster'] = winner_model.labels_ return winner_model elif self.cluster_method == 'hdbscan': clusterer = hdb.HDBSCAN(min_cluster_size=cluster_param, prediction_data=True) clusterer.fit(df_cluster) dump(clusterer,self.folder+'best_model.joblib') self.df['cluster'] = clusterer.labels_ return clusterer
def agruparDados(self, file): style.use("ggplot") caminho = 'C:/Users/Teste/Desktop/10 semestre/tcc2/Arquivos de Logs/Arquivos de Logs/Ameaças/Novos/trainThreats.csv' colors = [ 'b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen' ] # Data points with their publisher name,category score, category name, place name #category = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 9] # categoria #severity = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 8] # severidade X = np.genfromtxt(caminho, dtype=object, delimiter=',', skip_header=1)[:, 1:] kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2) clusters = kproto.fit_predict( X, categorical=[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14]) file['Clusters'] = clusters # Print cluster centroids of the trained model. print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) print(kproto.gamma) '''plt.scatter(X[clusters == 0, 8], X[clusters == 0, 9], c='purple', alpha=0.5, s=150, label='Cluster 0') plt.scatter(X[clusters == 1, 8], X[clusters == 1, 9], c='black', alpha=0.5, s=150, label='Cluster 1') plt.scatter(X[clusters == 2, 8], X[clusters == 2, 9], c='red', alpha=0.5, s=150, label='Cluster 2') plt.scatter(X[clusters == 3, 8], X[clusters == 3, 9], c='green', alpha=0.5, s=150, label='Cluster 3') plt.scatter(X[clusters == 4, 8], X[clusters == 4, 9], c='blue', alpha=0.5, s=100, label='Cluster 4') plt.scatter(X[clusters == 5, 8], X[clusters == 5, 9], c='yellow', alpha=0.5, s=100, label='Cluster 5') plt.xlabel('Severity') plt.ylabel('Category') plt.legend() plt.show()''' self.lerXML(file)
def get_labels(self, data, reprocess=False): data_original = data data = self._preprocessed_data(data) categorical_indices = get_categorical_indices(data) if not categorical_indices: return self._fallback_algorithm(data_original) if self.model is None or reprocess: data = encode_nominal_parameters(data) data = normalized_dataset(data, categorical_indices) initial_centers = self._get_initial_centers( data, categorical_indices) self.model = KPrototypes(n_clusters=self.cluster_number, max_iter=1000, init=initial_centers, n_init=10, gamma=self.categorical_weight, num_dissim=dissimilarity_python.euclidean, n_jobs=1) data = data.values self.model.fit(data, categorical=categorical_indices) self.labels = self.model.predict(data, categorical=categorical_indices) self.centers = self.model.cluster_centroids_ centers = self.centers[0] for index, cat_index in enumerate(categorical_indices): centers = np.insert(centers, cat_index, values=self.centers[1].transpose()[index], axis=1) self.centers = centers else: self.labels = self.model.predict(data) return self.labels
def plot_costs(X, min_k, max_k): """Plots sse for values of k between min_k and max_k Args: - X - feature matrix - min_k, max_k - smallest and largest k to plot sse for return: list of costs """ k_values = range(min_k, max_k + 1) costs = [] for k in k_values: kp = KPrototypes(n_clusters=k, init='Cao', n_init=22, verbose=0, random_state=4, n_jobs=4) kp.fit_predict(X, categorical=[1, 2, 3]) costs.append(kp.cost_) plt.plot(k_values, costs) plt.xlabel('k') plt.ylabel('costs') plt.show() plt.savefig("../image/kprototype_costs.png") return costs
def clusterization(data, clusters, method): if method is 'kmeans': model = KMeans(n_clusters=clusters, init='random', algorithm='full') model.fit(data) clustering_labels = model.predict(data) elif method is 'agglomerative': linkage = ('ward', 'average', 'complete', 'single') model = AgglomerativeClustering(linkage=linkage[0], n_clusters=clusters) model.fit(data) clustering_labels = model.labels_ elif method is 'fuzzy': cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(data.T, clusters, 2, error=0.005, maxiter=10000, init=None) clustering_labels = np.argmax(u, axis=0) elif method is 'kprototypes': clustering_labels = KPrototypes(n_clusters=clusters, init='random', gamma=0.1, n_init=1).fit_predict(data, categorical=list([8])) else: print(" The supported methods are: kmeans, agglomerative, fuzzy ...") return clustering_labels
def create_elbowgraph(n, df, type="kmeans", categorical=[0]): if type == "kmeans": clusters = [] for i in range(1, n): kmeans = KMeans(n_clusters=i, random_state=1).fit(df) clusters.append(kmeans.inertia_) print("Calculated kmeans with " + str(i) + " clusters") elif type == "kproto": clusters = [] for i in range(1, n): kproto = KPrototypes(n_clusters=i, init='random', random_state=1).fit(df, categorical=categorical) clusters.append(kproto.cost_) print("Calculated kproto with " + str(i) + " clusters") plt.plot(range(1, n), clusters, 'go--') plt.title("Elbow graph") plt.xlabel("Number of cluster") plt.ylabel("within-cluster sum-of-squares (inertia)")
import numpy as np from sklearn import datasets from kmodes.kprototypes import KPrototypes iris = datasets.load_iris() data = np.c_[iris['data'], iris['target']] kp = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=True) kp.fit_predict(data, categorical=[4]) print(kp.cluster_centroids_) print(kp.labels_)
data=data, num_numerical=num_numerical_features, num_category=num_category_features, max_iters=10, mode=2) print("K_Means算法的Calinski-Harabaz Index值为:{}".format( metrics.calinski_harabasz_score(data, label_2))) label_3, center_numerical_3, center_category_3 = K_Prototypes( random_seed=2020, n=N, data=data, num_numerical=num_numerical_features, num_category=num_category_features, max_iters=10, mode=1) print("K_Modes算法的Calinski-Harabaz Index值为:{}".format( metrics.calinski_harabasz_score(data, label_3))) kp = KPrototypes(n_clusters=5, init='Huang', n_init=1, verbose=True, n_jobs=4, random_state=2020) KPrototypes_results = kp.fit_predict( data, categorical=list( range(num_numerical_features, num_numerical_features + num_category_features - 1))) print("K_Prototypes算法包的Calinski-Harabaz Index值为:{}".format( metrics.calinski_harabasz_score(data, KPrototypes_results)))
def cluster_clients(k=None, save_centroids=True, save_clusters=True): ''' Runs k-prototypes clustering algorithm on preprocessed dataset :param k: Desired number of clusters :param save_centroids: Boolean indicating whether to save cluster centroids :param save_clusters: Boolean indicating whether to save client cluster assignments :return: A KPrototypes object that describes the best clustering of all the runs ''' cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load preprocessed client data try: client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA']) except FileNotFoundError: print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.") raw_df = load_raw_data(cfg) client_df = prepare_for_clustering(cfg, raw_df, save_df=False) excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE'] client_df.drop(excluded_feats, axis=1, inplace=True) # Features we don't want to see in clustering client_feats_df = client_df.copy() client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist() cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats] bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats] ordinal_encoder = OrdinalEncoder() client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats]) X = np.array(client_df) # Get list of categorical feature indices. Boolean feats are considered categorical for clustering cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df] numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs] # Normalize noncategorical features X_noncat = X[:, numcl_feat_idxs] std_scaler = StandardScaler().fit(X_noncat) X_noncat = std_scaler.transform(X_noncat) X[:, numcl_feat_idxs] = X_noncat # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client if k is None: k = cfg['K-PROTOTYPES']['K'] k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'], n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim, cat_dissim=matching_dissim) client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs) k_prototypes.samples = X k_prototypes.labels = client_clusters k_prototypes.dist = lambda x0, x1: \ k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0), np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \ k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0)) client_clusters += 1 # Enforce that cluster labels are integer range of [1, K] clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters}) clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left') clusters_df.set_index('CONTRACT_ACCOUNT') # Get centroids of clusters cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0], k_prototypes.cluster_centroids_[0].shape[1] + k_prototypes.cluster_centroids_[1].shape[1])) cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0] # Numerical features cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1] # Categorical features # Scale noncategorical features of the centroids back to original range centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs] centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats) cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats # Create a DataFrame of cluster centroids centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns)) for i in range(len(cat_feats)): ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))} centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict) centroids_df[bool_feats] = centroids_df[bool_feats].round() cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1)) centroids_df.insert(0, 'Cluster', cluster_num_series) # Get fraction of clients in each cluster cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0]) centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100) # Save centroid features and cluster assignments to spreadsheet if save_centroids: centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) if save_clusters: clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) return k_prototypes
#!/usr/bin/env python import numpy as np from kmodes.kprototypes import KPrototypes # stocks with their market caps, sectors and countries syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0] X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:] X[:, 0] = X[:, 0].astype(float) kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2) clusters = kproto.fit_predict(X, categorical=[1, 2]) # Print cluster centroids of the trained model. print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) for s, c in zip(syms, clusters): print(f"Symbol: {s}, cluster:{c}")
def cao(): KPrototypes(n_clusters=K, init='Cao', verbose=2)\ .fit_predict(data, categorical=list(range(M - MN, M)))
def huang(): KPrototypes(n_clusters=K, init='Huang', n_init=1, verbose=2)\ .fit_predict(data, categorical=list(range(M - MN, M)))
#!/usr/bin/env python import numpy as np from kmodes.kprototypes import KPrototypes import pandas as pd # stocks with their market caps, sectors and countries syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0] X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:] X[:, 0] = X[:, 0].astype(float) kproto = KPrototypes(n_clusters=3, init='Cao', verbose=8) clusters = kproto.fit_predict( X, categorical=[1, 2]) #TC: define categorical variables here # Print cluster centroids of the trained model. print("\nCluster centroid") print(kproto.cluster_centroids_) # Print training statistics print("\nCost") print(kproto.cost_) print("\nNumber of iterations") print(kproto.n_iter_) """for s, c in zip(syms, clusters): print("Symbol: {}, cluster:{}".format(s, c))""" print("\nClustering result") df = pd.DataFrame(zip(syms, clusters)) df.columns = ["Symbol", "Cluster"] print(df)
ms.fit(df_cust_num_norm) labels = ms.labels_ cluster_centers = ms.cluster_centers_ df["Labels"] = ms.predict(df_cust_num_norm) cols = customer_related_num + ["labels"] cc_mshift = df[cols].groupby("labels").mean() sizes = df["labels"].value_counts() ######## Categorical ########### ### 1. Approach: K-Prototype with categorical and numerical Features scaler = StandardScaler() cust_norm = scaler.fit_transform(df[customer_related_num]) df_num_norm = pd.DataFrame(cust_norm, columns=customer_related_num) df_cust_norm = df_num_norm.join(df[customer_related_cat]) # create_elbowgraph(10, df_cust_norm, "kproto", [4,5,6,7,8] ) kproto = KPrototypes(n_clusters=3, init='random', random_state=1) model = kproto.fit(df_cust_norm, categorical=[4, 5, 6, 7, 8, 9]) # Inverse Normalization for Interpretation cc_kproto_num = pd.DataFrame( scaler.inverse_transform(X=model.cluster_centroids_[0])) cc_kproto = pd.concat( [cc_kproto_num, pd.DataFrame(model.cluster_centroids_[1])], axis=1) cc_kproto.columns = customer_related ###### 2. Approach: Categorical Kmodes ######## kmodes = KModes(n_clusters=4) temp_kmodes = kmodes.fit_predict(df[customer_related_cat]) kmcc = pd.DataFrame(kmodes.cluster_centroids_, columns=customer_related_cat) df["cat_cluster"] = temp_kmodes
df_encode[col] = LabelEncoder().fit_transform(df_encode[col]) #Menggabungkan dataframe df_model = df_encode.merge(df_standar, left_index=True, right_index=True, how='left') from kmodes.kprototypes import KPrototypes import matplotlib.pyplot as plt import seaborn as sns #Melakukan Iterasi untuk mendapatkan nilai Cost cost = {} for k in range(2, 10): kproto = KPrototypes(n_clusters=k, random_state=75) kproto.fit_predict(df_model, categorical=[0, 1, 2]) cost[k] = kproto.cost_ #Visualisasi Elbow Plot sns.pointplot(x=list(cost.keys()), y=list(cost.values())) plt.show() #Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot import pickle kproto = KPrototypes(n_clusters=5, random_state=75) kproto = kproto.fit(df_model, categorical=[0, 1, 2]) pickle.dump(kproto, open('best_cluster.pkl', 'wb')) #Menentukan segmen tiap pelanggan