def find_optimalCluster(self): # Mencari Jumlah Cluster yang Optimal # Melakukan Iterasi untuk Mendapatkan nilai Cost cost = {} for k in range(2,10): kproto = KPrototypes(n_clusters = k, random_state = 75) kproto.fit_predict(self.df_model, categorical = [0,1,2]) cost[k]= kproto.cost_ # Memvisualisasikan Elbow Plot sns.pointplot(x = list(cost.keys()), y = list(cost.values())) plt.show()
def exec_kprototypes(df, choices_obj): print("Whitening data...", end='', flush=True) for header in choices_obj['numerical']: df[header + "_scaled"] = whiten(df[header]) print("Done.") nums_scaled = [header + "_scaled" for header in choices_obj['numerical']] cats_not_scaled = [header for header in choices_obj['categorical']] X = pd.concat( [df[nums_scaled].astype(float), df[cats_not_scaled].astype(str)], axis=1) k = int(input("Number of clusters:\n > ")) kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2) df['cluster_labels'] = kproto.fit_predict( X.values, categorical=list( range(len(X.columns) - len(cats_not_scaled), len(X.columns)))) if (len(nums_scaled) >= 2): # Plot clusters print("Only showing 2 dimensions of data (picking first two headers)") sns.scatterplot(x=nums_scaled[0], y=nums_scaled[1], hue='cluster_labels', data=df) plt.show()
def kprototypesCluster(features: np.array, catCols: list, nClust: int): #Convert continous features to astype float model = KPrototypes(n_clusters=nClust, verbose=2) clusters = model.fit_predict(features, categorical=catCols) return model
def kproto(self, K=20, N=int(1e5), MN=4, T=10, type='cao', save=True): data = self.to_numpy() M = data.shape[1] # MN = 22 if type == 'huang': model = KPrototypes(n_clusters=K, init='Huang', n_init=1, verbose=1) if type == 'cao': model = KPrototypes(n_clusters=K, init='Cao', verbose=2, max_iter=10000) clusters = model.fit_predict( data, categorical=[0, 3, 6, 8] if self.cl_type == 'prop' else [ 0, 2, 3, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ]) if save: self.save(model, 'Clustering_kproto_model') return np.array(model.cluster_centroids_[0]), np.array( model.cluster_centroids_[1]), np.array(clusters)
def predict(self): with open(self.data_processed, 'rb') as f: self.dataset = pickle.load(f) with open(self.label_file, 'rb') as f: self.label = pickle.load(f) # self.y_pred = KMeans(n_clusters=5, random_state=9).fit_predict(self.dataset) # np.savetxt(self.cluster_result, np.hstack(self.y_pred, self.dataset) , delimiter=',') # score = metrics.calinski_harabaz_score(self.dataset, self.y_pred) # print(score) kproto = KPrototypes(n_clusters=5, init='Cao', verbose=2) clusters = kproto.fit_predict(self.dataset, categorical=[1]) temp = np.loadtxt(fname=self.data_cleaned, dtype=object, delimiter=',') room_identity = temp[1:, :3] self.result = np.column_stack((room_identity, self.dataset, clusters)) print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) with open(self.result_binary, 'wb') as f: pickle.dump(self.result, f) with open('kproto_res', 'wb') as f: pickle.dump(kproto, f) with open(self.cluster_result, 'w') as f: re = self.result.tolist() for line in re: f.write("\t".join(list(map(str, line))) + '\n') for s, c in zip(self.label, clusters): print("Room identity: {}, cluster:{}".format(s, c))
def kprotoypes_cluster(df, n_clusters, category, hover_text): datadf = df.loc[:, df.columns != hover_text] kmodes_instance = KPrototypes(n_clusters=n_clusters, init='Cao', verbose=2) clusters = kmodes_instance.fit_predict(datadf, categorical=category) data_array = np.array(datadf.to_numpy().tolist()) col_len = len(datadf.columns) if (col_len == 2): clus = scat2d(data_array, clusters, hover_text, df) return clus else: clus = scat3d(data_array, clusters, hover_text, df) return clus
def KPrototypes_cluster(input_data, k_clusters): from kmodes.kprototypes import KPrototypes #normalized data normalized = preprocessing.StandardScaler() input_data[input_data.select_dtypes( include=['float', 'integer']).columns] = normalized.fit_transform( input_data[input_data.select_dtypes( include=['float', 'integer']).columns]) input_data = input_data.as_matrix() kproto = KPrototypes(n_clusters=k_clusters, init='Cao', verbose=2) clus_kmeans_fit = kproto.fit_predict(input_data, categorical=[0, 1, 2, 3, 4, 5, 6, 7]) return (clus_kmeans_fit)
def plot_costs(X, min_k, max_k): """Plots sse for values of k between min_k and max_k Args: - X - feature matrix - min_k, max_k - smallest and largest k to plot sse for return: list of costs """ k_values = range(min_k, max_k + 1) costs = [] for k in k_values: kp = KPrototypes(n_clusters=k, init='Cao', n_init=22, verbose=0, random_state=4, n_jobs=4) kp.fit_predict(X, categorical=[1, 2, 3]) costs.append(kp.cost_) plt.plot(k_values, costs) plt.xlabel('k') plt.ylabel('costs') plt.show() plt.savefig("../image/kprototype_costs.png") return costs
def cluster(summ, agg_classes=None): """ Clusters summary info using DBSCAN if agg_classes is provided it uses K-Prototypes """ all_prop = None prop = {} ranks = [] for flow, edge in summ.get_flowedges(): rank = round(summ.get_edge_rank(flow, edge), 2) if rank < 0.5: continue # else: # print(flow, edge, rank) policy = nopticon.ReachabilityPolicy({ 'flow': flow, 'source': edge[0], 'target': edge[1] }) prop[policy] = len(ranks) if agg_classes is not None: ranks.append([rank, agg_classes[edge[0]], agg_classes[edge[1]]]) else: ranks.append([rank]) if agg_classes is not None: kproto = KPrototypes(n_clusters=3, init='Huang') clust = kproto.fit_predict(np.matrix(ranks).A, categorical=[1, 2]) else: agg = KMeans(n_clusters=2, n_jobs=2) # linkage="complete") clust = agg.fit(ranks).labels_ assert len(clust) == len(ranks) means = {} high = None for k in set(clust): kranks = [ranks[idx][0] for idx in prop.values() if clust[idx] == k] means[k] = sum(kranks) / len(kranks) if high is None or means[k] > means[high]: high = k for p, idx in prop.items(): if clust[idx] == high: # print("\tHIGH:", ranks[idx], p) summ.mark_cluster_accepted(p.flow(), p.edge()) # else: # print("\tlow:", ranks[idx], p) return
def kprototypes_compute_metrics_for_every_cluster_number( clusters_range_lower_bound, clusters_range_upper_bound, dataset, distance_algorithm, init_Cao_or_Huang_for_kprototypes, list_categorical_features_indeces_for_kprototypes, print_optimum_metrics=True): kprototypes_list_metrics = [] for num_of_clusters in range(clusters_range_lower_bound, clusters_range_upper_bound): kprototypes = KPrototypes(n_clusters=int(num_of_clusters), init=str(init_Cao_or_Huang_for_kprototypes), n_init=50, verbose=0) predictions = kprototypes.fit_predict( dataset, categorical=list_categorical_features_indeces_for_kprototypes) centers = kprototypes.cluster_centroids_ cost_function = kprototypes.cost_ num_jobs = kprototypes.n_iter_ error_metric = cost_function silhouette = silhouette_score(dataset, predictions, distance_algorithm) kprototypes_list_metrics.append({ 'clusters': num_of_clusters, 'silhouette': silhouette, 'error': error_metric, 'num_jobs': num_jobs }) if print_optimum_metrics is True: print( "For n_clusters = {}, silhouette score is {}, cluster_errors is {}, " "n_jobs {})".format(num_of_clusters, silhouette, error_metric, num_jobs)) return kprototypes_list_metrics
def agruparDados(self, file): style.use("ggplot") caminho = 'C:/Users/Teste/Desktop/10 semestre/tcc2/Arquivos de Logs/Arquivos de Logs/Ameaças/Novos/trainThreats.csv' colors = [ 'b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen' ] # Data points with their publisher name,category score, category name, place name #category = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 9] # categoria #severity = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 8] # severidade X = np.genfromtxt(caminho, dtype=object, delimiter=',', skip_header=1)[:, 1:] kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2) clusters = kproto.fit_predict( X, categorical=[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14]) file['Clusters'] = clusters # Print cluster centroids of the trained model. print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) print(kproto.gamma) '''plt.scatter(X[clusters == 0, 8], X[clusters == 0, 9], c='purple', alpha=0.5, s=150, label='Cluster 0') plt.scatter(X[clusters == 1, 8], X[clusters == 1, 9], c='black', alpha=0.5, s=150, label='Cluster 1') plt.scatter(X[clusters == 2, 8], X[clusters == 2, 9], c='red', alpha=0.5, s=150, label='Cluster 2') plt.scatter(X[clusters == 3, 8], X[clusters == 3, 9], c='green', alpha=0.5, s=150, label='Cluster 3') plt.scatter(X[clusters == 4, 8], X[clusters == 4, 9], c='blue', alpha=0.5, s=100, label='Cluster 4') plt.scatter(X[clusters == 5, 8], X[clusters == 5, 9], c='yellow', alpha=0.5, s=100, label='Cluster 5') plt.xlabel('Severity') plt.ylabel('Category') plt.legend() plt.show()''' self.lerXML(file)
X = df.iloc[:, 1:5] X.columns = ['a','b','c','d'] X.head() min_max_scaler = preprocessing.MinMaxScaler() bcd = X.iloc[:,1:4] x_scaled = min_max_scaler.fit_transform(bcd) X_scaled = pd.DataFrame(x_scaled,columns=bcd.columns) X = pd.concat([df['a'],X_scaled], axis=1) X_matrix = X.values cost = [] for num_clusters in list(range(1,5)): kproto = KPrototypes(n_clusters=num_clusters, init='Cao') kproto.fit_predict(X_matrix, categorical=[0]) cost.append(kproto.cost_) plt.plot(cost) pd.DataFrame(cost) kproto = KPrototypes(n_clusters=1, init='Cao') clusters = kproto.fit_predict(X_matrix, categorical=[0]) print('====== Centriods ======') kproto.cluster_centroids_ print() print('====== Cost ======') kproto.cost_ centroids = pd.concat([pd.DataFrame(kproto.cluster_centroids_[1]),pd.DataFrame(kproto.cluster_centroids_[0])], axis=1) centroids
s = (nc_data_2015.dtypes == 'object') object_cols = list(s[s].index) # Append the boolean features: object_cols.extend(['search_conducted', 'contraband_found', 'is_arrested', 'drugs_related_stop']) print(object_cols) # Create a data frame copy to label encode all categorical features: nc_encoded = nc_data_2015.copy() label_encoder = LabelEncoder() for col in object_cols: nc_encoded[col] = label_encoder.fit_transform(nc_encoded[col]) # Create a subset using only age, gender, and race and use it to perform KPrototypes clustering: X = nc_encoded.iloc[:, [1,2,3]] kp = KPrototypes(n_clusters=3, init="Cao", n_init=1, verbose=1) cluster_labels = kp.fit_predict(X, categorical=[0, 2]) X['Cluster'] = cluster_labels # Create a count plot showing clusters related to feature values: plt.figure(figsize=(16,10)) sns.countplot(x='Cluster', hue='driver_race_raw', data=X) plt.legend(title='Driver Race', labels=['Asian', 'Black Hispanic', 'Black', 'Other', 'Unknown Hispanic', 'Unknown Non-Hispanic', 'White Hispanic', 'White']) plt.xlabel("Cluster") plt.ylabel("Count") plt.title("Driver Race Count Per Cluster") plt.show() # Create a count plot showing clusters related to feature values: plt.figure(figsize=(16,10)) sns.countplot(x='Cluster', hue='driver_gender', data=X, palette='mako') plt.legend(title='Driver Gender', labels=['Female', 'Male'])
import pandas as pd from kmodes.kprototypes import KPrototypes %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns # standardizing data columns_to_normalize = ['RFM_Score','Age'] rfm_encoded[columns_to_normalize] = rfm_encoded[columns_to_normalize].apply(lambda x: (x - x.mean()) / np.std(x)) matrix = rfm_encoded.as_matrix() # Running K-Prototype clustering kproto = KPrototypes(n_clusters=3, init='Cao') clusters = kproto.fit_predict(matrix, categorical=[2]) print(kproto.cluster_centroids_) print(kproto.cost_) rfm_encoded['cluster_id'] = clusters # add cluster_id column to rfm data frame for better understanding rfm_table['cluster_number']=rfm_encoded['cluster_id'].values #Checking cluster count cluster_count = pd.DataFrame(rfm_encoded['cluster_id'].value_counts()) print(cluster_count) sns.barplot(x=cluster_count.index, y=cluster_count['cluster_id'])
#colunas_cat_ = ["grau","NATUREZA","Porte","Estratificacao","RM_OU_RIDE","Atuacao_Vara"] colunas_cat_ = ["Porte", "Estratificacao", "RM_OU_RIDE", "Atuacao_Vara"] #colunas_cat = [0, 1, 3, 4, 5, 6] colunas_cat = [1, 2, 3, 4] dados = dados.filter(items=colunas) dados['duracao_dias'] = pd.to_numeric(dados['duracao_dias']) print(dados.head(10)) print(dados.dtypes) #km = KModes(n_clusters=6, init='Huang', n_init=10, verbose=1) km = KPrototypes(n_clusters=6, init='Huang', n_init=2, verbose=1) print(dados.shape[1]) dados_temp = dados[colunas_cat_] print(dados_temp) clusters = km.fit_predict(dados, categorical=colunas_cat) #clusters = km.fit_predict(dados) # Print the cluster centroids print('Centroídes') print(km.cluster_centroids_) print('Clusters') print(clusters)
#!/usr/bin/env python import numpy as np from kmodes.kprototypes import KPrototypes # stocks with their market caps, sectors and countries syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0] X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:] X[:, 0] = X[:, 0].astype(float) kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2) clusters = kproto.fit_predict(X, categorical=[1, 2]) # Print cluster centroids of the trained model. print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) for s, c in zip(syms, clusters): print(f"Symbol: {s}, cluster:{c}")
#!/usr/bin/env python import numpy as np from kmodes.kprototypes import KPrototypes import pandas as pd # stocks with their market caps, sectors and countries syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0] X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:] X[:, 0] = X[:, 0].astype(float) kproto = KPrototypes(n_clusters=3, init='Cao', verbose=8) clusters = kproto.fit_predict( X, categorical=[1, 2]) #TC: define categorical variables here # Print cluster centroids of the trained model. print("\nCluster centroid") print(kproto.cluster_centroids_) # Print training statistics print("\nCost") print(kproto.cost_) print("\nNumber of iterations") print(kproto.n_iter_) """for s, c in zip(syms, clusters): print("Symbol: {}, cluster:{}".format(s, c))""" print("\nClustering result") df = pd.DataFrame(zip(syms, clusters)) df.columns = ["Symbol", "Cluster"] print(df)
import pandas as pd import numpy as np from kmodes.kprototypes import KPrototypes import sys df = pd.read_csv('dataset.txt', sep=";") df_array = df.values df_array[:, 2] = df_array[:, 2].astype(float) kproto = KPrototypes(n_clusters=3, verbose=2, max_iter=20) clusters = kproto.fit_predict(df_array, categorical=[0, 1, 3, 4]) cluster_dict = [] for c in clusters: cluster_dict.append(c) df['cluster'] = cluster_dict c0 = df[df['cluster'] == 0] c1 = df[df['cluster'] == 1] c2 = df[df['cluster'] == 2] # c0 = df[df['cluster']== 0].applymap(lambda s:s.lower() if type(s) == str else s) # c1 = df[df['cluster']== 1].applymap(lambda s:s.lower() if type(s) == str else s) # c2 = df[df['cluster']== 2].applymap(lambda s:s.lower() if type(s) == str else s) # # print(c0['cost range']) # # print(c1,c2,c0) # # print("In py")
def cluster_clients(k=None, save_centroids=True, save_clusters=True, explain_centroids=True): ''' Runs k-prototype clustering algorithm on preprocessed dataset :param k: Desired number of clusters :param save_centroids: Boolean indicating whether to save cluster centroids :param save_clusters: Boolean indicating whether to save client cluster assignments :param explain_centroids: Boolean indicating whether to compute LIME explanations for cluster centroids :return: A KPrototypes object that describes the best clustering of all the runs ''' cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load preprocessed client data try: df = pd.read_csv(cfg['PATHS']['PROCESSED_DATA']) except FileNotFoundError: print("No file found at " + cfg['PATHS']['PROCESSED_DATA'] + ". Run preprocessing script before running this script.") return client_ids = df.pop('ClientID').tolist() if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp': dates = df.pop('Date').tolist() df.drop('GroundTruth', axis=1, inplace=True) X = np.array(df) # Load feature info try: data_info = yaml.full_load(open(cfg['PATHS']['DATA_INFO'], 'r')) except FileNotFoundError: print("No file found at " + cfg['PATHS']['DATA_INFO'] + ". Run preprocessing script before running this script.") return # Get list of categorical feature indices noncat_feat_idxs = [ df.columns.get_loc(c) for c in data_info['NON_CAT_FEATURES'] if c in df ] cat_feat_idxs = [ i for i in range(len(df.columns)) if i not in noncat_feat_idxs ] # Normalize noncategorical features X_noncat = X[:, noncat_feat_idxs] std_scaler = StandardScaler().fit(X_noncat) X_noncat = std_scaler.transform(X_noncat) X[:, noncat_feat_idxs] = X_noncat # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client if k is None: k = cfg['K-PROTOTYPES']['K'] k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'], n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim, cat_dissim=matching_dissim) client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs) k_prototypes.samples = X k_prototypes.labels = client_clusters k_prototypes.dist = lambda x0, x1: \ k_prototypes.num_dissim(np.expand_dims(x0[noncat_feat_idxs], axis=0), np.expand_dims(x1[noncat_feat_idxs], axis=0)) + \ k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0)) client_clusters += 1 # Enforce that cluster labels are integer range of [1, K] if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp': clusters_df = pd.DataFrame({ 'ClientID': client_ids, 'Date': dates, 'Cluster Membership': client_clusters }) clusters_df.set_index(['ClientID', 'Date']) else: clusters_df = pd.DataFrame({ 'ClientID': client_ids, 'Cluster Membership': client_clusters }) clusters_df.set_index('ClientID') # Get centroids of clusters cluster_centroids = np.zeros((k_prototypes.cluster_centroids_[0].shape[0], k_prototypes.cluster_centroids_[0].shape[1] + k_prototypes.cluster_centroids_[1].shape[1])) cluster_centroids[:, noncat_feat_idxs] = k_prototypes.cluster_centroids_[ 0] # Numerical features cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[ 1] # Categorical features #cluster_centroids = np.concatenate((k_prototypes.cluster_centroids_[0], k_prototypes.cluster_centroids_[1]), axis=1) # Scale noncategorical features of the centroids back to original range centroid_noncat_feats = cluster_centroids[:, noncat_feat_idxs] centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats) cluster_centroids[:, noncat_feat_idxs] = centroid_noncat_feats # Create a DataFrame of cluster centroids cluster_centroids = np.rint( cluster_centroids) # Round centroids to nearest int centroids_df = pd.DataFrame(cluster_centroids, columns=list(df.columns)) for i in range(len(data_info['SV_CAT_FEATURE_IDXS'])): idx = data_info['SV_CAT_FEATURE_IDXS'][i] ordinal_encoded_vals = cluster_centroids[:, idx].astype(int) original_vals = [ data_info['SV_CAT_VALUES'][idx][v] for v in ordinal_encoded_vals ] centroids_df[data_info['SV_CAT_FEATURES'][i]] = original_vals cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1)) centroids_df.insert(0, 'Cluster', cluster_num_series) # Get fraction of clients in each cluster cluster_freqs = np.bincount(client_clusters) / float( client_clusters.shape[0]) centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100) # Load objects necessary for prediction and explanations try: scaler_ct = load(cfg['PATHS']['SCALER_COL_TRANSFORMER']) ohe_ct_sv = load(cfg['PATHS']['OHE_COL_TRANSFORMER_SV']) explainer = dill.load(open(cfg['PATHS']['LIME_EXPLAINER'], 'rb')) model = load_model(cfg['PATHS']['MODEL_TO_LOAD'], compile=False) except FileNotFoundError as not_found_err: print( 'File "' + not_found_err.filename + '" was not found. Ensure you have trained a model and run LIME before running this script.' ) return # Add model's prediction of centroids (classes and prediction probabilities) to the DataFrame predicted_classes = [] prediction_probs = [] print("Obtaining model's predictions for cluster centroids.") for i in tqdm(range(len(cluster_centroids))): x = np.expand_dims(cluster_centroids[i], axis=0) y = np.squeeze(predict_instance(x, model, ohe_ct_sv, scaler_ct).T, axis=1) # Predict centroid prediction = 1 if y[1] >= cfg['PREDICTION'][ 'THRESHOLD'] else 0 # Model's classification predicted_class = cfg['PREDICTION']['CLASS_NAMES'][prediction] predicted_classes.append(predicted_class) prediction_probs.append(y[1] * 100) # Include as a percentage centroids_df.insert(centroids_df.shape[1], 'At risk of chronic homelessness', pd.Series(predicted_classes)) centroids_df.insert(centroids_df.shape[1], 'Probability of chronic homelessness [%]', pd.Series(prediction_probs)) # Predict and explain the cluster centroids if explain_centroids: model_def = cfg['TRAIN']['MODEL_DEF'].upper() NUM_SAMPLES = cfg['LIME'][model_def]['NUM_SAMPLES'] NUM_FEATURES = cfg['LIME'][model_def]['NUM_FEATURES'] exp_rows = [] explanations = [] print('Creating explanations for cluster centroids.') for i in tqdm(range(cluster_centroids.shape[0])): row = [] exp = predict_and_explain(cluster_centroids[i], model, explainer, ohe_ct_sv, scaler_ct, NUM_FEATURES, NUM_SAMPLES) explanations.append(exp) exp_tuples = exp.as_list() for exp_tuple in exp_tuples: row.extend(list(exp_tuple)) if len(exp_tuples) < NUM_FEATURES: row.extend([''] * (2 * (NUM_FEATURES - len(exp_tuples))) ) # Fill with empty space if explanation too small exp_rows.append(row) exp_col_names = [] for i in range(NUM_FEATURES): exp_col_names.extend( ['Explanation ' + str(i + 1), 'Weight ' + str(i + 1)]) exp_df = pd.DataFrame(exp_rows, columns=exp_col_names) centroids_df = pd.concat( [centroids_df, exp_df], axis=1, sort=False) # Concatenate client features and explanations # Visualize clusters' LIME explanations predictions = centroids_df[[ 'At risk of chronic homelessness', 'Probability of chronic homelessness [%]' ]].to_numpy() visualize_cluster_explanations( explanations, predictions, cluster_freqs, 'Explanations for k-prototypes clusters', cfg['PATHS']['IMAGES'] + 'centroid_explanations_') # Save centroid features and explanations to spreadsheet if save_centroids: centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) if save_clusters: clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) return k_prototypes
# load the data md_df = pd.read_csv('markdown_group.csv') no_md_df = pd.read_csv('no_markdown_group.csv') # clear the first two columns in both groups md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis=1) no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis=1) # perform k-prototypes clustering on markdown cell group costs_md = [] K = range(1, 11) for k in K: print("clustering with " + str(k) + " clusters") kproto = KPrototypes(n_clusters=k, init='Cao', n_jobs=4, verbose=0) clusters = kproto.fit_predict( md_df, categorical=[0, 1, 2, 3, 9, 10, 11, 13, 15, 19, 20, 22, 24]) costs_md.append(kproto.cost_) # save the costs plot plt.plot(K, costs_md, 'bx-') plt.xlabel('k') plt.ylabel('cost') plt.title('Cost Graph for Optimal k for Markdown Cell Group') plt.savefig('figures/10-markdown-kproto.png') # perform k-prototypes clustering on no markdown cell group costs_no_md = [] K = range(1, 11) for k in K: print("clustering with " + str(k) + " clusters") kproto = KPrototypes(n_clusters=k, init='Cao', n_jobs=4, verbose=0)
std = StandardScaler() for i in num: df_copy[i] = std.fit_transform(df_copy[i].values.reshape(-1, 1)) # taking indexes of categorical column cat_columns_index = [ df_copy.columns.get_loc(c) for c in cat.columns if c in df_copy.columns ] # K-Prototypes from kmodes.kprototypes import KPrototypes X = df_copy.values kproto = KPrototypes(n_clusters=12) clusters = kproto.fit_predict(X, categorical=cat_columns_index) # adding clusters to data df_copy['cluster'] = clusters # creating segments seg1 = df_copy[df_copy['cluster'] == 0].sort_values(['age'], axis=0, ascending=True) seg2 = df_copy[df_copy['cluster'] == 1].sort_values(['age'], axis=0, ascending=True) seg3 = df_copy[df_copy['cluster'] == 2].sort_values(['age'], axis=0, ascending=True) seg4 = df_copy[df_copy['cluster'] == 3].sort_values(['age'],
print("Columns with categorical data") print(c_data) d = {} k = 0 for i in customer.columns: d[i] = k k = k + 1 customer.rename(columns=d, inplace=True) c_list = [] cluster_dict = {} for i in range(1, 11): kproto = KPrototypes(n_clusters=i, init='Cao', verbose=2) clusters = kproto.fit_predict(customer, categorical=ind) cluster_dict[i] = clusters c_list.append(kproto.cost_) print("------------------------------------------------------") sns.lineplot(y=c_list, x=range(0, len(c_list))) y = c_list x = range(1, len(y) + 1) kn = KneeLocator(x, y, curve='convex', direction='decreasing') print("Number of clusters : ", kn.knee) final_cluster = cluster_dict[kn.knee] cd = {} for i in range(kn.knee): cd[i] = []
encoding='utf-8', index=True) # Estimation: K-Prototypes # # Testing Number of Clusters # K_MAX = 29 centroids_huang = [] centroids_cao = [] labels_huang = [] labels_cao = [] gamma_huang = [] gamma_cao = [] KK = range(1, K_MAX + 1) for k in KK: km = KPrototypes(n_clusters=k, init='Huang', n_init=10, verbose=1) km.fit_predict(df_norm.values, categorical=[39, 40, 41]) centroids_huang.append(km.cluster_centroids_) labels_huang.append(km.labels_) gamma_huang.append(km.gamma) km = KPrototypes(n_clusters=k, init='Cao', n_init=10, verbose=1) km.fit_predict(df_norm.values, categorical=[39, 40, 41]) centroids_cao.append(km.cluster_centroids_) labels_cao.append(km.labels_) gamma_cao.append(km.gamma) D_k_huang = [ gower_distance_tocentroid(df_norm, cent, 1) for cent in centroids_huang ] D_k_cao = [ gower_distance_tocentroid(df_norm, cent, 1) for cent in centroids_cao ] # axis=0: Horizontal. axis=1: Vertical
#Menggabungkan dataframe df_model = df_encode.merge(df_standar, left_index=True, right_index=True, how='left') from kmodes.kprototypes import KPrototypes import matplotlib.pyplot as plt import seaborn as sns #Melakukan Iterasi untuk mendapatkan nilai Cost cost = {} for k in range(2, 10): kproto = KPrototypes(n_clusters=k, random_state=75) kproto.fit_predict(df_model, categorical=[0, 1, 2]) cost[k] = kproto.cost_ #Visualisasi Elbow Plot sns.pointplot(x=list(cost.keys()), y=list(cost.values())) plt.show() #Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot import pickle kproto = KPrototypes(n_clusters=5, random_state=75) kproto = kproto.fit(df_model, categorical=[0, 1, 2]) pickle.dump(kproto, open('best_cluster.pkl', 'wb')) #Menentukan segmen tiap pelanggan clusters = kproto.predict(df_model, categorical=[0, 1, 2])
#!/usr/bin/env python import numpy as np from kmodes.kprototypes import KPrototypes # stocks with their market caps, sectors and countries syms = np.genfromtxt('people.csv', dtype=str, delimiter=',')[:, 0] X = np.genfromtxt('people.csv', dtype=object, delimiter=',')[:, 1:] X[:, 0] = X[:, 0].astype(float) weights = [1] * 4 weights[2] = 100 weights[3] = 100 kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2) clusters = kproto.fit_predict(X, categorical=[2], sample_weight=weights) # Print cluster centroids of the trained model. print(kproto.cluster_centroids_) # Print training statistics print(kproto.cost_) print(kproto.n_iter_) for s, c in zip(syms, clusters): print(f"Symbol: {s}, cluster:{c}")
data=data, num_numerical=num_numerical_features, num_category=num_category_features, max_iters=10, mode=2) print("K_Means算法的Calinski-Harabaz Index值为:{}".format( metrics.calinski_harabasz_score(data, label_2))) label_3, center_numerical_3, center_category_3 = K_Prototypes( random_seed=2020, n=N, data=data, num_numerical=num_numerical_features, num_category=num_category_features, max_iters=10, mode=1) print("K_Modes算法的Calinski-Harabaz Index值为:{}".format( metrics.calinski_harabasz_score(data, label_3))) kp = KPrototypes(n_clusters=5, init='Huang', n_init=1, verbose=True, n_jobs=4, random_state=2020) KPrototypes_results = kp.fit_predict( data, categorical=list( range(num_numerical_features, num_numerical_features + num_category_features - 1))) print("K_Prototypes算法包的Calinski-Harabaz Index值为:{}".format( metrics.calinski_harabasz_score(data, KPrototypes_results)))
# model parameters # evalu = data_cats.copy() evalu.drop(['GoodForKids'], axis=1, inplace=True) evaluate_clusters(evalu, 50) init = 'Huang' # init can be 'Cao', 'Huang' or 'random' n_clusters = 20 # how many clusters (hyper parameter) max_iter = 100 # default 100 # get the model # kproto = KPrototypes(n_clusters=n_clusters, init=init, verbose=2) # # fit/predict # clusters = kproto.fit_predict(data_cats_matrix, categorical=categoricals_indicies) # # combine dataframe entries with resultant cluster_id # proto_cluster_assignments = zip(data_cats_matrix, clusters) # Instantiate dataframe to house new cluster data # cluster_df = pd.DataFrame(columns=('GoodForKids', 'stars', 'RestaurantsPriceRange2', 'latitude', 'longitude', 'Fac1', 'Fac2', 'cluster_id')) # # load arrays back into a dataframe # for array in proto_cluster_assignments: cluster_df = cluster_df.append(
def cluster_clients(k=None, save_centroids=True, save_clusters=True): ''' Runs k-prototypes clustering algorithm on preprocessed dataset :param k: Desired number of clusters :param save_centroids: Boolean indicating whether to save cluster centroids :param save_clusters: Boolean indicating whether to save client cluster assignments :return: A KPrototypes object that describes the best clustering of all the runs ''' cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r')) # Load preprocessed client data try: client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA']) except FileNotFoundError: print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.") raw_df = load_raw_data(cfg) client_df = prepare_for_clustering(cfg, raw_df, save_df=False) excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE'] client_df.drop(excluded_feats, axis=1, inplace=True) # Features we don't want to see in clustering client_feats_df = client_df.copy() client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist() cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats] bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats] ordinal_encoder = OrdinalEncoder() client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats]) X = np.array(client_df) # Get list of categorical feature indices. Boolean feats are considered categorical for clustering cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df] numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs] # Normalize noncategorical features X_noncat = X[:, numcl_feat_idxs] std_scaler = StandardScaler().fit(X_noncat) X_noncat = std_scaler.transform(X_noncat) X[:, numcl_feat_idxs] = X_noncat # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client if k is None: k = cfg['K-PROTOTYPES']['K'] k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'], n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim, cat_dissim=matching_dissim) client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs) k_prototypes.samples = X k_prototypes.labels = client_clusters k_prototypes.dist = lambda x0, x1: \ k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0), np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \ k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0)) client_clusters += 1 # Enforce that cluster labels are integer range of [1, K] clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters}) clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left') clusters_df.set_index('CONTRACT_ACCOUNT') # Get centroids of clusters cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0], k_prototypes.cluster_centroids_[0].shape[1] + k_prototypes.cluster_centroids_[1].shape[1])) cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0] # Numerical features cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1] # Categorical features # Scale noncategorical features of the centroids back to original range centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs] centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats) cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats # Create a DataFrame of cluster centroids centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns)) for i in range(len(cat_feats)): ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))} centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict) centroids_df[bool_feats] = centroids_df[bool_feats].round() cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1)) centroids_df.insert(0, 'Cluster', cluster_num_series) # Get fraction of clients in each cluster cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0]) centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100) # Save centroid features and cluster assignments to spreadsheet if save_centroids: centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) if save_clusters: clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv', index_label=False, index=False) return k_prototypes
import numpy as np from sklearn import datasets from kmodes.kprototypes import KPrototypes iris = datasets.load_iris() data = np.c_[iris['data'], iris['target']] kp = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=True) kp.fit_predict(data, categorical=[4]) print(kp.cluster_centroids_) print(kp.labels_)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from kmodes.kprototypes import KPrototypes import pickle dataset = pd.read_csv('model_data.csv') matrix = dataset.to_numpy() kproto = KPrototypes(n_clusters=5, init='Cao') kproto.fit_predict(matrix, categorical=[ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 ]) pickle.dump(kproto, open('model.pkl', 'wb+'))
def infer_reachability(summaries, settings): all_prop = None inferences_per_summary = [set() for _ in summaries] for i, summary in enumerate(summaries): prop = {} ranks = [] for flow in summary.get_flows(): for edge in summary.get_edges(flow): rank = round(summary.get_edge_rank(flow, edge), settings.precision) if rank > float(settings.threshold): policy = nopticon.ReachabilityPolicy({ 'flow': flow, 'source': edge[0], 'target': edge[1] }) prop[policy] = len(ranks) if settings.equiv_classes: # TODO: genericize class ranks.append([rank, edge[0][0], edge[1][0]]) else: ranks.append([rank]) if settings.equiv_classes: kproto = KPrototypes(n_clusters=2, init='cao') clust = kproto.fit_predict(np.matrix(ranks).A, categorical=[1, 2]) else: agg = AgglomerativeClustering(n_clusters=2, linkage="ward") clust = agg.fit(ranks).labels_ fig = plt.figure() ax = plt.subplot() colors = ['green', 'red', 'blue', 'purple', 'cyan', 'orange'] means = {} high = None for k in set(clust): kranks = [ ranks[idx][0] for idx in prop.values() if clust[idx] == k ] means[k] = sum(kranks) / len(kranks) if high is None or means[k] > means[high]: high = k props_to_isect = set( [p for p, idx in prop.items() if clust[idx] == high]) inferences_per_summary[i] = props_to_isect if all_prop is None: all_prop = props_to_isect else: all_prop = all_prop.intersection(props_to_isect) # exp_colors = ['green' for _ in ranks] # for (f,s,t), idx in prop.items(): # if t[0] != 'l': # exp_colors[idx] = 'red' # clust_colors = [colors[l] if l >= 0 else "black" for l in clust.labels_] # for k in range(0,2): # ax.scatter(clust.labels_, [r for rs in ranks for r in rs], # c=exp_colors) # fig.savefig("cluster.png") return (all_prop, inferences_per_summary)