def cluster_category_data(df, scale_data='minmax', dim_red_method='som', use_elbow_method='True', cluster_method='hierarchical', n_clusters=None, verbose=1, perplexity=None): """ :param df: dataframe containing all the columns belonging to a category to be used in clustering :param scale_data: method to be used to scale the dataset :param dim_red_method: options are 'som', 'umap', 'tsne', None. If None, do clustering directly. :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required). :param n_clusters: If use_elbow_method is False, n_clusters needs to be given. :param verbose: If True, output the progress in clustering process :param perplexity: If method used is TSNE, perplexity nedds to be specified """ t = time.time() if scale_data == 'minmax': X = MinMaxScaler().fit_transform(df) elif scale_data == 'standard': X = StandardScaler().fit_transform(df) else: X = df.values if verbose: print(f'number of features = {df.shape[1]}') if dim_red_method == 'som': if verbose: print( 'Self Organising Maps is being used for dimensionality reduction...' ) opt_k = 2 max_s = -1 f = 0 for mapsize in [(30, 30)]: if verbose: print(f'map size = {mapsize}') sm = SOMFactory().build(X, normalization='var', initialization='pca', mapsize=mapsize) sm.train(n_job=1, verbose=False, train_rough_len=100, train_finetune_len=500) if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(sm.codebook.matrix).elbow_value_ if elbow and verbose: print(f'elbow value = {elbow}') if not elbow: if verbose: print('elbow not found') ms = -1 for k in range(2, 20): km_labels = KMeans(k).fit_predict(sm.codebook.matrix) s = silhouette_score(sm.codebook.matrix, km_labels) if s > ms: elbow = k else: elbow = n_clusters x = sm.project_data(X) labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method) clabels = [] for i in range(X.shape[0]): clabels.append(labels[x[i]]) s_score = silhouette_score(X, clabels) if verbose: print(f'silhouette score = {round(s_score, 3)}') max_s = max(s_score, max_s) if (max_s == s_score): opt_k = elbow opt_labels = clabels opt_size = mapsize if (max_s > s_score): break if verbose: print(f'optimum mapsize = {opt_size}') print( f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}' ) print(f'time taken = {round(time.time()-t,1)}') return opt_labels, opt_k elif dim_red_method: if dim_red_method == 'umap': print('UMAP is being used for dimensionality reduction...') embedding = umap.UMAP(n_components=2, n_neighbors=5, min_dist=0.0001, metric='euclidean', random_state=1, spread=0.5, n_epochs=1000).fit_transform(X) print('UMAP embedding done...') elif dim_red_method == 'tsne': print('t-SNE is being used for dimensionality reduction...') embedding = TSNE(perplexity=perplexity).fit_transform(X) print('t-SNE embedding is done...') if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(embedding).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(embedding) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding) if verbose: s_score = silhouette_score(X, opt_labels) print( f'number of clusters = {elbow} and silhouette_score = {s_score}' ) return opt_labels, elbow else: if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(X).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(X) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(X) print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}') return opt_labels, elbow
vhts = BmuHitsView(5, 5, "Hits Map", text_size=11) vhts.show(sm, anotate=True, onlyzeros=False, labelsize=9, cmap="plasma", logaritmic=False) # Get the labels for each BMU # in the SOM (15 * 10 neurons) clabs = sm.cluster_labels # Project the data on to the SOM # so that we get the BMU for each # of the original data points bmus = sm.project_data(data) # Turn the BMUs into cluster labels # and append to the data frame data_std[s_var] = pd.Series(clabs[bmus], index=data_std.index) print(data_std.SOM.value_counts()) sdf = gdf.join(data_std, how='inner') sdf.sample(5)[[k_var, d_var, s_var]] from pysal.contrib.viz import mapping as maps # Where will our shapefile be stored shp_link = os.path.join('outputs', 'lsoas_som.shp')