def clustering(the_image_autoencoded, the_image_shape, number_of_clusters, extra_parameters=""): print() print("*** Mean-Shift clustering ***") print("---------------------------------") # https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68 # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html print("Image shape: ", the_image_shape) # print("Creating dataframe") # df = DataFrame(data=the_image_autoencoded) print("Running fit function for mean-shift clustering") clust = MeanShift(bandwidth=2).fit(the_image_autoencoded) print("Creating list for clustered data") clustered_data = np.zeros((the_image_shape[0], the_image_shape[1])) print("Clustered data shape: ", np.shape(clustered_data)) x = 0 y = 0 for i in range(the_image_shape[0] * the_image_shape[1]): clustered_data[y, x] = clust.labels_[i] x = x + 1 if x == the_image_shape[1]: x = 0 y = y + 1 # Parameters start print("Parameters for this estimation: ", clust.get_params()) label_min = 1 label_max = 0 for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]): if clust.labels_[i] > label_max: label_max = clust.labels_[i] if clust.labels_[i] < label_min: label_min = clust.labels_[i] print("Labels from", label_min, " ,to", label_max, ". Number of labels: ", label_max - label_min) # Parameters stop return clustered_data
def _mean_shift(table, input_cols, prediction_col='prediction', bandwidth=None, bin_seeding=False, min_bin_freq=1, cluster_all=True): inputarr = table[input_cols] ms = MeanShift(bandwidth=bandwidth, bin_seeding=bin_seeding, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=1) ms.fit(inputarr) label_name = { 'bandwidth': 'Bandwidth', 'bin_seeding': 'Bin Seeding', 'min_bin_freq': 'Minimum Bin Frequency', 'cluster_all': 'Cluster All' } get_param = ms.get_params() param_table = pd.DataFrame.from_items( [['Parameter', list(label_name.values())], ['Value', [get_param[x] for x in list(label_name.keys())]]]) cluster_centers = ms.cluster_centers_ n_clusters = len(cluster_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) labels = ms.labels_ if len(input_cols) > 1: pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) fig_centers = _mean_shift_centers_plot(input_cols, cluster_centers, colors) fig_samples = _mean_shift_samples_plot( table, input_cols, 100, cluster_centers, colors) if len(table.index) > 100 else _mean_shift_samples_plot( table, input_cols, None, cluster_centers, colors) if len(input_cols) > 1: fig_pca = _mean_shift_pca_plot(labels, cluster_centers, pca2_model, pca2, colors) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Mean Shift Result | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_pca} | {fig_samples} | ### Parameters | {params} """.format(fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=pandasDF2MD(param_table)))) else: rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Mean Shift Result | - Coordinates of cluster centers | {fig_cluster_centers} | - Samples | {fig_samples} | ### Parameters | {params} """.format(fig_cluster_centers=fig_centers, fig_samples=fig_samples, params=pandasDF2MD(param_table)))) model = _model_dict('mean_shift') model['model'] = ms model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = labels return {'out_table': out_table, 'model': model}
print("Clustered data shape: ", np.shape(clustered_data)) x = 0 y = 0 for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]): clustered_data[x][y] = clust.labels_[i] x = x + 1 if x == 100: x = 0 y = y + 1 plt.imshow(clustered_data) name = img_dir + 'img_mean_shift_clustering.png' plt.savefig(name, bbox_inches='tight') print("Parameters for this estimation: ", clust.get_params()) label_min = 1 label_max = 0 for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]): if clust.labels_[i] > label_max: label_max = clust.labels_[i] if clust.labels_[i] < label_min: label_min = clust.labels_[i] print("Labels from", label_min, " ,to", label_max, ". Number of labels: ", label_max - label_min) print() print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) end_time = time.time() print("End time: ", time.ctime(end_time)) print("Duration: ", int(end_time - start_time), " seconds")