# First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] # ## Do some initial runs to determine the right number for k # print '===== kmeans clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score) plot.plot(k_values, silhouette_values, 'b-') plot.xlabel('k') plot.ylabel('silhouette score') plot.ylim([0,1]) plot.show() # And run the knn with the highest silhouette score k = 0 best_silhouette_score = 0
clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] attributes_to_cluster = [ 'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)', 'Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)' ] ## Do some initial runs to determine the right number for k print('===== kmeans clustering =====') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), attributes_to_cluster, k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)]
def main(): # Read the result from the previous chapter convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Create objects for clustering clusteringNH = NonHierarchicalClustering() clusteringH = HierarchicalClustering() if FLAGS.mode == 'kmeans': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-means clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run the knn with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') print('Use this value of k to run the --mode=final --k=?') if FLAGS.mode == 'kmediods': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-medoids clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run k medoids with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Medoids silhouette score: k = {k}') dataset_kmed = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=50) DataViz.plot_clusters_3d( data_table=dataset_kmed, data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], cluster_col='cluster', label_cols=['label']) DataViz.plot_silhouette(data_table=dataset_kmed, cluster_col='cluster', silhouette_col='silhouette') util.print_latex_statistics_clusters( dataset=dataset_kmed, cluster_col='cluster', input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], label_col='label') # Run hierarchical clustering if FLAGS.mode == 'agglomerative': k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the maximum number of clusters print('Running agglomerative clustering') for k in k_values: print(f'k = {k}') dataset_cluster, link = clusteringH.agglomerative_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], max_clusters=k, distance_metric='euclidean', use_prev_linkage=True, link_function='ward') silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) if k == k_values[0]: DataViz.plot_dendrogram(dataset_cluster, link) # Plot the clustering results DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) if FLAGS.mode == 'final': # Select the outcome dataset of the knn clustering clusteringNH = NonHierarchicalClustering() dataset = clusteringNH.k_means_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=FLAGS.k, distance_metric='default', max_iters=50, n_inits=50) # Plot the results DataViz.plot_clusters_3d(dataset, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset, 'cluster', 'silhouette') # Print table statistics util.print_latex_statistics_clusters( dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label') del dataset['silhouette'] # Store the final dataset dataset.to_csv(DATA_PATH / RESULT_FNAME)
# First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] # ## Do some initial runs to determine the right number for k # print '===== kmeans clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score) plot.plot(k_values, silhouette_values, 'b-') plot.xlabel('k') plot.ylabel('silhouette score') plot.ylim([0, 1]) plot.show() # And run the knn with the highest silhouette score k = 6
# First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] # ## Do some initial runs to determine the right number for k # print '===== kmeans clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), selected_columns, k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score) plot.plot(k_values, silhouette_values, 'b-') plot.xlabel('k') plot.ylabel('silhouette score') plot.ylim([0, 1]) plot.show() # And run the knn with the highest silhouette score highest_score = 0 k = 0
DataViz = VisualizeDataset(__file__) # We'll start by applying non-hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] ## Do some initial runs to determine the right number for k print('===== kmeans clustering =====') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores
clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. # ## Do some initial runs to determine the right number for k # for printing in params: # print '===== kmeans clustering =====' k_values = range(2, 10) silhouette_values = [] for k in k_values: # print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), [printing[0], printing[1], printing[2]], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() # print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score) # plot.plot(k_values, silhouette_values, 'b-') # plot.xlabel('k') # plot.ylabel('silhouette score') # plot.ylim([0,1]) # plot.show() # print(silhouette_values) # And run the knn with the highest silhouette score k = k_values[silhouette_values.index(max(silhouette_values))] print(printing[3], k, max(silhouette_values))
DataViz = VisualizeDataset(__file__) # We'll start by applying non-hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] ## Do some initial runs to determine the right number for k print('===== kmeans clustering =====') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores