def main(): np.random.seed(311) # import the datasets x_census_data, y_census_data = process_data.process_census_data('./../Datasets/Census_Income') x_stock_data, y_stock_data = process_data.process_stock_data('./../Datasets/Stocks', ["2016", "2017"]) stock_params = { 'n_clusters': [21], 'elbow_graph': 'Stock Dataset Elbow Graph - K Means', 'con_mat': 'Stock Dataset Confusion Matrix - K Means', 'fig_size': (20, 5), 'cluster_center_file': "stock_k_means_centers.txt" } census_params = { 'n_clusters': [8], 'elbow_graph': 'Census Dataset Elbow Graph - K Means', 'con_mat': 'Census Dataset Confusion Matrix - K Means', 'fig_size': (20, 5), 'cluster_center_file': "census_k_means_centers.txt" } # Run the elbow technique on the stock dataset and then calculate the clusters using the ideal # cluster size, and see what those cluster run_k_means_elbow(stock_params, x_stock_data) calculate_k_means_clusters(stock_params, x_stock_data, y_stock_data) plt.figure() run_k_means_elbow(census_params, x_census_data) calculate_k_means_clusters(census_params, x_census_data, y_census_data)
def clusters_added_ann(): x_census_data, y_census_data = process_data.process_census_data( './../Datasets/Census_Income') # 2 ANN's here, one for each clustering algorithm census_params_km = { 'clusters': 8, "title": "Census Dataset - K Means Clusters Added as Feature", "path": "./ANN_Clustered_Data/", "file": "k_means_ann.txt", "cm_mat": "Census Dataset - K Means Clusters added Confusion Matrix" } census_params_em = { 'clusters': 12, "title": "Census Dataset - EM Clusters Added as Feature", "path": "./ANN_Clustered_Data/", "file": "em_ann.txt", "cm_mat": "Census Dataset - EM Clusters added Confusion Matrix" } census_params_orig = { "title": "Census Dataset - Original", "path": "./ANN_Clustered_Data/", "file": "ann_orig.txt", "cm_mat": "Census Dataset - Original Confusion Matrix" } km_clusters = k_means.return_k_means_clusters(census_params_km, x_census_data).reshape( (len(x_census_data), 1)) em_clusters = k_means.return_k_means_clusters(census_params_em, x_census_data).reshape( (len(x_census_data), 1)) x_census_data_km = np.hstack([x_census_data, km_clusters]) x_census_data_em = np.hstack([x_census_data, em_clusters]) neural_network_learning(x_census_data_km, y_census_data, census_params_km) neural_network_learning(x_census_data_em, y_census_data, census_params_em) neural_network_learning(x_census_data, y_census_data, census_params_orig)
def main(): np.random.seed(311) # import the datasets x_census_data, y_census_data = process_data.process_census_data('./../Datasets/Census_Income') x_stock_data, y_stock_data = process_data.process_stock_data('./../Datasets/Stocks', ["2016", "2017"]) stock_params = { 'min_dim_graph': "Stock Dataset - Minimum Dimension vs EPS", 'projection_loss_graph': "Stock Dataset - Projection Losses for Data Random Projections", 'components': 100, 'num_retry': 10 } census_params = { 'min_dim_graph': "Census Dataset - Minimum Dimension vs EPS", 'projection_loss_graph': "Census Dataset - Projection Losses for Data Random Projections", 'components': 4, 'num_retry': 10 } determine_min_dim(stock_params, x_stock_data) determine_min_dim(census_params, x_census_data) create_random_guassian_projections(stock_params, x_stock_data) create_random_guassian_projections(census_params, x_census_data)
def main(): np.random.seed(311) # import the datasets x_census_data, y_census_data = process_data.process_census_data('./../Datasets/Census_Income') x_stock_data, y_stock_data = process_data.process_stock_data('./../Datasets/Stocks', ["2016", "2017"]) stock_params = { 'n_components': 100, 'filename': None, 'projection_loss_graph': None, 'num_retry': 10, 'ica_title': None, 'components': 100 } census_params = { 'n_components': 4, 'filename': None, 'projection_loss_graph': None, 'num_retry': 10, 'ica_title': None, 'components': 4 } x_stock_data_reduced = [] x_census_data_reduced = [] # import all the reduced dimensionality datasets # PCA x_stock_data_reduced.append(principle_component_analysis.run_pca(stock_params, x_stock_data)) x_census_data_reduced.append(principle_component_analysis.run_pca(census_params, x_census_data)) # ICA x_stock_data_reduced.append(independant_component_analysis.run_ica(stock_params, x_stock_data)) x_census_data_reduced.append(independant_component_analysis.run_ica(census_params, x_census_data)) # Random Projections x_stock_data_reduced.append(random_projections.create_random_guassian_projections(stock_params, x_stock_data)) x_census_data_reduced.append(random_projections.create_random_guassian_projections(census_params, x_census_data)) # SVD x_stock_data_reduced.append(svd_projection.run_svd(stock_params, x_stock_data)) x_census_data_reduced.append(svd_projection.run_svd(census_params, x_census_data)) # Make some rough elbow graphs to figure out how many clusters to do km_stock_elbow_dict = {'elbow_graph': 'Stock Dataset K Means Elbow Graph Reduced Dimension', 'path': './Reduced_Data_Clustering/'} k_means.run_k_means_elbow(km_stock_elbow_dict, x_stock_data_reduced[0]) km_census_elbow_dict = {'elbow_graph': 'Census Dataset K Means Elbow Graph Reduced Dimension', 'path': './Reduced_Data_Clustering/'} k_means.run_k_means_elbow(km_census_elbow_dict, x_census_data_reduced[0]) # Run K means on all these guys k_means_stock_params_km = [] k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means PCA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means ICA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means Random Projections", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_stock_params_km.append({'n_clusters': [23], 'con_mat': "Stock Dataset Confusion Matrix - K-Means SVD Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_census_params_km = [] k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means PCA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means ICA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means Random Projections", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) k_means_census_params_km.append({'n_clusters': [8], 'con_mat': "Census Dataset Confusion Matrix - K-Means SVD Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_stock_params = [] em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM PCA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM ICA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM Random Projections", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_stock_params.append({'n_clusters': [8], 'con_mat': "Stock Dataset Confusion Matrix - EM SVD Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_census_params = [] em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM PCA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM ICA Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM Random Projections", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) em_census_params.append({'n_clusters': [12], 'con_mat': "Census Dataset Confusion Matrix - EM SVD Reduction", 'path': './Reduced_Data_Clustering/', 'fig_size': (20, 5)}) # loop through all the parameters and datasets to do the analysis # 4 sets of 4 analyses = 16 in total for i in range(4): k_means.calculate_k_means_clusters(k_means_stock_params_km[i], x_stock_data_reduced[i], y_stock_data) k_means.calculate_k_means_clusters(k_means_census_params_km[i], x_census_data_reduced[i], y_census_data) expectation_maximization.calculate_em_clusters(em_stock_params[i], x_stock_data_reduced[i], y_stock_data) expectation_maximization.calculate_em_clusters(em_census_params[i], x_census_data_reduced[i], y_census_data)
def dimensionality_reduction_ann(): x_census_data, y_census_data = process_data.process_census_data( './../Datasets/Census_Income') train_size = 0.6 random_state = 86 x_census_data_reduced = [] census_params = { 'n_components': 4, 'filename': None, 'projection_loss_graph': None, 'num_retry': 10, 'ica_title': None, 'components': 4 } # import all the reduced dimensionality datasets # PCA x_census_data_reduced.append( principle_component_analysis.run_pca(census_params, x_census_data)) # ICA x_census_data_reduced.append( independant_component_analysis.run_ica(census_params, x_census_data)) # Random Projections x_census_data_reduced.append( random_projections.create_random_guassian_projections( census_params, x_census_data)) # SVD x_census_data_reduced.append( svd_projection.run_svd(census_params, x_census_data)) census_params = [] census_params.append({ "title": "Census Dataset - PCA Dimension Reduction", "path": "./ANN_Reduced_Data/", 'file': 'pca_reduced_ann.txt', 'cm_mat': "Census Dataset - PCA Dimension Reduction Confusion Matrix" }) census_params.append({ "title": "Census Dataset - ICA Dimension Reduction", "path": "./ANN_Reduced_Data/", 'file': 'ica_reduced_ann.txt', 'cm_mat': "Census Dataset - ICA Dimension Reduction Confusion Matrix" }) census_params.append({ "title": "Census Dataset - Random Projection Reduction", "path": "./ANN_Reduced_Data/", 'file': 'random_reduced_ann.txt', 'cm_mat': "Census Dataset - Random Projection Reduction Confusion Matrix" }) census_params.append({ "title": "Census Dataset - SVD Dimension Reduction", "path": "./ANN_Reduced_Data/", 'file': 'svd_reduced_ann.txt', 'cm_mat': "Census Dataset - SVD Dimension Reduction Confusion Matrix" }) # Run 4 neural nets - one for each instance of the reduced data for i in range(4): neural_network_learning(x_census_data_reduced[i], y_census_data, census_params[i])