def plot_pca_points(dataset): if dataset == "pulsar": _, y = load_pulsar() else: _, y = load_intention() pca_data = get_transformed_data(dataset, "PCA", pca_dir) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") colors = ["red", "blue"] cmap = mpl.colors.ListedColormap(colors) ax.scatter(pca_data[:, 0], pca_data[:, 1], zs=pca_data[:, 2], c=y, cmap=cmap, alpha=0.2) ax.set_xlabel("PCA Dimension 1") ax.set_ylabel("PCA Dimension 2") ax.set_zlabel("PCA Dimension 3") title = "PCA-Transformed Data Points by Label" ax.set_title(title) plt.savefig(os.path.join(part2_plot_dir, f"PCAPlot{dataset}.png")) plt.close()
def plot_principal_axes(dataset): if dataset == "pulsar": X, y = load_pulsar() cols_to_use = X.columns # Use all else: X, y = load_intention() cols_to_use = [ "Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", ] index = [f"Component {i+1}" for i in range(X.shape[1])] pca_transformer_name = f"{dataset}_PCA_transformer.pkl" pca_data = get_transformed_data(dataset, "PCA", pca_dir) pca = get_transformer(pca_transformer_name, pca_dir) components = pd.DataFrame(pca.components_, columns=X.columns, index=index) components = components[cols_to_use] first_two_components = components.iloc[:2] ax = first_two_components.plot(kind="bar") ax.set_ylabel("Value") ax.set_title( "Principal Axes in Feature Space resulting from PCA Decomposition") plt.savefig(os.path.join(part2_plot_dir, f"PrincipalAxes{dataset}.png")) plt.close()
def plot_ICA_components(dataset): if dataset == "intention": X, y = load_intention() cols_to_use = [ "Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", ] else: X, y = load_pulsar() cols_to_use = X.columns n_components = ica_components_map[dataset] transformer_name = f"{dataset}_ICA{n_components}_transformer.pkl" folder = os.path.join("output", "part2", "ICA") transformer = get_transformer(transformer_name, folder) components = pd.DataFrame(transformer.components_, columns=X.columns) components = components[cols_to_use] rows = int(np.sqrt(n_components)) fig = plt.figure(figsize=(10, 10)) plt.rcParams.update({"font.size": 5}) for i in range(n_components): ax = fig.add_subplot(int(f"{rows}{rows}{i+1}")) ax.set_title(f"Component {i+1}") components.loc[i, :].plot(kind="bar", ax=ax) # Turn off ticks # if n_components - i > rows: ax.set_xticklabels([]) for tic in ax.xaxis.get_major_ticks(): tic.tick1On = tic.tick2On = False plt.subplots_adjust(wspace=1) plt.rcParams.update({"font.size": 14}) fig.suptitle("ICA Components") plot_dir = os.path.join("plots", "part2", f"ica_components{dataset}.png") plt.savefig(plot_dir) plt.close()
def run_experiment_1(): X1, y1 = load_pulsar() X2, y2 = load_intention() print("Running Exp1 on Pulsar Dataset") pulsar_results = run_clustering_experiment(X1, y1) pulsar_filename = os.path.join(OUTPUT_DIR, "exp1_pulsar_data.json") with open(pulsar_filename, "w") as f: json.dump(pulsar_results, f) print("Running Exp1 on Intention Dataset") intention_results = run_clustering_experiment(X2, y2) intention_filename = os.path.join(OUTPUT_DIR, "exp1_intention_data.json") with open(intention_filename, "w") as f: json.dump(intention_results, f)
def part2(intention=True, pulsar=True): part2_dir = os.path.join(OUTPUT_DIR, "part2") if intention: X, y = load_intention() print("Collecting Exp2 on Intention") save_all_transformations(X, "intention", output_dir=part2_dir) if pulsar: X, y = load_pulsar() print("Collecting Exp2 Data on Pulsar") save_all_transformations( X, "pulsar", output_dir=part2_dir, ica_max_component_ratio=1.0, rp_component_ratio=1.0, lle_max_component_ratio=1.0, )
def plot_cluster_means( data_loader, transformer_path, dataset, output_dir, file_prefix, kmeans_clusters=2, em_clusters=2, ): if dataset == "intention": X_untransformed = load_intention() else: X_untransformed = load_pulsar() Xtransformed, y = data_loader() with open(transformer_path, "rb") as f: transformer = pickle.load(f) X = get_inverse_transform(transformer, Xtransformed) X = pd.DataFrame(X, columns=X_untransformed.columns) if data_loader is load_intention: X_plot = X[ [ "Administrative", "Administrative_Duration", "Informational", "Informational_Duration", "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", ] ] else: X_plot = X fig, (ax1, ax2) = plt.subplots(1, 2) kmeans = KMeans(kmeans_clusters, random_state=1) em = GaussianMixture(n_components=em_clusters, random_state=1) kmeans.fit(X) em.fit(X) kmeans_df = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns) kmeans_df = kmeans_df[X_plot.columns] em_df = pd.DataFrame(em.means_, columns=X.columns) em_df = em_df[X_plot.columns] kmeans_df.plot(kind="bar", ax=ax1) em_df.plot(kind="bar", ax=ax2) ax1.set_ylabel("Mean Value") ax2.set_ylabel("Mean Value") ax1.set_xlabel("Cluster") ax2.set_xlabel("Cluster") ax1.set_title("K-Means Cluster Centers") ax2.set_title("EM Cluster Centers") ax1.get_legend().remove() plt.savefig(os.path.join(output_dir, f"{file_prefix}_clusterprojections.png")) plt.close()
output_dir="output/part2", ica_max_component_ratio=0.3, rp_component_ratio=0.3, lle_max_component_ratio=0.3): pca_dir = os.path.join(output_dir, "PCA") ica_dir = os.path.join(output_dir, "ICA") rp_dir = os.path.join(output_dir, "RP") lle_dir = os.path.join(output_dir, "LLE") print(f"Running PCA Collection on {file_prefix}") # run_PCA_collection(data, file_prefix, pca_dir) print(f"Running ICA Collection on {file_prefix}") # run_ICA_collection(data, file_prefix, ica_dir, max_component_ratio=ica_max_component_ratio) print(f"Running RP Collection on {file_prefix}") run_RP_collection(data, file_prefix, rp_dir, max_component_ratio=rp_component_ratio) print(f"Running LLE Collection on {file_prefix}") # run_LLE_collection(data, file_prefix, lle_dir, max_component_ratio=lle_max_component_ratio) if __name__ == "__main__": X, y = load_intention() X2, y2 = load_pulsar() print("Collecting Exp2 on Intention") save_all_transformations(X, "intention") print("Collecting Exp2 Data on Pulsar") save_all_transformations(X2, "pulsar")