예제 #1
0
def tune_part5networks():
    # TODO
    neural_params = {
        "hidden_layer_sizes": [(64, 64), (128, 128), (64, 64, 64), (128, 128, 128)],
        "learning_rate_init": [0.0001, 0.001, 0.01, 0.1],
    }
    X, y = load_intention()

    kmeans = KMeans(2)
    em = GaussianMixture(2)

    Xtrain, Xtest, ytrain, ytest = train_test_split(
        X, y, train_size=0.8, random_state=1
    )

    kmeans.fit(Xtrain)
    em.fit(Xtrain)

    cluster1, cluster2 = kmeans.cluster_centers_

    distance_from_cluster1 = (np.sqrt((X - cluster1) ** 2)).mean(axis=1)
    distance_from_cluster2 = (np.sqrt((X - cluster2) ** 2)).mean(axis=1)
    point_probabilities = em.predict_proba(X)

    X_minimal = pd.DataFrame(
        {
            "dist1": distance_from_cluster1,
            "dist2": distance_from_cluster2,
            "prob1": point_probabilities[:, 0],
            "prob2": point_probabilities[:, 1],
        }
    )
    X_augmented = X.copy()
    X_augmented[["dist1", "dist2", "prob1", "prob2"]] = X_minimal.copy()

    Xminimaltrain, Xminimaltest = train_test_split(
        X_minimal, train_size=0.8, random_state=1
    )
    Xaugtrain, Xaugtest = train_test_split(X_augmented, train_size=0.8, random_state=1)

    mlp = MLPClassifier(max_iter=2000, early_stopping=True)

    print("Tuning neural network on minimal dataset")
    clf_minimal = GridSearchCV(mlp, neural_params)
    clf_minimal.fit(X_minimal, y)

    print("Tuning neural network on augmented dataset")
    clf_augmented = GridSearchCV(
        MLPClassifier(max_iter=2000, early_stopping=True), neural_params
    )
    clf_augmented.fit(X_augmented, y)

    minimal_outpath = os.path.join("output", "part5", "clf_minimal.pkl")
    augmented_outpath = os.path.join("output", "part5", "clf_augmented.pkl")

    with open(minimal_outpath, "wb") as f:
        pickle.dump(clf_minimal, f)

    with open(augmented_outpath, "wb") as f:
        pickle.dump(clf_augmented, f)
예제 #2
0
def generate_cluster_dfs():
    X, y = load_intention()

    kmeans = KMeans(2)
    em = GaussianMixture(2)

    Xtrain, Xtest, ytrain, ytest = train_test_split(
        X, y, train_size=0.8, random_state=1
    )

    kmeans.fit(Xtrain)
    em.fit(Xtrain)

    cluster1, cluster2 = kmeans.cluster_centers_

    distance_from_cluster1 = (np.sqrt((X - cluster1) ** 2)).mean(axis=1)
    distance_from_cluster2 = (np.sqrt((X - cluster2) ** 2)).mean(axis=1)
    point_probabilities = em.predict_proba(X)

    X_minimal = pd.DataFrame(
        {
            "dist1": distance_from_cluster1,
            "dist2": distance_from_cluster2,
            "prob1": point_probabilities[:, 0],
            "prob2": point_probabilities[:, 1],
        }
    )
    X_augmented = X.copy()
    X_augmented[["dist1", "dist2", "prob1", "prob2"]] = X_minimal.copy()
    return X_minimal, X_augmented, y
def plot_pca_points(dataset):
    if dataset == "pulsar":
        _, y = load_pulsar()
    else:
        _, y = load_intention()

    pca_data = get_transformed_data(dataset, "PCA", pca_dir)

    fig = plt.figure()
    ax = fig.add_subplot(111, projection="3d")
    colors = ["red", "blue"]
    cmap = mpl.colors.ListedColormap(colors)
    ax.scatter(pca_data[:, 0],
               pca_data[:, 1],
               zs=pca_data[:, 2],
               c=y,
               cmap=cmap,
               alpha=0.2)
    ax.set_xlabel("PCA Dimension 1")
    ax.set_ylabel("PCA Dimension 2")
    ax.set_zlabel("PCA Dimension 3")

    title = "PCA-Transformed Data Points by Label"
    ax.set_title(title)
    plt.savefig(os.path.join(part2_plot_dir, f"PCAPlot{dataset}.png"))
    plt.close()
def plot_principal_axes(dataset):
    if dataset == "pulsar":
        X, y = load_pulsar()
        cols_to_use = X.columns  # Use all
    else:
        X, y = load_intention()
        cols_to_use = [
            "Administrative",
            "Administrative_Duration",
            "Informational",
            "Informational_Duration",
            "ProductRelated",
            "ProductRelated_Duration",
            "BounceRates",
            "ExitRates",
            "PageValues",
        ]

    index = [f"Component {i+1}" for i in range(X.shape[1])]
    pca_transformer_name = f"{dataset}_PCA_transformer.pkl"
    pca_data = get_transformed_data(dataset, "PCA", pca_dir)
    pca = get_transformer(pca_transformer_name, pca_dir)
    components = pd.DataFrame(pca.components_, columns=X.columns, index=index)

    components = components[cols_to_use]
    first_two_components = components.iloc[:2]
    ax = first_two_components.plot(kind="bar")
    ax.set_ylabel("Value")

    ax.set_title(
        "Principal Axes in Feature Space resulting from PCA Decomposition")
    plt.savefig(os.path.join(part2_plot_dir, f"PrincipalAxes{dataset}.png"))
    plt.close()
def plot_ICA_components(dataset):
    if dataset == "intention":
        X, y = load_intention()
        cols_to_use = [
            "Administrative",
            "Administrative_Duration",
            "Informational",
            "Informational_Duration",
            "ProductRelated",
            "ProductRelated_Duration",
            "BounceRates",
            "ExitRates",
            "PageValues",
        ]
    else:
        X, y = load_pulsar()
        cols_to_use = X.columns

    n_components = ica_components_map[dataset]
    transformer_name = f"{dataset}_ICA{n_components}_transformer.pkl"
    folder = os.path.join("output", "part2", "ICA")

    transformer = get_transformer(transformer_name, folder)
    components = pd.DataFrame(transformer.components_, columns=X.columns)
    components = components[cols_to_use]
    rows = int(np.sqrt(n_components))

    fig = plt.figure(figsize=(10, 10))
    plt.rcParams.update({"font.size": 5})
    for i in range(n_components):
        ax = fig.add_subplot(int(f"{rows}{rows}{i+1}"))
        ax.set_title(f"Component {i+1}")
        components.loc[i, :].plot(kind="bar", ax=ax)

        # Turn off ticks
        #
        if n_components - i > rows:
            ax.set_xticklabels([])
            for tic in ax.xaxis.get_major_ticks():
                tic.tick1On = tic.tick2On = False
    plt.subplots_adjust(wspace=1)
    plt.rcParams.update({"font.size": 14})
    fig.suptitle("ICA Components")

    plot_dir = os.path.join("plots", "part2", f"ica_components{dataset}.png")
    plt.savefig(plot_dir)
    plt.close()
예제 #6
0
def run_experiment_1():
    X1, y1 = load_pulsar()
    X2, y2 = load_intention()

    print("Running Exp1 on Pulsar Dataset")
    pulsar_results = run_clustering_experiment(X1, y1)
    pulsar_filename = os.path.join(OUTPUT_DIR, "exp1_pulsar_data.json")

    with open(pulsar_filename, "w") as f:
        json.dump(pulsar_results, f)

    print("Running Exp1 on Intention Dataset")
    intention_results = run_clustering_experiment(X2, y2)
    intention_filename = os.path.join(OUTPUT_DIR, "exp1_intention_data.json")

    with open(intention_filename, "w") as f:
        json.dump(intention_results, f)
예제 #7
0
def part2(intention=True, pulsar=True):
    part2_dir = os.path.join(OUTPUT_DIR, "part2")
    if intention:
        X, y = load_intention()
        print("Collecting Exp2 on Intention")
        save_all_transformations(X, "intention", output_dir=part2_dir)

    if pulsar:
        X, y = load_pulsar()
        print("Collecting Exp2 Data on Pulsar")
        save_all_transformations(
            X,
            "pulsar",
            output_dir=part2_dir,
            ica_max_component_ratio=1.0,
            rp_component_ratio=1.0,
            lle_max_component_ratio=1.0,
        )
예제 #8
0
def evaluate_part4network(transformer):
    clf = load_part4_network(transformer)
    if transformer == "None":
        X, y = load_intention()
    elif transformer == "PCA":
        X, y = load_intention_PCA_reduced()
    elif transformer == "ICA":
        X, y = load_intention_ICA_reduced()
    elif transformer == "RP":
        X, y = load_intention_RP_reduced()
    elif transformer == "LLE":
        X, y = load_intention_LLE_reduced()

    # Should be consistent due to random seed
    Xtrain, Xtest, ytrain, ytest = train_test_split(
        X, y, train_size=0.8, random_state=1
    )

    train_preds = clf.predict(Xtrain)
    test_preds = clf.predict(Xtest)

    train_acc = accuracy_score(ytrain, train_preds)
    test_acc = accuracy_score(ytest, test_preds)

    train_f1 = f1_score(ytrain, train_preds)
    test_f1 = f1_score(ytest, test_preds)

    best_classifier_index = np.argmin(clf.cv_results_["rank_test_score"])

    train_time = clf.cv_results_["mean_fit_time"][best_classifier_index]

    return {
        "train_accuracy": train_acc,
        "train_f1": train_f1,
        "test_accuracy": test_acc,
        "test_f1": test_f1,
        "train_time": train_time,
    }
예제 #9
0
def plot_cluster_means(
    data_loader,
    transformer_path,
    dataset,
    output_dir,
    file_prefix,
    kmeans_clusters=2,
    em_clusters=2,
):
    if dataset == "intention":
        X_untransformed = load_intention()
    else:
        X_untransformed = load_pulsar()

    Xtransformed, y = data_loader()
    with open(transformer_path, "rb") as f:
        transformer = pickle.load(f)

    X = get_inverse_transform(transformer, Xtransformed)
    X = pd.DataFrame(X, columns=X_untransformed.columns)

    if data_loader is load_intention:
        X_plot = X[
            [
                "Administrative",
                "Administrative_Duration",
                "Informational",
                "Informational_Duration",
                "ProductRelated",
                "ProductRelated_Duration",
                "BounceRates",
                "ExitRates",
                "PageValues",
            ]
        ]

    else:
        X_plot = X

    fig, (ax1, ax2) = plt.subplots(1, 2)
    kmeans = KMeans(kmeans_clusters, random_state=1)
    em = GaussianMixture(n_components=em_clusters, random_state=1)
    kmeans.fit(X)
    em.fit(X)

    kmeans_df = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
    kmeans_df = kmeans_df[X_plot.columns]

    em_df = pd.DataFrame(em.means_, columns=X.columns)
    em_df = em_df[X_plot.columns]

    kmeans_df.plot(kind="bar", ax=ax1)
    em_df.plot(kind="bar", ax=ax2)

    ax1.set_ylabel("Mean Value")
    ax2.set_ylabel("Mean Value")

    ax1.set_xlabel("Cluster")
    ax2.set_xlabel("Cluster")

    ax1.set_title("K-Means Cluster Centers")
    ax2.set_title("EM Cluster Centers")
    ax1.get_legend().remove()
    plt.savefig(os.path.join(output_dir, f"{file_prefix}_clusterprojections.png"))
    plt.close()
                             output_dir="output/part2",
                             ica_max_component_ratio=0.3,
                             rp_component_ratio=0.3,
                             lle_max_component_ratio=0.3):
    pca_dir = os.path.join(output_dir, "PCA")
    ica_dir = os.path.join(output_dir, "ICA")
    rp_dir = os.path.join(output_dir, "RP")
    lle_dir = os.path.join(output_dir, "LLE")

    print(f"Running PCA Collection on {file_prefix}")
    # run_PCA_collection(data, file_prefix, pca_dir)
    print(f"Running ICA Collection on {file_prefix}")
    # run_ICA_collection(data, file_prefix, ica_dir, max_component_ratio=ica_max_component_ratio)
    print(f"Running RP Collection on {file_prefix}")
    run_RP_collection(data,
                      file_prefix,
                      rp_dir,
                      max_component_ratio=rp_component_ratio)
    print(f"Running LLE Collection on {file_prefix}")
    # run_LLE_collection(data, file_prefix, lle_dir, max_component_ratio=lle_max_component_ratio)


if __name__ == "__main__":
    X, y = load_intention()
    X2, y2 = load_pulsar()
    print("Collecting Exp2 on Intention")
    save_all_transformations(X, "intention")

    print("Collecting Exp2 Data on Pulsar")
    save_all_transformations(X2, "pulsar")