示例#1
0
    res_df, k_labels_df = run_kmeans(X, y)
    k_labels_df.to_csv(f"data/kmeans_labels_{data_key}.csv", index=False)
    res_df.plot(
        subplots=True,
        style=".-",
        title=f"KMeans performance vs n_clusters on {data_key} data",
    )
    plt.xlabel("n_clusters")
    plt.savefig(f"output/kmeans_{data_key}.png")
    plt.close()

    if "credit" in data_key:
        examine_credit_cluster(
            X.values[:, TOP_FEATURES[:2]],
            y,
            title="True Label",
            xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]],
            fname=f"output/true_cluster_{data_key}.png",
        )
        examine_credit_cluster(
            X.values[:, TOP_FEATURES[:2]],
            k_labels_df[y.nunique()],
            title="KMeans",
            xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]],
            fname=f"output/kmeans_cluster_{data_key}.png",
        )

    if data_key == "fashion":
        plot_fashion_cluster(X,
                             k_labels_df[len(np.unique(y))],
                             fname="output/kmeans_cluster_fashion.png")
示例#2
0
    X_ica = ica.fit_transform(X)
    kurt = kurtosis(X_ica)
    plt.plot(kurt)
    plt.xlabel("IC")
    plt.ylabel("kurtosis")
    plt.title(f"ICA kurtosis on {data_key} data")
    plt.savefig(f"output/ica_kurtosis_{data_key}.png")
    plt.close()

    top_kurt_ind = (-kurt).argsort()

    # IC
    examine_credit_cluster(
        X_ica[:, top_kurt_ind[:2]],
        y,
        title=f"ICA transformation on {data_key} data",
        xylabel=["IC1", "IC2"],
        fname=f"output/ica_ic_cluster_{data_key}.png",
    )

    if data_key == "fashion":
        # IC
        plot_fashion_cluster(
            ica.components_[top_kurt_ind[:25], :],
            range(25),
            fname="output/ica_ic_fashion.png",
        )

        # reconstructed image
        X_recon = ica.inverse_transform(X_ica)
        plot_fashion_cluster(X_recon,
示例#3
0
    print(f"Running KMeans on {data_key} data")
    for algo_key in DATA[data_key]:
        print(f"..Running on {algo_key} transformed data")
        X, y = DATA[data_key][algo_key]
        res_df, k_labels_df = run_kmeans(X, y)
        k_labels_df.to_csv(f"data/kmeans_labels_{algo_key}_{data_key}.csv",
                           index=False)
        res_df.plot(
            subplots=True,
            style=".-",
            title=
            f"KMeans performance vs n_clusters on {algo_key} transformed {data_key} data",
        )
        plt.xlabel("n_clusters")
        plt.savefig(f"output/kmeans_{algo_key}_{data_key}.png")
        plt.close()

        examine_credit_cluster(
            X[:, :2],
            y,
            title=f"True label of {algo_key} transformed {data_key} data",
            fname=f"output/true_cluster_{algo_key}_{data_key}.png",
        )

        examine_credit_cluster(
            X[:, :2],
            k_labels_df[len(np.unique(y))],
            title=f"KMeans on {algo_key} transformed {data_key} data",
            fname=f"output/kmeans_cluster_{algo_key}_{data_key}.png",
        )
示例#4
0
for data_key in DATA:
    if data_key not in RUN_DATA:
        continue
    print(f"Running EM on {data_key} data")
    X, y = DATA[data_key]
    res_df, k_labels_df = run_em(X, y)
    k_labels_df.to_csv(f"data/EM_labels_{data_key}.csv", index=False)
    res_df.plot(
        subplots=True,
        style=".-",
        title=f"EM performance vs n_clusters on {data_key} data",
    )
    plt.xlabel("n_clusters")
    plt.savefig(f"output/EM_{data_key}.png")
    plt.close()

    if "credit" in data_key:
        examine_credit_cluster(
            X.values[:, TOP_FEATURES[:2]],
            k_labels_df[y.nunique()],
            title="EM",
            xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]],
            fname=f"output/EM_cluster_{data_key}.png",
        )

    if data_key == "fashion":
        plot_fashion_cluster(X,
                             k_labels_df[len(np.unique(y))],
                             fname="output/EM_cluster_fashion.png")
示例#5
0
        index=NUM_CLUSTERS,
    )
    k_labels_df = pd.DataFrame(data=k_labels)
    return res_df, k_labels_df


for data_key in DATA:
    if data_key not in RUN_DATA:
        continue
    print(f"Running EM on {data_key} data")
    for algo_key in DATA[data_key]:
        print(f"..Running on {algo_key} transformed data")
        X, y = DATA[data_key][algo_key]
        res_df, k_labels_df = run_em(X, y)
        k_labels_df.to_csv(f"data/EM_labels_{algo_key}_{data_key}.csv", index=False)
        res_df.plot(
            subplots=True,
            style=".-",
            title=f"EM performance vs n_clusters on {algo_key} transformed {data_key} data",
        )
        plt.xlabel("n_clusters")
        plt.savefig(f"output/EM_{algo_key}_{data_key}.png")
        plt.close()

        examine_credit_cluster(
            X[:, :2],
            k_labels_df[len(np.unique(y))],
            title=f"EM on {algo_key} transformed {data_key} data",
            fname=f"output/EM_cluster_{algo_key}_{data_key}.png",
        )
示例#6
0
    X_pca = pca.fit_transform(X)
    eigenvalues = pca.explained_variance_
    plt.plot(eigenvalues)
    if data_key == "fashion":
        plt.yscale("log")
    plt.xlabel("PC")
    plt.ylabel("eigenvalue")
    plt.title(f"PCA eigenvalue on {data_key} data")
    plt.savefig(f"output/pca_eigenval_{data_key}.png")
    plt.close()

    # PC
    examine_credit_cluster(
        X_pca[:, :2],
        y,
        title=f"PCA transformation on {data_key} data",
        xylabel=["PC1", "PC2"],
        fname=f"output/pca_pc_cluster_{data_key}.png",
    )

    if data_key == "fashion":
        # PC image
        plot_fashion_cluster(
            pca.components_[:25, :], range(25), fname="output/pca_pc_fashion.png"
        )

        # reconstructed image
        pca = PCA(n_components=0.95, whiten=True, random_state=0)
        X_recon = pca.inverse_transform(pca.fit_transform(X))
        print(f"...Keep {pca.n_components_} components for {data_key} data...")
        plot_fashion_cluster(X_recon, y, fname="output/pca_reconstructed_fashion.png")
示例#7
0
from sklearn.manifold import TSNE

from load_data import DATA
from examine_cluster import examine_credit_cluster

RUN_DATA = ["credit", "fashion"]

for data_key in DATA:
    if data_key not in RUN_DATA:
        continue
    print(f"Running TSNE on {data_key} data")
    X, y = DATA[data_key]

    tsne = TSNE(n_components=3, random_state=0, n_jobs=-1)
    X_tsne = tsne.fit_transform(X)

    examine_credit_cluster(
        X_tsne[:, :2],
        y,
        title=f"TSNE on {data_key} data",
        xylabel=["", ""],
        fname=f"output/tsne_cluster_{data_key}.png",
    )