Пример #1
0
def plot_cluster_distances(estimator, dataset, version):
    visualizer = InterclusterDistance(estimator)
    visualizer.fit(data.DATA[dataset][version]['x_train'])
    visualizer.show(
        f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_distances_k{estimator.n_clusters}.png'
    )
    plt.clf()
Пример #2
0
def cluster_distances(model, X, graph):
    visualizer = InterclusterDistance(
        model,
        legend=True,
        legend_loc='upper left',
        title=" KMeans Intercluster Distance Map for " + graph)
    visualizer.fit(X)
    visualizer.show()
Пример #3
0
def ica(training_set, test_set, y_train):
    # https://www.ritchieng.com/machine-learning-dimensionality-reduction-feature-transform/
    ica_avg_kurtosis_curve(training_set)

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    ica = FastICA(n_components=10, random_state=RAND, max_iter=1000)

    X_train = ica.fit_transform(training_set)

    plot_silhouette(km, X_train, title="ICA(10), K=2")
    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    km = KMeans(n_clusters=5,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    ica = FastICA(n_components=10, random_state=RAND, max_iter=1000)

    X_train = ica.fit_transform(training_set)

    plot_silhouette(km, X_train, title="ICA(10), K=5")

    km = KMeans(n_clusters=3,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    ica = FastICA(n_components=10, random_state=RAND, max_iter=1000)

    X_train = ica.fit_transform(training_set)

    plot_silhouette(km, X_train, title="ICA(10)" ", K=3")

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)
    km.fit(X_train)

    hs = metrics.homogeneity_score(y_train, km.labels_)
    print("homogenatity score for K=2:", hs)

    y_train_inverse = (~y_train.astype(bool)).astype(int)

    hs = metrics.homogeneity_score(y_train_inverse, km.labels_)
    print("homogenatity score for K=2: (inverse)", hs)
def cluster_distance_map(text, model, cv):
    path = 'models/{}'.format(model)
    pipe = load(path)
    kmeans = pipe.named_steps['kmeans']
    svd = pipe.named_steps['truncatedsvd']
    X = svd.fit_transform(cv)
    visualizer = InterclusterDistance(
        kmeans,
        embedding='mds',
    )
    visualizer.fit(X)
    visualizer.show(outpath="plots/ClusterMap.png")
    plt.close()
Пример #5
0
        #Confusion matrix
        plot_confusion_matrix(data_target, pred, figsize=(7, 5), cmap="PuBuGn")
        bottom, top = plt.ylim()
        plt.ylim(bottom + 0.5, top - 0.5)
        st.pyplot()

        # Elbow Method
        visualizer = KElbowVisualizer(KmeansClus, k=(1, 10))
        visualizer.fit(data_feature)
        visualizer.show()
        st.pyplot()

        # Inter Cluster Distances
        visualizer_inter = InterclusterDistance(KmeansClus)
        visualizer_inter.fit(data_feature)
        visualizer_inter.show()
        st.pyplot()
    except:
        st.write("Fill all parameters.")

########################################
# Mini-Batch k-means
########################################
if ML_option == "Mini-Batch k-means":
    try:
        # Mini Batch parameters
        Nk = st.number_input("Number of clusters: ", min_value=1, step=1)
        MBatchClus = MiniBatchKMeans(n_clusters=Nk)
        MBatchClus.fit(data_feature)
        pred = MBatchClus.predict(data_feature)
Пример #6
0
def kMeans():

    twitterX, twitterY, twitter_dataset, scaled_features = preprocess()

    gm = GaussianMixture(covariance_type='tied', n_components=18, n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(10):
        gm = GaussianMixture(covariance_type='spherical', n_components=9, n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    twitter_trainingX, twitter_testingX, twitter_trainingY, twitter_testingY = train_test_split(twitterX, twitterY)

    error = []

    #citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn


    for i in range(1, 8):
        pca = FastICA(n_components=i)
        pca.fit(twitter_trainingX)
        U, S, VT = np.linalg.svd(twitter_trainingX - twitter_trainingX.mean(0))
        x_train_pca = pca.transform(twitter_trainingX)
        x_train_pca2 = (twitter_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((twitter_trainingX - x_projected) ** 2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 8), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()


    clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8,), random_state=1,
                        solver='lbfgs')
    clf.fit(twitter_trainingX, twitter_trainingY)
    y_pred = clf.predict(twitter_testingX)

    print("Accuracy Score Normal", accuracy_score(twitter_testingY, y_pred))

    kmeans = KMeans(
        init="random",
        n_clusters=3,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(twitter_testingX)

    print("Accuracy Score K-Means", accuracy_score(twitter_testingY, labels))

    for i in range(9):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    # ica
    num_batches = 100
    inc_pca = IncrementalPCA(n_components=5)
    for X_batch in np.array_split(scaled_features, num_batches):
        inc_pca.partial_fit(X_batch)
    X_reduced_inc = inc_pca.transform(scaled_features)

    # randomized projections
    rnd_pca = PCA(n_components=5, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(twitterX.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=8)
    k_selected.fit(scaled_features_norm, twitterY)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    digits

    kmeans = KMeans(
        init="random",
        n_clusters=5,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(twitter_dataset)

    #the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    #final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    #num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    #labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init":"random",
        "n_init":10,
        "max_iter":300,
        "random_state":42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    model = KMeans(n_clusters=9)
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(twitterX)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(twitterX)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(twitterX)
    ic_visualizer.show()

    X = twitter_dataset[:, []]
    plt.scatter()
def intercluster(X):
    model = KMeans(3)
    visualizer = InterclusterDistance(model)

    visualizer.fit(X)
    visualizer.show()
Пример #8
0
def kMeans():
    # citation: https://realpython.com/k-means-clustering-python/
    digits = load_digits()

    # features
    digits_features = digits.data[:, 0:-1]
    # label
    label = digits.data[:, -1]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(digits_features)

    # citation: hands on machine learning
    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(21):
        gm = GaussianMixture(covariance_type='spherical',
                             n_components=20,
                             n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    # x_centered = digits_features - digits_features.mean(axis=0)
    # U, s, Vt = np.linalg.svd(x_centered)
    # c1 = Vt.T[:, 0]
    # c2 = Vt.T[:, 1]

    # W2 = Vt.T[:, :2]
    # X2D = x_centered.dot(W2)

    # pca = PCA()
    # pca.fit(scaled_features)
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # pca = PCA(n_components=0.95)
    # X_reduced = pca.fit_transform(scaled_features)

    explained_variance = []
    for i in range(63):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split(
        digits_features, label)

    # ica
    # citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn

    error = []

    for i in range(1, 50):
        pca = PCA(n_components=i)
        pca.fit(digits_trainingX)
        U, S, VT = np.linalg.svd(digits_trainingX - digits_trainingX.mean(0))
        x_train_pca = pca.transform(digits_trainingX)
        x_train_pca2 = (digits_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((digits_trainingX - x_projected)**2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 50), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)
    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))

    k_acc = []
    k_gm = []
    time_arr = []
    for k in range(1, 15):
        kmeans = KMeans(n_clusters=k)
        X_train = kmeans.fit_transform(digits_trainingX)
        X_test = kmeans.transform(digits_testingX)
        start_time = time.time()
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        total_time = time.time() - start_time
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        k_acc.append(score)
        time_arr.append(total_time)

    plt.plot(k_acc, label="K-Means")
    plt.plot(time_arr, label="Computation Time")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("k # of clusters")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()

    acc = []
    acc_ica = []
    acc_rca = []
    for i in range(1, 40):
        pca = PCA(n_components=i)
        X_train = pca.fit_transform(digits_trainingX)
        X_test = pca.transform(digits_testingX)
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        acc.append(score)

        ica = FastICA(n_components=i)
        x_train_i = ica.fit_transform(digits_trainingX)
        x_test_i = ica.transform(digits_testingX)
        clf.fit(x_train_i, digits_trainingY)
        y_pred_i = clf.predict(x_test_i)
        score_i = accuracy_score(digits_testingY, y_pred_i)
        acc_ica.append(score_i)

        rca = GaussianRandomProjection(n_components=i)
        x_train_r = rca.fit_transform(digits_trainingX)
        x_test_r = rca.transform(digits_testingX)
        clf.fit(x_train_r, digits_trainingY)
        y_pred_r = clf.predict(x_test_r)
        score_r = accuracy_score(digits_testingY, y_pred_r)
        acc_rca.append(score_r)

    plt.plot(acc, label="PCA")
    plt.plot(acc_ica, label="ICA")
    plt.plot(acc_rca, label="RCA")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Components")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # randomized projections
    rnd_pca = PCA(n_components=50, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(digits_features.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=50)
    k_selected.fit(scaled_features_norm, label)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45,
            scores,
            width=.2,
            label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(X_reduced_inc)
    print("GM Converged - PCA Inc", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(X_reduced_inc)
    gm.predict_proba(X_reduced_inc)
    gm.score_samples(X_reduced_inc)

    kmeans = KMeans(init="random",
                    n_clusters=63,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(scaled_features)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 63):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 63), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)

    model = KMeans(n_clusters=5)
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(digits_testingX)

    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))
    print("Accuracy Score K-Means", accuracy_score(digits_testingY, labels))

    elbow_visualizer = KElbowVisualizer(model, k=(2, 63))
    elbow_visualizer.fit(digits_features)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(digits_features)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(digits_features)
    ic_visualizer.show()

    # gmm = GaussianMixture(n_components=7).fit(digits_features)
    # labels = gmm.predict(digits_features)
    # plt.scatter(digits_features[:, 0], digits_features[:, 1], c=labels, s=40, cmap='viridis')
    # plt.show()

    # digits_features_pd = pd.DataFrame(data=digits_features[1:, 1:],
    # index=digits_features[1:,0],
    # columns=digits_features[0,1:])

    # pd.plotting.scatter_matrix(digits_features_pd)

    # probs = GaussianMixture.predict_proba(digits_features)
    # print(probs[:5].round(3))

    kmeans = KMeans(init="random",
                    n_clusters=18,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(X_reduced_inc)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 18), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    model = KMeans()
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(X_reduced_inc)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(X_reduced_inc)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(X_reduced_inc)
    ic_visualizer.show()
Пример #9
0
def pca(training_set, test_set):
    pca = PCA()

    pca.fit_transform(training_set)
    pca.transform(test_set)

    explained_variance = pca.explained_variance_ratio_
    components = 16
    print("for " + str(components) + " components")
    top_n = explained_variance[:components]
    print(top_n)
    print("captures ")
    print(np.sum(top_n))
    print("percent")

    pca_cum_variance(pca)

    pca = PCA(n_components=16)
    X_train = pca.fit_transform(training_set)
    X_test = pca.transform(test_set)

    distortions = []
    for i in range(1, 11):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        km.fit(X_train)
        distortions.append(km.inertia_)

    plt.plot(range(1, 11), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.title("Distortion vs # Clusters PCA-20")

    plt.tight_layout()
    plt.show()

    km = KMeans(n_clusters=3,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=3")

    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=2")

    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    km = KMeans(n_clusters=4,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=4")

    km = KMeans(n_clusters=5,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=5")
Пример #10
0
def plain_clustering():
    distortions = []
    for i in range(1, 11):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        km.fit(X_train)
        distortions.append(km.inertia_)

    plt.plot(range(1, 11), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.tight_layout()
    plt.show()

    km = KMeans(n_clusters=3,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)
    y_km = km.fit_predict(X_train)

    visualizer = InterclusterDistance(km)

    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure

    # cluster_labels = np.unique(y_km)
    # n_clusters = cluster_labels.shape[0]
    # silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
    # y_ax_lower, y_ax_upper = 0, 0
    # yticks = []
    # for i, c in enumerate(cluster_labels):
    #     c_silhouette_vals = silhouette_vals[y_km == c]
    #     c_silhouette_vals.sort()
    #     y_ax_upper += len(c_silhouette_vals)
    #     color = cm.jet(float(i) / n_clusters)
    #     plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
    #              edgecolor='none', color=color)
    #
    #     yticks.append((y_ax_lower + y_ax_upper) / 2.)
    #     y_ax_lower += len(c_silhouette_vals)
    #
    # silhouette_avg = np.mean(silhouette_vals)
    # plt.axvline(silhouette_avg, color="red", linestyle="--")
    #
    # plt.yticks(yticks, cluster_labels + 1)
    # plt.ylabel('Cluster')
    # plt.xlabel('Silhouette coefficient')
    #
    # plt.tight_layout()
    # # plt.savefig('images/11_04.png', dpi=300)
    # plt.show()

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)
    y_km = km.fit_predict(X_train)
    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    cluster_labels = np.unique(y_km)
    n_clusters = cluster_labels.shape[0]
    silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
    y_ax_lower, y_ax_upper = 0, 0
    yticks = []
    for i, c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_km == c]
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)
        color = cm.jet(float(i) / n_clusters)
        plt.barh(range(y_ax_lower, y_ax_upper),
                 c_silhouette_vals,
                 height=1.0,
                 edgecolor='none',
                 color=color)

        yticks.append((y_ax_lower + y_ax_upper) / 2.)
        y_ax_lower += len(c_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg, color="red", linestyle="--")

    plt.yticks(yticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')

    plt.tight_layout()
    # plt.savefig('images/11_04.png', dpi=300)
    plt.show()
                visualizerRadViz = RadViz(classes=classes,
                                          features=features,
                                          title=' ')
                visualizerRadViz.fit(X, y)  # Fit the data to the visualizer
                visualizerRadViz.transform(X)  # Transform the data
                locationFileNameRVZ = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx]) \
                                                   +'_idx_'+str(idx)+'_label_'+str(labelsIdx)+'_date_'+str(dateIdx)+'_radviz.png')
                visualizerRadViz.show(outpath=locationFileNameRVZ)
                plt.show()

                ## MDS

                # Instantiate the clustering model and visualizer
                model = KMeans(6)
                plt.figure()
                plt.xlabel('features', fontsize=12)
                plt.ylabel('features', fontsize=12)

                plt.xticks(fontsize=14)
                plt.yticks(fontsize=12)
                visualizerID = InterclusterDistance(model)
                visualizerID.fit(X)  # Fit the data to the visualizer

                locationFileNameID = os.path.join(
                    '/home/ak/Documents/Research/Papers/figures',
                    str(symbols[symbolIdx]) + '_idx_' + str(idx) +
                    '_KMeans_MDS.png')
                visualizerID.show(outpath=locationFileNameID
                                  )  # Finalize and render the figure
                plt.show()
Пример #12
0
def cluster_metrics(i_patches, a_patches, g_patches, city_names, K, save_path,
                    g_indices):
    # intra-cluster distances: ssd of samples to the nearest cluster centre
    sum_of_squared_distances = []
    silhouette_scores = []
    calinski_harabasz_scores = []
    davies_bouldin_scores = []
    k_mean_list = []
    for k in K:
        model, k_means, A = get_kmeans144_result(a_patches, k)
        k_mean_list.append(k_means)
        sum_of_squared_distances.append(k_means.inertia_)

        labels = k_means.labels_
        score = metrics.silhouette_score(A, labels, metric='euclidean')
        silhouette_scores.append(score)

        score = metrics.calinski_harabasz_score(A, labels)
        calinski_harabasz_scores.append(score)

        score = metrics.davies_bouldin_score(A, labels)
        davies_bouldin_scores.append(score)

        mydict = dict_cluster(i_patches, a_patches, g_patches, city_names,
                              k_means)
        save_path_k = '{}_{}'.format(save_path, k)
        gt_ratio = gt_metric(mydict, save_path_k)

    plot_figure(K, sum_of_squared_distances, save_path,
                'sum_of_squared_distances')
    plot_figure(K, silhouette_scores, save_path, 'silhouette_scores')
    plot_figure(K, calinski_harabasz_scores, save_path,
                'calinski_harabasz_scores')
    plot_figure(K, davies_bouldin_scores, save_path, 'davies_bouldin_score')

    ssd_best_index = sum_of_squared_distances.index(
        max(sum_of_squared_distances))
    sil_best_index = silhouette_scores.index(max(silhouette_scores))
    ch_best_index = calinski_harabasz_scores.index(
        max(calinski_harabasz_scores))
    db_best_index = davies_bouldin_scores.index(max(davies_bouldin_scores))
    #gtr_best_index = gt_ratio.index(max(gt_ratio))

    all_indices = [
        ssd_best_index, sil_best_index, ch_best_index, db_best_index
    ]  #, gtr_best_index] #, axis=None)
    best_k = np.array(K)[np.unique(all_indices)]

    for ind in range(len(K)):  #best_k:
        # Visualize output clusters of K means in 2D
        k_means = k_mean_list[ind]
        visualizer = InterclusterDistance(k_means)
        visualizer.fit(A)  # Fit the data to the visualizer
        #visualizer.show()  # Finalize and render the figure
        visualizer.show(
            outpath='{}_{}_InterclusterDistance.png'.format(save_path, ind))
        visualizer.poof()

        # Visualize through TSNE
    A_embedded = TSNE().fit_transform(A)
    plt.figure()
    palette = sns.color_palette("bright", 2)
    y_ = np.asarray(g_indices)
    y = y_.astype(np.float32)
    sns.scatterplot(A_embedded[:, 0],
                    A_embedded[:, 1],
                    hue=y,
                    legend='full',
                    palette=palette)
    plt.savefig('{}_tsne.png'.format(save_path))

    return
Пример #13
0
print((accuracy_score(pred_y_train, Y_train_encoded)))

visualizer = InterclusterDistance(kmeans)
#visualizer.fit(X_train)
#visualizer.show()

pred_y_test = kmeans.fit_predict(X_validation)
print("K Cluster Test Accuracy")
homo_score_test = metrics.homogeneity_score(pred_y_test, Y_test_encoded)
print("Homogeneity Score")
print(homo_score_test)

print((accuracy_score(pred_y_test, Y_test_encoded)))

visualizer.fit(X_validation)
visualizer.show()

#Using K means Cluster as Features
X_train_cluster = pandas.DataFrame(X_train)
X_validation_cluster = pandas.DataFrame(X_validation)
print("@@@@@@@@@@@@@@@@@@   KMeans Labels@@@@@@@@@@@@@@@@@")

X_train_cluster['km'] = pred_y_train
X_validation_cluster['km'] = pred_y_test

#Run Neural Net on dataset with KMeans Cluster
nn_model = MLPClassifier(solver='lbfgs',
                         activation='relu',
                         alpha=1e-5,
                         hidden_layer_sizes=(60, ),
                         learning_rate='constant',