Пример #1
0
    def clustering(self, k):
        word_vectors = self.__model_p__.wv
        KM_model = KMeans(n_clusters=k,
                          max_iter=1000,
                          random_state=True,
                          n_init=50).fit(X=word_vectors.vectors)

        center_closest = []
        for i in range(k):
            center_closest.append([
                el[0] for el in word_vectors.similar_by_vector(
                    KM_model.cluster_centers_[i], topn=15, restrict_vocab=None)
            ])

        metric_str = 'euclidean'
        score = silhouette_score(word_vectors.vectors,
                                 KM_model.predict(word_vectors.vectors),
                                 metric=metric_str)
        print("silhouette_score:", score)

        SVmodel = SilhouetteVisualizer(KM_model, is_fitted=True)
        SVmodel.fit(word_vectors.vectors)
        SVmodel.show()
        words = pd.DataFrame(word_vectors.vocab.keys(), columns=['words'])
        words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
        words['cluster'] = words.vectors.apply(
            lambda x: KM_model.predict([np.array(x)]))
        words.cluster = words.cluster.apply(lambda x: x[0])
        words['closeness_score'] = words.apply(
            lambda x: 1 / (KM_model.transform([x.vectors]).min()), axis=1)

        return KM_model, center_closest, score, words
Пример #2
0
def silhouettevisual(model, X, graph):
    visualizer = SilhouetteVisualizer(
        model,
        colors='yellowbrick',
        title=" Silhouette Plot of KMeans Clustering for " + graph)
    visualizer.fit(X)
    visualizer.show()
Пример #3
0
def kmeans_exp():

    with open('features_GMM.csv', mode='r') as feature_file:
        feature_reader = csv.reader(feature_file,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
        i = 0
        for fs in feature_reader:
            if i > 0:
                print(f"user n#{i}")
                print(fs)
                fs.pop()
                df_training = pd.DataFrame()
                training_list, testing_dict, training_fs_list, validation_fs_dict = load_dataset(
                    i)
                training_list = training_list[0:16] + training_list[
                    21:25] + training_list[36:40]
                for element in training_list:
                    df_training = df_training.append(element)

                # knn = KNeighborsClassifier()
                model = KMeans(n_clusters=6, random_state=42).fit(df_training)

                for signature in testing_dict['genuine']:
                    cluster = model.predict(signature)
                    print("testing signature:")
                    occurences = Counter(cluster)
                    print(occurences)

                visualizer = SilhouetteVisualizer(model)
                visualizer.fit(df_training)
                visualizer.show()

            i += 1
Пример #4
0
def plot_cluster_silhouette(estimator, dataset, version):
    visualizer = SilhouetteVisualizer(estimator, colors='yellowbrick')
    visualizer.fit(data.DATA[dataset][version]['x_train'])
    visualizer.show(
        f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_silhouettes_k{estimator.n_clusters}.png'
    )
    plt.clf()
Пример #5
0
 def shiloutte_score_plot(self, directory):
     self._check_model()
     plt.figure(figsize=(10, 10))
     visualizer = SilhouetteVisualizer(
         self.best_estimator_.named_steps['clustering'],
         colors='yellowbrick',
         is_fitted=True)
     visualizer.fit(self.data_preprocessed)
     visualizer.show(directory + "/shiloutte_score.png")
     visualizer.finalize()
     plt.close()
def silhouette_plot(text, model, cv):
    '''
    Loads in a saved model and produces a silhouette score plot
    '''
    path = 'models/{}'.format(model)
    pipe = load(path)
    kmeans = pipe.named_steps['kmeans']
    svd = pipe.named_steps['truncatedsvd']
    X = svd.fit_transform(cv)
    visualizer = SilhouetteVisualizer(kmeans, colors='sns_deep')
    visualizer.fit(X)
    visualizer.show(outpath="plots/Silhouette.png")
    plt.close()
Пример #7
0
def main():
    xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data(
    )
    km = KMeans(4)
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick')
    visualizer.fit(xtrain1)
    visualizer.show()
    ytest = km.fit_predict(xtrain1)
    print(metrics.homogeneity_score(ytrain1, ytest))
    score(xtrain2, 20, ytrain2)
    elbowplot(xtrain2, 20, "distortion",
              "K Means Clustering Distortion vs Number of Clusters dat2",
              "figs/kmeans/kmeans_elbow_dat2.png")
    elbowplot(xtrain1, 100, "distortion",
              "K Means Clustering Distortion vs Number of Clusters dat1",
              "figs/kmeans/kmeans_elbow_dat1.png")
    elbowplot(xtrain2,
              40,
              "silhouette",
              "K Means Clustering Silhouette Score vs Number of Clusters dat2",
              "figs/kmeans/kmeans_silhouette_dat2.png",
              elbow=False)
    elbowplot(xtrain1,
              100,
              "silhouette",
              "K Means Clustering Silhouette Score vs Number of Clusters dat1",
              "figs/kmeans/kmeans_silhouette_dat1.png",
              elbow=False)
    elbowplot(
        xtrain2,
        20,
        "calinski_harabasz",
        "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat2",
        "figs/kmeans/kmeans_calinski_dat2.png",
        elbow=False)
    elbowplot(
        xtrain1,
        100,
        "calinski_harabasz",
        "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat1",
        "figs/kmeans/kmeans_calinski_dat1.png",
        elbow=False)
Пример #8
0
def kMeans():

    twitterX, twitterY, twitter_dataset, scaled_features = preprocess()

    gm = GaussianMixture(covariance_type='tied', n_components=18, n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(10):
        gm = GaussianMixture(covariance_type='spherical', n_components=9, n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    twitter_trainingX, twitter_testingX, twitter_trainingY, twitter_testingY = train_test_split(twitterX, twitterY)

    error = []

    #citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn


    for i in range(1, 8):
        pca = FastICA(n_components=i)
        pca.fit(twitter_trainingX)
        U, S, VT = np.linalg.svd(twitter_trainingX - twitter_trainingX.mean(0))
        x_train_pca = pca.transform(twitter_trainingX)
        x_train_pca2 = (twitter_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((twitter_trainingX - x_projected) ** 2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 8), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()


    clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8,), random_state=1,
                        solver='lbfgs')
    clf.fit(twitter_trainingX, twitter_trainingY)
    y_pred = clf.predict(twitter_testingX)

    print("Accuracy Score Normal", accuracy_score(twitter_testingY, y_pred))

    kmeans = KMeans(
        init="random",
        n_clusters=3,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(twitter_testingX)

    print("Accuracy Score K-Means", accuracy_score(twitter_testingY, labels))

    for i in range(9):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    # ica
    num_batches = 100
    inc_pca = IncrementalPCA(n_components=5)
    for X_batch in np.array_split(scaled_features, num_batches):
        inc_pca.partial_fit(X_batch)
    X_reduced_inc = inc_pca.transform(scaled_features)

    # randomized projections
    rnd_pca = PCA(n_components=5, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(twitterX.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=8)
    k_selected.fit(scaled_features_norm, twitterY)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    digits

    kmeans = KMeans(
        init="random",
        n_clusters=5,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(twitter_dataset)

    #the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    #final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    #num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    #labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init":"random",
        "n_init":10,
        "max_iter":300,
        "random_state":42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    model = KMeans(n_clusters=9)
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(twitterX)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(twitterX)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(twitterX)
    ic_visualizer.show()

    X = twitter_dataset[:, []]
    plt.scatter()
Пример #9
0
        model = KMeans(n_clusters=k, random_state=1)  # 모델 선택
        model.fit(X)  # 모델 훈련
        kmeans_models.append(model)
        inertias.append(model.inertia_)
        silhouettes.append(silhouette_score(X, model.labels_))
    print('inertias:', inertias)
    print('silhouettes:', silhouettes)

    # k - 실루엣 점수 그래프
    plt.plot(k_ranges[1:], silhouettes[1:], marker='o')
    plt.xlabel('k')
    plt.ylabel('silhouette score')
    plt.show()

    # 실루엣 점수(silhouette score) = (b - a) / max(a, b)
    # a: 동일한 클러스터 안에서 다른 샘플들과의 거리의 평균
    # b: 가장 가까운 클러스터까지의 평균 거리
    # -1 <= ss <= 1
    # 실루엣 점수가 큰 모델의 클러스터 개수가 적절한 클러스터 개수
    # ss = 1: 샘플들이 자기 클러스터 안에 잘 모여 있고,
    # 다른 클러스터와는 멀리 떨어져 있는 경우
    # ss = 0: 샘들들이 클러스터 경계에 몰려 있는 경우
    # ss = -1: 샘플들이 잘못된 클러스터에 포함되는 경우.

    # 실루엣 다이어그램
    # pip install yellowbrick
    for model in kmeans_models[1:]:  # 훈련된 각각의 KMeans 모델들에 대해서
        visualizer = SilhouetteVisualizer(model, color='yellowbrick')
        visualizer.fit(X)
        visualizer.show()
Пример #10
0
def explore_DBSCAN_clustering(
    df,
    num_cols=None,
    metric="euclidean",
    eps=[0.5],
    min_samples=[5],
    include_silhouette=True,
    include_PCA=True,
    random_state=None,
):
    """fit and plot DBSCAN clustering algorithms

    Parameters
    ----------
    df : pandas.DataFrame
        the dataset, should be transformed with StandardScaler
    num_cols : list, optional
        list of numeric column names, in case of None, get all numeric columns
    metric : str, optional
        metric, by default "euclidean"
    eps : list, optional
        list of eps hyperparams, by default [0.5]
    min_samples: list, optional
        list of min_samples hyperparams, by default [5]
    include_silhouette : bool, optional
        whether Silhouette plots should be generated, by default True
    include_PCA : bool, optional
        whether PCA plots should be generated, by default True
    random_state : int, optional
        a number determines random number generation for centroid initialization, by default None

    Returns
    -------
    Tuple
        list
            a list of n_clusters values returned by DBSCAN models
        dict
            a dictionary with key=type of plot, value=list of plots

    Examples
    -------
    >>> original_df = pd.read_csv("/data/menu.csv")
    >>> numeric_features = eda.get_numeric_columns(original_df)
    >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
    >>> preprocessor = make_column_transformer(
    >>>     (numeric_transformer, numeric_features)
    >>> )
    >>> df = pd.DataFrame(
    >>>     data=preprocessor.fit_transform(original_df), columns=numeric_features
    >>> )
    >>> n_clusters, dbscan_plots = explore_DBSCAN_clusterting(df)
    """
    if num_cols is None:
        num_cols = get_numeric_columns(df)
    else:
        _verify_numeric_cols(df, num_cols)

    x = df[num_cols]

    results = {}
    n_clusters = []

    s_plots = []
    pca_plots = []

    print("------------------------")
    print("DBSCAN CLUSTERING")
    print("------------------------")

    for e in eps:
        for ms in min_samples:
            dbscan = DBSCAN(eps=e, min_samples=ms, metric=metric)
            dbscan.fit(x)
            k = len(set(dbscan.labels_)) - 1  # exclduing -1 labels
            n_clusters.append(k)
            print(f"eps={e}, min_samples={ms}, n_cluster={k}")
            if include_silhouette and k > 0:
                # generat Silhouette plot
                dbscan.n_clusters = k
                dbscan.predict = lambda x: dbscan.labels_
                fig, ax = plt.subplots()
                s_visualizer = SilhouetteVisualizer(dbscan, colors="yellowbrick", ax=ax)
                s_visualizer.fit(x)
                s_visualizer.show()
                s_plots.append(fig)
                # plt.clf()
                plt.close()
            else:
                s_plots.append(None)

            if include_PCA:
                # genrate PCA plot
                p_lot = plot_pca_clusters(x, dbscan.labels_, random_state=random_state)
                pca_plots.append(p_lot)
            else:
                pca_plots.append(None)

    results["Silhouette"] = s_plots
    results["PCA"] = pca_plots

    return n_clusters, results
Пример #11
0
def explore_KMeans_clustering(
    df,
    num_cols=None,
    n_clusters=range(3, 5),
    include_silhouette=True,
    include_PCA=True,
    random_state=None,
):
    """create, fit and plot KMeans clustering on the dataset

    Parameters
    ----------
    df : pandas.DataFrame
        the dataset, should be transformed with StandardScaler
    num_cols : list, optional
        list of numeric column names, in case of None, get all numeric columns
    metric : str, optional
        metric, by default "euclidean"
    n_clusters : list, optional
        list of n_clusters hyperparams, by default range(2, 9)
    include_silhouette : bool, optional
        whether Silhouette plots should be generated, by default True
    include_PCA : bool, optional
        whether PCA plots should be generated, by default True
    random_state : int, optional
        a number determines random number generation for centroid initialization, by default None

    Returns
    -------
    dict
        a dictionary with key=type of plot, value=list of plots

    Examples
    -------
    >>> original_df = pd.read_csv("/data/menu.csv")
    >>> numeric_features = eda.get_numeric_columns(original_df)
    >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
    >>> preprocessor = make_column_transformer(
    >>>     (numeric_transformer, numeric_features)
    >>> )
    >>> df = pd.DataFrame(
    >>>     data=preprocessor.fit_transform(original_df), columns=numeric_features
    >>> )
    >>> explore_KMeans_clusterting(df)
    """
    if num_cols is None:
        num_cols = get_numeric_columns(df)
    else:
        _verify_numeric_cols(df, num_cols)
    x = df[num_cols]
    results = {}
    if 1 in n_clusters:
        raise Exception("n_cluster cannot be 1")

    print("------------------------")
    print("K-MEANS CLUSTERING")
    print("------------------------")

    if len(n_clusters) > 1:
        print("Generating KElbow plot for KMeans.")
        # visualize using KElbowVisualizer
        kmeans = KMeans(random_state=random_state)

        plt.clf()
        fig, ax = plt.subplots()
        elbow_visualizer = KElbowVisualizer(kmeans, k=n_clusters, ax=ax)
        elbow_visualizer.fit(x)  # Fit the data to the visualizer
        elbow_visualizer.show()
        plt.close()
        elbow_visualizer.k = elbow_visualizer.elbow_value_  # fix printing issue
        results["KElbow"] = fig
    else:
        results["KElbow"] = None

    # visualize using SilhouetteVisualizer
    print("Generating Silhouette & PCA plots")
    silhouette_plots = []
    pca_plots = []
    for k in n_clusters:
        print(f"Number of clusters: {k}")

        kmeans = KMeans(k, random_state=random_state)

        if include_silhouette:
            fig, ax = plt.subplots()
            s_visualizer = SilhouetteVisualizer(kmeans, colors="yellowbrick", ax=ax)
            s_visualizer.fit(x)  # Fit the data to the visualizer
            s_visualizer.show()
            silhouette_plots.append(fig)
            # plt.clf()
            plt.close()

        else:
            silhouette_plots.append(None)

        # PCA plots
        if include_PCA:
            labels = kmeans.fit_predict(x)
            pca_fig = plot_pca_clusters(x, labels, random_state=random_state)
            pca_plots.append(pca_fig)
        else:
            pca_plots.append(None)

    results["Silhouette"] = silhouette_plots
    results["PCA"] = pca_plots

    return results
X = Data
distorsions = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    distorsions.append(kmeans.inertia_)
k = range(2, 10)
fig = plt.figure(figsize=(15, 5))
plt.plot(k, distorsions)
plt.grid(True)
plt.title('Elbow curve')

#methode de silouette
from sklearn.metrics import silhouette_samples, silhouette_score
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k)
    y_pred = kmeans.fit_predict(Data)
    score = silhouette_score(Data, y_pred)
    print("For k = {}, silhouette score is {})".format(k, score))

from yellowbrick.cluster import SilhouetteVisualizer
# Instantiate the clustering model and visualizer
for k in range(2, 10):
    model = KMeans(k, random_state=42)
    plt.subplot(221)
    visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer.fit(Data)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure
    plt.subplot(222)
    plt.scatter(Data[:, 0], Data[:, 1], c=y_pred, cmap='rainbow')
Пример #13
0
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.savefig("Gap Values.png")
plt.show()

# =============================================================================

# =============================================================================
# Using the silhouette to find the optimal number of clusters

for n_clusters in range(4, 10):
    model = KMeans(n_clusters, init='k-means++')
    cluster_labels = model.fit_predict(X)
    visualizer = SilhouetteVisualizer(model)
    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.show(outpath="BoW_Silhouette %d" % n_clusters)
    visualizer.poof()  # Draw/show/poof the data
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)

# =============================================================================

# =============================================================================
# Clustering Using K-Means
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

# reduce the features to 2D
reduced_features = pca.fit_transform(X)
Пример #14
0
score_lst.append(silhouette_score(Data_f_scaled, DBSCAN_1, metric=metric_str))
score_lst.append(silhouette_score(Data_f_scaled, DBSCAN_2, metric=metric_str))
score_lst.append(
    silhouette_score(Data_f_scaled, SpectralClustering_0, metric=metric_str))
score_lst.append(
    silhouette_score(Data_f_scaled, SpectralClustering_1, metric=metric_str))
score_lst.append(
    silhouette_score(Data_f_scaled, SpectralClustering_2, metric=metric_str))

print(score_lst)

for i in n_clust_vec:
    fig = plt.figure(figsize=(20, 40))
    model = SilhouetteVisualizer(KMeans(i, random_state=0))
    model.fit(Data_reduced_scaled)
    model.show()
    fig.savefig(path_join(directory_img, "k_m_silhouettes_{}.png".format(i)))
    plt.close()

Data_TSNE_2 = TSNE(n_components=2).fit_transform(Data_reduced_scaled)
Data_TSNE_3 = TSNE(n_components=3).fit_transform(Data_reduced_scaled)

fig = plt.figure(figsize=(20, 40))
Data_Video = Data_f[Data_f['Type_Video'] == 1]
ax = fig.add_subplot(2, 1, 1)
ax.scatter(Data_TSNE_3[:, 0],
           Data_TSNE_3[:, 1],
           c=k_means_results[0],
           cmap='viridis',
           marker='o',
           s=30)
Пример #15
0
                n_init=10,
                max_iter=maxIter,
                tol=tol)
    visualizer = SilhouetteVisualizer(km, colors="yellowbrick")
    visualizer.fit(np.array(data))
    km.fit(data)
    distortions.append(km.inertia_)
    cluster_labels = km.predict(data)
    silhouette_avg = silhouette_score(data, cluster_labels)

    opath = str(1) + "-1d"
    path = os.path.join("./", opath)
    if not os.path.exists(path):
        os.mkdir(path)
    startEndPath = str(numClustersStart) + "-" + str(numClustersEnd)
    visualizer.show(outpath="./" + opath + "/" + str(n_clusters) +
                    ".png")  # TODO
    plt.cla()
    plt.clf()
    plt.close("all")
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )
    print("For n_clusters =", n_clusters, "The distortion is :", km.inertia_)
    dictionary = {}
    print(n_clusters, "clusters:")
    for i in range(n_clusters):
        dictionary[i] = []
    for i in range(len(cluster_labels)):
Пример #16
0
def k_means_e_metodo_do_cotovelo(nome_arq_saida_todos_momentos,
                                 k_array,
                                 metodos_do_cotovelo,
                                 is_plotar=True,
                                 is_plotar_momentos_3d=False):
    df_momentos_familias = pd.read_csv(nome_arq_saida_todos_momentos, sep=",")
    df_momentos_familias = df_momentos_familias.apply(pd.to_numeric,
                                                      errors='coerce')
    df_momentos_familias = df_momentos_familias.dropna()
    df_momentos_familias = df_momentos_familias.reset_index(drop=True)
    np_elem_norm = df_momentos_familias.to_numpy()

    # método distorcao_km_inertia - soma das distâncias ao quadrado
    soma_das_dist_ao_quadrado = []
    arr_kmeans = []
    for cada_k in k_array:
        k_means_model = analisador_k_means(df_momentos_familias, np_elem_norm,
                                           cada_k, is_plotar,
                                           is_plotar_momentos_3d)
        arr_kmeans.append(k_means_model)
        soma_das_dist_ao_quadrado.append(k_means_model.inertia_)
        if "silhueta_yellowbrick" in metodos_do_cotovelo:
            visualizer = SilhouetteVisualizer(k_means_model,
                                              colors='yellowbrick')
            visualizer.fit(np_elem_norm)  # Fit the data to the visualizer
            visualizer.fig.savefig(
                "./silhueta_yellowbrick__k_{}.png".format(cada_k))
            if is_plotar:
                visualizer.show()
            plt.close('all')

    for metodo_do_cotovelo in metodos_do_cotovelo:
        if metodo_do_cotovelo == "distorcao_km_inertia":
            plt.plot(k_array, soma_das_dist_ao_quadrado, 'bx-')
            plt.xlabel('k')
            plt.ylabel('Distorção')
            plt.title('Método do cotovelo para encontrar o melhor k')
            plt.savefig("./{}.png".format(metodo_do_cotovelo))
            if is_plotar:
                plt.show()
        elif metodo_do_cotovelo == "distorcao_yellowbrick":
            kmeans = KMeans(random_state=0)
            visualizer = KElbowVisualizer(kmeans,
                                          k=k_array,
                                          metric='distortion')
            visualizer.fit(np_elem_norm)  # Fit the data to the visualizer
            plt.savefig("./{}.png".format(metodo_do_cotovelo))
            melhor_k = visualizer.elbow_value_
            if is_plotar:
                visualizer.show()
        elif metodo_do_cotovelo == "calinski_harabasz_yellowbrick":
            kmeans = KMeans(random_state=0)
            visualizer = KElbowVisualizer(kmeans,
                                          k=k_array,
                                          metric='calinski_harabasz')
            visualizer.fit(np_elem_norm)  # Fit the data to the visualizer
            plt.savefig("./{}.png".format(metodo_do_cotovelo))
            if is_plotar:
                visualizer.show()
        plt.close('all')

    return arr_kmeans, melhor_k
def silhouette(X):
    for i in range(2, 4):
        model = KMeans(i, random_state=42)
        visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
        visualizer.fit(X)  # Fit the data to the visualizer
        visualizer.show()  # Finalize and render the figure
Пример #18
0
def kMeans():
    # citation: https://realpython.com/k-means-clustering-python/
    digits = load_digits()

    # features
    digits_features = digits.data[:, 0:-1]
    # label
    label = digits.data[:, -1]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(digits_features)

    # citation: hands on machine learning
    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(21):
        gm = GaussianMixture(covariance_type='spherical',
                             n_components=20,
                             n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    # x_centered = digits_features - digits_features.mean(axis=0)
    # U, s, Vt = np.linalg.svd(x_centered)
    # c1 = Vt.T[:, 0]
    # c2 = Vt.T[:, 1]

    # W2 = Vt.T[:, :2]
    # X2D = x_centered.dot(W2)

    # pca = PCA()
    # pca.fit(scaled_features)
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # pca = PCA(n_components=0.95)
    # X_reduced = pca.fit_transform(scaled_features)

    explained_variance = []
    for i in range(63):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split(
        digits_features, label)

    # ica
    # citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn

    error = []

    for i in range(1, 50):
        pca = PCA(n_components=i)
        pca.fit(digits_trainingX)
        U, S, VT = np.linalg.svd(digits_trainingX - digits_trainingX.mean(0))
        x_train_pca = pca.transform(digits_trainingX)
        x_train_pca2 = (digits_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((digits_trainingX - x_projected)**2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 50), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)
    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))

    k_acc = []
    k_gm = []
    time_arr = []
    for k in range(1, 15):
        kmeans = KMeans(n_clusters=k)
        X_train = kmeans.fit_transform(digits_trainingX)
        X_test = kmeans.transform(digits_testingX)
        start_time = time.time()
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        total_time = time.time() - start_time
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        k_acc.append(score)
        time_arr.append(total_time)

    plt.plot(k_acc, label="K-Means")
    plt.plot(time_arr, label="Computation Time")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("k # of clusters")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()

    acc = []
    acc_ica = []
    acc_rca = []
    for i in range(1, 40):
        pca = PCA(n_components=i)
        X_train = pca.fit_transform(digits_trainingX)
        X_test = pca.transform(digits_testingX)
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        acc.append(score)

        ica = FastICA(n_components=i)
        x_train_i = ica.fit_transform(digits_trainingX)
        x_test_i = ica.transform(digits_testingX)
        clf.fit(x_train_i, digits_trainingY)
        y_pred_i = clf.predict(x_test_i)
        score_i = accuracy_score(digits_testingY, y_pred_i)
        acc_ica.append(score_i)

        rca = GaussianRandomProjection(n_components=i)
        x_train_r = rca.fit_transform(digits_trainingX)
        x_test_r = rca.transform(digits_testingX)
        clf.fit(x_train_r, digits_trainingY)
        y_pred_r = clf.predict(x_test_r)
        score_r = accuracy_score(digits_testingY, y_pred_r)
        acc_rca.append(score_r)

    plt.plot(acc, label="PCA")
    plt.plot(acc_ica, label="ICA")
    plt.plot(acc_rca, label="RCA")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Components")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # randomized projections
    rnd_pca = PCA(n_components=50, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(digits_features.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=50)
    k_selected.fit(scaled_features_norm, label)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45,
            scores,
            width=.2,
            label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(X_reduced_inc)
    print("GM Converged - PCA Inc", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(X_reduced_inc)
    gm.predict_proba(X_reduced_inc)
    gm.score_samples(X_reduced_inc)

    kmeans = KMeans(init="random",
                    n_clusters=63,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(scaled_features)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 63):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 63), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)

    model = KMeans(n_clusters=5)
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(digits_testingX)

    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))
    print("Accuracy Score K-Means", accuracy_score(digits_testingY, labels))

    elbow_visualizer = KElbowVisualizer(model, k=(2, 63))
    elbow_visualizer.fit(digits_features)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(digits_features)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(digits_features)
    ic_visualizer.show()

    # gmm = GaussianMixture(n_components=7).fit(digits_features)
    # labels = gmm.predict(digits_features)
    # plt.scatter(digits_features[:, 0], digits_features[:, 1], c=labels, s=40, cmap='viridis')
    # plt.show()

    # digits_features_pd = pd.DataFrame(data=digits_features[1:, 1:],
    # index=digits_features[1:,0],
    # columns=digits_features[0,1:])

    # pd.plotting.scatter_matrix(digits_features_pd)

    # probs = GaussianMixture.predict_proba(digits_features)
    # print(probs[:5].round(3))

    kmeans = KMeans(init="random",
                    n_clusters=18,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(X_reduced_inc)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 18), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    model = KMeans()
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(X_reduced_inc)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(X_reduced_inc)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(X_reduced_inc)
    ic_visualizer.show()
Пример #19
0
score1_confinement = [word_vectors.similarity('confinement', el[0])
                      for el in g1]

"""
g2 = word_vectors.similar_by_vector(KM_model2.cluster_centers_[2], topn = 15,
                               restrict_vocab = None)
"""
metric_str = 'euclidean'
score = silhouette_score(word_vectors.vectors,
                         KM_model2.predict(word_vectors.vectors),
                         metric = metric_str)
print("silhouette_score:", score)

SVmodel = SilhouetteVisualizer(KM_model2, is_fitted = True)
SVmodel.fit(word_vectors.vectors)
SVmodel.show()  

words = pd.DataFrame(word_vectors.vocab.keys(), columns = ['words'])
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: KM_model2.predict(
    [np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])
words['cluster_value'] = [1 if i == 0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(
    lambda x: 1/(KM_model2.transform([x.vectors]).min()), axis = 1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

clus_time = time.time() 
print("clustering time: %s seconds "
      % (clus_time - w2v_time))  
 def visualize(self):
     model = KMeans(self.k)
     visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
     visualizer.fit(self.data)
     visualizer.show()