示例#1
0
def run_k_means_elbow(params, x_data):
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(2, 40), metric='distortion')
    plt.figure()
    visualizer.fit(x_data)
    visualizer.set_title(params['elbow_graph'])

    try:
        path = params['path'] + params['elbow_graph'] + '.png'
    except:
        path = params['elbow_graph'] + '.png'

    visualizer.show(outpath=path)
示例#2
0
def perform_elbow_method(x_scaled):
    """
    Perform the elbow method to help us decide the number of clusters
    :param x_scaled: Values of our data after normalization
    :return: A plot of the elbow method
    """
    # Instantiate the clustering model and visualizer
    mpl.rcParams['xtick.labelsize'] = 12
    mpl.rcParams['ytick.labelsize'] = 12
    mpl.rcParams['axes.titlesize'] = 18
    mpl.rcParams['axes.labelsize'] = 14
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(1, 12))
    # Fit the data to the visualizer
    visualizer.fit(x_scaled)
    visualizer.set_title("The Elbow Method")
    visualizer.show()
示例#3
0
def perform_elbow_method(points, method):
    """
    Perform and visualize elbow method.
    :param points: the data's points
    :param method: clustering method - K means or Hierarchical
    :return: None
    """
    if method == 'K means':
        model = KMeans()
    elif method == 'Hierarchical':
        model = AgglomerativeClustering()
    else:
        raise Exception(
            'This elbow method designed only for K means and Hierarchical')
    visualizer = KElbowVisualizer(model, k=(1, 12))
    # Fit the data to the visualizer
    visualizer.fit(points)
    visualizer.set_title("The Elbow Method")
    visualizer.show()
示例#4
0
def elbow_kmeans(datasetDir, flag):
    all_data = datasets.load_files(datasetDir, 
        description=None, load_content=True, encoding='utf-8', shuffle=False)
    #--------------------------------------------------------------

    count_vectorizer = TfidfVectorizer(stop_words='english')

    X = count_vectorizer.fit_transform(raw_documents=all_data.data)

    svd = TruncatedSVD(n_components=200)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)


    # Instantiate the clustering model and visualizer



    title = "Elbow calinski_harabasz Score of Kmeans clustering"
    kwargs = {'title': title}

    plot_Dir = os.path.join(os.getcwd(), "elbow for kmeans")
    if not (os.path.exists(plot_Dir)):
            os.makedirs(plot_Dir)          
   
    filename = plot_Dir + "\\"

    model = KMeans(random_state=42)
    if flag == 0:
        suffix = "_full.png"
    if flag == 1:
        suffix = "_stemming.png"
    if flag == 2:
        suffix = "_lemmatizing.png"

    visualizer = KElbowVisualizer(
        model, k=(4,30), metric='calinski_harabasz',  **kwargs
    )

    visualizer.fit(X)        # Fit the data to the visualizer
    visualizer.set_title(title=title)
    plt.savefig(filename + title + suffix)
    plt.close()

    # visualizer.show()        # Finalize and render the figure
    # plt.savefig(filename + title + suffix)

    title = "Elbow silhouette Score of Kmeans clustering"
    kwargs = {'title': title}

    visualizer = KElbowVisualizer(
        model, k=(4,30), metric='silhouette',  **kwargs
    )
    visualizer.fit(X)        # Fit the data to the visualizer
    visualizer.set_title(title=title)
    plt.savefig(filename + title + suffix)
    plt.close()
    # visualizer.show()        # Finalize and render the figure
    # plt.savefig(filename + title + suffix)

    title = "Elbow distortion of Kmeans clustering"
    kwargs = {'title': title}

    visualizer = KElbowVisualizer(
        model, k=(4,20), metric='distortion',  **kwargs
    )
    visualizer.fit(X)        # Fit the data to the visualizer
    visualizer.set_title(title=title)
    plt.savefig(filename + title + suffix)
    plt.close()
    # visualizer.show()        # Finalize and render the figure
    # plt.savefig(filename + title + suffix)

    sil = []
    for n_cluster in range(4, 30):
        model = KMeans(random_state=42, n_clusters=n_cluster).fit(X)
        labels = model.labels_
        sil.append(silhouette_score(X, labels, metric = 'euclidean'))
        # model = KMeans(random_state=42, n_clusters=n_cluster)
        # Svisualizer = SilhouetteVisualizer(model)
        # Svisualizer.fit(X)    # Fit the data to the visualizer
        # Svisualizer.poof()    # Draw/show/poof the data
        # plt.
    plt.plot(list(range(4, 30)), sil)
    plt.grid(True)
    plt.savefig(filename + "sihouette" + suffix)
    plt.close()