"""
This script uses the elbow method to help identify a good value of 'k' to use
for k-means clustering.

@author: Chris McCormick
"""

from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from simsearch import SimSearch
import numpy as np
import matplotlib.pyplot as plt

# Load the pre-built corpus.
print('Loading the saved SimSearch and corpus...')
(ksearch, ssearch) = SimSearch.load(save_dir='./mhc_corpus/')

# Get the dataset to be clustered.
# Note - The index is store with all of the vectors *already normalized*.
X = ssearch.index.index

# If you needed to normalize the vectors:
# norms = np.linalg.norm(X)
# norms = norms.reshape(-1, 1)
# X = X / norms

# These lists will store the actual values to plot.
plotx = []
ploty1 = []
ploty2 = []