def main(): # Defining an object for the class MyKmeans km = MyKmeans() # Parsing the data file and acquiring a dataframe parsedData = km.readData('digits-embedding.csv') # Defining the numbers you need within the datafile to create a subset sub_nums = [2, 4, 6, 7] # Creating the subset subset = createSubset(parsedData, sub_nums) #--- The Tasks ---# # (1) Visualizing the images based on the 2D features visualizeData(subset, sub_nums) # (2) Cluster the data with different values of K ∈ [2,4,8,16]. For each K # repeat the experiment for 5 different times each with random centroids and # calculate the average Silhouette Coefficient (SC) of each K after the 5 trials K = [2, 4, 8, 16] # >> K = 2 SC_avg_K2 = iterClustering(subset, parsedData, 5, 50, K[0]) # >> K = 4 SC_avg_K4 = iterClustering(subset, parsedData, 5, 50, K[1]) # # >> K = 8 SC_avg_K8 = iterClustering(subset, parsedData, 5, 50, K[2]) # # >> K = 16 SC_avg_K16 = iterClustering(subset, parsedData, 5, 50, K[3]) # Plotting results plotAvgSC(K, [SC_avg_K2, SC_avg_K4, SC_avg_K8, SC_avg_K16])
def calAvgSC(clusters, parsedData): km = MyKmeans() # Intialize SC value SC = 0 # Looping to obtain average SC value for all trials for x in range(clusters.shape[0]): temp = km.calculateSC(clusters[x], parsedData) SC += temp SC_avg = SC / clusters.shape[0] return SC_avg
def iterClustering(subset, parsedData, iterNum, iterCount, K): # Creating local object km = MyKmeans() cluster_K = [] # Initialize an empty array for x in range(iterNum): temp = km.cluster(subset, iterCount, K, []) cluster_K.append(temp) cluster_K = np.array(cluster_K) SC_avg_k = calAvgSC(cluster_K, parsedData) return SC_avg_k
def main(): # Defining an object for the class MyKmeans km = MyKmeans() # Parsing the data file and acquiring a dataframe parsedData = km.readData('digits-embedding.csv') # Defining the numbers you need within the datafile to create a subset sub_nums = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # Creating the subset subset = createSubset(parsedData, sub_nums) #--- The Tasks ---# # (1) Visualizing the images based on the 2D features visualizeData(subset, sub_nums)