def elbow_analysis(sample_file_path, kmin, kmax, **kwargs): initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(sample_file_path) elbow_instance = elbow(sample, kmin, kmax, initializer=initializer) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize() kmeans_instance = kmeans(sample, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters)) figure = plt.figure(1) ax = figure.add_subplot(111) ax.plot(range(kmin, kmax), wce, color='b', marker='.') ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10) ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5)) ax.grid(True) plt.ylabel("WCE") plt.xlabel("K") plt.show() kmeans_visualizer.show_clusters(sample, clusters, centers)
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 10 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(path_to_data) answer = answer_reader(path_to_answer) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), kmax - kmin) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if actual_elbow != len(answer.get_clusters()): additional_info.append(actual_elbow) #time.sleep(0.05) # sleep to gain new seed for random generator continue testing_result = True break message = str(len(answer.get_clusters())) + ": " + str(additional_info) assertion.true(testing_result, message=message)
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 5 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(path_to_data) answer = answer_reader(path_to_answer) for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), kmax - kmin) assertion.lt(actual_wce[-1], actual_wce[0]) if actual_elbow != len(answer.get_clusters()): continue testing_result = True break assertion.true(testing_result)
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root) + '\\' + nameData) initial_medians = kppi(data, k_clusters).initialize() kmedians_instance = kmedians(data, initial_medians) kmedians_instance.process() clusters = kmedians_instance.get_clusters() # final_medians = kmedians_instance.get_medians() predicted = kmedians_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) elbow_instance = elbow(data, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount( ) # most probable amount of clusters wce = elbow_instance.get_wce()
def subcluster(dataset): kmin = 1 kmax = 20 if kmax > len(dataset): kmax = len(dataset) optimal_clusters = 1 # Determining Clusters # Might potentially be inefficient technique # Instead of elbow, could again repeat what is done # in the main clustering, going through K values # Choosing one with lowest error from calcError # This could be very time intensive however if kmax - kmin <= 3: optimal_clusters = int((kmin + kmax) / 2) else: elbow_inst = elbow(dataset, kmin, kmax) elbow_inst.process() optimal_clusters = elbow_inst.get_amount() if optimal_clusters > len(dataset): optimal_clusters = len(dataset) initial_centers = kmeans_plusplus_initializer( dataset, optimal_clusters).initialize() metric = distance_metric(type_metric.EUCLIDEAN) kmeans_instance = kmeans(dataset, initial_centers, metric=metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() return clusters
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 10 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(path_to_data) answer = answer_reader(path_to_answer) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), kmax - kmin) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if actual_elbow != len(answer.get_clusters()): additional_info.append(actual_elbow) #time.sleep(0.05) # sleep to gain new seed for random generator continue testing_result = True break message = str(len(answer.get_clusters())) + ": " + str(additional_info) assertion.true(testing_result, message=message)
def calculateAppropriateNumberOfClusters(data, minimum, maximum, specInit=random_center_initializer): elbow_instance = elbow(data, minimum, maximum, initializer=specInit) elbow_instance.process() return (elbow_instance.get_amount(), elbow_instance.get_wce())
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs): initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(sample_file_path) elbow_instance = elbow(sample, kmin, kmax, initializer=initializer) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize() kmeans_instance = kmeans(sample, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters)) figure = plt.figure(1) ax = figure.add_subplot(111) ax.plot(range(kmin, kmax), wce, color='b', marker='.') ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10) ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5)) ax.grid(True) plt.ylabel("WCE") plt.xlabel("K") plt.show() kmeans_visualizer.show_clusters(sample, clusters, centers)
def calculateAppropriateNumberOfClusters(data, minimum, maximum): #elbow_instance = elbow(data, minimum, maximum) elbow_instance = elbow(data, minimum, maximum, initializer=random_center_initializer) elbow_instance.process() return elbow_instance.get_amount()
def random_state_fixed(path_to_data, kmin, kmax, ccore, **kwargs): repeat = kwargs.get('repeat', 1) for _ in range(repeat): sample = read_sample(path_to_data) elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs).process() elbow_1 = elbow_instance.get_amount() wce_1 = elbow_instance.get_wce() elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs).process() elbow_2 = elbow_instance.get_amount() wce_2 = elbow_instance.get_wce() assertion.eq(elbow_1, elbow_2) assertion.eq(wce_1, wce_2)
def elbow_k_means(key_word, model_path): logger = Logger(model_path) model = logger.model result = model.most_similar(key_word, topn=100) word_vectors = [] num_clusters = 8 word_names = [] word_correlation = [] for r in result: word_vectors.append(model.wv[r[0]]) word_names.append(r[0]) word_correlation.append(r[1]) tsne = PCA(n_components=2) X_tsne = tsne.fit_transform(word_vectors) kmin, kmax = 1, 10 elbow_instance = elbow(X_tsne, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(X_tsne, amount_clusters, amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() k_means_instance = kmeans(X_tsne, centers) k_means_instance.process() clusters = k_means_instance.get_clusters() centers = k_means_instance.get_centers() index_to_word = [[] for i in range(len(clusters))] index_to_correlation = [[] for i in range(len(clusters))] idx = 0 cluster_list = [] for c in clusters: words_list = [] for i in c: word_dict = dict() word_dict["text"] = word_names[i] word_dict["correlation"] = word_correlation[i] t_dict = dict() t_dict["word"] = word_dict words_list.append(t_dict) words_dict = dict() words_dict["words"] = words_list cluster_list.append(words_dict) idx += 1 return len(clusters), cluster_list
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 15 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False kstep = kwargs.get('kstep', 1) sample = read_sample(path_to_data) expected_clusters_amount = None if path_to_answer is not None: if isinstance(path_to_answer, int): expected_clusters_amount = path_to_answer else: expected_clusters_amount = len( answer_reader(path_to_answer).get_clusters()) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), math.floor((kmax - kmin) / kstep + 1)) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if (expected_clusters_amount is not None) and ( actual_elbow != expected_clusters_amount): additional_info.append(actual_elbow) continue testing_result = True break message = None if expected_clusters_amount is not None: message = str(expected_clusters_amount) + ": " + str( additional_info) assertion.true(testing_result, message=message)
def run_elbow(data): # create instance of Elbow method using K value from 1 to 10. kmin, kmax = 1, 10 elbow_instance = elbow(data, kmin, kmax) # process input data and obtain results of analysis elbow_instance.process() amount_clusters = elbow_instance.get_amount() # perform cluster analysis using K-Means algorithm centers = kmeans_plusplus_initializer( data, amount_clusters, amount_candidates=kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize() kmeans_instance = kmeans(data, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() kmeans_visualizer.show_clusters(data, clusters, centers)
def subcluster(dataset): kmin = len(dataset[0]) kmax = len(dataset) optimal_clusters = 1 if kmax - kmin <= 3: optimal_clusters = int((kmin + kmax) / 2) else: elbow_inst = elbow(dataset, kmin, kmax) elbow_inst.process() optimal_clusters = elbow_inst.get_amount() if optimal_clusters > len(dataset): optimal_clusters = len(dataset) initial_centers = kmeans_plusplus_initializer( dataset, optimal_clusters).initialize() metric = distance_metric(type_metric.EUCLIDEAN) kmeans_instance = kmeans(dataset, initial_centers, metric=metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() return clusters
def elbow_kmeans_optimizer(X, k=None, kmin=1, kmax=5, visualize=True): """k-means clustering with or without automatically determined cluster numbers. Reference: https://pyclustering.github.io/docs/0.8.2/html/d3/d70/classpyclustering_1_1cluster_1_1elbow_1_1elbow.html # Arguments: X (numpy array-like): Input data matrix. kmin: Minimum number of clusters to consider. Defaults to 1. kmax: Maximum number of clusters to consider. Defaults to 5. visualize: Whether to perform k-means visualization or not. # Returns: numpy arraylike: Clusters. numpy arraylike: Cluster centers. """ from pyclustering.utils import read_sample from pyclustering.samples.definitions import SIMPLE_SAMPLES from pyclustering.cluster.kmeans import kmeans from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer, random_center_initializer from pyclustering.core.wrapper import ccore_library from pyclustering.cluster.elbow import elbow from pyclustering.cluster.kmeans import kmeans_visualizer import pyclustering.core.elbow_wrapper as wrapper if k is not None: amount_clusters = k else: elbow_instance = elbow(X, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(X, amount_clusters).initialize() kmeans_instance = kmeans(X, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() kmeans_visualizer.show_clusters(X, clusters, centers) return clusters, centers
c += 1 #Setting up training and testing data X = np.array(list(job_data.values())) job_data.clear() load.clear() rows.clear() X_normed = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) Y = np.array(list(wait_time.values())) R = np.array(list(run_time.values())) X_train, X_test = X_normed[:20000], X_normed[20000 + 1:60000] Y_train, Y_test = Y[:20000], Y[20000 + 1:60000] R_train, R_test = R[:20000], R[20000 + 1:60000] #Applying ELBOW elbow_instance = elbow(X_train, 1, 50) elbow_instance.process() amount_clusters = elbow_instance.get_amount( ) # most probable amount of clusters print(amount_clusters) kmeans = KMeans(n_clusters=amount_clusters, random_state=0).fit(X_train) joblib.dump(kmeans, 'XGB/Kmeans.pkl') reg_models = [] for i in range(amount_clusters): x = X_train[kmeans.labels_ == i] y = Y_train[kmeans.labels_ == i] x = x[:, :4] reg = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='mae', colsample_bytree=0.75, learning_rate=0.01,
def createvisual(fileNum, type): sample = read_sample("TestData/QueryBehaviorText/" + str(fileNum) + ".txt") # Sample is simply matrix holding values, can be accessed for values just like any other kmin, kmax = 1, len(sample) elbow_inst = elbow(sample, kmin, kmax) elbow_inst.process() optimal_clusters = elbow_inst.get_amount() initial_centers = kmeans_plusplus_initializer( sample, optimal_clusters).initialize() # user_function = lambda point1, point2: sum(l1 != 12 for l1, l2 in zip(point1, point2)) user_function = lambda point1, point2: np.count_nonzero( np.array(point1) != np.array(point2)) metricUser = distance_metric(type_metric.USER_DEFINED, func=user_function) # print(metricUser([0, 1, 1], [0, 0, 1])) metric = distance_metric(type_metric.EUCLIDEAN) if type == 0: metric = distance_metric(type_metric.USER_DEFINED, func=user_function) kmeans_instance = kmeans(sample, initial_centers, metric=metric) kmeans_instance.process() clusters = kmeans_instance.get_clusters() print(fileNum) print(clusters) print("\n\n\n") final_centers = kmeans_instance.get_centers() mockDataArr = [] for i in range(len(sample)): mockDataArr.append(i) mockDataPos = {} for i in range(len(sample)): mockDataPos[i] = i # print("\n") # print("Position Mapping Hashmap: ", mockDataPos) # print("\n") # print("Initial Column Positions: ", mockDataArr, "\n") mockDataClustered = [] for cluster in clusters: mockDataClustered.extend(cluster) imageData = [] origMulitDimen = np.array(sample, dtype=int) # print("Original Coordinates") # print(origMulitDimen) numpyChar = np.transpose(origMulitDimen) originalSave = np.copy(numpyChar) printNumpy = np.insert(numpyChar, 0, mockDataArr, 0) for i in range(len(mockDataArr) - 1): if i != mockDataPos[mockDataClustered[i]]: temp = np.copy(numpyChar[:, i]) realTemp = mockDataArr[i] mockDataArr[i] = mockDataArr[mockDataPos[mockDataClustered[i]]] mockDataArr[mockDataPos[mockDataClustered[i]]] = realTemp numpyChar[:, i] = numpyChar[:, mockDataPos[mockDataClustered[i]]] numpyChar[:, mockDataPos[mockDataClustered[i]]] = temp temp2 = mockDataPos[mockDataClustered[i]] mockDataPos[mockDataClustered[i]] = i mockDataPos[realTemp] = temp2 printArray = np.insert(numpyChar, 0, np.array(mockDataArr), 0) swappederror = calcError(numpyChar) defaulterror = calcError(originalSave) f.write(str(swappederror / defaulterror * 100) + "\n") print(swappederror / defaulterror) fig, ax = plt.subplots(1, 2) fig.suptitle('Clusters: ' + str(len(clusters)), fontsize=20) fig.text(.5, .05, 'Clustered Columns: ' + str(clusters), ha='center') fig.text(.5, .1, 'Original Error: ' + str(defaulterror), ha='center') fig.text(.5, .15, 'Clustered Error: ' + str(swappederror), ha='center') ax[0].imshow(numpyChar, cmap=plt.cm.Greys) ax[1].imshow(originalSave, cmap=plt.cm.Greys) ax[0].title.set_text('Clustered Characteristic Matrix') ax[1].title.set_text('Original Charecteristic Matrix') fig.set_size_inches(10, 7) if type == 0: plt.savefig("TestData/QueryBehaviorVisualsHamming/" + str(fileNum) + ".png") else: plt.savefig("TestData/QueryBehaviorVisualsEuclidean/" + str(fileNum) + ".png")
print('Successfully loaded all modules') # load image encodings print('Loading encodings...') start = time.time() encodings = "encodings.npy" encodings = np.load(encodings) stop = time.time() print('Encodings loaded successfully', '[', round(stop - start, 2), 'seconds ]') # create elbow instance print('Creating elbow instance...') start = time.time() elbow_instance = elbow(encodings, 2, 100) stop = time.time() print('Elbow instance created', '[', round(stop - start, 2), 'seconds ]') # find the optimal value for K (no. of groups) using elbow algorithm print('Getting the optimal number of clusters using elbow...') start = time.time() elbow_instance.process() K = elbow_instance.get_amount() stop = time.time() print(K, 'clusters should be formed according to Elbow', '[', round(stop - start, 2), 'seconds ]') # load the distance matrix (similarity matrix) print('Loading the similarity (distance) matrix...') start = time.time()
def _optimal_cluster(self, kmax=50): elbow_instance = elbow(self.traj.values, 1, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() return amount_clusters, wce
if maxClusters > len(mash_mat): print( "Warning: max number of clusters exceeds size of mash matrix. Reducing maxClusters." ) maxClusters = len(mash_mat) # collapse the distance matrix X = distance.pdist(mash_mat).reshape(-1, 1) # define the range of number of clusters to test range_n_clusters = range(minClusters, maxClusters + 1) # use the elbow method if clusterMethod == "elbow": elbow_instance = elbow(X, range_n_clusters[0], range_n_clusters[-1] + 1) elbow_instance.process() wce = elbow_instance.get_wce() chosen_nClusters = elbow_instance.get_amount() if interactive: plt.plot(range_n_clusters, wce, 'bx-') plt.xlabel('#Clusters') plt.ylabel("Distortion") plt.title("Elbow Method showing optimal K") plt.draw() plt.pause(0.001) manual_clusters = input( "Chose {} clusters. OK? [y/n] ".format(chosen_nClusters)) if manual_clusters in ["n", "N", "No", "no"]: