def templateClusterAllocationTheSameObjects(number_objects, number_clusters, ccore_flag = False): value = random() input_data = [ [value] ] * number_objects initial_medoids = [] step = int(math.floor(number_objects / number_clusters)) for i in range(number_clusters): initial_medoids.append(i * step) kmedoids_instance = kmedoids(input_data, initial_medoids, ccore=ccore_flag) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() assertion.eq(len(clusters), len(medoids)) assertion.eq(len(set(medoids)), len(medoids)) object_mark = [False] * number_objects allocated_number_objects = 0 for cluster in clusters: for index_object in cluster: assertion.eq(False, object_mark[index_object]) # one object can be in only one cluster. object_mark[index_object] = True allocated_number_objects += 1 assertion.eq(number_objects, allocated_number_objects) # number of allocated objects should be the same.
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def templateClusterAllocationOneDimensionData(self): input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]; kmedians_instance = kmedoids(input_data, [ 5, 15, 25, 35 ], 0.025); kmedians_instance.process(); clusters = kmedians_instance.get_clusters(); assert len(clusters) == 4; for cluster in clusters: assert len(cluster) == 10;
def templateClusterAllocationOneDimensionData(ccore_flag): input_data = [[random()] for i in range(10)] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ] kmedoids_instance = kmedoids(input_data, [ 5, 15, 25, 35 ], 0.025, ccore_flag) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() assertion.eq(4, len(clusters)) for cluster in clusters: assertion.eq(10, len(cluster))
def template_clustering(start_medoids, path, tolerance = 0.25): sample = read_sample(path); kmedoids_instance = kmedoids(sample, start_medoids, tolerance); (ticks, result) = timedcall(kmedoids_instance.process); clusters = kmedoids_instance.get_clusters(); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); draw_clusters(sample, clusters);
def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length): sample = read_sample(path_to_file); kmedoids_instance = kmedoids(sample, start_centers, 0.025); kmedoids_instance.process(); clusters = kmedoids_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); obtained_cluster_sizes.sort(); expected_cluster_length.sort(); assert obtained_cluster_sizes == expected_cluster_length;
def template_clustering(start_medoids, path, tolerance = 0.25, show = True): sample = read_sample(path); kmedoids_instance = kmedoids(sample, start_medoids, tolerance); (ticks, result) = timedcall(kmedoids_instance.process); clusters = kmedoids_instance.get_clusters(); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); if (show is True): visualizer = cluster_visualizer(1); visualizer.append_clusters(clusters, sample, 0); visualizer.show(); return (sample, clusters);
def template_clustering(start_medoids, path, tolerance = 0.25, show = True): sample = read_sample(path) kmedoids_instance = kmedoids(sample, start_medoids, tolerance) (ticks, result) = timedcall(kmedoids_instance.process) clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") if show is True: visualizer = cluster_visualizer(1) visualizer.append_clusters(clusters, sample, 0) visualizer.append_cluster([sample[index] for index in start_medoids], marker='*', markersize=15) visualizer.append_cluster(medoids, data=sample, marker='*', markersize=15) visualizer.show() return sample, clusters
def templateAllocateRequestedClusterAmount(data, amount_clusters, initial_medoids, ccore_flag): if initial_medoids is None: initial_medoids = [] for _ in range(amount_clusters): index_point = randint(0, len(data) - 1) while (index_point in initial_medoids): index_point = randint(0, len(data) - 1) initial_medoids.append(index_point) kmedoids_instance = kmedoids(data, initial_medoids, 0.025, ccore = ccore_flag) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() assertion.eq(len(clusters), amount_clusters) amount_objects = 0 for cluster in clusters: amount_objects += len(cluster) assertion.eq(amount_objects, len(data))
def templateClusterAllocationTheSameObjects(self, number_objects, number_clusters, ccore_flag = False): value = random(); input_data = [ [value] ] * number_objects; initial_medoids = []; step = math.floor(number_objects / number_clusters); for i in range(number_clusters): initial_medoids.append(i * step); kmedoids_instance = kmedoids(input_data, initial_medoids); kmedoids_instance.process(); clusters = kmedoids_instance.get_clusters(); object_mark = [False] * number_objects; allocated_number_objects = 0; for cluster in clusters: for index_object in cluster: assert (object_mark[index_object] == False); # one object can be in only one cluster. object_mark[index_object] = True; allocated_number_objects += 1; assert (number_objects == allocated_number_objects); # number of allocated objects should be the same.
def registration_icp(static, moving, points=20, pca=True, maxiter=100000, affine=[0, 0, 0, 0, 0, 0, 1], clustering=None, medoids=[0, 1, 2], k=3, beta=999, max_dist=40, dist='pc'): options = { 'maxcor': 10, 'ftol': 1e-7, 'gtol': 1e-5, 'eps': 1e-8, 'maxiter': maxiter } #options1 = {'xtol': 1e-6, 'ftol': 1e-6, 'maxiter': 1e6} if pca: moving = pca_transform_norm(static, moving, max_dist) else: mean_m = np.mean(np.concatenate(moving), axis=0) mean_s = np.mean(np.concatenate(static), axis=0) moving = [i - mean_m + mean_s for i in moving] original_moving = moving.copy() static = set_number_of_points(static, points) moving = set_number_of_points(moving, points) if clustering == 'kmeans': kmeans = KMeans(k).fit(np.concatenate(moving)) idx = {i: np.where(kmeans.labels_ == i)[0] for i in range(k)} #dist = Clustering().distance_pc_clustering_mean if dist == 'pc': dist_fun = distance_pc_clustering_mean else: dist_fun = distance_tract_clustering_mean args = (static, moving, kmeans, idx, beta, max_dist) print('kmeans') elif clustering == 'kmedoids': k_medoids = kmedoids(np.concatenate(moving), medoids) k_medoids.process() #dist = Clustering().distance_pc_clustering_medoids if dist == 'pc': dist_fun = distance_pc_clustering_medoids else: dist_fun = distance_tract_clustering_medoids args = (static, moving, k_medoids, beta, max_dist) print('kmedoids') else: if dist == 'pc': dist_fun = distance_pc args = (static, moving, beta, max_dist) else: dist_fun = distance_mdf args = (static, moving) print('Without Clustering') 'L-BFGS-B,Powell' m = Optimizer(dist_fun, affine, args=args, method='L-BFGS-B', options=options) #m = Optimizer(dist, affine,args=args,method='Powell',options=options1) m.print_summary() mat = compose_matrix44(m.xopt) return transform_streamlines(original_moving, mat)
def build_clusterer(data, nclusters, method, **kwargs): """ A simple wrapper to various clustering approaches. Cluster the given data into nclusters by using the specified method. Depending on the specified method different packages may be required and different arguments are expected in the kwargs dict. """ features = copy.deepcopy(kwargs["config"]["features"]) print("{0} cluster features used {1}".format(INFO, features)) windows = [] has_gc = False if 'gc' in features: features.pop(features.index('gc')) has_gc = True has_mean_ratio = False if 'mean_ratio' in features: features.pop(features.index('mean_ratio')) has_mean_ratio = True has_wga_mean = False if 'wga_mean' in features: features.pop(features.index('wga_mean')) has_wga_mean = True has_no_wga_mean = False if 'no_wga_mean' in features: features.pop(features.index('no_wga_mean')) has_no_wga_mean = True for window in data: if has_wga_mean: window_values = [window.get_feature(feature='mean', name=WindowType.WGA)] elif has_no_wga_mean: window_values = [window.get_feature(feature='mean', name=WindowType.NO_WGA)] else: window_values = window.get_features(features=features) if has_gc: window_values.append(window.get_feature(feature='gc', name=WindowType.WGA)) if has_mean_ratio: means = window.get_features(features=['mean']) ratio = (means[0] + 1) / (means[1] + 1) window_values.append(ratio) windows.append(window_values) if method == "kmeans": from sklearn.cluster import KMeans clusterer = KMeans(n_clusters=nclusters) clusterer.fit(windows) return clusterer elif method == "kmedoids": from pyclustering.cluster.kmedoids import kmedoids metric = get_distance_metric(dist_metric=kwargs["config"]["metric"].upper(), degree=kwargs["config"]["metric_degree"] if 'metric_degree' in kwargs["config"] else 0) initial_index_medoids = [] if kwargs["config"]["init_cluster_idx"] == "random_from_data": import random for c in range(nclusters): idx = random.randint(0, len(windows) - 1) if idx in initial_index_medoids: # try ten times before quiting for time in range(10): idx = random.randint(0, len(windows) - 1) if idx in initial_index_medoids: continue else: initial_index_medoids.append(idx) break else: initial_index_medoids.append(idx) else: initial_index_medoids = kwargs["config"]["init_cluster_idx"] clusterer = kmedoids(data=windows, initial_index_medoids=initial_index_medoids, metric=metric) clusterer.process() return clusterer, initial_index_medoids raise Error("Invalid clustering method: " + method)
def LPAM(graph, k=2, threshold=0.5, distance="amp", seed=0): """ Link Partitioning Around Medoids :param graph: a networkx object :param k: number of clusters :param threshold: merging threshold in [0,1], default 0.5 :param distance: type of distance: "amp" - amplified commute distance, or "cm" - commute distance, or distance matrix between all edges as np ndarray :param seed: random seed for k-medoid heuristic :return: NodeClustering object :Example: >>> from cdlib import algorithms >>> import networkx as nx >>> G = nx.karate_club_graph() >>> coms = algorithms.lpam(G, k=2, threshold=0.4, distance = "amp") :References: Link Partitioning Around Medoids https://arxiv.org/abs/1907.08731 Alexander Ponomarenko, Leonidas Pitsoulis, Marat Shamshetdinov """ def getCommuteDistace(G): """ Returns commute distance matrix """ verts = list(G.nodes) n = len(verts) vol = nx.volume(G, verts) # use NetworkX to get Laplacian L = nx.laplacian_matrix(G) L = L.todense() Gamma = L + (1 / n) * np.ones([n, n]) CM = np.zeros([n, n]) # get Moore-Penrose pseudo inverse Gamma_pinv = np.linalg.pinv(Gamma, rcond=1e-4) for i in range(n): for j in range(i + 1, n): CM[i, j] = vol * (Gamma_pinv[i, i] + Gamma_pinv[j, j] - 2 * Gamma_pinv[i, j]) CM[j, i] = CM[i, j] return CM def getAmp(G): """ Returns amplified commute distance matrix """ verts = list(G.nodes) n = len(verts) # get adj matrix A = nx.adjacency_matrix(G) A = A.todense() # use NetworkX to get Laplacian L = nx.laplacian_matrix(G) L = L.todense() Gamma = L + (1 / n) * np.ones([n, n]) C_AMP = np.zeros([n, n]) # get Moore-Penrose pseudo inverse Gamma_pinv = np.linalg.pinv(Gamma, rcond=1e-4) for i in range(n): for j in range(i + 1, n): r_ij = (Gamma_pinv[i, i] + Gamma_pinv[j, j] - 2 * Gamma_pinv[i, j]) # resistance dist d_i = G.degree(list(G.nodes())[i]) d_j = G.degree(list(G.nodes())[j]) if d_i != 0 and d_j != 0: s_ij = r_ij - (1 / d_i) - (1 / d_j) w_ij = A[i, j] w_ii = A[i, i] w_jj = A[j, j] u_ij = (((2 * w_ij) / (d_i * d_j)) - (w_ii / (d_i**2)) - (w_jj / (d_j**2))) C_AMP[i, j] = s_ij + u_ij C_AMP[j, i] = s_ij + u_ij else: C_AMP[i, j] = np.NaN C_AMP[j, i] = np.NaN return C_AMP line_graph = nx.line_graph(graph) D = None distance_name = distance if distance == "amp": D = getAmp(line_graph) if distance == "cm": D = getCommuteDistace if isinstance(distance, np.ndarray): D = distance distance_name = "custom" if D is None: raise TypeError( 'Parameter distance should be "amp"/"cm", or numpy.ndarray') _n = len(line_graph.nodes()) np.random.seed(0) initial_medoids = np.random.choice(_n, k, replace=False) kmedoids_instance = kmedoids(D, initial_medoids, data_type="distance_matrix") # run cluster analysis and obtain results kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() final_clusters = {} for c_i, c in enumerate(clusters): for line_vertex in c: source, target = list(line_graph.nodes())[line_vertex] if source not in final_clusters: final_clusters[source] = [] final_clusters[source].append(c_i) if target not in final_clusters: final_clusters[target] = [] final_clusters[target].append(c_i) res_clusters = {} for v, l in final_clusters.items(): degree = len(l) res = defaultdict(list) for x in l: res[x].append(x) covering = np.zeros(k) for c_i, _l in res.items(): covering[c_i] = len(_l) / degree res_clusters[v] = covering _res_clusters = [[] for i in range(k)] for v, l in res_clusters.items(): for i in range(k): if l[i] >= threshold: _res_clusters[i].append(v) return NodeClustering( communities=[c for c in _res_clusters if len(c) > 0], graph=graph, method_name="lpam " + distance_name, method_parameters={ "k": k, "threshold": threshold, "distance": distance_name, "seed": seed, }, overlap=True, )
def clustering_with_answer(data_file, answer_file, ccore, **kwargs): data_type = kwargs.get('data_type', 'points') metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)) original_data = read_sample(data_file) data = original_data if data_type == 'distance_matrix': data = calculate_distance_matrix(original_data, metric) reader = answer_reader(answer_file) amount_medoids = len(reader.get_clusters()) initial_medoids = kmeans_plusplus_initializer( data, amount_medoids, **kwargs).initialize(return_index=True) kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore, **kwargs) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() expected_length_clusters = sorted(reader.get_cluster_lengths()) assertion.eq(len(expected_length_clusters), len(medoids)) assertion.eq(len(data), sum([len(cluster) for cluster in clusters])) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) unique_medoids = set() for medoid in medoids: assertion.false( medoid in unique_medoids, message="Medoids '%s' is not unique (actual medoids: '%s')" % (str(medoid), str(unique_medoids))) unique_medoids.add(medoid) unique_points = set() for cluster in clusters: for point in cluster: assertion.false( point in unique_points, message= "Point '%s' is already assigned to one of the clusters." % str(point)) unique_points.add(point) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters])) expected_clusters = reader.get_clusters() for actual_cluster in clusters: cluster_found = False for expected_cluster in expected_clusters: if actual_cluster == expected_cluster: cluster_found = True assertion.true( cluster_found, message="Actual cluster '%s' is not found among expected." % str(actual_cluster))
while len(medoidsToInit) < k_clusters: number = random.randrange(0, points_amount) if not number in medoidsToInit: medoidsToInit.append(number) return medoidsToInit def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root)+'\\'+filenameData) kClusters = canoc(data, kmin, kmax) initial_medoids = rci(data, kClusters).initialize() kmedoids_instance = kmedoids(data, initial_medoids) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root)
def process_kmedoids(sample): instance = kmedoids(sample, [ CURRENT_CLUSTER_SIZE * multiplier for multiplier in range(NUMBER_CLUSTERS) ]) (ticks, _) = timedcall(instance.process) return ticks
'wine_servings'] == column['spirit_servings']: nova_coluna_numerica.append(3) nova_coluna_nominal.append('none') #Adicionando as novas colunas bebida_mundo['most_consumed_number'] = nova_coluna_numerica bebida_mundo['most_consumed_nominal'] = nova_coluna_nominal #Criação de uma variável com as colunas númericas da quantidade ingerida de cada classe bebida = bebida_mundo.iloc[:, 1:4].values #Criação de uma variável com as classes dos registros bebida_numero = bebida_mundo.iloc[:, 5].values #Faz o processo de achar o kmedoids autmoaticamente (executar os 2 comandos simultaneamente) cluster = kmedoids(bebida, [117, 68, 61]) cluster.get_medoids() #Faz o processamento de clusterização cluster.process() #A variavel previsoes determina o número de cluster que a maquina ja processou anteriormente com o clusters.process() previsoes = cluster.get_clusters() #A variavel medoides determina o medoide (centro de um cluster) medoides = cluster.get_medoids() #Gera um gráfico com os 3 grupos, onde a * é o centro dos medoides (executar os 4 comandos simultâneos) v = cluster_visualizer() v.append_clusters(previsoes, bebida) v.append_cluster(medoides, bebida, marker='*', markersize=100)
nx.draw(G, node_color=colors, with_labels=True) #plt.show() np_MST = np.array(MST) line_count = [] for i in range(noa): line = np.count_nonzero(np_MST[i]) line += np.count_nonzero(np_MST[:, i]) line_count.append(line) ''' add_all=0 for a in line_count: add_all+=a ''' kmedoids_instance = kmedoids(distnace_matrix, [random.randrange(0, noa) for i in range(10)], data_type='distance_matrix') kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() one_edge_in_each_cluster = [] for cluster in clusters: one_edge = [] for i in cluster: if line_count[i] == 1: one_edge.append(i) one_edge_in_each_cluster.append(one_edge) final_10_stocks = []
from pyclustering.cluster.kmedoids import kmedoids from pyclustering.cluster import cluster_visualizer from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES # Load list of points for cluster analysis. sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS) lines = open("t4.8k", "r") inp = [] for line in lines: cords = line.split() if len(cords) != 2: continue inp.append([float(cords[0]), float(cords[1])]) # Set random initial medoids. initial_medoids = [1, 800, 1400, 672, 763, 926] # Create instance of K-Medoids algorithm. kmedoids_instance = kmedoids(inp, initial_medoids) # Run cluster analysis and obtain results. kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() # Display clusters. visualizer = cluster_visualizer() visualizer.append_clusters(clusters, inp) visualizer.show()
# define K initial medoids randomly print('Choosing', K, 'initial medoids randomly...') start = time.time() initial_medoids = [ int(np.random.uniform(0, distance_matrix.shape[0])) for i in range(K) ] stop = time.time() print('Random medoids selected', '[', round(stop - start, 2), 'seconds ]') print('Random medoids are', initial_medoids) # execute the K-Medoids algorithm print('Creating Kmediod instance...') start = time.time() kmedoids_instance = kmedoids(distance_matrix, initial_medoids, data_type='distance_matrix') stop = time.time() print('Created Kmedoid instance', '[', round(stop - start, 2), 'seconds ]') # get actual K medoids and clusters print('Get clusters and medoids using K-Medoids algorithm...') start = time.time() kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() stop = time.time() print('Found clusters and medoids', '[', round(stop - start, 2), 'seconds ]') print('Final medoids are', medoids)
def run_kmedoids(element_maps_with_binary_lead_l, binarized_lead_l_map, num_clusters, num_clusters_in_a_cluster): ''' This function creates 2 clusters based on a binarized lead_l map, one with areas where lead is present and the other with area where lead is not present (under the threshold level) ''' row = binarized_lead_l_map.shape[0] col = binarized_lead_l_map.shape[1] combined_maps = np.zeros((row, col)) for map in list(element_maps_with_binary_lead_l.keys()): combined_maps = np.dstack( (combined_maps, element_maps_with_binary_lead_l[map])) unnormalized_data = combined_maps[:, :, 1:] num_chnl = unnormalized_data.shape[2] #this part normalizes each map in the patch normalized_data = np.zeros( (row, col, num_chnl) ) #each patch is normalized with different minimum and maximum values for i in range(num_chnl): normalized_data[:, :, i] = normalize(unnormalized_data[:, :, i]) #reshapes the data to run kmeans data2D = np.reshape(normalized_data, (row * col, num_chnl)) #data2D_PCA = PCA(data2D, 1) #reduces data2D to have 1 dimension only so that it can be given to the kmedoids function #data 2d size : 1345410,11 initial_index_medoids = [1, 30000] kmed_round1 = kmed.kmedoids(data2D, initial_index_medoids) kmed_round1.process() result_1 = kmed_round1.get_clusters() classified_result_1 = np.full((row * col), 255) for i in range(len(result_1)): for j in range(len(result_1[i])): classified_result_1[result_1[i][j]] = i classified_result_1 = np.reshape(classified_result_1, (row, col)) cluster_dict = dict() pixel_location_dict = dict() # stores in each of the two dictionaries above classification information for each pixel in the first # round of clustering and the location of the pixel for i in range(classified_result_1.shape[0]): for j in range(classified_result_1.shape[1]): if classified_result_1[i, j] not in cluster_dict: cluster_dict[classified_result_1[i, j]] = [ [] for num in range(num_chnl) ] pixel_location_dict[classified_result_1[i, j]] = [] pixel_location_dict[classified_result_1[i, j]].append([i, j]) for k in range(num_chnl): cluster_dict[classified_result_1[i, j]][k].append( normalized_data[i, j, k]) #runs the second round of classification on each of the clusters formed from the first classification for cluster in list(cluster_dict.keys()): cluster_dict[cluster] = np.array(cluster_dict[cluster], dtype='float32') cluster_dict[cluster] = np.transpose( cluster_dict[cluster] ) #each value for a cluster is a row*col , num_chnl clusters_in_a_cluster = cluster_clusters(cluster_dict[cluster], cluster, num_clusters_in_a_cluster) cluster_dict[cluster] = clusters_in_a_cluster #builds the classification map based on the second classification results for cluster in list(cluster_dict.keys()): classification_map = np.full((row, col), 255) for i in range(len(cluster_dict[cluster])): idx_pair = pixel_location_dict[cluster][i] row_idx = idx_pair[0] col_idx = idx_pair[1] classification_map[row_idx, col_idx] = cluster_dict[cluster][i] build_map(classification_map, cluster, num_clusters_in_a_cluster) return result_1
def compute_kmedoids(bboxes, cls, option='pyclustering', indices=15, max_clusters=35, max_limit=5000): print("Performing clustering using", option) clustering = [{} for _ in range(indices)] bboxes = centralize_bbox(bboxes) # subsample the number of bounding boxes so that it can fit in memory and is faster if bboxes.shape[0] > max_limit: sub_ind = np.random.choice(np.arange(bboxes.shape[0]), size=max_limit, replace=False) bboxes = bboxes[sub_ind] distances_cache = Path('distances_{0}.jbl'.format(cls)) if distances_cache.exists(): print("Loading distances") dist = joblib.load(distances_cache) else: dist = compute_distances(bboxes) joblib.dump(dist, distances_cache, compress=5) if option == 'pyclustering': for k in range(indices, max_clusters + 1): print(k, "clusters") initial_medoids = np.random.choice(bboxes.shape[0], size=k, replace=False) kmedoids_instance = kmedoids(dist, initial_medoids, ccore=True, data_type='distance_matrix') print("Running KMedoids") t1 = datetime.now() kmedoids_instance.process() dt = datetime.now() - t1 print("Total time taken for clustering {k} medoids: {0}min:{1}s". format(dt.seconds // 60, dt.seconds % 60, k=k)) medoids_idx = kmedoids_instance.get_medoids() medoids = bboxes[medoids_idx] clustering.append({ 'n_clusters': k, 'medoids': medoids, 'class': cls }) elif option == 'pyclust': for k in range(indices, max_clusters + 1): print(k, "clusters") kmd = KMedoids(n_clusters=k, distance=rect_dist, n_trials=1, max_iter=2) t1 = datetime.now() kmd.fit(bboxes) dt = datetime.now() - t1 print("Total time taken for clustering {k} medoids: {0}min:{1}s". format(dt.seconds // 60, dt.seconds % 60, k=k)) medoids = kmd.centers_ clustering.append({ 'n_clusters': k, 'medoids': medoids, 'class': cls }) elif option == 'local': for k in range(indices, max_clusters + 1): print(k, "clusters") curr_medoids, cluster_idxs = kMedoids(dist, k=k) medoids = [] for m in curr_medoids: medoids.append(bboxes[m, :]) clustering.append({ 'n_clusters': k, 'medoids': medoids, 'class': cls }) return clustering
def main(): st.title('Similarity Recommender') st.markdown("---") st.text("This is a lead generator according to a company's portfolio.") Choices = st.sidebar.selectbox( "Do you have a client you wish to generate leads from?", [" ", "Yes", "No"]) if Choices == "Yes": st.sidebar.title("Lead Generator") st.sidebar.markdown("---") loading_portfolios = st.sidebar.text('Loading the portfolios...') portfolios = load_portfolios() loading_portfolios.text( 'Loading complete!\nNow you can start using the app!') portfolio = st.sidebar.selectbox( "Select the portfolio of the company you want to look for leads.", list(portfolios.keys())) if portfolios[portfolio] is not None: load_database = st.text('Loading the database...') market_ID = load_market() load_database.text('Loading complete!') st.subheader("Market Database") st.dataframe(market_ID.head(5)) df_target = portfolios[portfolio] values = df_target.index.tolist() options = df_target['id'].tolist() dic = dict(zip(options, values)) Id = st.selectbox('Choose a client', options, format_func=lambda x: dic[x]) st.write(" **Id**: " + Id) n_top = st.slider( 'Select the number of leads you want to look for', 0, 5) st.text( 'For showcase purposes the maximum amount of leads was set to 5.' ) if n_top > 0: data_load_state = st.text( 'Searching for the nearest neighbours, this may take a while...' ) NN_ID, leads = neighbours_search(Id, market_ID, df_target, n_top) data_load_state.text('Found them!') for i in range(0, n_top): st.subheader("Lead " + str(i + 1)) st.markdown('**Index**: ' + str(NN_ID.get('index')[i])) st.markdown('**Id**: ' + str(leads[i])) st.markdown('**Dissimalirity**: ' + str(round(NN_ID.get('values')[i], 5))) if Choices == "No": st.sidebar.title("Cluster Generator") st.sidebar.markdown("---") loading_portfolios = st.sidebar.text('Loading the portfolios...') portfolios = load_portfolios() loading_portfolios.text( 'Loading complete!\nNow you can start using the app!') portfolio = st.sidebar.selectbox( "Select the portfolio of the company to generate clusters.", list(portfolios.keys())) if portfolios[portfolio] is not None: load_database = st.text('Loading the database...') market_ID = load_market() load_database.text('Loading complete!') st.subheader("Market Database") st.dataframe(market_ID.head(5)) calculating = st.text( 'Calculating the dissimilarity matrix! This may take a while...' ) dissimilarity_matrix = calculate_distance(portfolios[portfolio]) calculating.text('Phew, we finally finished the calculus!') X = dissimilarity_matrix metrics = st.text('Generating plots for evaluation metrics...') # creating the lists we'll want to save values to medoids_per_k = [] # medoids for each number of clusters clusters_per_k = [] # clusters for each number of clusters k_scores = [] # average silhouette score of k clusters wss = [] # the sum of dissimilarity of each cluster random.seed(42) for i, k in enumerate([2, 3, 4, 5, 6, 7]): # the medoids algorithm requires an initial point to start so we're setting it here initial_medoids_km = random.sample( range(1, portfolios[portfolio].shape[0]), k) # Run the Kmeans algorithm km = kmedoids(X, initial_medoids_km, data_type='distance_matrix') km.process() # saving the created clusters into a list clusters_km = km.get_clusters() clusters_per_k.append(clusters_km) # saving the medoids that were found medoids_km = km.get_medoids() # saving the medoids that were found per each number of clusters into a list medoids_per_k.append(medoids_km) # creating a dataframe with the labels of each cluster labels_km = pd.Series(0, index=range( 0, portfolios[portfolio].shape[0])) for i in range(0, len(clusters_km)): for n in range(0, len(clusters_km[i])): index = clusters_km[i][n] labels_km.iloc[index] = i # getting the sum of the dissimilarity per cluster clusters_distances = [] for n in range(0, len(clusters_km)): clusters_distances.append(X[medoids_km[n]][labels_km[ labels_km == n].index].sum()) # total sum of the dissimilarity wss.append(sum(clusters_distances)) # Get silhouette samples silhouette_vals = silhouette_samples(X, labels_km, metric='precomputed') # Silhouette plot fig = go.Figure() fig.update_layout(title={ 'text': 'Silhouette plot for ' + str(k) + ' clusters', 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, xaxis_title='Silhouette coefficient values', yaxis_title='Cluster labels', font=dict(family="Courier New, monospace", size=16, color="RebeccaPurple"), autosize=False, width=1000, height=600, margin=dict(l=50, r=50, b=100, t=100, pad=4), paper_bgcolor="LightGrey") y_lower, y_upper = 0, 0 annotations = [] for i, cluster in enumerate(np.unique(labels_km)): cluster_silhouette_vals = silhouette_vals[labels_km == cluster] cluster_silhouette_vals.sort() y_upper += len(cluster_silhouette_vals) fig.add_trace( go.Bar(x=cluster_silhouette_vals, y=np.array((range(y_lower, y_upper))), name=str(i + 1), orientation='h', showlegend=False)) annotations.append( dict(x=-0.03, y=(y_lower + y_upper) / 2, text=str(i + 1), showarrow=False)) y_lower += len(cluster_silhouette_vals) fig.update_layout(annotations=annotations) # Get the average silhouette score avg_score = np.mean(silhouette_vals) # saving the average silhouette score of k clusters in a list k_scores.append(avg_score) # plottting the average silhouette score fig.update_layout(shapes=[ dict(type='line', yref='paper', y0=0, y1=1, xref='x', x0=avg_score, x1=avg_score, line=dict(color='green', width=2, dash='dash')) ]) fig.update_yaxes(showticklabels=False) # plotting the graphs created in streamlit st.plotly_chart(fig) fig_wss = go.Figure() fig_wss.update_layout(title={ 'text': 'Dissimilarity plot - The Elbow Method', 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, xaxis_title='Number of Clusters', yaxis_title='Dissimilarity', font=dict(family="Courier New, monospace", size=16, color="RebeccaPurple"), autosize=False, width=1000, height=600, margin=dict(l=50, r=50, b=100, t=100, pad=4), paper_bgcolor="LightGrey") fig_wss.add_trace( go.Scatter(x=list(range(2, 8)), y=wss, mode='lines+markers')) st.plotly_chart(fig_wss) metrics.text("Metrics' plots generated.") st.markdown( "Now comes the fun part, I am going to challenge you to choose the best " "number of clusters!<br/>" "However I am going to help you by giving you a few tips:\n" " * The Silhouette Coefficient is bounded between -1 for incorrect clustering " "and +1 for highly dense " "clustering.<br/>" "Scores around zero indicate overlapping clusters.\n" " * You'll want to look for a couple of things in the Silhouette plot:\n" " * The plot with the less amount of negative values, representing incorrect " "labeled clients.\n" " * The plot where the clusters have a greater area above the mean silhouette score, " "which means clusters with higher density, in another words closer clients (or alike).\n" " * The elbow method consists in finding a inflection point in the plot. " "That is if you picture a bent arm you want to look at the point where the elbow is.<br/>\n" "I'll help you with an example: from 2 clusters to 3 the dissimilarity drops by 20k," " but from 3 to 4 only drops 5k. " "This means from 3 clusters onwards the dissimilarity 'gains' " "by having more clusters isn't significative.", unsafe_allow_html=True) list_clusters = [0, 2, 3, 4, 5, 6, 7] number_clusters = st.selectbox( "How many clusters do you want to use?", list_clusters) if number_clusters is not 0: graphics = st.text("Creating shiny plots...") medoids = medoids_per_k[ number_clusters - 2] # The medoids and clusters lists starts at index 0 which # is with 2 clusters, and finishes at 5, 7 clusters thus the -2 clusters = clusters_per_k[number_clusters - 2] fit_umap = umap.UMAP(n_neighbors=14, min_dist=0.1, n_components=3, metric='dice', random_state=42) p_umap = fit_umap.fit_transform( portfolios[portfolio].drop(columns=['id'])) # Visualising the clusters fig_umap = go.Figure() for i in range(0, number_clusters): fig_umap.add_trace( go.Scatter3d(x=p_umap[clusters[i], 0], y=p_umap[clusters[i], 1], z=p_umap[clusters[i], 2], name='Cluster ' + str(i), mode='markers')) fig_umap.add_trace( go.Scatter3d(x=p_umap[medoids, 0], y=p_umap[medoids, 1], z=p_umap[medoids, 2], name='Medoids', mode='markers', marker_color="rgb(255,255,0)", marker=dict(size=16))) fig_umap.update_layout(title={ 'text': 'Clusters with the Dice Distance', 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, font=dict( family="Courier New, monospace", size=16, color="RebeccaPurple"), autosize=False, width=1000, height=600, margin=dict(l=50, r=50, b=100, t=100, pad=4)) st.plotly_chart(fig_umap) fit_umap_man = umap.UMAP(n_neighbors=14, min_dist=0.1, n_components=3, metric='manhattan', random_state=42) p_umap_man = fit_umap_man.fit_transform( portfolios[portfolio].drop(columns=['id'])) fig_umap_man = go.Figure() for i in range(0, number_clusters): fig_umap_man.add_trace( go.Scatter3d(x=p_umap_man[clusters[i], 0], y=p_umap_man[clusters[i], 1], z=p_umap_man[clusters[i], 2], name='Cluster ' + str(i), mode='markers')) fig_umap_man.add_trace( go.Scatter3d(x=p_umap_man[medoids, 0], y=p_umap_man[medoids, 1], z=p_umap_man[medoids, 2], name='Medoids', mode='markers', marker_color="rgb(255,255,0)", marker=dict(size=16))) fig_umap_man.update_layout(title={ 'text': 'Clusters with the Manhattan Distance', 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, font=dict( family="Courier New, monospace", size=16, color="RebeccaPurple"), autosize=False, width=1000, height=600, margin=dict(l=50, r=50, b=100, t=100, pad=4)) st.plotly_chart(fig_umap_man) graphics.text('3D clusters visualization complete!') st.markdown( "**Developer's notes**: <br/>UMAP doesn't have the Gower distance in-built," " however it has the Dice and Manhattan distances," "which are the distances used by the Gower distance.<br/>" "So I have shown the 3D visualization using both distances instead of the distance used" "to finds clusters.<br/>" "A future development would be coding the Gower distance as a custom " "distance in the UMAP method.", unsafe_allow_html=True) selection = st.selectbox( 'Choose a representative client(Medoid)', medoids) Id = portfolios[portfolio].loc[selection, 'id'] st.write("**Client " + str(selection) + " ID:** " + Id) n_top = st.slider( 'Select the number of leads you want to look for', 0, 5) st.text( 'For showcase purposes the maximum amount of leads was set to 5.' ) df_target = portfolios[portfolio] if n_top > 0: data_load_state = st.text( 'Searching for the nearest neighbours, this may take a while...' ) NN_ID, leads = neighbours_search(Id, market_ID, df_target, n_top) data_load_state.text('Found them!') for i in range(0, n_top): st.subheader("Lead " + str(i + 1)) st.markdown('**Index**: ' + str(NN_ID.get('index')[i])) st.markdown('**Id**: ' + str(leads[i])) st.markdown('**Dissimalirity**: ' + str(round(NN_ID.get('values')[i], 5))) st.sidebar.title("Useful Links") st.sidebar.markdown("---") st.sidebar.markdown("[Github]" "(https://github.com/Rpinto02/Similarity_Recommender)") st.sidebar.markdown("[Linkedin]" "(https://www.linkedin.com/in/rpinto02/)") st.sidebar.markdown("[Codenation]" "(https://codenation.dev)")
def fit(self): Final_cluster = [] Temp_cluster = [] ToCheck_cluster = [] #threshold=0.9529#0.01 #0.5 K = 4 #int(y.max()-y.min()/threshold) Final_medoids = [] Check_medoids = [] Temp_medoids = [] kmedoids_instance = kmedoids(self.X.values, self.initial_medoids, ccore=False, data_type='distance_matrix') # run cluster analysis and obtain results kmedoids_instance.process() ToCheck_cluster = kmedoids_instance.get_clusters() Check_medoids = kmedoids_instance.get_medoids() OC = [] for i in range(len(Check_medoids)): STD = np.std(self.Y.iloc[ToCheck_cluster[i]]) #it is a number if STD <= self.threshold: Final_cluster.append(ToCheck_cluster[i]) Final_medoids.append(Check_medoids[i]) else: Temp_cluster.append(ToCheck_cluster[i]) Temp_medoids.append(Check_medoids[i]) ToCheck_cluster = Temp_cluster Check_medoids = Temp_medoids while ToCheck_cluster: L = len(ToCheck_cluster) Temp_cluster = [] Temp_medoids = [] for i in range(0, L): list = ToCheck_cluster[i] if len(list) == 0: continue if len(list) <= 2: Final_cluster.append(list) Final_medoids.append(Check_medoids[i]) continue OC = self.Y.iloc[list] STD = np.std(OC) if STD <= self.threshold: Final_cluster.append(list) Final_medoids.append(Check_medoids[i]) else: data = self.X.iloc[list, list] new_medoids = random.sample(range(len(list)), K) kmedoids_instance = kmedoids(data.values, new_medoids, ccore=False, data_type='distance_matrix') # run cluster analysis and obtain results kmedoids_instance.process() cluster = kmedoids_instance.get_clusters() for i in range(len(cluster)): for j in range(len(cluster[i])): cluster[i][j] = list[cluster[i][j]] Temp_cluster.append(cluster[i]) medoids = kmedoids_instance.get_medoids() for i in range(len(medoids)): Temp_medoids.append(list[medoids[i]]) ToCheck_cluster = Temp_cluster Check_medoids = Temp_medoids return Final_medoids, Final_cluster
from pyclustering.cluster.kmedoids import kmedoids from pyclustering.cluster import cluster_visualizer from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES fig, axs = plt.subplots(2, 4,figsize=(14, 10)) from sklearn.cluster import KMeans colors = ['b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen'] y_means = np.zeros(len(X)) for ncenters, ax in enumerate(axs.reshape(-1), 1): initial_medoids = [i for i in range(ncenters)] kmedoids_instance = kmedoids(X, initial_medoids) # Run cluster analysis and obtain results. kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() # Show allocated clusters. medoids = kmedoids_instance.get_medoids() for i in range(len(clusters)): for j in range(len(clusters[i])): x_index = clusters[i][j] y_means[x_index] = i # Нарисовали точки по кластерам ax.set_title('Centers = {0}'.format(ncenters)) ax.scatter(X[:, 0], X[:, 1], c=y_means , s=50, cmap='viridis') centers = [] for i in range(len(medoids)):
# Feature extraction for each image #compute_BOW_descriptors() # Cluster images with kmedoids X = pd.read_csv(os.path.join(competitors_dir, "bow_images.pd"), index_col=0) # Select interesting images with open(COCO_train_graphs_subset_json_path) as f: graphs = json.load(f) selected_names = [f"{g['graph']['name']:012d}.jpg" for g in graphs] X = X.loc[selected_names] K = 9 km = kmedoids(X.to_numpy(), np.random.randint(0, len(X), K)) start_time = datetime.now() print("Start clustering process.") km.process() med = km.get_medoids() end_time = datetime.now() print('Done. Duration: ' + str(end_time - start_time)) images = [] for m in med: img = X.iloc[m].name images.append(img) print(images) with open(os.path.join(competitors_dir, out_file), 'w') as f: for el in images:
def testCoreInterfaceIntInputData(self): kmedoids_instance = kmedoids([[1], [2], [3], [20], [21], [22]], [2, 5], 0.025, True) kmedoids_instance.process() assert len(kmedoids_instance.get_clusters()) == 2
import matplotlib.pyplot as plt from scipy.stats import stats import sklearn.datasets as datasets from sklearn.metrics import accuracy_score, confusion_matrix # pip install pyclustering from pyclustering.cluster.kmedoids import kmedoids from pyclustering.cluster import cluster_visualizer iris = datasets.load_iris() data = iris.data[:, 0:2] classes = iris.target # 0:2 -> col 0 and 1 # 3, 12, 20 center of cluster sugest model_cluster = kmedoids(data, [3, 12, 20]) model_cluster.get_medoids() # [3, 12, 20] model_cluster.process() # lists of previsions previsoes = model_cluster.get_clusters() # index of medoids find medoids = model_cluster.get_medoids() # [7, 67, 112] v = cluster_visualizer() v.append_clusters(previsoes, data) v.append_cluster(medoids, data=data, marker='*', markersize=15) v.show() n_lst = []
def __init__(self, data, initial_medoids): self.kmedoids_ = kmedoids(data, initial_medoids)
from pyclustering.cluster import cluster_visualizer, cluster_visualizer_multidim from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES, FAMOUS_SAMPLES # Load list of points for cluster analysis. # sample = read_sample(FCPS_SAMPLES.SAMPLE_TWO_DIAMONDS) sample = read_sample(FAMOUS_SAMPLES.SAMPLE_IRIS) lines = open("t4.8k", "r") inp = [] for line in lines: cords = line.split() if len(cords) != 2: continue inp.append([float(cords[0]), float(cords[1])]) # Set random initial medoids. # initial_medoids = [1, 800, 1400, 672, 763, 926] initial_medoids = [1, 500] # Create instance of K-Medoids algorithm. # kmedoids_instance = kmedoids(inp, initial_medoids) kmedoids_instance = kmedoids(sample, initial_medoids) # Run cluster analysis and obtain results. kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() # Display clusters. visualizer = cluster_visualizer_multidim() # visualizer.append_clusters(clusters, inp) visualizer.append_clusters(clusters, sample) visualizer.show()
def main(args): with open( "saved/{0}_{1}_{2}".format(args.name, args.suffix, args.cluster_label), "rb") as handle: plot_data = pickle.load(handle) K = len(plot_data["label"]) # print("full data len: ", K) print("dataset name: ", args.name) print("approach: ", args.approach) tsne_data = np.hstack([ np.array(plot_data["x"]).reshape(-1, 1), np.array(plot_data["y"]).reshape(-1, 1) ]) softmax_pca_data = softmax(plot_data["pca"], axis=1) softmax_tsne_data = softmax(tsne_data, axis=1) dp = [[0.0 for j in range(K)] for i in range(K)] for i in range(K): for j in range(i + 1, K): if args.approach == "euclid-pca": dist = distance.euclidean(plot_data["pca"][i], plot_data["pca"][j]) elif args.approach == "euclid-tsne": dist = distance.euclidean(tsne_data[i], tsne_data[j]) elif args.approach == "cosine-pca": dist = distance.cosine(plot_data["pca"][i], plot_data["pca"][j]) elif args.approach == "cosine-tsne": dist = distance.cosine(tsne_data[i], tsne_data[j]) elif args.approach == "kldiv-pca": dist = sum( rel_entr(softmax_pca_data[i], softmax_pca_data[j]) + rel_entr(softmax_pca_data[j], softmax_pca_data[i])) elif args.approach == "kldiv-tsne": dist = sum( rel_entr(softmax_tsne_data[i], softmax_tsne_data[j]) + rel_entr(softmax_tsne_data[j], softmax_tsne_data[i])) dp[i][j] = dp[j][i] = dist print("created dist matrix") labels = plot_data["label"][:K] cluster_count = len(set(labels)) print("num clusters: ", cluster_count) inits = rng.choice(K, size=cluster_count, replace=False) # print("cluster inits:", inits) print("max iterations: ", args.itermax) km_instance = kmedoids(dp, inits, data_type="distance_matrix", itermax=args.itermax) # print("running kmedoids") km_instance.process() # print("getting clusters") clusters = km_instance.get_clusters() predicts = [-1 for i in range(K)] for index, clust in enumerate(clusters): for pt in clust: predicts[pt] = index # print("cluster allocations: ", clusters) # print("predictions: ", predicts) # print("true labels: ", labels) score = adjusted_rand_score(labels, predicts) print("adj. rand score: ", score) return score
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer( sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore=ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue for cluster in clusters: if len(cluster) == 0: continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
for i in np.arange(m.shape[0]): allfeatures[df3['product_title']. agg(lambda x: sum([y == m[i] for y in x.split()]) > 0), i] = 1 df4 = df3.iloc[:, 0:1] Complete_data = pd.concat([df4, pd.DataFrame(allfeatures)], 1) cm = Complete_data.values.tolist() initial_medoids = [1, 2, 3, 4, 5, 6, 7, 8, 9] #initial_medoids=[1,2,3,4,5] #metric=distance.euclidean metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) #metric = DistanceMetric.get_metric('') kmedoids_instance = kmedoids.kmedoids(cm, initial_medoids, metric=metric) kmedoids_instance.process() cl = kmedoids_instance.get_clusters() zero = pd.DataFrame(cl[0]) zero['label'] = 0 one = pd.DataFrame(cl[1]) one['label'] = 1 two = pd.DataFrame(cl[2]) two['label'] = 2 three = pd.DataFrame(cl[3]) three['label'] = 3 four = pd.DataFrame(cl[4]) four['label'] = 4 five = pd.DataFrame(cl[5]) five['label'] = 5
Seeds = [ 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000 ] for Nseeds in Seeds: print("----" + str(Nseeds) + "----") # initial_medoids = [random.randint(0, len(names_dt)) for i in range(Nseeds)] # metric = distance_metric(type_metric.USER_DEFINED, func=descriptors_similarity) # #initial_medoids = kmedoids_plusplus_initializer(dt, 8).initialize() # Create instance of K-Means algorithm with prepared centers. kmedoids_instance = kmedoids(dt, initial_medoids, metric=metric, itermax=10) # Run cluster analysis and obtain results. start = time.time() print("hello") kmedoids_instance.process() end = time.time() print(end - start) # clusters = kmedoids_instance.get_clusters() final_medoids = kmedoids_instance.get_medoids() #final_centers = kmedoids_instance.get_centers() # run cluster analysis and obtain results # #names_dt[clusters[0]]
def create_clusters(df, keys, score, resp, ncluster=20, w=None, type='kmenoids', tolerance=0.001): from pyclustering.cluster.kmedoids import kmedoids from sklearn.cluster import AgglomerativeClustering grouped = df.groupby(keys)[score].mean() if w is not None: grouped_w = df.groupby(keys)[w].sum() df_vs = keys + [resp, w] else: w = 'count' grouped_w = df.groupby(keys)[score].count() grouped_w.name = w df_vs = keys + [resp] if type == 'kmenoids': calculate_init = pd.concat([grouped, grouped_w], axis=1) calculate_init['index'] = list(range(len(grouped))) calculate_init = calculate_init.sort_values(by=score) calculate_init['cw'] = calculate_init[w].cumsum().div( calculate_init[w].sum()) quantiles = np.linspace(0, 1, ncluster + 2)[1:-1] init_centroid = list( map( lambda x: calculate_init[calculate_init['cw'] > x]['index']. iloc[0], quantiles)) clustering = kmedoids(grouped.values.reshape(-1, 1).tolist(), init_centroid, tolerance=tolerance) clustering.process() clusters = clustering.get_clusters() cluster_mapping = { index: n for n, instance in enumerate(clusters) for index in instance } elif type == 'ward': ff = np.average #lambda x: np.average(x, w=df[w].iloc[x.index]) clusters = AgglomerativeClustering(n_clusters=ncluster, pooling_func=ff) cluster_values = clusters.fit_predict(grouped.values.reshape(-1, 1)) cluster_mapping = dict(zip(range(0, len(grouped)), cluster_values)) clusters = range(0, ncluster) grouped = grouped.to_frame().reset_index() grouped['cluster'] = grouped.index.map( lambda x: cluster_mapping.get(x, None)) merged = pd.merge(df[df_vs], grouped, left_on=keys, right_on=keys, how='left').reset_index(False) reoder_cluster = { i: n for n, i in enumerate( merged.groupby('cluster')[resp].aggregate(lambda x: np.average( x, weights=merged.loc[x.index, w])).sort_values().index) } return merged['cluster'].map(reoder_cluster).values
def build_clusterer(data, nclusters, method, **kwargs): """ A simple wrapper to various clustering approaches. Cluster the given data into nclusters by using the specified method. Depending on the specified method different packages may be required and different argumens are expected in the kwargs dict. """ features = kwargs["config"]["features"] windows = [] print("{0} cluster features used {1}".format(INFO, features)) for window in data: window_data = window.get_rd_stats(statistics="all") window_values = [] for feature in features: window_values.append(window_data[0][feature]) window_values.append(window_data[1][feature]) windows.append(window_values) if method == "kmeans": from sklearn.cluster import KMeans clusterer = KMeans(n_clusters=nclusters) clusterer.fit(windows) return clusterer elif method == "kmedoids": from pyclustering.cluster.kmedoids import kmedoids metric = get_distance_metric( dist_metric=kwargs["config"]["metric"].upper(), degree=kwargs["config"]["metric_degree"] if 'metric_degree' in kwargs["config"] else 0) initial_index_medoids = [] if kwargs["config"]["init_cluster_idx"] == "random_from_data": import random for c in range(nclusters): idx = random.randint(0, len(windows) - 1) if idx in initial_index_medoids: # try ten times before quiting for time in range(10): idx = random.randint(0, len(windows) - 1) if idx in initial_index_medoids: continue else: initial_index_medoids.append(idx) break else: initial_index_medoids.append(idx) else: initial_index_medoids = kwargs["config"]["init_cluster_idx"] clusterer = kmedoids(data=windows, initial_index_medoids=initial_index_medoids, metric=metric) clusterer.process() return clusterer, initial_index_medoids raise Error("Invalid clustering method: " + method)
#k-medoids é um cluster principal caracteristica dele é que o centro onde s cluster são inicializados ao invés de serem #pontos aleatórios como no k-means o k-medoids usam pontos reais de dados from sklearn import datasets from sklearn.metrics import confusion_matrix import numpy as np from pyclustering.cluster.kmedoids import kmedoids from pyclustering.cluster import cluster_visualizer #carregamento dos dados iris = datasets.load_iris() #criando o objeto cluster / porém estão pegando as 2 primeiras colunas de iris(:, 0:2) para melhor intemdimento dos graficos do cluster, voce pode alterar para 4 se quisers # Configuração dos parâmetros do k-medoids, utilizando somente as duas primeiras colunas da base de dados por causa da visualização apenas # 3, 12 e 20 são índices aleatórios de registros da base de dados (inicialização) cluster = kmedoids( iris.data[:, 0:2], [3, 12, 20] ) #a sintaxe do pyto e de [:, 0:2] de 0 até 2 ignorando o último valor pegando 0 e 1 #visualização dos pontos escolhidos (3,12,20) aqui estou falando q vou usar o registro para iniciar dessas posições. apartir dali cluster.get_medoids() # Aplicação do algoritmo para o agrupamento, obtenção da previsões (grupo de cada registro) e visualização dos medoides cluster.process() previsoes = cluster.get_clusters() medoides = cluster.get_medoids() #lista de 3 elementos, com os indices dos registros do cluster previsoes #aqui são 3 lista olhando a imagen vc vê / é diferente você não vê o vetor de 0 e 1 ou vetor de propabilidade como era o c-means #visualização do agrupamento / gerar o grafico do cluster com o centroides v = cluster_visualizer() v.append_clusters(previsoes, iris.data[:, 0:2]) v.append_cluster(medoides, data=iris.data[:, 0:2], marker='*', markersize=20)
def testCoreInterfaceIntInputData(self): kmedoids_instance = kmedoids([ [1], [2], [3], [20], [21], [22] ], [ 2, 5 ], 0.025, True) kmedoids_instance.process() assert len(kmedoids_instance.get_clusters()) == 2
from sklearn import datasets from sklearn.metrics import confusion_matrix import numpy as np from pyclustering.cluster.kmedoids import kmedoids from pyclustering.cluster import cluster_visualizer iris = datasets.load_iris() cluster = kmedoids(iris.data[:, 0:2], [3, 12, 20]) cluster.get_medoids() cluster.process() previsoes = cluster.get_clusters() medoides = cluster.get_medoids() v = cluster_visualizer() v.append_clusters(previsoes, iris.data[:, 0:2]) v.append_cluster(medoides, data = iris.data[:, 0:2], marker = '*', markersize = 15) v.show() lista_previsoes = [] lista_real = [] for i in range(len(previsoes)): print('----') print(i) print('----') for j in range(len(previsoes[i])): #print(j) print(previsoes[i][j]) lista_previsoes.append(i) lista_real.append(iris.target[previsoes[i][j]])
def main(): argparse.ArgumentParser(description="P4Fuzz Bugs Tamer") cnx = mysql.connector.connect(user="******", password="******", host="localhost", database="fuzzer") cursor = cnx.cursor() caseIds = [] caseErrors = [] cursor.execute("DELETE FROM tamed_bugs") cnx.commit() cursor.execute("SELECT id, error FROM bugs WHERE id < 3200") for (id, error) in cursor: caseIds.append(id) caseErrors.append(str(error)) dt = datetime.now() print "Loading data from database..." dt2 = datetime.now() diff = dt2 - dt print str(diff.total_seconds() * 1000) + " Loaded data from database" dist_tuple = p4fuzzclib.calc_distance_matrix(caseErrors) dist = [list(x) for x in dist_tuple] dt3 = datetime.now() diff = dt3 - dt2 print str(diff.total_seconds()) + " Calculated distances using token" # dists = [edit_distance(caseErrors[i], caseErrors[j]) # for i in range(1, len(caseErrors)) # for j in range(0, i)] # # dt7 = datetime.now() # diff = dt7 - dt3 # print str(diff.total_seconds()) + " Calculated distances using lev" # sys.exit() initial_medoids = [0, len(caseErrors) - 1] kmedoids_instance = kmedoids(dist, initial_medoids, 10, data_type='distance_matrix') kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() print "Clustered #1 ..." has_large = True cnt = 1 while has_large: cnt += 1 has_large = False for i, cluster in enumerate(clusters): medoid = medoids[i] medoid_distances = dist[medoid] max_points = p4fuzzclib.calc_max_distance_cluster( dist_tuple, cluster) max_dist = dist[max_points[0]][max_points[1]] if max_dist > 60: has_large = True new_medoid = max_points[0] if medoid_distances[max_points[ 0]] > medoid_distances[max_points[1]] else max_points[1] initial_medoids = medoids initial_medoids.append(new_medoid) kmedoids_instance = kmedoids(dist, initial_medoids, 100, data_type='distance_matrix') kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() else: print "Cluster " + str(i) + ": " + str(max_dist) print "Clustered #" + str(cnt) + " ..." dt4 = datetime.now() diff = dt4 - dt3 print str(diff.total_seconds() * 1000) + " Clustering finished" for i, cluster in enumerate(clusters): medoid = medoids[i] for error_index in cluster: is_medoid = True if medoid == error_index else False data = (caseIds[error_index], i, is_medoid) cursor.execute( "INSERT INTO tamed_bugs (`bug_id`, `cluster`, `is_medoid`) VALUES (%s, %s, %s)", data) cnx.commit() dt5 = datetime.now() diff = dt5 - dt4 print str( diff.total_seconds() * 1000 ) + " Tamed bugs clusters inserted into database finished! All Done!" dt6 = datetime.now() diff = dt6 - dt print "Total time: " + str(diff.total_seconds())
def run_test_clustered( classes, rounds, n_aug_sample_points, n_train, n_jobs, cv, use_GPU, batch_size, dataset, aug_transformation, aug_kw_args, logistic_reg__C, CNN_extractor_max_iter, use_loss, experiment_configs, results_filename, model_filename, n_clusters, cluster_type="kmeans", #cluster_type="kmedoids", ): run_params = { "classes": classes, "rounds": rounds, "n_aug_sample_points": n_aug_sample_points, "n_train": n_train, "n_jobs": n_jobs, "cv": cv, "use_GPU": use_GPU, "batch_size": batch_size, "dataset": dataset.name, "aug_transformation": aug_transformation.name, "aug_kw_args": aug_kw_args, "logistic_reg__C": logistic_reg__C, "CNN_extractor_max_iter": CNN_extractor_max_iter, "use_loss": use_loss, "experiment_configs": experiment_configs, "results_filename": results_filename, "model_filename": model_filename, "n_clusters": n_clusters, "cluster_type": cluster_type, } pprint.pprint(run_params) assert n_aug_sample_points (x_train, y_train), (x_test, y_test) = experiments_util.prepare_dataset( dataset, classes, n_train, ) print("Train class breakdown: {}".format( np.unique(y_train, return_counts=True))) print("Test class breakdown: {}".format( np.unique(y_test, return_counts=True))) aug_f = augmentations.get_transformation(aug_transformation) (orig_and_auged_x_train, orig_and_auged_y_train, orig_and_auged_idxs_train) = \ experiments_util.poison_dataset(x_train, y_train, aug_f, aug_kw_args) (orig_and_auged_x_test, orig_and_auged_y_test, orig_and_auged_idxs_test) = \ experiments_util.poison_dataset(x_test, y_test, aug_f, aug_kw_args) print("x_train", x_train.shape) print("orig_and_auged_x_train", orig_and_auged_x_train.shape) feature_clf = featurized_classifiers.build_featurized_ResNet_feature_clf( CNN_extractor_max_iter, use_GPU, batch_size, model_filename, ) @mem.cache def transform_features(x, y, model_filename): # We need model filename to invalidate cache on model change return feature_clf.fit_transform(x, y=y) featurized_x_train = transform_features( x=x_train, y=y_train, model_filename=model_filename, ) featurized_y_train = y_train featurized_x_test = transform_features( x=x_test, y=y_test, model_filename=model_filename, ) featurized_y_test = y_test orig_and_auged_featurized_x_train = transform_features( x=orig_and_auged_x_train, y=orig_and_auged_y_train, model_filename=model_filename, ) orig_and_auged_featurized_y_train = orig_and_auged_y_train orig_and_auged_featurized_x_train_to_source_idxs = \ orig_and_auged_idxs_train orig_and_auged_featurized_x_test = transform_features( x=orig_and_auged_x_test, y=orig_and_auged_y_test, model_filename=model_filename, ) orig_and_auged_featurized_y_test = orig_and_auged_y_test orig_and_auged_featurized_x_test_to_source_idxs = orig_and_auged_idxs_test if cluster_type == "kmeans": clustering_clf = sklearn.cluster.KMeans(n_clusters=n_clusters) train_cluster_IDs = clustering_clf.fit_predict(featurized_x_train) test_cluster_IDs = clustering_clf.predict(featurized_x_test) elif cluster_type == "kmedoids": from pyclustering.cluster.kmedoids import kmedoids from pyclustering.utils import timedcall import scipy.spatial # Using some code from kmedoids_examples.py from pyclustering clustering_clf = sklearn.cluster.KMeans(n_clusters=n_clusters) init_medoids = clustering_clf.fit_predict(featurized_x_train) #init_medoids = np.random.choice(len(featurized_x_train), # n_clusters, # replace=False) tolerance = 0.25 kmedoids_instance = kmedoids(featurized_x_train, init_medoids, tolerance) (ticks, result) = timedcall(kmedoids_instance.process) # Run cluster_IDs = kmedoids_instance.get_medoids( ) # index into training set clusters = featurized_x_train[cluster_IDs] tree = scipy.spatial.cKDTree(clusters) _, train_cluster_IDs = tree.query(featurized_x_train, 1) _, test_cluster_IDs = tree.query(featurized_x_test, 1) print("Train cluster IDs: {}".format(train_cluster_IDs)) print("Test cluster IDs: {}".format(test_cluster_IDs)) clf = featurized_classifiers.build_logistic_reg_clf( logistic_reg__C, cv, ) svm__C = [0.01, 0.1, 1, 10, 100] svm_cv = 4 is_SV = experiments_util.get_SV_raw(featurized_x_train, featurized_y_train, CNN_extractor_max_iter, use_GPU, batch_size, svm__C, svm_cv, n_jobs) SVM_losses = experiments_util.get_SVM_losses_raw(featurized_x_train, featurized_y_train, CNN_extractor_max_iter, use_GPU, batch_size, svm__C, svm_cv, n_jobs) print("Number of support vectors is: {}".format(np.sum(is_SV))) SV_idxs = np.where(is_SV)[0] orig_and_SV_idxs = np.concatenate([SV_idxs, [-1]]) print("orig_and_SV_idxs", orig_and_SV_idxs) print("orig_and_SV_idxs", orig_and_SV_idxs.shape) SV_orig_and_auged_mask = np.isin(orig_and_auged_idxs_train, orig_and_SV_idxs) print("SV_orig_and_auged_mask count {}/{}".format( np.sum(SV_orig_and_auged_mask), len(SV_orig_and_auged_mask), )) SV_x_train = orig_and_auged_featurized_x_train[SV_orig_and_auged_mask] SV_y_train = orig_and_auged_featurized_y_train[SV_orig_and_auged_mask] clf.fit(SV_x_train, SV_y_train) VSV_acc = clf.score(orig_and_auged_featurized_x_test, orig_and_auged_featurized_y_test) print("VSV acc: {}".format(VSV_acc)) np_data_dict = { "x_train": orig_and_auged_x_train, "y_train": orig_and_auged_y_train, "train_to_source_idxs": orig_and_auged_idxs_train, "featurized_x_train": orig_and_auged_featurized_x_train, "featurized_y_train": orig_and_auged_featurized_y_train, "x_test": orig_and_auged_x_test, "y_test": orig_and_auged_y_test, "test_to_source_idxs": orig_and_auged_idxs_test, "featurized_x_test": orig_and_auged_featurized_x_test, "featurized_y_test": orig_and_auged_featurized_y_test, "SV_x_train": orig_and_auged_x_train[SV_orig_and_auged_mask], "SV_y_train": orig_and_auged_y_train[SV_orig_and_auged_mask], "featurized_SV_x_train": SV_x_train, "featurized_SV_y_train": SV_y_train, } np_data_filename = results_filename + "_data.npz" np.savez(np_data_filename, **np_data_dict) (no_aug_no_poison_acc, poisoned_acc, all_aug_train_poisoned_acc, aug_scores, after_aug_scores, best_params, training_total_time) = experiments_util.train_and_score_clf( clf, featurized_x_train, y_train, featurized_x_test, y_test, orig_and_auged_featurized_x_train, orig_and_auged_featurized_y_train, orig_and_auged_featurized_x_test, orig_and_auged_featurized_y_test, use_loss, cv, ) training_end_time = time.time() experiment_results = {} for policy_name, update_score, downweight_points in experiment_configs: policy_f = selection_policy.get_policy_by_name(policy_name) if "deterministic" in policy_name: _rounds = 1 else: _rounds = rounds acc, idxs = experiments.precomputed_aug_experiment_rounds( clf=clf, auged_featurized_x_train=orig_and_auged_featurized_x_train, auged_featurized_y_train=orig_and_auged_featurized_y_train, auged_featurized_x_train_to_source_idxs= orig_and_auged_featurized_x_train_to_source_idxs, auged_featurized_x_test=orig_and_auged_featurized_x_test, auged_featurized_y_test=orig_and_auged_featurized_y_test, auged_featurized_x_test_to_source_idxs= orig_and_auged_featurized_x_test_to_source_idxs, aug_iter=policy_f, train_idxs_scores=aug_scores, n_aug_sample_points=n_aug_sample_points, rounds=_rounds, update_scores=update_score, weight_aug_samples=downweight_points, use_loss=use_loss, stratified_sampling_x_train_ks=train_cluster_IDs, ) config_name = [policy_name] if update_score: config_name.append("update") if downweight_points: config_name.append("downweight") config_name = "_".join(config_name) experiment_results[config_name] = acc all_results = { "no_aug_no_poison_acc": no_aug_no_poison_acc, "poisoned_acc": poisoned_acc, "all_aug_train_poisoned_acc": all_aug_train_poisoned_acc, "is_SV": is_SV, "VSV_acc": VSV_acc, "best_params": best_params, "initial_aug_scores": aug_scores, "after_aug_scores": after_aug_scores, "experiment_results": experiment_results, "n_aug_sample_points": n_aug_sample_points, "run_parameters": run_params, "n_train": n_train, "rounds": rounds, } tests_total_time = time.time() - training_end_time all_results["tests_total_runtime"] = tests_total_time pprint.pprint(all_results) np.savez( results_filename, **all_results, ) print("*" * 80) print("Training took {} seconds".format(training_total_time)) print("All tests took {} seconds".format(tests_total_time)) print("*" * 80)