def template_clustering_random_points_performance(cluster_length, amount_clusters, ccore_flag): sample = [[random.random(), random.random()] for _ in range(cluster_length)] for index in range(1, amount_clusters): default_offset = 5 sample += [[ random.random() + default_offset * index, random.random() + default_offset * index ] for _ in range(cluster_length)] initial_center = [[random.random(), random.random()], [random.random(), random.random()]] xmeans_instance = xmeans(sample, initial_center, 20, 0.25, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) ticks_array = [] amount_measures = 5 for _ in range(amount_measures): xmeans_instance = xmeans(sample, initial_center, 20, 0.25, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) (ticks, _) = timedcall(xmeans_instance.process) ticks_array.append(ticks) print( "Random sample: (size:" + str(len(sample)) + ") ', Execution time: '", sum(ticks_array) / amount_measures)
def random_state(ccore_flag, kinitial, kmax, random_state): data = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE5) initial_centers = random_center_initializer(data, kinitial, random_state=random_state).initialize() xmeans_instance1 = xmeans(data, initial_centers, kmax, ccore=ccore_flag, random_state=random_state).process() xmeans_instance2 = xmeans(data, initial_centers, kmax, ccore=ccore_flag, random_state=random_state).process() assertion.eq(xmeans_instance1.get_total_wce(), xmeans_instance2.get_total_wce()) assertion.eq(xmeans_instance1.get_centers(), xmeans_instance2.get_centers()) assertion.eq(xmeans_instance1.get_clusters(), xmeans_instance2.get_clusters())
def x_means(X, num_init_clusters=8, visualize=True): from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer from pyclustering.cluster.xmeans import xmeans from pyclustering.cluster import cluster_visualizer from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer from pyclustering.cluster import cluster_visualizer_multidim X = list(X) start_centers = kmeans_plusplus_initializer( X, num_init_clusters).initialize() xmeans_instance = xmeans(X, start_centers, 32, ccore=True, criterion=0) # Run cluster analysis and obtain results. xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() print('Number of cluster centers calculated :', len(centers)) if visualize: visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, X) visualizer.show() return centers, clusters
def xmeans_cluster(self, domain_features): final_centers = None final_radiuses = None final_clusters = None for i in range(5): initial_centers = kmeans_plusplus_initializer(domain_features, 2).initialize() # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum max_num = int(len(domain_features) / 2) xmeans_instance = xmeans(domain_features, initial_centers, max_num) xmeans_instance.process() centers = xmeans_instance.get_centers() flag = False if i == 0 or len(centers) > len(final_centers): flag = True if flag: radiuses = [] cluster_num = 1 for cluster in xmeans_instance.get_clusters(): cluster_num = cluster_num + 1 radius_total = 0.0 for i in cluster: dist = np.linalg.norm(domain_features[i] - centers[cluster_num - 2]) radius_total += dist radiuses.append(radius_total / len(cluster)) final_centers = xmeans_instance.get_centers() final_radiuses = radiuses final_clusters = xmeans_instance.get_clusters() return final_centers, final_radiuses, final_clusters
def template_clustering( start_centers, path, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=True): sample = read_sample(path) xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore, repeat=5) (ticks, _) = timedcall(xmeans_instance.process) clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() criterion_string = "UNKNOWN" if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION" elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH" print("Sample: ", ntpath.basename(path), "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(clusters), ",", criterion_string, "\n") visualizer = cluster_visualizer() visualizer.set_canvas_title(criterion_string) visualizer.append_clusters(clusters, sample) visualizer.append_cluster(centers, None, marker='*') visualizer.show()
def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore): sample = None; if (isinstance(input_sample, str)): sample = read_sample(input_sample); else: sample = input_sample; #clusters = xmeans(sample, start_centers, 20, ccore); xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore); xmeans_instance.process(); clusters = xmeans_instance.get_clusters(); centers = xmeans_instance.get_centers(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); assert len(clusters) == len(centers); assert len(centers) <= kmax; if (expected_cluster_length is not None): assert len(centers) == len(expected_cluster_length); obtained_cluster_sizes.sort(); expected_cluster_length.sort(); assert obtained_cluster_sizes == expected_cluster_length;
def templateMaxAllocatedClusters(ccore_flag, amount_clusters, size_cluster, offset, kinitial, kmax): input_data = [] for index in range(amount_clusters): for _ in range(size_cluster): input_data.append([ random.random() * index * offset, random.random() * index * offset ]) initial_centers = random_center_initializer(input_data, kinitial).initialize() xmeans_instance = xmeans(input_data, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() if len(clusters) != len(centers): print(input_data) print(initial_centers) assertion.ge(kmax, len(clusters)) assertion.ge(kmax, len(centers)) assertion.eq(len(clusters), len(centers))
def template_clustering( start_centers, path, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=False): sample = read_sample( '/home/tengmo/crawler_to_server_set_time/crawler/source_code_python2.7/cluster/test.txt' ) xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore) (ticks, result) = timedcall(xmeans_instance.process) clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() criterion_string = "UNKNOWN" if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION" elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH" #print("Sample: ", path, "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(clusters), ",", criterion_string, "\n"); print {'length': len(clusters), 'clus': clusters, 'cen': centers}
def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore, **kwargs): if isinstance(input_sample, str): sample = read_sample(input_sample) else: sample = input_sample xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore, **kwargs) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() wce = xmeans_instance.get_total_wce() obtained_cluster_sizes = [len(cluster) for cluster in clusters] assertion.eq(len(sample), sum(obtained_cluster_sizes)) assertion.eq(len(clusters), len(centers)) assertion.le(len(centers), kmax) expected_wce = 0.0 metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) for index_cluster in range(len(clusters)): for index_point in clusters[index_cluster]: expected_wce += metric(sample[index_point], centers[index_cluster]) assertion.eq(expected_wce, wce) if expected_cluster_length is not None: assertion.eq(len(centers), len(expected_cluster_length)) obtained_cluster_sizes.sort() expected_cluster_length.sort() assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore): if isinstance(input_sample, str): sample = read_sample(input_sample) else: sample = input_sample #clusters = xmeans(sample, start_centers, 20, ccore); xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() obtained_cluster_sizes = [len(cluster) for cluster in clusters] assert len(sample) == sum(obtained_cluster_sizes); assert len(clusters) == len(centers); assert len(centers) <= kmax; if expected_cluster_length is not None: assert len(centers) == len(expected_cluster_length); obtained_cluster_sizes.sort() expected_cluster_length.sort() assert obtained_cluster_sizes == expected_cluster_length;
def template_clustering( start_centers, path, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=False): sample = read_sample(path) xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore) (ticks, result) = timedcall(xmeans_instance.process) clusters = xmeans_instance.get_clusters() criterion_string = "UNKNOWN" if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN_INFORMATION_CRITERION" elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM_NOISELESS_DESCRIPTION_LENGTH" print("Sample: ", path, "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(clusters), ",", criterion_string, "\n") draw_clusters(sample, clusters)
def cluster_xMean_FixSize(binsList, amount_initial_centers=3, kmax=10): sample = [] means = [] vars = [] slopes = [] for i, bin in enumerate(binsList): sample.append( [bin.get_representation(), bin.get_variance(), bin.get_slope()]) means.append(bin.get_representation()) vars.append(bin.get_variance()) slopes.append(bin.get_slope()) # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will # start analysis. initial_centers = kmeans_plusplus_initializer( sample, amount_initial_centers).initialize() # # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum # # number of clusters that can be allocated is 20. xmeans_instance = xmeans(sample, initial_centers, kmax) xmeans_instance.process() # # Extract clustering results: clusters and their centers clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() print(len(clusters)) return {"clusters": clusters, "centers": centers}
def process_xmeans(sample): instance = xmeans( sample, [[random() + (multiplier * 5), random() + (multiplier + 5)] for multiplier in range(NUMBER_CLUSTERS)]) (ticks, _) = timedcall(instance.process) return ticks
def template_clustering( start_centers, path, tolerance=0.025, criterion=splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore=False): sample = read_sample(path) start = time.clock() xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore) (ticks, _) = timedcall(xmeans_instance.process) # clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() end = time.clock() print(end - start) criterion_string = "UNKNOWN" if criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION: criterion_string = "BAYESIAN INFORMATION CRITERION" elif criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH: criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH" print("Sample: ", ntpath.basename(path), "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(centers), ",", criterion_string, "\n")
def testCoreInterfaceIntInputData(self): xmeans_instance = xmeans([[1], [2], [3], [20], [21], [22]], [[2], [21]], 5, ccore=True) xmeans_instance.process() assert len(xmeans_instance.get_clusters()) == 2
def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore): if isinstance(input_sample, str): sample = read_sample(input_sample) else: sample = input_sample #clusters = xmeans(sample, start_centers, 20, ccore); xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() obtained_cluster_sizes = [len(cluster) for cluster in clusters] assert len(sample) == sum(obtained_cluster_sizes) assert len(clusters) == len(centers) assert len(centers) <= kmax if expected_cluster_length is not None: assert len(centers) == len(expected_cluster_length) obtained_cluster_sizes.sort() expected_cluster_length.sort() assert obtained_cluster_sizes == expected_cluster_length
def plot_unnecessary_part_by_clustering(in_mask_path, in_img_path, out_img_path, show_flag=0): """ディティールマスク部を画像に矩形として表示する 【引数】 in_mask_path: 入力するディティールマスクのパス in_img_path: 矩形を重ねたいフレーム画像のパス out_img_path: 出力する矩形描画後の画像のパス show_flag: マスク画像生成後、画像を表示するかどうか。0だと表示しない。1だと表示する。 【返り値】 なし """ in_mask = cv2.imread(in_mask_path, cv2.IMREAD_GRAYSCALE) height = in_mask.shape[0] width = in_mask.shape[1] X = [] print('Checking unnecessary part...') for i in tqdm(range(height)): for j in range(width): if (in_mask[i, j] == 255): X.append([i, j]) initializer = xmeans.kmeans_plusplus_initializer(data=X, amount_centers=2) initial_centers = initializer.initialize() xm = xmeans.xmeans(data=X, initial_centers=initial_centers) xm.process() clusters = xm.get_clusters() img_out = cv2.imread(in_img_path) mask = create_blank_mask(height, width) print('Clustering unnecessary part...') for cluster in tqdm(clusters): coodinates = [] for item in cluster: coodinates.append(X[item]) x, y, width, height = cv2.boundingRect(np.array(coodinates)) img_out = cv2.rectangle(img_out, (y, x), (y + height, x + width), (0, 0, 255), 2) for i in range(y, y + height): for j in range(x, x + width): mask[j, i] = 255 cv2.imwrite(out_img_path, img_out) if show_flag == 0: return elif show_flag == 1: cv2.imshow('window', img_out) cv2.waitKey(0) cv2.destroyAllWindows() return else: return
def score_embeddings(self, min_length, max_num_speakers, mode): """ Score embeddings. Args: min_length (int): minimal length of segment used for clustering in miliseconds max_num_speakers (int): maximal number of speakers mode (str): running mode, see examples/diarization.py for details Returns: dict: dictionary with scores for each file """ result_dict = {} logger.info('Scoring using `{}`.'.format( 'PLDA' if self.plda is not None else 'cosine distance')) for embedding_set in self.embeddings: name = os.path.normpath(embedding_set.name) embeddings_all = embedding_set.get_all_embeddings() embeddings_long = embedding_set.get_longer_embeddings(min_length) if len(embeddings_long) == 0: logger.warning( f'No embeddings found longer than {min_length} for embedding set `{name}`.' ) continue size = len(embedding_set) if size > 0: logger.info( f'Clustering `{name}` using {len(embeddings_long)} long embeddings.' ) if mode == 'diarization': if embedding_set.num_speakers is not None: num_speakers = embedding_set.num_speakers else: xm = xmeans(embeddings_long, kmax=max_num_speakers) xm.process() num_speakers = len(xm.get_clusters()) centroids = self.run_clustering(num_speakers, embeddings_long) if self.norm is None: if self.plda is None: result_dict[name] = cosine_similarity( embeddings_all, centroids).T else: result_dict[name] = self.plda.score( embeddings_all, centroids) else: result_dict[name] = self.norm.s_norm( embeddings_all, centroids) else: clusters = [] for k in range(1, MAX_SRE_CLUSTERS + 1): if size >= k: centroids = self.run_clustering(k, embeddings_long) clusters.extend(x for x in centroids) result_dict[name] = np.array(clusters) else: logger.warning( f'No embeddings to score in `{embedding_set.name}`.') return result_dict
def cl_xmeans(sample): initial_centers = kmeans_plusplus_initializer(sample, 2).initialize() xmeans_instance = xmeans(sample, initial_centers, 20) xmeans_instance.process() return xmeans_instance.get_clusters() # slc: single linkage clustering
def xmeansRoutine(self): self.initial_centers = kmeans_plusplus_initializer( self.datalist, self.amount_initial_centers).initialize() self.xmeans_instance = xmeans(self.datalist, self.initial_centers, self.amount_max_centers) self.xmeans_instance.process() self.clusters = self.xmeans_instance.get_clusters() self.centers = self.xmeans_instance.get_centers()
def score_ivec(self, min_length, max_num_speakers, num_threads): """ Score i-vectors. Args: min_length (int): minimal length of segment used for clustering in miliseconds max_num_speakers (int): maximal number of speakers num_threads (int): number of threads to use Returns: dict: dictionary with scores for each file """ scores_dict = {} for ivecset in self.ivecs: name = os.path.normpath(ivecset.name) ivecs_all = ivecset.get_all() ivecs_long = ivecset.get_longer(min_length) loginfo('Scoring {} ...'.format(name)) size = ivecset.size() if size > 0: if ivecset.num_speakers is not None: num_speakers = ivecset.num_speakers sklearnkmeans = sklearnKMeans( n_clusters=num_speakers, n_init=100, n_jobs=num_threads).fit(ivecs_long) if self.plda is None: centroids = sklearnkmeans.cluster_centers_ else: centroids = PLDAKMeans(sklearnkmeans.cluster_centers_, num_speakers, self.plda).fit(ivecs_long) else: xm = xmeans(ivecs_long, kmax=max_num_speakers) xm.process() num_speakers = len(xm.get_clusters()) sklearnkmeans = sklearnKMeans( n_clusters=num_speakers, n_init=100, n_jobs=num_threads).fit(ivecs_long) centroids = sklearnkmeans.cluster_centers_ if self.norm is None: if self.plda is None: ivecs_all = Utils.l2_norm(ivecs_all) centroids = Utils.l2_norm(centroids) scores_dict[name] = cosine_similarity( ivecs_all, centroids).T else: scores_dict[name] = self.plda.score( ivecs_all, centroids) else: ivecs_all = Utils.l2_norm(ivecs_all) centroids = Utils.l2_norm(centroids) scores_dict[name] = self.norm.s_norm(ivecs_all, centroids) else: logwarning('No i-vectors to score in {}.'.format(ivecset.name)) return scores_dict
def templateClusterAllocationOneDimensionData(self, ccore_flag): input_data = [ [0.0] for i in range(10) ] + [ [5.0] for i in range(10) ] + [ [10.0] for i in range(10) ] + [ [15.0] for i in range(10) ]; xmeans_instance = xmeans(input_data, [ [0.5], [5.5], [10.5], [15.5] ], 20, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag); xmeans_instance.process(); clusters = xmeans_instance.get_clusters(); assert len(clusters) == 4; for cluster in clusters: assert len(cluster) == 10;
def template_clustering_random_points_performance(cluster_length, amount_clusters, ccore_flag): sample = [ [ random.random(), random.random() ] for _ in range(cluster_length) ] for index in range(1, amount_clusters): default_offset = 5 sample += [ [ random.random() + default_offset * index, random.random() + default_offset * index ] for _ in range(cluster_length) ] initial_center = [ [ random.random(), random.random() ], [ random.random(), random.random() ] ] xmeans_instance = xmeans(sample, initial_center, 20, 0.25, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) ticks_array = [] amount_measures = 5 for _ in range(amount_measures): xmeans_instance = xmeans(sample, initial_center, 20, 0.25, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) (ticks, _) = timedcall(xmeans_instance.process) ticks_array.append(ticks) print("Random sample: (size:" + str(len(sample)) + ") ', Execution time: '", sum(ticks_array) / amount_measures)
def __run_feature_xmeans(self, features, num_init_centers = 10, max_centers = 30, \ clust_size_threshold = 1, dist_threshold = 10) -> list: # run xmeans algorithm initial_centers = kmeans_plusplus_initializer( features, num_init_centers).initialize() algo = xmeans(features, initial_centers=initial_centers, kmax=max_centers) algo.process() centroids, clusters = algo.get_centers(), algo.get_clusters() # pre-process centroids p_centroids = [] for coord in centroids: row, col = coord[0], coord[1] p_centroids.append((int(round(row)), int(round(col)))) # determine close centroids comb_indices = set() for comb in itertools.combinations(range(len(p_centroids)), 2): cen, c_cen = p_centroids[comb[0]], p_centroids[comb[1]] dist = math.sqrt((cen[0] - c_cen[0])**2 + (cen[1] - c_cen[1])**2) if dist <= dist_threshold: comb_indices.add(frozenset(comb)) # find transitive centroid clusters trans_centroids = [] for comb in comb_indices: addedFlag = False for i in range(len(trans_centroids)): if len(trans_centroids[i].intersection(comb)): trans_centroids[i] = trans_centroids[i].union(comb) addedFlag = True break if not addedFlag: trans_centroids.append(frozenset(comb)) # combine close transitive centroids sets c_centroids, added_indices = [], set() for combs in trans_centroids: n_centroid = [0, 0] for c_idx in combs: added_indices.add(c_idx) n_centroid[0] += centroids[c_idx][0] n_centroid[1] += centroids[c_idx][1] n_centroid[0] /= len(combs) n_centroid[1] /= len(combs) c_centroids.append(n_centroid) # purge under-sized clusters for c_idx in range(len(centroids)): if c_idx in added_indices or len(clusters[c_idx]) \ <= clust_size_threshold: continue c_centroids.append(centroids[c_idx]) return c_centroids
def template_clustering_performance(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False): sample = read_sample(path) xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore) (ticks, _) = timedcall(xmeans_instance.process) criterion_string = "UNKNOWN" if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION"; elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH"; print("Sample: ", ntpath.basename(path), "', Execution time: '", ticks, "',", criterion_string)
def est_num_clusters(embs, max_num, init_num): """Use xmeans to estimate number of speakers.""" embs_list = embs.tolist() initial_centers = kmeans_plusplus_initializer(embs_list, init_num).initialize() xm = xmeans(embs_list, initial_centers, kmax=max_num, ccore=True) xm.process() num_speakers = len(xm.get_clusters()) print('Estimated number of speakers: ' + str(num_speakers)) return num_speakers
def fit(self, X, y=None): initial_number_of_cluster_centers = self.min_clusters initial_centers = kmeans_plusplus_initializer( X, initial_number_of_cluster_centers).initialize() self.xmeans_instance = xmeans(X, initial_centers, self.max_clusters) self.xmeans_instance.process() return self
def templatePredict(path_to_file, initial_centers, points, expected_amount, expected_closest_clusters, ccore, **kwargs): sample = read_sample(path_to_file) kmax = kwargs.get('kmax', 20) xmeans_instance = xmeans(sample, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore) xmeans_instance.process() closest_clusters = xmeans_instance.predict(points) assertion.eq(expected_amount, len(xmeans_instance.get_clusters())) assertion.eq(len(expected_closest_clusters), len(closest_clusters)) assertion.true(numpy.array_equal(numpy.array(expected_closest_clusters), closest_clusters))
def make_range_mask(in_mask_path, out_mask_path, show_flag=0): """x-meansを使ってレンジマスクを生成する 【引数】 in_mask_path: 入力するディティールマスク画像のパス out_mask_path: 出力するレンジマスク画像のパス show_flag: マスク画像生成後、画像を表示するかどうか。0だと表示しない。1だと表示する。 【返り値】 なし """ in_mask = cv2.imread(in_mask_path, cv2.IMREAD_GRAYSCALE) height = in_mask.shape[0] width = in_mask.shape[1] X = [] print('Checking unnecessary part...') for i in tqdm(range(height)): for j in range(width): if (in_mask[i, j] == 255): X.append([i, j]) initializer = xmeans.kmeans_plusplus_initializer(data=X, amount_centers=2) initial_centers = initializer.initialize() xm = xmeans.xmeans(data=X, initial_centers=initial_centers) xm.process() clusters = xm.get_clusters() mask = create_blank_mask(height, width) print('Clustering unnecessary part...') for cluster in tqdm(clusters): coodinates = [] for item in cluster: coodinates.append(X[item]) x, y, width, height = cv2.boundingRect(np.array(coodinates)) for i in range(y, y + height): for j in range(x, x + width): mask[j, i] = 255 cv2.imwrite(out_mask_path, mask) if show_flag == 0: return elif show_flag == 1: cv2.imshow('window', mask) cv2.waitKey(0) cv2.destroyAllWindows() return else: return
def score_embeddings(self, min_length, max_num_speakers): """ Score embeddings. Args: min_length (int): minimal length of segment used for clustering in miliseconds max_num_speakers (int): maximal number of speakers Returns: dict: dictionary with scores for each file """ scores_dict = {} logger.info('Scoring using `{}`.'.format('PLDA' if self.plda is not None else 'cosine distance')) for embedding_set in self.embeddings: name = os.path.normpath(embedding_set.name) embeddings_all = embedding_set.get_all_embeddings() embeddings_long = embedding_set.get_longer_embeddings(min_length) if len(embeddings_long) == 0: logger.warning( 'No embeddings found longer than {} for embedding set `{}`.'.format(min_length, name)) continue size = len(embedding_set) if size > 0: logger.info('Clustering `{}` using {} long embeddings.'.format(name, len(embeddings_long))) if embedding_set.num_speakers is not None: num_speakers = embedding_set.num_speakers if self.use_l2_norm: kmeans_clustering = SphericalKMeans( n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long) else: kmeans_clustering = sklearnKMeans( n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long) if self.plda is None: centroids = kmeans_clustering.cluster_centers_ else: centroids = PLDAKMeans( kmeans_clustering.cluster_centers_, num_speakers, self.plda).fit(embeddings_long) else: xm = xmeans(embeddings_long, kmax=max_num_speakers) xm.process() num_speakers = len(xm.get_clusters()) kmeans_clustering = sklearnKMeans( n_clusters=num_speakers, n_init=100, n_jobs=1).fit(embeddings_long) centroids = kmeans_clustering.cluster_centers_ if self.norm is None: if self.plda is None: scores_dict[name] = cosine_similarity(embeddings_all, centroids).T else: scores_dict[name] = self.plda.score(embeddings_all, centroids) else: scores_dict[name] = self.norm.s_norm(embeddings_all, centroids) else: logger.warning('No embeddings to score in `{}`.'.format(embedding_set.name)) return scores_dict
def xmeans_model(self, sample): amount_initial_centers = 2 initial_centers = kmeans_plusplus_initializer( sample, amount_initial_centers).initialize() # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum max_num = int(len(sample) / 2) xmeans_instance = xmeans(sample, initial_centers, max_num) xmeans_instance.process() # Extract clustering results: clusters and their centers clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() return clusters, centers
def get_x_clusters(doc_vectors): # Prepare initial centers - amount of initial centers defines amount of clusters from which X-Means will # start analysis. amount_initial_centers = 2 initial_centers = kmeans_plusplus_initializer( doc_vectors, amount_initial_centers).initialize() # Create instance of X-Means algorithm. The algorithm will start analysis from 2 clusters, the maximum # number of clusters that can be allocated is 20. xmeans_instance = xmeans(doc_vectors, initial_centers, 20) xmeans_instance.process() # Extract clustering results: clusters and their centers clusters = xmeans_instance.get_clusters() return clusters
def template_clustering(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False): sample = read_sample(path); xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore); (ticks, result) = timedcall(xmeans_instance.process); clusters = xmeans_instance.get_clusters(); criterion_string = "UNKNOWN"; if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN_INFORMATION_CRITERION"; elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM_NOISELESS_DESCRIPTION_LENGTH"; print("Sample: ", path, "\tExecution time: ", ticks, "Number of clusters: ", len(clusters), criterion_string, "\n"); draw_clusters(sample, clusters);
def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length, type_splitting, ccore = False): sample = read_sample(path_to_file); #clusters = xmeans(sample, start_centers, 20, ccore); xmeans_instance = xmeans(sample, start_centers, 20, 0.025, type_splitting, ccore); xmeans_instance.process(); clusters = xmeans_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); obtained_cluster_sizes.sort(); expected_cluster_length.sort(); assert obtained_cluster_sizes == expected_cluster_length;
def main(): file_path = "/home/joli/Downloads/Av_CP343-CP348_ft.sp" frequencies, intensities = read_data(file_path) max_x = max(frequencies) max_y = (max(intensities)) avg = sum(intensities) / len(intensities) # Marie's Function plot.plot(frequencies, intensities, color='grey') # test(peak_finder, frequencies, intensities, 0.2) # # # Kyle's Function # plot.plot(frequencies, intensities, color='grey') # test(k_peak_finder, frequencies, intensities, 3) # # Using Peak Utils # plot.axhline(linewidth=4,y=avg, color='green', xmin=0, xmax=max_x) # plot.plot(frequencies, intensities, 'bo', color='grey') # plot.plot(frequencies, intensities, 'bo', color='grey') # test(peak_finder_peakutils, frequencies, intensities, 0.0001) # Optimized # test(optimized_peak_finder, frequencies, intensities) # Test Difference # f1,i1 = peak_finder(frequencies, intensities, 1) f2, i2 = k_peak_finder(frequencies, intensities, 5) sample = read_sample("/home/joli/PycharmProjects/Experiments/168-175_pzf1.sp") instance = xmeans.xmeans(sample, [3.7, 5.5]) instance.process() clusters = instance.get_clusters() print clusters # A = set(f1) # B = set(f2) # C = A&B # # print len(f1) # print len(f2) # print len(C) # plot.bar(f1, i1, color="purple") plot.bar(f2, i2, color="green", width=1) # plot.bar(C, [1] * len(C), color="yellow", bottom=-1) plot.show()
def templateMaxAllocatedClusters(ccore_flag, amount_clusters, size_cluster, offset, kinitial, kmax): input_data = [] for index in range(amount_clusters): for _ in range(size_cluster): input_data.append([random.random() * index * offset, random.random() * index * offset]) initial_centers = random_center_initializer(input_data, kinitial).initialize() xmeans_instance = xmeans(input_data, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() if len(clusters) != len(centers): print(input_data) print(initial_centers) assertion.ge(kmax, len(clusters)) assertion.ge(kmax, len(centers)) assertion.eq(len(clusters), len(centers))
def template_clustering(start_centers, path, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False): sample = read_sample(path) xmeans_instance = xmeans(sample, start_centers, 20, tolerance, criterion, ccore) (ticks, _) = timedcall(xmeans_instance.process) clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() criterion_string = "UNKNOWN" if (criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION): criterion_string = "BAYESIAN INFORMATION CRITERION"; elif (criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH): criterion_string = "MINIMUM NOISELESS DESCRIPTION_LENGTH"; print("Sample: ", ntpath.basename(path), "\nInitial centers: '", (start_centers is not None), "', Execution time: '", ticks, "', Number of clusters:", len(clusters), ",", criterion_string, "\n") visualizer = cluster_visualizer() visualizer.set_canvas_title(criterion_string) visualizer.append_clusters(clusters, sample) visualizer.append_cluster(centers, None, marker = '*') visualizer.show()
def testCoreInterfaceIntInputData(self): xmeans_instance = xmeans([ [1], [2], [3], [20], [21], [22] ], [ [2], [21] ], 5, ccore = True); xmeans_instance.process(); assert len(xmeans_instance.get_clusters()) == 2;
def process_xmeans(sample): instance = xmeans(sample, [ [random() + (multiplier * 5), random() + (multiplier + 5)] for multiplier in range(NUMBER_CLUSTERS) ]) (ticks, _) = timedcall(instance.process) return ticks