def do_evaluations(dataset_path): print("Doing evaluations for dataset %s" % dataset_path) algorithm_results = {'bv_kmeans': None, 'yinyang': None} for algorithm in algorithm_results: print("Executing k-means with algorithm: %s" % algorithm) km = kmeans.KMeans(n_jobs=1, no_clusters=300, algorithm=algorithm, init='random', seed=0, verbose=False) km.fit(dataset_path) algorithm_results[algorithm] = km.get_tracked_params() f, (a0, a1, a2) = plt.subplots(1, 3, gridspec_kw={'width_ratios': [1, 4, 4]}, figsize=(18, 8)) plot_overall_duration_subplot(a0, algorithm_results) plot_iteration_duration_subplot(a1, algorithm_results) plot_fulldist_calcs_subplot(a2, algorithm_results) f.tight_layout() destination_filename = join(dirname(__file__), "calculations_evaluation.png") plt.savefig(destination_filename) print("plot was saved in the current folder to: %s" % destination_filename)
def do_evaluations(dataset_path, dataset_name): # Load dataset directly into csr matrix format this way it only needs to be converted once data_as_csrmatrix = get_csr_matrix_from_object(dataset_path) print("Doing evaluations for dataset %s" % dataset_path) algorithm_results = { 'kmeans_optimized': {}, 'yinyang': {}, 'fast_yinyang': {}, 'elkan': {} } clusters = [100, 250, 1000] #clusters = [2, 3] # Do the evaluations for every algorithm and every no_clusters for algorithm in sorted(algorithm_results.keys()): for no_clusters in clusters: print("Executing k-means with algorithm: %s and k=%d" % (algorithm, no_clusters)) km = kmeans.KMeans(n_jobs=1, no_clusters=no_clusters, algorithm=algorithm, init='random', seed=0, verbose=False) km.fit(data_as_csrmatrix) algorithm_results[algorithm][no_clusters] = km.get_tracked_params() # plot the results plot_overall_duration(algorithm_results, dataset_name)
def do_evaluations(datasets): dataset_results = {} for dataset_name, dataset_as_string in datasets: print("Doing evaluations for dataset %s" % dataset_name) dataset_results[dataset_name] = { 'varying_k': { 'avoided_calculations': [] } } k_values = list(range(1, 102, 20)) + list(range(200, 1001, 200)) bv_annz = 0.3 algorithm = "kmeans_optimized" for k in k_values: print("Executing %s with k=%d (bv_annz: %f)" % (algorithm, k, bv_annz)) km = kmeans.KMeans(n_jobs=1, no_clusters=k, algorithm=algorithm, init='random', additional_params={'bv_annz': bv_annz}, seed=0, verbose=False) km.fit(dataset_as_string) dataset_results[dataset_name]['varying_k'][ 'avoided_calculations'].append( sum(km.get_tracked_params()['iteration_bv_calcs_success']) * 100 / (sum(km.get_tracked_params()['iteration_bv_calcs_success']) + sum(km.get_tracked_params() ['iteration_full_distance_calcs']))) for dataset_name in dataset_results: plt.plot( k_values, dataset_results[dataset_name]['varying_k']['avoided_calculations'], '-', linewidth=3, label=dataset_name) plt.legend() plt.grid(True) plt.xlabel('number of clusters') plt.ylabel('avoided full distance calculations (percent)') plt.title( r'Varying k and observing avoided full distance calculations (bv annz = 0.3)' ) destination_filename = join(dirname(__file__), "varying_k_evaluation.png") plt.savefig(destination_filename) print("plot was saved in the current folder to: %s" % destination_filename)
def do_kmeans(result_q, control_params, params): X = control_params['libsvm_dataset_path'] data_as_csrmatrix = get_csr_matrix_from_object(X) no_samples, _ = data_as_csrmatrix.shape print(no_samples) info = params['info'] task = params['task'] output = "for %s with algorithm=%s run=%d k=%d"%(info['dataset_name'], info['algorithm'], task['run'], task['no_clusters']) if 'pca' in info['algorithm']: annz_input_matrix = data_as_csrmatrix.annz desired_no_eigenvectors = int(data_as_csrmatrix.annz * info['truncated_svd_annz_percentage']) print("Using TruncatedSVD to retrieve %d eigenvectors from input matrix with %d annz" % (desired_no_eigenvectors, annz_input_matrix)) p = TruncatedSVD(n_components = int(data_as_csrmatrix.annz * info['truncated_svd_annz_percentage'])) start = time.time() scipy_csr_matrix = data_as_csrmatrix.to_numpy() p.fit(scipy_csr_matrix) # convert to millis fin = (time.time() - start) * 1000 pca_projection_csrmatrix = get_csr_matrix_from_object(p.components_) (no_components, no_features) = p.components_.shape print("Time needed to complete getting %d eigenvectors with %d features with SVD:" % (no_components, no_features), fin, "(annz of the top eigenvectors:", pca_projection_csrmatrix.annz, ")") additional_algo_data = {info['algorithm']: {'data': pca_projection_csrmatrix, 'duration': fin}} else: additional_algo_data = {} print("Executing " + output) km = kmeans.KMeans(n_jobs=1, no_clusters=task['no_clusters'], algorithm=info['algorithm'], init='random', seed = task['run'], verbose = True, additional_params = dict(task), iteration_limit = task['iteration_limit'], additional_info = dict(info)) if info['algorithm'] in additional_algo_data: km.fit(data_as_csrmatrix, external_vectors = additional_algo_data[info['algorithm']]['data']) result = km.get_tracked_params() result['truncated_svd'] = {} result['truncated_svd']['no_components'] = no_components result['truncated_svd']['no_features'] = no_features result['truncated_svd']['duration'] = additional_algo_data[info['algorithm']]['duration'] else: km.fit(data_as_csrmatrix) result = km.get_tracked_params() result_q.put(result)
def do_evaluations(dataset_path, dataset_name): # Load dataset directly into csr matrix format this way it only needs to be converted once data_as_csrmatrix = get_csr_matrix_from_object(dataset_path) # Convert data to numpy array data_as_numpy = data_as_csrmatrix.to_numpy() # sklearn # - uses dense vectors to store the cluster centers. this makes the calculation of the distance between # sparse samples and dense clusters very fast. However if the input data has very large dimensions, storing # dense cluster centers gets very costly. # # fcl # - uses sparse vectors everywhere. Calculating distances between a sparse center and a sparse sample is a lot more expensive # then using a dense center. However it is possible to cluster very high dimensional data into many clusters on a regular # while keeping the memory usage very low. algorithm_results = {} # These values generate the official repository plot. #algorithms = ["elkan", "bv_kmeans", "yinyang", "bv_yinyang", "kmeans"] #clusters = [10, 50, 100, 500, 1000] # These values are used in order to allow the tests to be more efficient algorithms = ["kmeans"] clusters = [2, 8] for no_clusters in clusters: for algorithm in algorithms: algorithm_name = "fcl_kmeans_" + algorithm if not algorithm_name in algorithm_results: algorithm_results[algorithm_name] = {} print("evaluating: fcl kmeans (%s) with k=%d and dataset %s"%(algorithm, no_clusters, dataset_name)) dur = timeit(kmeans.KMeans(n_jobs=1, no_clusters=no_clusters, algorithm=algorithm, init='random', seed = 1, verbose = False) , data_as_csrmatrix) algorithm_results[algorithm_name][no_clusters] = dur algorithm_name = "sklearn_kmeans" if not algorithm_name in algorithm_results: algorithm_results[algorithm_name] = {} # Evaluating the speed of scikit-learn when clustering a sparse matrix print("evaluating: sklearn kmeans (sparse matrix) with k=%d and dataset %s"%(no_clusters, dataset_name)) dur = timeit(sklearn.cluster.KMeans(n_init = 1, n_jobs=1, n_clusters=no_clusters, algorithm='full', init='random', random_state=1) , data_as_numpy) algorithm_results[algorithm_name][no_clusters] = dur # plot the results plot_sklearn_comparison(algorithm_results, dataset_name)
def do_evaluations(datasets): dataset_results = {} for dataset_name, dataset_as_string in datasets: print("Doing evaluations for dataset %s" % dataset_name) dataset_results[dataset_name] = { 'searching_best_b': { 'durations': [], 'bv_annz': [] } } for i in range(1, 101, 10): bv_annz = float(i) / 100 print("Executing k-means optimized with bv_annz: %f" % bv_annz) km = kmeans.KMeans(n_jobs=1, no_clusters=1000, algorithm="kmeans_optimized", init='random', additional_params={'bv_annz': bv_annz}, seed=0, verbose=False) km.fit(dataset_as_string) dataset_results[dataset_name]['searching_best_b'][ 'durations'].append( km.get_tracked_params()['duration_kmeans'] / 1000) dataset_results[dataset_name]['searching_best_b'][ 'bv_annz'].append( km.get_tracked_params()['additional_params']['bv_annz']) for dataset_name in dataset_results: plt.plot( dataset_results[dataset_name]['searching_best_b']['bv_annz'], dataset_results[dataset_name]['searching_best_b']['durations'], '-', linewidth=3, label=dataset_name) plt.legend() plt.grid(True) plt.xlabel('relative block vector size') plt.ylabel('time / s') plt.title(r'Varying the block vector size in algorithm kmeans-optimized') destination_filename = join(dirname(__file__), "bv_annz_evaluation.png") plt.savefig(destination_filename) print("plot was saved in the current folder to: %s" % destination_filename)
from __future__ import print_function from fcl import kmeans from pprint import pprint if __name__ == "__main__": # Create dataset as string X = ('1 1:0.50 2:0.34\n' + '1 1:0.13 2:0.11\n' + '1 1:0.24 2:0.15\n' + '1 1:0.67 2:0.24\n' + '1 1:0.12 2:0.89\n' + '1 1:0.52 \n' + '1 1:0.21 2:0.97\n') # this example shows how to use tracking parameters km = kmeans.KMeans(algorithm='bv_kmeans', init='kmeans++', no_clusters=2, seed=0) km.fit(X) tracked_params = km.get_tracked_params() # tracked params is a dict with various items # tracked_params['general_params'] # stores all params kmeans was run with # tracked_params['general_params']['no_clusters'] # number of clusters requested # tracked_params['general_params']['algorithm'] # the used algorithms # tracked_params['general_params']['seed'] # The seed used for clustering # tracked_params['general_params']['remove_empty'] # If true, empty clusters are removed after clustering # tracked_params['general_params']['iteration_limit'] # After how many iterations to stop (if not converged by then) # tracked_params['general_params']['tol'] # If objective does not improve more than 'tol', converge # tracked_params['general_params']['init'] # Initialization strategy used # tracked_params['general_params']['no_cores_used'] # Number of cores used for this experiment #
# this specifies to which sample every cluster should initially be assigned to initialization_params[kmeans.INIT_PRMS_TOKEN_ASSIGNMENTS] = [ 0, 1, 1, # closest to iself 0, 0, 2, # closest to itself 1 ] # this example shows how to cluster a matrix read from a string. # the same way a file can be read in libsvm format and passed as well. km = kmeans.KMeans(no_clusters=2, seed=1, initialization_params=initialization_params) idx = km.fit_predict(X) init_params_out = km.get_output_initialization_params() # output the input initialization params print() print("Initialization input parameters:") pprint.pprint(initialization_params) # to be able to compare them with the output initialization params print() print("Initialization output parameters:") pprint.pprint(init_params_out) # Determine which samples fall into the same clusters
def do_evaluations(dataset_path, dataset_name): # Load dataset directly into csr matrix format this way it only needs to be converted once data_as_csrmatrix = get_csr_matrix_from_object(dataset_path) print("Doing evaluations for dataset %s" % dataset_path) algorithm_results = { 'bv_kmeans': {}, 'yinyang': {}, 'bv_yinyang': {}, 'elkan': {}, 'pca_kmeans': {}, 'pca_elkan': {}, 'pca_yinyang': {} } additional_algo_data = {} calculate_svd = False for algo in algorithm_results: if "pca" in algo: calculate_svd = True break if calculate_svd: p = TruncatedSVD(n_components=int(data_as_csrmatrix.annz * 0.1)) start = time.time() p.fit(data_as_csrmatrix.to_numpy()) # convert to millis fin = (time.time() - start) * 1000 print(fin) pca_projection_csrmatrix = get_csr_matrix_from_object(p.components_) for algorithm in algorithm_results: if algorithm.startswith("pca_"): additional_algo_data[algorithm] = { 'data': pca_projection_csrmatrix, 'duration': fin } clusters = [100, 250, 1000] # Do the evaluations for every algorithm and every no_clusters for algorithm in sorted(algorithm_results.keys()): for no_clusters in clusters: print("Executing k-means with algorithm: %s and k=%d" % (algorithm, no_clusters)) km = kmeans.KMeans(n_jobs=1, no_clusters=no_clusters, algorithm=algorithm, init='random', seed=0, verbose=True) if algorithm not in additional_algo_data: km.fit(data_as_csrmatrix) else: km.fit( data_as_csrmatrix, external_vectors=additional_algo_data[algorithm]['data']) algorithm_results[algorithm][no_clusters] = km.get_tracked_params() if algorithm in additional_algo_data: algorithm_results[algorithm][no_clusters][ 'duration_kmeans'] += additional_algo_data[algorithm][ 'duration'] # plot the results plot_overall_duration(algorithm_results, dataset_name)
from __future__ import print_function import fcl import os from fcl import kmeans from fcl.datasets import load_example_dataset from os.path import abspath, join, dirname if __name__ == "__main__": # Download dataset and put it into ds_folder ds_folder = abspath( join(dirname(__file__), os.pardir, os.pardir, os.pardir, 'datasets')) dataset_path = load_example_dataset(ds_folder) # this example shows how to cluster a dataset in libsvm format available under dataset_path. km = kmeans.KMeans(no_clusters=10, seed=0) km.fit(dataset_path) # it is now also possible to directly predict a dataset on the filesystem # If the dataset has M samples, then idx is an Mx1 array assigning each sample the closest cluster index. idx = km.predict(dataset_path) # Determine which samples fall into the same clusters clusters = {} for sample_id in range(len(idx)): cluster_id = idx[sample_id] if cluster_id not in clusters: clusters[cluster_id] = [] clusters[cluster_id].append(sample_id) #Show which samples are in the same cluster