예제 #1
0
def do_evaluations(dataset_path):
    print("Doing evaluations for dataset %s" % dataset_path)
    algorithm_results = {'bv_kmeans': None, 'yinyang': None}

    for algorithm in algorithm_results:
        print("Executing k-means with algorithm: %s" % algorithm)
        km = kmeans.KMeans(n_jobs=1,
                           no_clusters=300,
                           algorithm=algorithm,
                           init='random',
                           seed=0,
                           verbose=False)
        km.fit(dataset_path)
        algorithm_results[algorithm] = km.get_tracked_params()

    f, (a0, a1, a2) = plt.subplots(1,
                                   3,
                                   gridspec_kw={'width_ratios': [1, 4, 4]},
                                   figsize=(18, 8))

    plot_overall_duration_subplot(a0, algorithm_results)
    plot_iteration_duration_subplot(a1, algorithm_results)
    plot_fulldist_calcs_subplot(a2, algorithm_results)

    f.tight_layout()
    destination_filename = join(dirname(__file__),
                                "calculations_evaluation.png")
    plt.savefig(destination_filename)

    print("plot was saved in the current folder to: %s" % destination_filename)
def do_evaluations(dataset_path, dataset_name):

    # Load dataset directly into csr matrix format this way it only needs to be converted once
    data_as_csrmatrix = get_csr_matrix_from_object(dataset_path)

    print("Doing evaluations for dataset %s" % dataset_path)
    algorithm_results = {
        'kmeans_optimized': {},
        'yinyang': {},
        'fast_yinyang': {},
        'elkan': {}
    }
    clusters = [100, 250, 1000]
    #clusters = [2, 3]

    # Do the evaluations for every algorithm and every no_clusters
    for algorithm in sorted(algorithm_results.keys()):
        for no_clusters in clusters:
            print("Executing k-means with algorithm: %s and k=%d" %
                  (algorithm, no_clusters))
            km = kmeans.KMeans(n_jobs=1,
                               no_clusters=no_clusters,
                               algorithm=algorithm,
                               init='random',
                               seed=0,
                               verbose=False)
            km.fit(data_as_csrmatrix)
            algorithm_results[algorithm][no_clusters] = km.get_tracked_params()

    # plot the results
    plot_overall_duration(algorithm_results, dataset_name)
def do_evaluations(datasets):

    dataset_results = {}
    for dataset_name, dataset_as_string in datasets:
        print("Doing evaluations for dataset %s" % dataset_name)
        dataset_results[dataset_name] = {
            'varying_k': {
                'avoided_calculations': []
            }
        }
        k_values = list(range(1, 102, 20)) + list(range(200, 1001, 200))
        bv_annz = 0.3
        algorithm = "kmeans_optimized"

        for k in k_values:
            print("Executing %s with k=%d (bv_annz: %f)" %
                  (algorithm, k, bv_annz))
            km = kmeans.KMeans(n_jobs=1,
                               no_clusters=k,
                               algorithm=algorithm,
                               init='random',
                               additional_params={'bv_annz': bv_annz},
                               seed=0,
                               verbose=False)
            km.fit(dataset_as_string)
            dataset_results[dataset_name]['varying_k'][
                'avoided_calculations'].append(
                    sum(km.get_tracked_params()['iteration_bv_calcs_success'])
                    * 100 /
                    (sum(km.get_tracked_params()['iteration_bv_calcs_success'])
                     + sum(km.get_tracked_params()
                           ['iteration_full_distance_calcs'])))

    for dataset_name in dataset_results:
        plt.plot(
            k_values,
            dataset_results[dataset_name]['varying_k']['avoided_calculations'],
            '-',
            linewidth=3,
            label=dataset_name)

    plt.legend()
    plt.grid(True)

    plt.xlabel('number of clusters')
    plt.ylabel('avoided full distance calculations (percent)')
    plt.title(
        r'Varying k and observing avoided full distance calculations (bv annz = 0.3)'
    )

    destination_filename = join(dirname(__file__), "varying_k_evaluation.png")
    plt.savefig(destination_filename)

    print("plot was saved in the current folder to: %s" % destination_filename)
예제 #4
0
def do_kmeans(result_q, control_params, params):
    
    X = control_params['libsvm_dataset_path']
    
    data_as_csrmatrix = get_csr_matrix_from_object(X)
    no_samples, _ = data_as_csrmatrix.shape
    print(no_samples)
    
    info = params['info']
    task = params['task']
    
    output = "for %s with algorithm=%s run=%d k=%d"%(info['dataset_name'],
                                                              info['algorithm'],
                                                              task['run'],
                                                              task['no_clusters'])
    
    if 'pca' in info['algorithm']:
      
      annz_input_matrix = data_as_csrmatrix.annz
      desired_no_eigenvectors = int(data_as_csrmatrix.annz * info['truncated_svd_annz_percentage'])
      print("Using TruncatedSVD to retrieve %d eigenvectors from input matrix with %d annz" % (desired_no_eigenvectors,
                                                                                               annz_input_matrix))
      p = TruncatedSVD(n_components = int(data_as_csrmatrix.annz * info['truncated_svd_annz_percentage']))
      start = time.time()
      scipy_csr_matrix = data_as_csrmatrix.to_numpy()
      p.fit(scipy_csr_matrix)
      # convert to millis
      fin = (time.time() - start) * 1000 
      pca_projection_csrmatrix = get_csr_matrix_from_object(p.components_)
      (no_components, no_features) = p.components_.shape
      print("Time needed to complete getting %d eigenvectors with %d features with SVD:" % (no_components, no_features),
            fin, "(annz of the top eigenvectors:", pca_projection_csrmatrix.annz, ")")
      additional_algo_data = {info['algorithm']: {'data': pca_projection_csrmatrix, 'duration': fin}}
    else:
      additional_algo_data = {}
    
    print("Executing " + output)
    km = kmeans.KMeans(n_jobs=1, no_clusters=task['no_clusters'], algorithm=info['algorithm'],
                       init='random', seed = task['run'], verbose = True, additional_params = dict(task),
                       iteration_limit = task['iteration_limit'], additional_info = dict(info))
    
    if info['algorithm'] in additional_algo_data:
      km.fit(data_as_csrmatrix, external_vectors = additional_algo_data[info['algorithm']]['data'])
      result = km.get_tracked_params()
      result['truncated_svd'] = {}
      result['truncated_svd']['no_components'] = no_components
      result['truncated_svd']['no_features'] = no_features
      result['truncated_svd']['duration'] = additional_algo_data[info['algorithm']]['duration']
    else:
      km.fit(data_as_csrmatrix)
      result = km.get_tracked_params()
    
    result_q.put(result)
def do_evaluations(dataset_path, dataset_name):
  
  # Load dataset directly into csr matrix format this way it only needs to be converted once
  data_as_csrmatrix = get_csr_matrix_from_object(dataset_path)
  
  # Convert data to numpy array
  data_as_numpy = data_as_csrmatrix.to_numpy()
  
  # sklearn 
  # - uses dense vectors to store the cluster centers. this makes the calculation of the distance between
  #   sparse samples and dense clusters very fast. However if the input data has very large dimensions, storing
  #   dense cluster centers gets very costly.
  #
  # fcl
  # - uses sparse vectors everywhere. Calculating distances between a sparse center and a sparse sample is a lot more expensive
  #   then using a dense center. However it is possible to cluster very high dimensional data into many clusters on a regular
  #   while keeping the memory usage very low.
  algorithm_results = {}
  
  # These values generate the official repository plot.
  #algorithms = ["elkan", "bv_kmeans", "yinyang", "bv_yinyang", "kmeans"]
  #clusters = [10, 50, 100, 500, 1000]
  
  # These values are used in order to allow the tests to be more efficient
  algorithms = ["kmeans"]
  clusters = [2, 8]
  
  for no_clusters in clusters:
    for algorithm in algorithms:
      algorithm_name = "fcl_kmeans_" + algorithm
      if not algorithm_name in algorithm_results:
        algorithm_results[algorithm_name] = {}
      print("evaluating: fcl kmeans (%s) with k=%d and dataset %s"%(algorithm, no_clusters, dataset_name))
      dur = timeit(kmeans.KMeans(n_jobs=1, no_clusters=no_clusters, algorithm=algorithm, init='random', seed = 1, verbose = False)
             , data_as_csrmatrix)
      algorithm_results[algorithm_name][no_clusters] = dur
    
    algorithm_name = "sklearn_kmeans"
    if not algorithm_name in algorithm_results:
      algorithm_results[algorithm_name] = {}
    
    # Evaluating the speed of scikit-learn when clustering a sparse matrix
    print("evaluating: sklearn kmeans (sparse matrix) with k=%d and dataset %s"%(no_clusters, dataset_name))
    dur = timeit(sklearn.cluster.KMeans(n_init = 1, n_jobs=1, n_clusters=no_clusters, algorithm='full', init='random', random_state=1)
               , data_as_numpy)
    algorithm_results[algorithm_name][no_clusters] = dur
    

  # plot the results
  plot_sklearn_comparison(algorithm_results, dataset_name)
예제 #6
0
def do_evaluations(datasets):

    dataset_results = {}
    for dataset_name, dataset_as_string in datasets:
        print("Doing evaluations for dataset %s" % dataset_name)
        dataset_results[dataset_name] = {
            'searching_best_b': {
                'durations': [],
                'bv_annz': []
            }
        }

        for i in range(1, 101, 10):
            bv_annz = float(i) / 100
            print("Executing k-means optimized with bv_annz: %f" % bv_annz)
            km = kmeans.KMeans(n_jobs=1,
                               no_clusters=1000,
                               algorithm="kmeans_optimized",
                               init='random',
                               additional_params={'bv_annz': bv_annz},
                               seed=0,
                               verbose=False)
            km.fit(dataset_as_string)
            dataset_results[dataset_name]['searching_best_b'][
                'durations'].append(
                    km.get_tracked_params()['duration_kmeans'] / 1000)
            dataset_results[dataset_name]['searching_best_b'][
                'bv_annz'].append(
                    km.get_tracked_params()['additional_params']['bv_annz'])

    for dataset_name in dataset_results:
        plt.plot(
            dataset_results[dataset_name]['searching_best_b']['bv_annz'],
            dataset_results[dataset_name]['searching_best_b']['durations'],
            '-',
            linewidth=3,
            label=dataset_name)

    plt.legend()
    plt.grid(True)

    plt.xlabel('relative block vector size')
    plt.ylabel('time / s')
    plt.title(r'Varying the block vector size in algorithm kmeans-optimized')

    destination_filename = join(dirname(__file__), "bv_annz_evaluation.png")
    plt.savefig(destination_filename)

    print("plot was saved in the current folder to: %s" % destination_filename)
예제 #7
0
from __future__ import print_function
from fcl import kmeans
from pprint import pprint

if __name__ == "__main__":
    # Create dataset as string
    X = ('1 1:0.50 2:0.34\n' + '1 1:0.13 2:0.11\n' + '1 1:0.24 2:0.15\n' +
         '1 1:0.67 2:0.24\n' + '1 1:0.12 2:0.89\n' + '1 1:0.52       \n' +
         '1 1:0.21 2:0.97\n')

    # this example shows how to use tracking parameters
    km = kmeans.KMeans(algorithm='bv_kmeans',
                       init='kmeans++',
                       no_clusters=2,
                       seed=0)
    km.fit(X)

    tracked_params = km.get_tracked_params()

    # tracked params is a dict with various items

    # tracked_params['general_params']                      # stores all params kmeans was run with
    # tracked_params['general_params']['no_clusters']       # number of clusters requested
    # tracked_params['general_params']['algorithm']         # the used algorithms
    # tracked_params['general_params']['seed']              # The seed used for clustering
    # tracked_params['general_params']['remove_empty']      # If true, empty clusters are removed after clustering
    # tracked_params['general_params']['iteration_limit']   # After how many iterations to stop (if not converged by then)
    # tracked_params['general_params']['tol']               # If objective does not improve more than 'tol', converge
    # tracked_params['general_params']['init']              # Initialization strategy used
    # tracked_params['general_params']['no_cores_used']     # Number of cores used for this experiment
    #
예제 #8
0
    # this specifies to which sample every cluster should initially be assigned to
    initialization_params[kmeans.INIT_PRMS_TOKEN_ASSIGNMENTS] = [
        0,
        1,
        1,  # closest to iself
        0,
        0,
        2,  # closest to itself
        1
    ]

    # this example shows how to cluster a matrix read from a string.
    # the same way a file can be read in libsvm format and passed as well.
    km = kmeans.KMeans(no_clusters=2,
                       seed=1,
                       initialization_params=initialization_params)
    idx = km.fit_predict(X)
    init_params_out = km.get_output_initialization_params()

    # output the input initialization params
    print()
    print("Initialization input parameters:")
    pprint.pprint(initialization_params)

    # to be able to compare them with the output initialization params
    print()
    print("Initialization output parameters:")
    pprint.pprint(init_params_out)

    # Determine which samples fall into the same clusters
def do_evaluations(dataset_path, dataset_name):

    # Load dataset directly into csr matrix format this way it only needs to be converted once
    data_as_csrmatrix = get_csr_matrix_from_object(dataset_path)

    print("Doing evaluations for dataset %s" % dataset_path)

    algorithm_results = {
        'bv_kmeans': {},
        'yinyang': {},
        'bv_yinyang': {},
        'elkan': {},
        'pca_kmeans': {},
        'pca_elkan': {},
        'pca_yinyang': {}
    }

    additional_algo_data = {}

    calculate_svd = False
    for algo in algorithm_results:
        if "pca" in algo:
            calculate_svd = True
            break

    if calculate_svd:
        p = TruncatedSVD(n_components=int(data_as_csrmatrix.annz * 0.1))
        start = time.time()
        p.fit(data_as_csrmatrix.to_numpy())
        # convert to millis
        fin = (time.time() - start) * 1000
        print(fin)
        pca_projection_csrmatrix = get_csr_matrix_from_object(p.components_)
        for algorithm in algorithm_results:
            if algorithm.startswith("pca_"):
                additional_algo_data[algorithm] = {
                    'data': pca_projection_csrmatrix,
                    'duration': fin
                }

    clusters = [100, 250, 1000]

    # Do the evaluations for every algorithm and every no_clusters
    for algorithm in sorted(algorithm_results.keys()):
        for no_clusters in clusters:
            print("Executing k-means with algorithm: %s and k=%d" %
                  (algorithm, no_clusters))
            km = kmeans.KMeans(n_jobs=1,
                               no_clusters=no_clusters,
                               algorithm=algorithm,
                               init='random',
                               seed=0,
                               verbose=True)
            if algorithm not in additional_algo_data:
                km.fit(data_as_csrmatrix)
            else:
                km.fit(
                    data_as_csrmatrix,
                    external_vectors=additional_algo_data[algorithm]['data'])

            algorithm_results[algorithm][no_clusters] = km.get_tracked_params()
            if algorithm in additional_algo_data:
                algorithm_results[algorithm][no_clusters][
                    'duration_kmeans'] += additional_algo_data[algorithm][
                        'duration']

    # plot the results
    plot_overall_duration(algorithm_results, dataset_name)
예제 #10
0
from __future__ import print_function
import fcl
import os
from fcl import kmeans
from fcl.datasets import load_example_dataset
from os.path import abspath, join, dirname

if __name__ == "__main__":
    # Download dataset and put it into ds_folder
    ds_folder = abspath(
        join(dirname(__file__), os.pardir, os.pardir, os.pardir, 'datasets'))
    dataset_path = load_example_dataset(ds_folder)

    # this example shows how to cluster a dataset in libsvm format available under dataset_path.
    km = kmeans.KMeans(no_clusters=10, seed=0)
    km.fit(dataset_path)

    # it is now also possible to directly predict a dataset on the filesystem
    # If the dataset has M samples, then idx is an Mx1 array assigning each sample the closest cluster index.
    idx = km.predict(dataset_path)

    # Determine which samples fall into the same clusters
    clusters = {}
    for sample_id in range(len(idx)):
        cluster_id = idx[sample_id]
        if cluster_id not in clusters:
            clusters[cluster_id] = []

        clusters[cluster_id].append(sample_id)

    #Show which samples are in the same cluster