def result_evaluation_dataset_speed_comparison(out_folder, out_folder_csv):

    for fcnt, plotname in [('do_kmeans', 'kmeans_speeds')]:
        print(plotname)
        run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt)
        plot_data = {}

        result_data = {}
        for run_identifier in run_identifiers:
            db = ExperimentDB(out_folder, fcnt, run_identifier)

            for resid in db.get_algorithm_run_ids():
                (control_params, params,
                 res) = db.get_experiment_result_from_run_id(resid)
                if res is None:
                    continue
                ds = params['info']['dataset_name']
                alg = params['info']['algorithm']
                no_clusters = params['task']['no_clusters']
                run = params['task']['run']
                duration_kmeans = res['duration_kmeans']
                no_iterations = len(res['iteration_changes'])

                if ds not in result_data:
                    result_data[ds] = {}
                    result_data[ds]['results'] = {}
                    result_data[ds]['infos'] = {}

                if no_clusters not in result_data[ds]['results']:
                    result_data[ds]['results'][no_clusters] = {}

                if alg not in result_data[ds]['results'][no_clusters]:
                    result_data[ds]['results'][no_clusters][alg] = {}

                if 'duration' not in result_data[ds]['results'][no_clusters][
                        alg]:
                    result_data[ds]['results'][no_clusters][alg][
                        'duration'] = {}

                if 'no_iterations' not in result_data[ds]['results'][
                        no_clusters][alg]:
                    result_data[ds]['results'][no_clusters][alg][
                        'no_iterations'] = {}

                result_data[ds]['results'][no_clusters][alg]['duration'][
                    run] = duration_kmeans

                if 'truncated_svd' in res:
                    result_data[ds]['results'][no_clusters][alg]['duration'][
                        run] += res['truncated_svd']['duration']

                result_data[ds]['infos']['input_dimension'] = res[
                    'input_dimension']
                result_data[ds]['infos']['input_samples'] = res[
                    'input_samples']
                result_data[ds]['infos']['input_annz'] = res['input_annz']
                result_data[ds]['results'][no_clusters][alg]['no_iterations'][
                    run] = no_iterations

            remove_incomplete_data(result_data)

            print("Result data:")
            pprint(result_data)

        create_plot(output_folder=out_folder_csv,
                    plot_name=plotname,
                    pdata=result_data)
Пример #2
0
def result_evaluation_minibatch_best_params(out_folder,
                                            out_folder_csv,
                                            remove_incomplete=False,
                                            ignore_datasets={}):

    fcnt, plotname = ('do_minibatch_best_params', 'kmeans_params')
    print(plotname)
    run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt)
    plot_data = OrderedDict()

    result_data = OrderedDict()
    for run_identifier in run_identifiers:
        db = ExperimentDB(out_folder, fcnt, run_identifier)
        print(run_identifiers)
        for resid in db.get_algorithm_run_ids():
            (control_params, params,
             res) = db.get_experiment_result_from_run_id(resid)
            print(resid, control_params, params)
            if res is None:
                continue

            ds = params['info']['dataset_name']
            alg = params['info']['algorithm']
            no_clusters = params['task']['no_clusters']
            run = params['task']['run']
            duration_kmeans = res['duration_kmeans']
            no_iterations = len(res['iteration_changes'])
            iteration_durations = res['iteration_durations']
            iteration_changes = res['iteration_changes']
            iteration_wcssd = res['iteration_wcssd']

            if 'pca' in alg:
                param_percent = params['info']['truncated_svd_annz_percentage']
            elif 'bv' in alg:
                param_percent = params['task']['bv_annz']
            else:
                param_percent = 0

            if ds in ignore_datasets:
                continue

            if ds not in result_data:
                result_data[ds] = OrderedDict()
                result_data[ds]['results'] = OrderedDict()
                result_data[ds]['infos'] = OrderedDict()

            if no_clusters not in result_data[ds]['results']:
                result_data[ds]['results'][no_clusters] = OrderedDict()

            if alg not in result_data[ds]['results'][no_clusters]:
                result_data[ds]['results'][no_clusters][alg] = OrderedDict()

            for descr in [
                    'iteration_durations', 'iteration_changes',
                    'iteration_wcssd', 'duration', 'no_iterations'
            ]:
                if descr not in result_data[ds]['results'][no_clusters][alg]:
                    result_data[ds]['results'][no_clusters][alg][
                        descr] = OrderedDict()

            for descr in [
                    'iteration_durations', 'iteration_changes',
                    'iteration_wcssd', 'duration'
            ]:
                if run not in result_data[ds]['results'][no_clusters][alg][
                        descr]:
                    result_data[ds]['results'][no_clusters][alg][descr][
                        run] = OrderedDict()

            kmeans_duration_this_run = duration_kmeans

            if 'truncated_svd' in res:
                kmeans_duration_this_run += res['truncated_svd']['duration']

            if param_percent in result_data[ds]['results'][no_clusters][alg][
                    'duration'][run]:
                raise Exception(
                    "dataset=%s no_clusters=%s alg=%s duration run=%s already added !!! %s %s"
                    % (ds, str(no_clusters), alg, str(run), control_params,
                       params))

            for descr in [
                    'iteration_durations', 'iteration_changes',
                    'iteration_wcssd', 'duration'
            ]:
                if run not in result_data[ds]['results'][no_clusters][alg][
                        descr]:
                    result_data[ds]['results'][no_clusters][alg][descr][
                        run] = OrderedDict()

            result_data[ds]['results'][no_clusters][alg]['duration'][run][
                param_percent] = kmeans_duration_this_run

            result_data[ds]['infos']['input_dimension'] = res[
                'input_dimension']
            result_data[ds]['infos']['input_samples'] = res['input_samples']
            result_data[ds]['infos']['input_annz'] = res['input_annz']

            if run in result_data[ds]['results'][no_clusters][alg][
                    'no_iterations']:
                if result_data[ds]['results'][no_clusters][alg][
                        'no_iterations'][run] != no_iterations:
                    print(
                        alg, run, no_iterations, result_data[ds]['results']
                        [no_clusters][alg]['no_iterations'][run], ds,
                        no_clusters, param_percent, resid)
                    raise Exception(
                        "Number of iterations is not identical! len(res['iteration_changes']) = %d, no_iterations= %d resid=%d"
                        % (no_iterations, result_data[ds]['results']
                           [no_clusters][alg]['no_iterations'][run], resid))
            else:
                result_data[ds]['results'][no_clusters][alg]['no_iterations'][
                    run] = no_iterations

            result_data[ds]['results'][no_clusters][alg][
                'iteration_durations'][run][
                    param_percent] = iteration_durations
            result_data[ds]['results'][no_clusters][alg]['iteration_changes'][
                run][param_percent] = iteration_changes
            result_data[ds]['results'][no_clusters][alg]['iteration_wcssd'][
                run][param_percent] = iteration_changes

        if remove_incomplete:
            remove_incomplete_data(result_data)

        return result_data
def result_evaluation_memory_consumption(out_folder, out_folder_csv):
  
  for fcnt, plotname in [('do_kmeans', 'kmeans_speeds')]:
    print(plotname)
    run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt)
    plot_data = {}
    
    result_data = {}
    for run_identifier in run_identifiers:
      db = ExperimentDB(out_folder, fcnt, run_identifier)
      
      for resid in db.get_algorithm_run_ids():
        (control_params, params, res) = db.get_experiment_result_from_run_id(resid)
        if res is None:
          continue
        ds = params['info']['dataset_name']
        alg = params['info']['algorithm']
        no_clusters = params['task']['no_clusters']
        run = params['task']['run']
        duration_kmeans = res['duration_kmeans']
        no_iterations = len(res['iteration_changes'])
         
        if ds not in result_data:
          result_data[ds] = {}
          result_data[ds]['results'] = {}
          result_data[ds]['infos'] = {}
          
        if no_clusters not in result_data[ds]['results']:
          result_data[ds]['results'][no_clusters] = {}
          
        if alg not in result_data[ds]['results'][no_clusters]:
          result_data[ds]['results'][no_clusters][alg] = {}
          
        if 'duration' not in result_data[ds]['results'][no_clusters][alg]:
          result_data[ds]['results'][no_clusters][alg]['duration'] = {}
          
        if 'no_iterations' not in result_data[ds]['results'][no_clusters][alg]:
          result_data[ds]['results'][no_clusters][alg]['no_iterations'] = {}
        
        no_samples = res['input_samples']
        size_of_data_storage_element = 8
        size_of_key_storage_element = 4
        size_of_pointer_storage_element = 8
        
        if alg != 'kmeans':
          no_clusters_remaining = res['no_clusters_remaining']
        
        if alg == 'kmeans':
          mem_consumption = 0
        elif alg == 'elkan':
          # elkan stores two dense matrices
          # 1. lower_bound_matrix = no_samples * no_clusters_remaining
          # 2. distance_between_clusters_matrix = no_clusters_remaining * no_clusters_remaining
          
          lower_bound_matrix_mem_consumption = no_samples * no_clusters_remaining * size_of_data_storage_element
          distance_between_clusters_matrix_mem_consumption = no_clusters_remaining * no_clusters_remaining * size_of_data_storage_element
          mem_consumption = lower_bound_matrix_mem_consumption + distance_between_clusters_matrix_mem_consumption
        elif alg == 'pca_elkan':  
          # pca_elkan stores two dense matrices + orthonormal_basis_matrix + projected_matrix_samples + projected_matrix_clusters
          # 1. lower_bound_matrix = no_samples * no_clusters_remaining
          # 2. distance_between_clusters_matrix = no_clusters_remaining * no_clusters_remaining
          # 3. orthonormal_basis_matrix = no_orthonormal_vectors * orthonormal_basis_matrix_dim
          # 4. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 5  projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          lower_bound_matrix_mem_consumption = no_samples * no_clusters_remaining * size_of_data_storage_element
          distance_between_clusters_matrix_mem_consumption = no_clusters_remaining * no_clusters_remaining * size_of_data_storage_element
          
          # These matrices are stored as sparse matrices. Can be changed in the future since these matrices are almost completely dense 
          orthonormal_basis_matrix_mem_consumption = (res['truncated_svd']['no_components']
                                                      * res['truncated_svd']['no_features']
                                                      * (size_of_data_storage_element + size_of_key_storage_element)) \
                                                      + ((res['truncated_svd']['no_components'] + 1) * size_of_pointer_storage_element)
          projected_matrix_samples_mem_consumption = (no_samples * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (no_clusters_remaining * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = lower_bound_matrix_mem_consumption \
                                                     + distance_between_clusters_matrix_mem_consumption \
                                                     + orthonormal_basis_matrix_mem_consumption \
                                                     + projected_matrix_samples_mem_consumption \
                                                     + projected_matrix_clusters_mem_consumption
        elif alg == 'pca_kmeans':
          # pca_elkan stores a orthonormal_basis_matrix + projected_matrix
          # 1. orthonormal_basis_matrix = no_orthonormal_vectors * orthonormal_basis_matrix_dim
          # 2. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 3  projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          
          # These matrices are stored as sparse matrices. Can be changed in the future since these matrices are almost completely dense 
          orthonormal_basis_matrix_mem_consumption = (res['truncated_svd']['no_components']
                                                      * res['truncated_svd']['no_features']
                                                      * (size_of_data_storage_element + size_of_key_storage_element)) \
                                                      + ((res['truncated_svd']['no_components'] + 1) * size_of_pointer_storage_element)
          projected_matrix_samples_mem_consumption = (no_samples * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (no_clusters_remaining * res['truncated_svd']['no_components']
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = orthonormal_basis_matrix_mem_consumption \
                            + projected_matrix_samples_mem_consumption \
                            + projected_matrix_clusters_mem_consumption
          
        elif alg == 'kmeans_optimized':
          # kmeans_optimized stores a projected_matrix_samples + projected_matrix_clusters
          # 1. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 2  projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          
          annz_projected_matrix_samples = res['block_vector_data']['annz']
          # annz_projected_matrix_clusters was not measured (we use the annz_projected_matrix_samples as an approximation)
          annz_projected_matrix_clusters = annz_projected_matrix_samples
          
          projected_matrix_samples_mem_consumption = (annz_projected_matrix_samples * no_samples
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (annz_projected_matrix_clusters * no_clusters_remaining
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = projected_matrix_samples_mem_consumption \
                            + projected_matrix_clusters_mem_consumption
        elif alg == 'yinyang':
          # yinyang stores a dense matrix to keep a lower bound to every of the t groups
          t = no_clusters_remaining / 10
          mem_consumption = no_samples * t * size_of_data_storage_element
          
        elif alg == 'fast_yinyang':
          # yinyang stores a dense matrix to keep a lower bound to every of the t groups + block vector projected matrices samples/clusters
          # 1. lower_bound_group_matrix = no_samples * t
          # 2. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors)
          # 3. projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors)
          t = no_clusters_remaining / 10
          lower_bound_group_matrix_mem_consumption = no_samples * t * size_of_data_storage_element
          
          annz_projected_matrix_samples = res['block_vector_data']['annz']
          # annz_projected_matrix_clusters was not measured (we use the annz_projected_matrix_samples as an approximation)
          annz_projected_matrix_clusters = annz_projected_matrix_samples
          
          projected_matrix_samples_mem_consumption = (annz_projected_matrix_samples * no_samples
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
                              
          projected_matrix_clusters_mem_consumption = (annz_projected_matrix_clusters * no_clusters_remaining
                              * (size_of_data_storage_element + size_of_key_storage_element)) \
                              + ((no_samples + 1) * size_of_pointer_storage_element)
          
          mem_consumption = lower_bound_group_matrix_mem_consumption \
                            + projected_matrix_samples_mem_consumption \
                            + projected_matrix_clusters_mem_consumption
        else:
          raise Exception("please provide details for the memory consumption of %s" % alg)
          
        kmeans_duration_this_run = duration_kmeans
        
        if 'truncated_svd' in res:
          kmeans_duration_this_run += res['truncated_svd']['duration']
        
        
        
        mem_consumption = (mem_consumption / 1024.0) / 1024.0
        result_data[ds]['results'][no_clusters_remaining][alg]['duration'][run] = (float(mem_consumption), kmeans_duration_this_run)
        
        result_data[ds]['infos']['input_dimension'] = res['input_dimension']
        result_data[ds]['infos']['input_samples'] = res['input_samples']
        result_data[ds]['infos']['input_annz'] = res['input_annz']
        result_data[ds]['results'][no_clusters][alg]['no_iterations'][run] = no_iterations
      
      remove_incomplete_data(result_data)
      
      print("Result data:")
      pprint(result_data)
    
    create_plot(output_folder=out_folder_csv,
                plot_name=plotname,
                pdata=result_data)