Exemplo n.º 1
0
    
    if not os.path.isdir(ds_folder):
      os.makedirs(ds_folder)
    
    if not os.path.isdir(out_folder):
      os.makedirs(out_folder)
       
    params_general = collections.OrderedDict()
    params_general['calculate_kmeans'] = collections.OrderedDict()
    clusters_big = [100, 1000, 10000]
    clusters_medium = [100, 500, 5000]
    clusters_small = [100, 250, 1000]
    
    do_evaluations(load_usps_dataset(ds_folder), 'usps', out_folder, params_general, clusters_small)
    if not args.testmode:
      do_evaluations(load_sector_dataset(ds_folder), 'sector', out_folder, params_general, clusters_small)
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'real_sim.scaled.bz2'), 'realsim', out_folder, params_general, clusters_medium)    
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'mediamill_static_label_scaled.bz2'), 'mediamill', out_folder, params_general, clusters_medium)
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'caltech101.scaled.bz2'), 'caltech101', out_folder, params_general, clusters_big)  
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'e2006_static_label.scaled.bz2'), 'e2006', out_folder, params_general, clusters_small)
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'avira_201.scaled.bz2'), 'avira201', out_folder, params_general, clusters_big)
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'kdd.scaled.bz2'), 'kdd2001', out_folder, params_general, clusters_big)
      do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'mnist800k.scaled.bz2'), 'mnist800k', out_folder, params_general, clusters_big)
  else:
    if not os.path.isdir(out_folder):
      raise Exception("cannot do evaluation with nonexisting output dir %s" % out_folder)

  if not os.path.isdir(output_path_latex):
    os.makedirs(output_path_latex)

  result_evaluation_dataset_speed_comparison(out_folder, output_path_latex)
        if args.testmode:
            truncated_svd_annz_percentage = [
                float(x) / 100.0 for x in range(6, 10, 2)
            ]
            bv_annz = [float(x) / 100.0 for x in range(25, 35, 5)]
        else:
            truncated_svd_annz_percentage = [
                float(x) / 100.0 for x in range(2, 42, 2)
            ]
            bv_annz = [float(x) / 100.0 for x in range(5, 75, 5)]
        do_evaluations(load_usps_dataset(ds_folder), 'usps', out_folder,
                       params_general, clusters_small,
                       truncated_svd_annz_percentage, bv_annz)
        if not args.testmode:
            do_evaluations(load_sector_dataset(ds_folder), 'sector',
                           out_folder, params_general, clusters_small,
                           truncated_svd_annz_percentage, bv_annz)
            do_evaluations(
                load_and_extract_dataset_from_github('fcl_datasets2',
                                                     ds_folder,
                                                     'real_sim.scaled.bz2'),
                'realsim', out_folder, params_general, clusters_medium,
                truncated_svd_annz_percentage, bv_annz)
            do_evaluations(
                load_and_extract_dataset_from_github(
                    'fcl_datasets2', ds_folder,
                    'mediamill_static_label_scaled.bz2'), 'mediamill',
                out_folder, params_general, clusters_medium,
                truncated_svd_annz_percentage, bv_annz)
            do_evaluations(
            label=dataset_name)

    plt.legend()
    plt.grid(True)

    plt.xlabel('number of clusters')
    plt.ylabel('avoided full distance calculations (percent)')
    plt.title(
        r'Varying k and observing avoided full distance calculations (bv annz = 0.3)'
    )

    destination_filename = join(dirname(__file__), "varying_k_evaluation.png")
    plt.savefig(destination_filename)

    print("plot was saved in the current folder to: %s" % destination_filename)


if __name__ == "__main__":
    datasets = []

    ds_folder = abspath(
        join(dirname(__file__), os.pardir, os.pardir, os.pardir, 'datasets'))

    with open(load_sector_dataset(ds_folder), 'r') as f:
        datasets.append(('sector', f.read()))

    with open(load_usps_dataset(ds_folder), 'r') as f:
        datasets.append(('usps', f.read()))

    do_evaluations(datasets)