Пример #1
0
def speedup(n, clusters, dim):
    if args.dataset == 'clustered':
        dataset = get_dataset(data_name='clustered', n=n, dim=dim, clusters=clusters, noshuffle=False)
        print("Clustered Dataset")
    else:
        dataset = get_dataset(data_name='mnist', n=None, dim=8)
        print("MNIST Dataset")

    git_clone('blocked-distances-for-new-print-iterations')
    nn_list, no_reorder = c_nearest_neighbors('tmp/nn_descent', dataset, 20, 'l2', 2)
    baseline = extract_iteration_timing(nn_list[0].stdout)

    git_clone('reorder-data')
    nn_list, reorder = c_nearest_neighbors('tmp/nn_descent', dataset, 20, 'l2', 2)
    reordered = extract_iteration_timing(nn_list[0].stdout)

    return baseline, reordered, no_reorder.median_cycle/ reorder.median_cycle
Пример #2
0
def measure_costs(path, dataset, k, metric):
    # default 1 repetition for cost measurement
    nn_list, timing_data = c_nearest_neighbors(path,
                                               dataset,
                                               k,
                                               metric,
                                               1,
                                               gprof_compile=True)

    process = subprocess.run(
        ['gprof', os.path.join(path, "a.out"), '-p'],
        stdout=subprocess.PIPE,
        universal_newlines=True)
    return Costdata(process.stdout, metric)
Пример #3
0
def benchmark(dataset_name, dim, path, k, metric, repetitions, n_start, n_end,
              n_res, prefix):
    inputs = []
    cycles = []
    cycles_std = []
    runtimes = []
    sim_evals = []

    for n in np.logspace(n_start,
                         n_end,
                         num=n_res * (n_end - n_start + 1),
                         dtype=int,
                         base=2):

        dataset = get_dataset(dataset_name, n, dim)
        inputs.append(dataset.N)

        nn_list, timing_data = c_nearest_neighbors(path, dataset, k, metric,
                                                   repetitions)
        # append median or avg?
        cycles.append(timing_data.median_cycle)
        cycles_std.append(timing_data.std_cycle)
        runtimes.append(timing_data.median_runtime)

        cost_data = measure_costs(path, dataset, k, metric)
        sim_evals.append(cost_data.metric_calls)
    print(sim_evals)

    # for L2 norm:
    # d operations for a[i]-b[i]
    # d operations for squaring each component
    # d-1 operations for summing up the squares
    # so 3d flops per sim evaluation

    flops = np.array(sim_evals) * dataset.D * (3 - 1)

    save_data(
        '{}_{}_dim{}_logn{}to{}_k{}'.format(prefix, dataset_name, dim, n_start,
                                            n_end, k), inputs, sim_evals,
        runtimes, cycles, cycles_std, flops)
Пример #4
0
def benchmark_dim(dataset_name, n, path, k, metric, repetitions, dim_start,
                  dim_end, dim_step, prefix):
    inputs = []
    cycles = []
    cycles_std = []
    runtimes = []
    sim_evals = []
    flops = []

    for dim in np.arange(dim_start, dim_end, dim_step):
        inputs.append(dim)
        dataset = get_dataset(dataset_name, n, dim)

        nn_list, timing_data = c_nearest_neighbors(path, dataset, k, metric,
                                                   repetitions)
        # append median or avg?
        cycles.append(timing_data.median_cycle)
        cycles_std.append(timing_data.std_cycle)
        runtimes.append(timing_data.median_runtime)

        cost_data = measure_costs(path, dataset, k, metric)
        sim_evals.append(cost_data.metric_calls)
        flops.append(cost_data.metric_calls * dataset.D * (3 - 1))
        print("Dim: ", dim, ", flops:",
              flops[len(flops) - 1] / cycles[len(cycles) - 1])


# for L2 norm:
# d operations for a[i]-b[i]
# d operations for squaring each component
# d-1 operations for summing up the squares
# so 3d flops per sim evaluation

    save_data(
        '{}_{}_n{}_dim{}to{}_k{}'.format(prefix, dataset_name, n, dim_start,
                                         dim_end, k), inputs, sim_evals,
        runtimes, cycles, cycles_std, flops)
Пример #5
0
    print(args)

# script that benchmarks the permutation of the greedy heuristic
# clones modified nn_descent into tmp which prints permutation and exits

result = []
n = 2**14
dim = 8
clusters = 8
dataset = get_dataset(data_name='clustered',
                      n=n,
                      dim=dim,
                      clusters=clusters,
                      noshuffle=False)
git_clone('reorder-print-perm')
nn_list, no_reorder = c_nearest_neighbors('tmp/nn_descent', dataset, args.k,
                                          args.metric, args.repetitions)
s = nn_list[0].stdout

start = s.find("fwd_permutation\n") + len("fwd_permutation\n")
end = s.find("fwd_permutation_end")
substring = s[start:end]
fwd_perm = np.loadtxt(StringIO(substring), dtype=int)

X = dataset.X
X_ = np.zeros(X.shape)

for i in range(len(fwd_perm)):
    X_[fwd_perm[i]] = X[i]

nbrs = NearestNeighbors(n_neighbors=1).fit(dataset.means)
distances, indices = nbrs.kneighbors(X_)
Пример #6
0
                    help='print stdout of c code',
                    action='store_true',
                    dest='out')
parser.set_defaults(out=False)
args = parser.parse_args()
print(args)

dataset = get_dataset(data_name=args.dataset,
                      n=args.n,
                      dim=args.dim,
                      clusters=args.clusters,
                      noshuffle=args.noshuffle)

nn_list, timing_data = c_nearest_neighbors(args.path,
                                           dataset,
                                           args.k,
                                           args.metric,
                                           args.repetitions,
                                           stdout=args.out)
timing_data.print()

if args.verify or args.verifycmp:
    py_nn, py_timing_data = py_nearest_neighbors(dataset, args.k, args.metric,
                                                 args.repetitions)
    py_timing_data.print()

    if args.verifycmp:
        c_recall = list(map(py_nn[0].recall, nn_list))
        print("recall compared: ", c_recall)
    else:
        true_nn = nearest_neighbors(dataset, args.k, args.metric)