def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print("type D: ", type(D)) print("type x_train: ", type(X_train)) print("type x_test: ", type(X_test)) print("type distance: ", type(distance)) print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) X_train = dataset_transform[distance](X_train) X_test = dataset_transform[distance](X_test) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) descriptor["dataset"] = dataset store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query( algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name( definition.algorithm, batch) descriptor["dataset"] = dataset store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) if algo.builds_graph(): # Test data first to avoid converting test set index to graph index X_train = numpy.concatenate((X_test, X_train)) # The protocol expects the count to be given at query time, so it has # to be set as a parameter beforehand. algo.set_count(count) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) if algo.builds_graph(): descriptor, results = check_graph(algo, X_train, X_test, distance, count) else: descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) descriptor["dataset"] = dataset descriptor["count"] = int(count) descriptor["batch_mode"] = batch store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False): algo = instantiate_algorithm(definition) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_index_size("self") - index_size_before print('Built index in', build_time) print('Index size: ', index_size) best_search_time = float('inf') for i in range(run_count): print('Run %d/%d...' % (i + 1, run_count)) n_items_processed = [ 0 ] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): start = time.time() candidates = algo.query(v, count) total = (time.time() - start) candidates = [ (int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in candidates ] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) if len(candidates) > count: print( 'warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count)) return (total, candidates) def batch_query(X): start = time.time() result = algo.batch_query(X, count) total = (time.time() - start) candidates = [[ (int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in single_results ] for v, single_results in zip(X, results)] return [(total / float(len(X)), v) for v in candidates] if use_batch_query: results = batch_query(X_test) elif algo.use_threads() and not force_single: pool = multiprocessing.pool.ThreadPool() results = pool.map(single_query, X_test) else: results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) total_candidates = sum( len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) verbose = hasattr(algo, "query_verbose") attrs = { "batch_mode": use_batch_query, "build_time": build_time, "best_search_time": best_search_time, "candidates": avg_candidates, "expect_extra": verbose, "index_size": index_size, "name": algo.name, "run_count": run_count, "run_alone": force_single, "distance": distance, "count": int(count), "algo": definition.algorithm, "dataset": dataset } store_results(dataset, count, definition, attrs, results) finally: algo.done()
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False): algo = instantiate_algorithm(definition) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_index_size("self") - index_size_before print('Built index in', build_time) print('Index size: ', index_size) best_search_time = float('inf') for i in range(run_count): print('Run %d/%d...' % (i+1, run_count)) n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): start = time.time() candidates = algo.query(v, count) total = (time.time() - start) candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in candidates] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) if len(candidates) > count: print('warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count)) return (total, candidates) def batch_query(X): start = time.time() result = algo.batch_query(X, count) total = (time.time() - start) candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in single_results] for v, single_results in zip(X, results)] return [(total / float(len(X)), v) for v in candidates] if use_batch_query: results = batch_query(X_test) elif algo.use_threads() and not force_single: pool = multiprocessing.pool.ThreadPool() results = pool.map(single_query, X_test) else: results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) total_candidates = sum(len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) verbose = hasattr(algo, "query_verbose") attrs = { "batch_mode": use_batch_query, "build_time": build_time, "best_search_time": best_search_time, "candidates": avg_candidates, "expect_extra": verbose, "index_size": index_size, "name": algo.name, "run_count": run_count, "run_alone": force_single, } store_results(dataset, count, definition, attrs, results) finally: algo.done()