def load_all_results(): """Read all result files and compute all metrics""" all_runs_by_dataset = {'batch': {}, 'non-batch': {}} all_runs_by_algorithm = {'batch': {}, 'non-batch': {}} cached_true_dist = [] old_sdn = None for properties, f in results.load_all_results(): sdn = get_run_desc(properties) if sdn != old_sdn: dataset = get_dataset(properties["dataset"]) cached_true_dist = list(dataset["distances"]) old_sdn = sdn algo = properties["algo"] ms = compute_all_metrics(cached_true_dist, f, properties, args.recompute) algo_ds = get_dataset_label(sdn) idx = "non-batch" if properties["batch_mode"]: idx = "batch" all_runs_by_algorithm[idx].setdefault(algo, {}).setdefault(algo_ds, []).append(ms) all_runs_by_dataset[idx].setdefault(sdn, {}).setdefault(algo, []).append(ms) return (all_runs_by_dataset, all_runs_by_algorithm)
def load_all_results(): """Read all result files and compute all metrics""" all_runs_by_dataset = {'batch': {}, 'non-batch': {}} all_runs_by_algorithm = {'batch': {}, 'non-batch': {}} cached_true_dist = [] old_sdn = None for mode in ["non-batch", "batch"]: for properties, f in results.load_all_results( batch_mode=(mode == "batch")): sdn = get_run_desc(properties) if sdn != old_sdn: dataset = get_dataset(properties["dataset"]) cached_true_dist = list(dataset["distances"]) old_sdn = sdn algo_ds = get_dataset_label(sdn) desc_suffix = ("-batch" if mode == "batch" else "") algo = properties["algo"] + desc_suffix sdn += desc_suffix ms = compute_all_metrics(cached_true_dist, f, properties, args.recompute) all_runs_by_algorithm[mode].setdefault(algo, {}).setdefault( algo_ds, []).append(ms) all_runs_by_dataset[mode].setdefault(sdn, {}).setdefault(algo, []).append(ms) return (all_runs_by_dataset, all_runs_by_algorithm)
def load_all_results(): """Read all result files and compute all metrics""" all_runs_by_dataset = {} all_runs_by_algorithm = {} cached_true_dist = [] old_sdn = None for f in results.load_all_results(): properties = dict(f.attrs) # TODO Fix this properly. Sometimes the hdf5 file returns bytes # This converts these bytes to strings before we work with them for k in properties.keys(): try: properties[k]= properties[k].decode() except: pass sdn = get_run_desc(properties) if sdn != old_sdn: dataset = get_dataset(properties["dataset"]) cached_true_dist = list(dataset["distances"]) old_sdn = sdn algo = properties["algo"] ms = compute_all_metrics(cached_true_dist, f, properties["algo"]) algo_ds = get_dataset_label(sdn) all_runs_by_algorithm.setdefault(algo, {}).setdefault(algo_ds, []).append(ms) all_runs_by_dataset.setdefault(sdn, {}).setdefault(algo, []).append(ms) return (all_runs_by_dataset, all_runs_by_algorithm)
def load_all_results(): """Read all result files and compute all metrics""" all_runs_by_dataset = {'batch': {}, 'non-batch': {}} all_runs_by_algorithm = {'batch': {}, 'non-batch': {}} cached_true_dist = [] old_sdn = None for properties, f in results.load_all_results(): sdn = get_run_desc(properties) if sdn != old_sdn: dataset = get_dataset(properties["dataset"]) cached_true_dist = list(dataset["distances"]) old_sdn = sdn algo = properties["algo"] ms = compute_all_metrics( cached_true_dist, f, properties, args.recompute) algo_ds = get_dataset_label(sdn) idx = "non-batch" if properties["batch_mode"]: idx = "batch" all_runs_by_algorithm[idx].setdefault( algo, {}).setdefault(algo_ds, []).append(ms) all_runs_by_dataset[idx].setdefault( sdn, {}).setdefault(algo, []).append(ms) return (all_runs_by_dataset, all_runs_by_algorithm)
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print("type D: ", type(D)) print("type x_train: ", type(X_train)) print("type x_test: ", type(X_test)) print("type distance: ", type(distance)) print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) X_train = dataset_transform[distance](X_train) X_test = dataset_transform[distance](X_test) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) descriptor["dataset"] = dataset store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query( algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name( definition.algorithm, batch) descriptor["dataset"] = dataset store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def run(definition, dataset, count, run_count, batch): algo = instantiate_algorithm(definition) assert not definition.query_argument_groups \ or hasattr(algo, "set_query_arguments"), """\ error: query argument groups have been specified for %s.%s(%s), but the \ algorithm instantiated from it does not implement the set_query_arguments \ function""" % (definition.module, definition.constructor, definition.arguments) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) if algo.builds_graph(): # Test data first to avoid converting test set index to graph index X_train = numpy.concatenate((X_test, X_train)) # The protocol expects the count to be given at query time, so it has # to be set as a parameter beforehand. algo.set_count(count) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): prepared_queries = algo.supports_prepared_queries() t0 = time.time() memory_usage_before = algo.get_memory_usage() algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) print('Index size: ', index_size) query_argument_groups = definition.query_argument_groups # Make sure that algorithms with no query argument groups still get run # once by providing them with a single, empty, harmless group if not query_argument_groups: query_argument_groups = [[]] for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) if algo.builds_graph(): descriptor, results = check_graph(algo, X_train, X_test, distance, count) else: descriptor, results = run_individual_query(algo, X_train, X_test, distance, count, run_count, batch) descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = get_algorithm_name(definition.algorithm, batch) descriptor["dataset"] = dataset descriptor["count"] = int(count) descriptor["batch_mode"] = batch store_results(dataset, count, definition, query_arguments, descriptor, results, batch) finally: algo.done()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='re-run algorithms even if their results already exist', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only' ' the best result', default=5) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1' 'if no timeout should be set', default=2 * 3600) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same ' 'process) rather than using Docker') parser.add_argument('--batch', action='store_true', help='If set, algorithms get all queries at once') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument('--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') parser.add_argument('--parallelism', type=positive_int, help='Number of Docker containers in parallel', default=1) args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) logging.config.fileConfig("logging.conf") logger = logging.getLogger("annb") # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset, dimension = get_dataset(args.dataset) point_type = dataset.attrs.get('point_type', 'float') distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments, args.batch) if args.force or not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: logger.info(f'running only {args.algorithm}') definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag = tag.split(':')[0] docker_tags.add(tag) if args.docker_tag: logger.info(f'running only {args.docker_tag}') definitions = [ d for d in definitions if d.docker_tag == args.docker_tag ] if set(d.docker_tag for d in definitions).difference(docker_tags): logger.info( f'not all docker images available, only: {set(docker_tags)}') logger.info( f'missing docker images: ' f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}' ) definitions = [ d for d in definitions if d.docker_tag in docker_tags ] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor # of the right name, then the definition is broken if status == InstantiationStatus.NO_CONSTRUCTOR: raise Exception( "%s.%s(%s): error: the module '%s' does not" " expose the named constructor" % (df.module, df.constructor, df.arguments, df.module)) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because # of a missing dependency), print a warning and remove # this definition from the list of things to be run logging.warning( "%s.%s(%s): the module '%s' could not be " "loaded; skipping" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): logger.info( f'Not running disabled algorithms {[d for d in definitions if d.disabled]}' ) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: logger.info(f'Order: {definitions}') if args.parallelism > multiprocessing.cpu_count() - 1: raise Exception('Parallelism larger than %d! (CPU count minus one)' % (multiprocessing.cpu_count() - 1)) # Multiprocessing magic to farm this out to all CPUs queue = multiprocessing.Queue() for definition in definitions: queue.put(definition) if args.batch and args.parallelism > 1: raise Exception( f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})" ) workers = [ multiprocessing.Process(target=run_worker, args=(i + 1, args, queue)) for i in range(args.parallelism) ] [worker.start() for worker in workers] [worker.join() for worker in workers]
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular') parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument('--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument('--batch', help='Provide Queryset as Batch', action='store_true') parser.add_argument('--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print('The following algorithms are supported...') for point in definitions: print('\t... for the point type "%s"...' % point) for metric in definitions[point]: print('\t\t... and the distance metric "%s":' % metric) for algorithm in definitions[point][metric]: print('\t\t\t%s' % algorithm) sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) X_train = dataset['train'] X_test = dataset['test'] distance = dataset.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.count, distance): algos_already_run.add((run.attrs["library"], run.attrs["name"])) point_type = 'float' # TODO(erikbern): should look at the type of X_train algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args.count, X_train, X_test, library, algo, distance, send_pipe, args.runs, args.single, args.batch)) p.start() send_pipe.close() timed_out = False try: r = recv_pipe.poll(args.timeout) if r: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them attrs, results = recv_pipe.recv() if "expect_extra" in attrs: if attrs["expect_extra"]: attrs["extra"] = recv_pipe.recv() del attrs["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(attrs, results, args.dataset, args.count, distance) elif timed_out: print('algorithm worker process took too long') else: print('algorithm worker process stopped unexpectedly')
default='linear') parser.add_argument( '--raw', help='Show raw results (not just Pareto frontier) in faded colours', action='store_true') parser.add_argument('--batch', help='Plot runs in batch mode', action='store_true') parser.add_argument('--recompute', help='Clears the cache and recomputes the metrics', action='store_true') args = parser.parse_args() if not args.output: args.output = 'results/%s.png' % (args.dataset + ('-batch' if args.batch else '')) print('writing output to %s' % args.output) dataset, _ = get_dataset(args.dataset) count = int(args.count) unique_algorithms = get_unique_algorithms() results = load_all_results(args.dataset, count, args.batch) linestyles = create_linestyles(sorted(unique_algorithms)) runs = compute_metrics(np.array(dataset["distances"]), results, args.x_axis, args.y_axis, args.recompute) if not runs: raise Exception('Nothing to plot') create_plot(runs, args.raw, args.x_scale, args.y_scale, args.x_axis, args.y_axis, args.output, linestyles, args.batch)
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False): algo = instantiate_algorithm(definition) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_index_size("self") - index_size_before print('Built index in', build_time) print('Index size: ', index_size) best_search_time = float('inf') for i in range(run_count): print('Run %d/%d...' % (i + 1, run_count)) n_items_processed = [ 0 ] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): start = time.time() candidates = algo.query(v, count) total = (time.time() - start) candidates = [ (int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in candidates ] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) if len(candidates) > count: print( 'warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count)) return (total, candidates) def batch_query(X): start = time.time() result = algo.batch_query(X, count) total = (time.time() - start) candidates = [[ (int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in single_results ] for v, single_results in zip(X, results)] return [(total / float(len(X)), v) for v in candidates] if use_batch_query: results = batch_query(X_test) elif algo.use_threads() and not force_single: pool = multiprocessing.pool.ThreadPool() results = pool.map(single_query, X_test) else: results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) total_candidates = sum( len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) verbose = hasattr(algo, "query_verbose") attrs = { "batch_mode": use_batch_query, "build_time": build_time, "best_search_time": best_search_time, "candidates": avg_candidates, "expect_extra": verbose, "index_size": index_size, "name": algo.name, "run_count": run_count, "run_alone": force_single, "distance": distance, "count": int(count), "algo": definition.algorithm, "dataset": dataset } store_results(dataset, count, definition, attrs, results) finally: algo.done()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove') parser.add_argument( '--query-dataset', metavar='NAME', help='load query points from another dataset instead of choosing them randomly from the training dataset', default=None) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--distance', help='the metric used to calculate the distance between points', default='angular') parser.add_argument( '--limit', help='the maximum number of points to load from the dataset, or -1 to load all of them', type=int, default=-1) parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument( '--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print "The following algorithms are supported..." for point in definitions: print "\t... for the point type '%s'..." % point for metric in definitions[point]: print "\t\t... and the distance metric '%s':" % metric for algorithm in definitions[point][metric]: print "\t\t\t%s" % algorithm sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) manifest, X = get_dataset(args.dataset, args.limit) if not args.query_dataset: X_train, X_test = split_dataset( X, test_size = manifest['dataset']['test_size']) else: X_train = X query_manifest, X_test = get_dataset(args.query_dataset) assert manifest["dataset"] == query_manifest["dataset"], """\ error: the training dataset and query dataset have incompatible manifests""" queries_fn = get_query_cache_path( args.dataset, args.count, args.limit, args.distance, args.query_dataset) print('storing queries in', queries_fn) if not os.path.exists(queries_fn): queries = compute_distances(args.distance, args.count, X_train, X_test) with open(queries_fn, 'w') as f: pickle.dump(queries, f) else: with open(queries_fn) as f: queries = pickle.load(f) print('got', len(queries), 'queries') algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.limit, args.count, args.distance, args.query_dataset): algos_already_run.add((run["library"], run["name"])) point_type = manifest['dataset']['point_type'] algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, args.distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process( target=run_algo, args=(args.count, X_train, queries, library, algo, args.distance, send_pipe, args.runs, args.single)) p.start() send_pipe.close() timed_out = False try: results = recv_pipe.poll(args.timeout) if results: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them results = recv_pipe.recv() if "expect_extra" in results: if results["expect_extra"]: results["extra"] = recv_pipe.recv() del results["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(results, args.dataset, args.limit, args.count, args.distance, args.query_dataset) elif timed_out: print "(algorithm worker process took too long)" else: print "(algorithm worker process stopped unexpectedly)"
help='Plot runs in batch mode', action='store_true') parser.add_argument( '--output', help='Path to the output csv file') parser.add_argument( '--recompute', action='store_true', help='Path to the output csv file') args = parser.parse_args() count = int(args.count) rows = [] for dataset_name in datasets: print("Looking at dataset", dataset_name) dataset = get_dataset(dataset_name) unique_algorithms = get_unique_algorithms() print('Loading results') results = load_all_results(dataset_name, count, True, args.batch) print('... done') results = compute_metrics_all_runs(list(dataset["distances"]), results, args.recompute) rows.extend(results) print('Build dataframe') data = pd.DataFrame(rows) print('... done') print(data.groupby(['dataset', 'count', 'algorithm', 'parameters']).count()) with open(args.output, 'w') as fp: data.to_csv(fp, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same process) rather than using Docker') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument( '--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments) if not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups = not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) if args.docker_tag: print('running only', args.docker_tag) definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [d for d in definitions if d.docker_tag in docker_tags] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of # the right name, then the definition is broken assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ %s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because of a missing # dependency), print a warning and remove this definition from the # list of things to be run print("""\ %s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: print('Order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def run(definition, dataset, count, run_count=3, force_single=False, use_batch_query=False): algo = instantiate_algorithm(definition) D = get_dataset(dataset) X_train = numpy.array(D['train']) X_test = numpy.array(D['test']) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) try: t0 = time.time() index_size_before = algo.get_index_size("self") algo.fit(X_train) build_time = time.time() - t0 index_size = algo.get_index_size("self") - index_size_before print('Built index in', build_time) print('Index size: ', index_size) best_search_time = float('inf') for i in range(run_count): print('Run %d/%d...' % (i+1, run_count)) n_items_processed = [0] # a bit dumb but can't be a scalar since of Python's scoping rules def single_query(v): start = time.time() candidates = algo.query(v, count) total = (time.time() - start) candidates = [(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in candidates] n_items_processed[0] += 1 if n_items_processed[0] % 1000 == 0: print('Processed %d/%d queries...' % (n_items_processed[0], X_test.shape[0])) if len(candidates) > count: print('warning: algorithm %s returned %d results, but count is only %d)' % (algo.name, len(candidates), count)) return (total, candidates) def batch_query(X): start = time.time() result = algo.batch_query(X, count) total = (time.time() - start) candidates = [[(int(idx), float(metrics[distance]['distance'](v, X_train[idx]))) for idx in single_results] for v, single_results in zip(X, results)] return [(total / float(len(X)), v) for v in candidates] if use_batch_query: results = batch_query(X_test) elif algo.use_threads() and not force_single: pool = multiprocessing.pool.ThreadPool() results = pool.map(single_query, X_test) else: results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) total_candidates = sum(len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) verbose = hasattr(algo, "query_verbose") attrs = { "batch_mode": use_batch_query, "build_time": build_time, "best_search_time": best_search_time, "candidates": avg_candidates, "expect_extra": verbose, "index_size": index_size, "name": algo.name, "run_count": run_count, "run_alone": force_single, } store_results(dataset, count, definition, attrs, results) finally: algo.done()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same process) rather than using Docker') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) args = parser.parse_args() if args.timeout == -1: args.timeout = None if hasattr(args, "list_algorithms"): list_algorithms(args.definitions) sys.exit(0) # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # TODO(erikbern): should make this a helper function somewhere definitions = [definition for definition in definitions if not os.path.exists(get_result_filename(args.dataset, args.count, definition))] random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if args.docker_tag: print('running only', args.docker_tag) definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [d for d in definitions if d.docker_tag in docker_tags] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] print('order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=2) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help= 'If set, then will run everything locally (inside the same process) rather than using Docker' ) parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument('--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments) if not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) if args.docker_tag: print('running only', args.docker_tag) definitions = [ d for d in definitions if d.docker_tag == args.docker_tag ] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print( 'missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [ d for d in definitions if d.docker_tag in docker_tags ] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of # the right name, then the definition is broken assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ %s.%s(%s): error: the module '%s' does not expose the named constructor""" % ( df.module, df.constructor, df.arguments, df.module) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because of a missing # dependency), print a warning and remove this definition from the # list of things to be run print("""\ %s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: print('Order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove') parser.add_argument( '--query-dataset', metavar='NAME', help= 'load query points from another dataset instead of choosing them randomly from the training dataset', default=None) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--distance', help='the metric used to calculate the distance between points', default='angular') parser.add_argument( '--limit', help= 'the maximum number of points to load from the dataset, or -1 to load all of them', type=int, default=-1) parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument('--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument('--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print "The following algorithms are supported..." for point in definitions: print "\t... for the point type '%s'..." % point for metric in definitions[point]: print "\t\t... and the distance metric '%s':" % metric for algorithm in definitions[point][metric]: print "\t\t\t%s" % algorithm sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) manifest, X = get_dataset(args.dataset, args.limit) if not args.query_dataset: X_train, X_test = split_dataset( X, test_size=manifest['dataset']['test_size']) else: X_train = X query_manifest, X_test = get_dataset(args.query_dataset) assert manifest["dataset"] == query_manifest["dataset"], """\ error: the training dataset and query dataset have incompatible manifests""" queries_fn = get_query_cache_path(args.dataset, args.count, args.limit, args.distance, args.query_dataset) print('storing queries in', queries_fn) if not os.path.exists(queries_fn): queries = compute_distances(args.distance, args.count, X_train, X_test) with open(queries_fn, 'w') as f: pickle.dump(queries, f) else: with open(queries_fn) as f: queries = pickle.load(f) print('got', len(queries), 'queries') algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.limit, args.count, args.distance, args.query_dataset): algos_already_run.add((run["library"], run["name"])) point_type = manifest['dataset']['point_type'] algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, args.distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args.count, X_train, queries, library, algo, args.distance, send_pipe, args.runs, args.single)) p.start() send_pipe.close() timed_out = False try: results = recv_pipe.poll(args.timeout) if results: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them results = recv_pipe.recv() if "expect_extra" in results: if results["expect_extra"]: results["extra"] = recv_pipe.recv() del results["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(results, args.dataset, args.limit, args.count, args.distance, args.query_dataset) elif timed_out: print "(algorithm worker process took too long)" else: print "(algorithm worker process stopped unexpectedly)"
action='store_true') parser.add_argument('-Y', '--y-log', help='Draw the Y-axis using a logarithmic scale', action='store_true') parser.add_argument( '--raw', help='Show raw results (not just Pareto frontier) in faded colours', action='store_true') args = parser.parse_args() if not args.output: args.output = 'results/%s.png' % args.dataset print('writing output to %s' % args.output) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] count = int(args.count) definitions = get_definitions(args.definitions, dimension, point_type, distance, count) unique_algorithms = get_unique_algorithms(args.definitions) linestyles = create_linestyles(unique_algorithms) results = load_results(args.dataset, count, definitions) runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis) if not runs: raise Exception('Nothing to plot') create_plot(runs, args.raw, args.x_log, args.y_log, args.x_axis,
action='store_true') parser.add_argument( '-Y', '--y-log', help='Draw the Y-axis using a logarithmic scale', action='store_true') parser.add_argument( '--raw', help='Show raw results (not just Pareto frontier) in faded colours', action='store_true') args = parser.parse_args() if not args.output: args.output = 'results/%s.png' % args.dataset print('writing output to %s' % args.output) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] count = int(args.count) definitions = get_definitions(args.definitions, dimension, point_type, distance, count) unique_algorithms = get_unique_algorithms(args.definitions) linestyles = create_linestyles(unique_algorithms) results = load_results(args.dataset, count, definitions) runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis) if not runs: raise Exception('Nothing to plot') create_plot(runs, args.raw, args.x_log, args.y_log, args.x_axis, args.y_axis, args.output, linestyles)