def test_all_algos(): definitions = get_definitions("algos.yaml") for metric in ['angular', 'euclidean']: algos = get_algorithms(definitions, constructors, len(X[0]), "float", metric, 10) for algo_key in algos.keys(): algo = random.choice(algos[algo_key]) # Just pick one of each yield check_algo, algo.name, algo # pass name just so unittest can capture it
'--y-log', help='Draw the Y-axis using a logarithmic scale', action='store_true') parser.add_argument( '--raw', help='Show raw results (not just Pareto frontier) in faded colours', action='store_true') args = parser.parse_args() if not args.output: args.output = 'results/%s.png' % args.dataset print('writing output to %s' % args.output) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] count = int(args.count) definitions = get_definitions(args.definitions, dimension, point_type, distance, count) unique_algorithms = get_unique_algorithms(args.definitions) linestyles = create_linestyles(unique_algorithms) results = load_results(args.dataset, count, definitions) runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis) if not runs: raise Exception('Nothing to plot') create_plot(runs, args.raw, args.x_log, args.y_log, args.x_axis, args.y_axis, args.output, linestyles)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=2) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help= 'If set, then will run everything locally (inside the same process) rather than using Docker' ) parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument('--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments) if not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) if args.docker_tag: print('running only', args.docker_tag) definitions = [ d for d in definitions if d.docker_tag == args.docker_tag ] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print( 'missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [ d for d in definitions if d.docker_tag in docker_tags ] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of # the right name, then the definition is broken assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ %s.%s(%s): error: the module '%s' does not expose the named constructor""" % ( df.module, df.constructor, df.arguments, df.module) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because of a missing # dependency), print a warning and remove this definition from the # list of things to be run print("""\ %s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: print('Order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='re-run algorithms even if their results already exist', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only' ' the best result', default=5) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1' 'if no timeout should be set', default=2 * 3600) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same ' 'process) rather than using Docker') parser.add_argument('--batch', action='store_true', help='If set, algorithms get all queries at once') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument('--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') parser.add_argument('--parallelism', type=positive_int, help='Number of Docker containers in parallel', default=1) args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) logging.config.fileConfig("logging.conf") logger = logging.getLogger("annb") # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset, dimension = get_dataset(args.dataset) point_type = dataset.attrs.get('point_type', 'float') distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments, args.batch) if args.force or not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups=not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: logger.info(f'running only {args.algorithm}') definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag = tag.split(':')[0] docker_tags.add(tag) if args.docker_tag: logger.info(f'running only {args.docker_tag}') definitions = [ d for d in definitions if d.docker_tag == args.docker_tag ] if set(d.docker_tag for d in definitions).difference(docker_tags): logger.info( f'not all docker images available, only: {set(docker_tags)}') logger.info( f'missing docker images: ' f'{str(set(d.docker_tag for d in definitions).difference(docker_tags))}' ) definitions = [ d for d in definitions if d.docker_tag in docker_tags ] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor # of the right name, then the definition is broken if status == InstantiationStatus.NO_CONSTRUCTOR: raise Exception( "%s.%s(%s): error: the module '%s' does not" " expose the named constructor" % (df.module, df.constructor, df.arguments, df.module)) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because # of a missing dependency), print a warning and remove # this definition from the list of things to be run logging.warning( "%s.%s(%s): the module '%s' could not be " "loaded; skipping" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): logger.info( f'Not running disabled algorithms {[d for d in definitions if d.disabled]}' ) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: logger.info(f'Order: {definitions}') if args.parallelism > multiprocessing.cpu_count() - 1: raise Exception('Parallelism larger than %d! (CPU count minus one)' % (multiprocessing.cpu_count() - 1)) # Multiprocessing magic to farm this out to all CPUs queue = multiprocessing.Queue() for definition in definitions: queue.put(definition) if args.batch and args.parallelism > 1: raise Exception( f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})" ) workers = [ multiprocessing.Process(target=run_worker, args=(i + 1, args, queue)) for i in range(args.parallelism) ] [worker.start() for worker in workers] [worker.join() for worker in workers]
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same process) rather than using Docker') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) args = parser.parse_args() if args.timeout == -1: args.timeout = None if hasattr(args, "list_algorithms"): list_algorithms(args.definitions) sys.exit(0) # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # TODO(erikbern): should make this a helper function somewhere definitions = [definition for definition in definitions if not os.path.exists(get_result_filename(args.dataset, args.count, definition))] random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if args.docker_tag: print('running only', args.docker_tag) definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [d for d in definitions if d.docker_tag in docker_tags] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] print('order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular') parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument('--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument('--batch', help='Provide Queryset as Batch', action='store_true') parser.add_argument('--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print('The following algorithms are supported...') for point in definitions: print('\t... for the point type "%s"...' % point) for metric in definitions[point]: print('\t\t... and the distance metric "%s":' % metric) for algorithm in definitions[point][metric]: print('\t\t\t%s' % algorithm) sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) X_train = dataset['train'] X_test = dataset['test'] distance = dataset.attrs['distance'] print('got a train set of size (%d * %d)' % X_train.shape) print('got %d queries' % len(X_test)) algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.count, distance): algos_already_run.add((run.attrs["library"], run.attrs["name"])) point_type = 'float' # TODO(erikbern): should look at the type of X_train algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args.count, X_train, X_test, library, algo, distance, send_pipe, args.runs, args.single, args.batch)) p.start() send_pipe.close() timed_out = False try: r = recv_pipe.poll(args.timeout) if r: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them attrs, results = recv_pipe.recv() if "expect_extra" in attrs: if attrs["expect_extra"]: attrs["extra"] = recv_pipe.recv() del attrs["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(attrs, results, args.dataset, args.count, distance) elif timed_out: print('algorithm worker process took too long') else: print('algorithm worker process stopped unexpectedly')
action='store_true') parser.add_argument( '-Y', '--y-log', help='Draw the Y-axis using a logarithmic scale', action='store_true') parser.add_argument( '--raw', help='Show raw results (not just Pareto frontier) in faded colours', action='store_true') args = parser.parse_args() if not args.output: args.output = 'results/%s.png' % args.dataset print('writing output to %s' % args.output) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] count = int(args.count) definitions = get_definitions(args.definitions, dimension, point_type, distance, count) unique_algorithms = get_unique_algorithms(args.definitions) linestyles = create_linestyles(unique_algorithms) results = load_results(args.dataset, count, definitions) runs = compute_metrics(list(dataset["distances"]), results, args.x_axis, args.y_axis) if not runs: raise Exception('Nothing to plot') create_plot(runs, args.raw, args.x_log, args.y_log, args.x_axis, args.y_axis, args.output, linestyles)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset', metavar='NAME', help='the dataset to load training points from', default='glove') parser.add_argument( '--query-dataset', metavar='NAME', help= 'load query points from another dataset instead of choosing them randomly from the training dataset', default=None) parser.add_argument("-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--distance', help='the metric used to calculate the distance between points', default='angular') parser.add_argument( '--limit', help= 'the maximum number of points to load from the dataset, or -1 to load all of them', type=int, default=-1) parser.add_argument('--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument('--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help= 'run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help= 'Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument('--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument('--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print "The following algorithms are supported..." for point in definitions: print "\t... for the point type '%s'..." % point for metric in definitions[point]: print "\t\t... and the distance metric '%s':" % metric for algorithm in definitions[point][metric]: print "\t\t\t%s" % algorithm sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) manifest, X = get_dataset(args.dataset, args.limit) if not args.query_dataset: X_train, X_test = split_dataset( X, test_size=manifest['dataset']['test_size']) else: X_train = X query_manifest, X_test = get_dataset(args.query_dataset) assert manifest["dataset"] == query_manifest["dataset"], """\ error: the training dataset and query dataset have incompatible manifests""" queries_fn = get_query_cache_path(args.dataset, args.count, args.limit, args.distance, args.query_dataset) print('storing queries in', queries_fn) if not os.path.exists(queries_fn): queries = compute_distances(args.distance, args.count, X_train, X_test) with open(queries_fn, 'w') as f: pickle.dump(queries, f) else: with open(queries_fn) as f: queries = pickle.load(f) print('got', len(queries), 'queries') algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.limit, args.count, args.distance, args.query_dataset): algos_already_run.add((run["library"], run["name"])) point_type = manifest['dataset']['point_type'] algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, args.distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args.count, X_train, queries, library, algo, args.distance, send_pipe, args.runs, args.single)) p.start() send_pipe.close() timed_out = False try: results = recv_pipe.poll(args.timeout) if results: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them results = recv_pipe.recv() if "expect_extra" in results: if results["expect_extra"]: results["extra"] = recv_pipe.recv() del results["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(results, args.dataset, args.limit, args.count, args.distance, args.query_dataset) elif timed_out: print "(algorithm worker process took too long)" else: print "(algorithm worker process stopped unexpectedly)"
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove-100-angular', choices=DATASETS.keys()) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--docker-tag', metavar='NAME', help='run only algorithms in a particular docker image', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true') parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--local', action='store_true', help='If set, then will run everything locally (inside the same process) rather than using Docker') parser.add_argument( '--max-n-algorithms', type=int, help='Max number of algorithms to run (just used for testing)', default=-1) parser.add_argument( '--run-disabled', help='run algorithms that are disabled in algos.yml', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) dataset = get_dataset(args.dataset) dimension = len(dataset['train'][0]) # TODO(erikbern): ugly point_type = 'float' # TODO(erikbern): should look at the type of X_train distance = dataset.attrs['distance'] definitions = get_definitions(args.definitions, dimension, point_type, distance, args.count) # Filter out, from the loaded definitions, all those query argument groups # that correspond to experiments that have already been run. (This might # mean removing a definition altogether, so we can't just use a list # comprehension.) filtered_definitions = [] for definition in definitions: query_argument_groups = definition.query_argument_groups if not query_argument_groups: query_argument_groups = [[]] not_yet_run = [] for query_arguments in query_argument_groups: fn = get_result_filename(args.dataset, args.count, definition, query_arguments) if not os.path.exists(fn): not_yet_run.append(query_arguments) if not_yet_run: if definition.query_argument_groups: definition = definition._replace( query_argument_groups = not_yet_run) filtered_definitions.append(definition) definitions = filtered_definitions random.shuffle(definitions) if args.algorithm: print('running only', args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] if not args.local: # See which Docker images we have available docker_client = docker.from_env() docker_tags = set() for image in docker_client.images.list(): for tag in image.tags: tag, _ = tag.split(':') docker_tags.add(tag) if args.docker_tag: print('running only', args.docker_tag) definitions = [d for d in definitions if d.docker_tag == args.docker_tag] if set(d.docker_tag for d in definitions).difference(docker_tags): print('not all docker images available, only:', set(docker_tags)) print('missing docker images:', set(d.docker_tag for d in definitions).difference(docker_tags)) definitions = [d for d in definitions if d.docker_tag in docker_tags] else: def _test(df): status = algorithm_status(df) # If the module was loaded but doesn't actually have a constructor of # the right name, then the definition is broken assert status != InstantiationStatus.NO_CONSTRUCTOR, """\ %s.%s(%s): error: the module '%s' does not expose the named constructor""" % (df.module, df.constructor, df.arguments, df.module) if status == InstantiationStatus.NO_MODULE: # If the module couldn't be loaded (presumably because of a missing # dependency), print a warning and remove this definition from the # list of things to be run print("""\ %s.%s(%s): warning: the module '%s' could not be loaded; skipping""" % (df.module, df.constructor, df.arguments, df.module)) return False else: return True definitions = [d for d in definitions if _test(d)] if not args.run_disabled: if len([d for d in definitions if d.disabled]): print('Not running disabled algorithms:', [d for d in definitions if d.disabled]) definitions = [d for d in definitions if not d.disabled] if args.max_n_algorithms >= 0: definitions = definitions[:args.max_n_algorithms] if len(definitions) == 0: raise Exception('Nothing to run') else: print('Order:', definitions) for definition in definitions: print(definition, '...') try: if args.local: run(definition, args.dataset, args.count, args.runs) else: run_docker(definition, args.dataset, args.count, args.runs) except KeyboardInterrupt: break except: traceback.print_exc()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--dataset', metavar='NAME', help='the dataset to load training points from', default='glove') parser.add_argument( '--query-dataset', metavar='NAME', help='load query points from another dataset instead of choosing them randomly from the training dataset', default=None) parser.add_argument( "-k", "--count", default=10, type=positive_int, help="the number of near neighbours to search for") parser.add_argument( '--distance', help='the metric used to calculate the distance between points', default='angular') parser.add_argument( '--limit', help='the maximum number of points to load from the dataset, or -1 to load all of them', type=int, default=-1) parser.add_argument( '--definitions', metavar='FILE', help='load algorithm definitions from FILE', default='algos.yaml') parser.add_argument( '--algorithm', metavar='NAME', help='run only the named algorithm', default=None) parser.add_argument( '--sub-algorithm', metavar='NAME', help='run only the named instance of an algorithm (requires --algo)', default=None) parser.add_argument( '--list-algorithms', help='print the names of all known algorithms and exit', action='store_true', default=argparse.SUPPRESS) parser.add_argument( '--force', help='''re-run algorithms even if their results already exist''', action='store_true') parser.add_argument( '--runs', metavar='COUNT', type=positive_int, help='run each algorithm instance %(metavar)s times and use only the best result', default=3) parser.add_argument( '--timeout', type=int, help='Timeout (in seconds) for each individual algorithm run, or -1 if no timeout should be set', default=-1) parser.add_argument( '--single', help='run only a single algorithm instance at a time', action='store_true') parser.add_argument( '--no_save_index', help='do not save indices', action='store_true') args = parser.parse_args() if args.timeout == -1: args.timeout = None definitions = get_definitions(args.definitions) if hasattr(args, "list_algorithms"): print "The following algorithms are supported..." for point in definitions: print "\t... for the point type '%s'..." % point for metric in definitions[point]: print "\t\t... and the distance metric '%s':" % metric for algorithm in definitions[point][metric]: print "\t\t\t%s" % algorithm sys.exit(0) # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) # Nmslib specific code # Remove old indices stored on disk if os.path.exists(INDEX_DIR): shutil.rmtree(INDEX_DIR) manifest, X = get_dataset(args.dataset, args.limit) if not args.query_dataset: X_train, X_test = split_dataset( X, test_size = manifest['dataset']['test_size']) else: X_train = X query_manifest, X_test = get_dataset(args.query_dataset) assert manifest["dataset"] == query_manifest["dataset"], """\ error: the training dataset and query dataset have incompatible manifests""" queries_fn = get_query_cache_path( args.dataset, args.count, args.limit, args.distance, args.query_dataset) print('storing queries in', queries_fn) if not os.path.exists(queries_fn): queries = compute_distances(args.distance, args.count, X_train, X_test) with open(queries_fn, 'w') as f: pickle.dump(queries, f) else: with open(queries_fn) as f: queries = pickle.load(f) print('got', len(queries), 'queries') algos_already_run = set() if not args.force: for run in get_results(args.dataset, args.limit, args.count, args.distance, args.query_dataset): algos_already_run.add((run["library"], run["name"])) point_type = manifest['dataset']['point_type'] algos = get_algorithms(definitions, constructors, len(X_train[0]), point_type, args.distance, args.count) if args.algorithm: print('running only', args.algorithm) algos = {args.algorithm: algos[args.algorithm]} if args.sub_algorithm: algos[args.algorithm] = \ [algo for algo in algos[args.algorithm] if algo.name == args.sub_algorithm] algos_flat = [] for library in algos.keys(): for algo in algos[library]: if (library, algo.name) not in algos_already_run: algos_flat.append((library, algo)) random.shuffle(algos_flat) print('order:', [a.name for l, a in algos_flat]) for library, algo in algos_flat: recv_pipe, send_pipe = multiprocessing.Pipe(duplex=False) print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process( target=run_algo, args=(args.count, X_train, queries, library, algo, args.distance, send_pipe, args.runs, args.single)) p.start() send_pipe.close() timed_out = False try: results = recv_pipe.poll(args.timeout) if results: # If there's something waiting in the pipe at this point, then # the worker has begun sending us results and we should receive # them results = recv_pipe.recv() if "expect_extra" in results: if results["expect_extra"]: results["extra"] = recv_pipe.recv() del results["expect_extra"] else: # If we've exceeded the timeout and there are no results, then # terminate the worker process (XXX: what should we do about # algo.done() here?) p.terminate() timed_out = True results = None except EOFError: # The worker has crashed or otherwise failed to send us results results = None p.join() recv_pipe.close() if results: store_results(results, args.dataset, args.limit, args.count, args.distance, args.query_dataset) elif timed_out: print "(algorithm worker process took too long)" else: print "(algorithm worker process stopped unexpectedly)"